diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000000..660c62884be
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,18 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+<!--
+    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
+
+    If you have a question about using Kaldi, please use the kald-help discussion group:
+
+    https://groups.google.com/forum/#!forum/kaldi-help
+
+    Instructions for joining are available at: http://kaldi-asr.org/forums.html
+-->
diff --git a/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
new file mode 100644
index 00000000000..61e797b9ca1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-proposal-discussion.md
@@ -0,0 +1,18 @@
+---
+name: Feature proposal or discussion
+about: Suggest an idea for Kaldi
+title: ''
+labels: discussion
+assignees: ''
+
+---
+
+<!--
+    WARNING: THE KALDI ISSUE TRACKER IS **ONLY** USED FOR KALDI DEVELOPMENT!
+
+    If you have a question about using Kaldi, please use the kald-help discussion group:
+
+    https://groups.google.com/forum/#!forum/kaldi-help
+
+    Instructions for joining are available at: http://kaldi-asr.org/forums.html
+-->
diff --git a/.gitignore b/.gitignore
index df7cb26de9f..ed66c79d1c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,15 +73,19 @@ GSYMS
 /src/kaldi.mk.bak
 
 # /egs/
-/egs/*/s*/mfcc
-/egs/*/s*/plp
-/egs/*/s*/exp
-/egs/*/s*/data
+/egs/*/*/mfcc
+/egs/*/*/plp
+/egs/*/*/exp
+/egs/*/*/data
+/egs/*/*/wav
+/egs/*/*/enhan
 
 # /tools/
+/tools/pocolm/
 /tools/ATLAS/
 /tools/atlas3.8.3.tar.gz
 /tools/irstlm/
+/tools/mitlm/
 /tools/openfst
 /tools/openfst-1.3.2.tar.gz
 /tools/openfst-1.3.2/
@@ -143,3 +147,12 @@ GSYMS
 /tools/mmseg-1.3.0.tar.gz
 /tools/mmseg-1.3.0/
 /kaldiwin_vs*
+/tools/cub-1.8.0.zip
+/tools/cub-1.8.0/
+/tools/cub
+/tools/python/
+
+# These CMakeLists.txt files are all genareted on the fly at the moment.
+# They are added here to avoid accidently checkin.
+/src/**/CMakeLists.txt
+/build*
diff --git a/.travis.yml b/.travis.yml
index 23507297413..51e49653efc 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -49,7 +49,7 @@ script:
   # for the explanation why extra switches needed for clang with ccache.
   - CXX="ccache clang++-3.8 -Qunused-arguments -fcolor-diagnostics -Wno-tautological-compare"
     CFLAGS=""
-    LDFLAGS="-llapack"
+    LDFLAGS="-llapack -Wl,-fuse-ld=gold"
     INCDIRS="$XROOT/usr/include"
     LIBDIRS="$XROOT/usr/lib"
       tools/extras/travis_script.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000000..748d88a351f
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,195 @@
+cmake_minimum_required(VERSION 3.5)
+project(kaldi)
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
+include(GNUInstallDirs)
+include(Utils)
+include(third_party/get_third_party)
+
+message(STATUS "Running gen_cmake_skeleton.py")
+execute_process(COMMAND python
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake/gen_cmake_skeleton.py"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src"
+    "--quiet"
+)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_INSTALL_MESSAGE LAZY) # hide "-- Up-to-date: ..."
+if(BUILD_SHARED_LIBS)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    if(WIN32)
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+        message(FATAL_ERROR "DLL is not supported currently")
+    elseif(APPLE)
+        set(CMAKE_INSTALL_RPATH "@loader_path")
+    else()
+        set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../lib")
+    endif()
+endif()
+
+set(MATHLIB "OpenBLAS" CACHE STRING "OpenBLAS|MKL|Accelerate")
+option(KALDI_BUILD_EXE "If disabled, will make add_kaldi_executable a no-op" ON)
+option(KALDI_BUILD_TEST "If disabled, will make add_kaldi_test_executable a no-op" ON)
+option(KALDI_USE_PATCH_NUMBER "Use MAJOR.MINOR.PATCH format, otherwise MAJOR.MINOR" OFF)
+
+link_libraries(${CMAKE_DL_LIBS})
+
+find_package(Threads)
+link_libraries(Threads::Threads)
+
+if(MATHLIB STREQUAL "OpenBLAS")
+    set(BLA_VENDOR "OpenBLAS")
+    find_package(LAPACK REQUIRED)
+    add_definitions(-DHAVE_CLAPACK=1)
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/tools/CLAPACK)
+    link_libraries(${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
+elseif(MATHLIB STREQUAL "MKL")
+    set(BLA_VENDOR "Intel10_64lp")
+    # find_package(BLAS REQUIRED)
+    normalize_env_path(ENV{MKLROOT})
+    find_package(LAPACK REQUIRED)
+    add_definitions(-DHAVE_MKL=1)
+    include_directories($ENV{MKLROOT}/include) # TODO: maybe not use env, idk, find_package doesnt handle includes...
+    link_libraries(${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
+elseif(MATHLIB STREQUAL "Accelerate")
+    set(BLA_VENDOR "Apple")
+    find_package(BLAS REQUIRED)
+    find_package(LAPACK REQUIRED)
+    add_definitions(-DHAVE_CLAPACK=1)
+    link_libraries(${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
+else()
+    message(FATAL_ERROR "${MATHLIB} is not tested and supported, you are on your own now.")
+endif()
+
+if(MSVC)
+    # Added in source, but we actually should do it in build script, whatever...
+    # add_definitions(-DWIN32_LEAN_AND_MEAN=1)
+
+    add_compile_options(/permissive- /FS /wd4819 /EHsc /bigobj)
+
+    # some warnings related with fst
+    add_compile_options(/wd4018 /wd4244 /wd4267 /wd4291 /wd4305)
+
+    set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE INTERNAL "")
+    if(NOT DEFINED ENV{CUDAHOSTCXX})
+        set(ENV{CUDAHOSTCXX} ${CMAKE_CXX_COMPILER})
+    endif()
+    if(NOT DEFINED CUDA_HOST_COMPILER)
+        set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+    endif()
+endif()
+
+find_package(CUDA)
+if(CUDA_FOUND)
+    set(CUB_ROOT_DIR "${PROJECT_SOURCE_DIR}/tools/cub")
+
+    set(CUDA_PROPAGATE_HOST_FLAGS ON)
+    set(KALDI_CUDA_NVCC_FLAGS "--default-stream=per-thread;-std=c++${CMAKE_CXX_STANDARD}")
+    if(MSVC)
+        list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler /permissive-,/FS,/wd4819,/EHsc,/bigobj")
+        list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler /wd4018,/wd4244,/wd4267,/wd4291,/wd4305")
+        if(BUILD_SHARED_LIBS)
+            list(APPEND CUDA_NVCC_FLAGS_RELEASE -Xcompiler /MD)
+            list(APPEND CUDA_NVCC_FLAGS_DEBUG -Xcompiler /MDd)
+        endif()
+    else()
+    #     list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler -std=c++${CMAKE_CXX_STANDARD}")
+        list(APPEND KALDI_CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+    endif()
+    set(CUDA_NVCC_FLAGS ${KALDI_CUDA_NVCC_FLAGS} ${CUDA_NVCC_FLAGS})
+
+    add_definitions(-DHAVE_CUDA=1)
+    add_definitions(-DCUDA_API_PER_THREAD_DEFAULT_STREAM=1)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    link_libraries(
+        ${CUDA_LIBRARIES}
+        ${CUDA_CUDA_LIBRARY}
+        ${CUDA_CUBLAS_LIBRARIES}
+        ${CUDA_CUFFT_LIBRARIES}
+        ${CUDA_curand_LIBRARY}
+        ${CUDA_cusolver_LIBRARY}
+        ${CUDA_cusparse_LIBRARY})
+
+    find_package(NvToolExt REQUIRED)
+    include_directories(${NvToolExt_INCLUDE_DIR})
+    link_libraries(${NvToolExt_LIBRARIES})
+
+    find_package(CUB REQUIRED)
+    include_directories(${CUB_INCLUDE_DIR})
+endif()
+
+add_definitions(-DKALDI_NO_PORTAUDIO=1)
+
+include(VersionHelper)
+get_version() # this will set KALDI_VERSION and KALDI_PATCH_NUMBER
+if(${KALDI_USE_PATCH_NUMBER})
+    set(KALDI_VERSION "${KALDI_VERSION}.${KALDI_PATCH_NUMBER}")
+endif()
+
+get_third_party(openfst)
+set(OPENFST_ROOT_DIR ${CMAKE_CURRENT_BINARY_DIR}/openfst)
+include(third_party/openfst_lib_target)
+link_libraries(fst)
+
+# add all native libraries
+add_subdirectory(src/base) # NOTE, we need to patch the target with version from outside
+set_property(TARGET kaldi-base PROPERTY COMPILE_DEFINITIONS "KALDI_VERSION=\"${KALDI_VERSION}\"")
+add_subdirectory(src/matrix)
+add_subdirectory(src/cudamatrix)
+add_subdirectory(src/util)
+add_subdirectory(src/feat)
+add_subdirectory(src/tree)
+add_subdirectory(src/gmm)
+add_subdirectory(src/transform)
+add_subdirectory(src/sgmm2)
+add_subdirectory(src/fstext)
+add_subdirectory(src/hmm)
+add_subdirectory(src/lm)
+add_subdirectory(src/decoder)
+add_subdirectory(src/lat)
+add_subdirectory(src/nnet)
+add_subdirectory(src/nnet2)
+add_subdirectory(src/nnet3)
+add_subdirectory(src/rnnlm)
+add_subdirectory(src/chain)
+add_subdirectory(src/ivector)
+add_subdirectory(src/online)
+add_subdirectory(src/online2)
+add_subdirectory(src/kws)
+
+add_subdirectory(src/itf)
+
+# add all cuda libraries
+if(CUDA_FOUND)
+    add_subdirectory(src/cudafeat)
+    add_subdirectory(src/cudadecoder)
+endif()
+
+# add all native executables
+add_subdirectory(src/gmmbin)
+add_subdirectory(src/featbin)
+add_subdirectory(src/onlinebin)
+
+# add all cuda executables
+if(CUDA_FOUND)
+    add_subdirectory(src/cudafeatbin)
+    add_subdirectory(src/cudadecoderbin)
+endif()
+
+include(CMakePackageConfigHelpers)
+# maybe we should put this into subfolder?
+configure_package_config_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/kaldi-config.cmake.in
+    ${CMAKE_BINARY_DIR}/cmake/kaldi-config.cmake
+    INSTALL_DESTINATION lib/cmake/kaldi
+)
+write_basic_package_version_file(
+    ${CMAKE_BINARY_DIR}/cmake/kaldi-config-version.cmake
+    VERSION ${KALDI_VERSION}
+    COMPATIBILITY AnyNewerVersion
+)
+install(FILES ${CMAKE_BINARY_DIR}/cmake/kaldi-config.cmake ${CMAKE_BINARY_DIR}/cmake/kaldi-config-version.cmake
+    DESTINATION lib/cmake/kaldi
+)
+install(EXPORT kaldi-targets DESTINATION ${CMAKE_INSTALL_PREFIX}/lib/cmake/kaldi)
diff --git a/INSTALL b/INSTALL
index 2dbf318118c..7beb79a7336 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,9 +1,16 @@
 This is the official Kaldi INSTALL. Look also at INSTALL.md for the git mirror installation.
-[for native Windows install, see windows/INSTALL]
+[Option 1 in the following does not apply to native Windows install, see windows/INSTALL or following Option 2]
 
-(1)
-go to tools/  and follow INSTALL instructions there.
+Option 1 (bash + makefile):
 
-(2) 
-go to src/ and follow INSTALL instructions there.
+  Steps:
+    (1)
+    go to tools/  and follow INSTALL instructions there.
 
+    (2)
+    go to src/ and follow INSTALL instructions there.
+
+Option 2 (cmake):
+
+    Go to cmake/ and follow INSTALL.md instructions there.
+    Note, it may not be well tested and some features are missing currently.
diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake
new file mode 100644
index 00000000000..67676110c6d
--- /dev/null
+++ b/cmake/FindBLAS.cmake
@@ -0,0 +1,816 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindBLAS
+--------
+
+Find Basic Linear Algebra Subprograms (BLAS) library
+
+This module finds an installed Fortran library that implements the
+BLAS linear-algebra interface (see http://www.netlib.org/blas/).  The
+list of libraries searched for is taken from the ``autoconf`` macro file,
+``acx_blas.m4`` (distributed at
+http://ac-archive.sourceforge.net/ac-archive/acx_blas.html).
+
+Input Variables
+^^^^^^^^^^^^^^^
+
+The following variables may be set to influence this module's behavior:
+
+``BLA_STATIC``
+  if ``ON`` use static linkage
+
+``BLA_VENDOR``
+  If set, checks only the specified vendor, if not set checks all the
+  possibilities.  List of vendors valid in this module:
+
+  * Goto
+  * OpenBLAS
+  * FLAME
+  * ATLAS PhiPACK
+  * CXML
+  * DXML
+  * SunPerf
+  * SCSL
+  * SGIMATH
+  * IBMESSL
+  * Intel10_32 (intel mkl v10 32 bit)
+  * Intel10_64lp (intel mkl v10+ 64 bit, threaded code, lp64 model)
+  * Intel10_64lp_seq (intel mkl v10+ 64 bit, sequential code, lp64 model)
+  * Intel10_64ilp (intel mkl v10+ 64 bit, threaded code, ilp64 model)
+  * Intel10_64ilp_seq (intel mkl v10+ 64 bit, sequential code, ilp64 model)
+  * Intel (obsolete versions of mkl 32 and 64 bit)
+  * ACML
+  * ACML_MP
+  * ACML_GPU
+  * Apple
+  * NAS
+  * Generic
+
+``BLA_F95``
+  if ``ON`` tries to find the BLAS95 interfaces
+
+``BLA_PREFER_PKGCONFIG``
+  if set ``pkg-config`` will be used to search for a BLAS library first
+  and if one is found that is preferred
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This module defines the following variables:
+
+``BLAS_FOUND``
+  library implementing the BLAS interface is found
+``BLAS_LINKER_FLAGS``
+  uncached list of required linker flags (excluding ``-l`` and ``-L``).
+``BLAS_LIBRARIES``
+  uncached list of libraries (using full path name) to link against
+  to use BLAS (may be empty if compiler implicitly links BLAS)
+``BLAS95_LIBRARIES``
+  uncached list of libraries (using full path name) to link against
+  to use BLAS95 interface
+``BLAS95_FOUND``
+  library implementing the BLAS95 interface is found
+
+.. note::
+
+  C or CXX must be enabled to use Intel Math Kernel Library (MKL)
+
+  For example, to use Intel MKL libraries and/or Intel compiler:
+
+  .. code-block:: cmake
+
+    set(BLA_VENDOR Intel10_64lp)
+    find_package(BLAS)
+
+Hints
+^^^^^
+
+Set ``MKLROOT`` environment variable to a directory that contains an MKL
+installation.
+
+#]=======================================================================]
+
+include(CheckFunctionExists)
+include(CheckFortranFunctionExists)
+include(CMakePushCheckState)
+include(FindPackageHandleStandardArgs)
+cmake_push_check_state()
+set(CMAKE_REQUIRED_QUIET ${BLAS_FIND_QUIETLY})
+
+set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+
+# Check the language being used
+if( NOT (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED OR CMAKE_Fortran_COMPILER_LOADED) )
+  if(BLAS_FIND_REQUIRED)
+    message(FATAL_ERROR "FindBLAS requires Fortran, C, or C++ to be enabled.")
+  else()
+    message(STATUS "Looking for BLAS... - NOT found (Unsupported languages)")
+    return()
+  endif()
+endif()
+
+if(BLA_PREFER_PKGCONFIG)
+  find_package(PkgConfig)
+  pkg_check_modules(PKGC_BLAS blas)
+  if(PKGC_BLAS_FOUND)
+    set(BLAS_FOUND ${PKGC_BLAS_FOUND})
+    set(BLAS_LIBRARIES "${PKGC_BLAS_LINK_LIBRARIES}")
+    return()
+  endif()
+endif()
+
+macro(Check_Fortran_Libraries LIBRARIES _prefix _name _flags _list _thread)
+  # This macro checks for the existence of the combination of fortran libraries
+  # given by _list.  If the combination is found, this macro checks (using the
+  # Check_Fortran_Function_Exists macro) whether can link against that library
+  # combination using the name of a routine given by _name using the linker
+  # flags given by _flags.  If the combination of libraries is found and passes
+  # the link test, LIBRARIES is set to the list of complete library paths that
+  # have been found.  Otherwise, LIBRARIES is set to FALSE.
+
+  # N.B. _prefix is the prefix applied to the names of all cached variables that
+  # are generated internally and marked advanced by this macro.
+
+  set(_libdir ${ARGN})
+
+  set(_libraries_work TRUE)
+  set(${LIBRARIES})
+  set(_combined_name)
+  if (NOT _libdir)
+    if (WIN32)
+      set(_libdir ENV LIB)
+    elseif (APPLE)
+      set(_libdir ENV DYLD_LIBRARY_PATH)
+    else ()
+      set(_libdir ENV LD_LIBRARY_PATH)
+    endif ()
+  endif ()
+
+  list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+
+  foreach(_library ${_list})
+    set(_combined_name ${_combined_name}_${_library})
+    if(NOT "${_thread}" STREQUAL "")
+      set(_combined_name ${_combined_name}_thread)
+    endif()
+    if(_libraries_work)
+      if (BLA_STATIC)
+        if (WIN32)
+          set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES})
+        endif ()
+        if (APPLE)
+          set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES})
+        else ()
+          set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+        endif ()
+      else ()
+        if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+          # for ubuntu's libblas3gf and liblapack3gf packages
+          set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf)
+        endif ()
+      endif ()
+      find_library(${_prefix}_${_library}_LIBRARY
+        NAMES ${_library}
+        PATHS ${_libdir}
+        )
+      mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+      set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+      set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+    endif()
+  endforeach()
+  if(_libraries_work)
+    # Test this combination of libraries.
+    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_thread})
+    #  message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}")
+    if (CMAKE_Fortran_COMPILER_LOADED)
+      check_fortran_function_exists("${_name}" ${_prefix}${_combined_name}_WORKS)
+    else()
+      check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
+    endif()
+    set(CMAKE_REQUIRED_LIBRARIES)
+    set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  endif()
+  if(_libraries_work)
+    if("${_list}" STREQUAL "")
+      set(${LIBRARIES} "${LIBRARIES}-PLACEHOLDER-FOR-EMPTY-LIBRARIES")
+    else()
+      set(${LIBRARIES} ${${LIBRARIES}} ${_thread})  # for static link
+    endif()
+  else()
+    set(${LIBRARIES} FALSE)
+  endif()
+  #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}")
+endmacro()
+
+set(BLAS_LINKER_FLAGS)
+set(BLAS_LIBRARIES)
+set(BLAS95_LIBRARIES)
+if (NOT $ENV{BLA_VENDOR} STREQUAL "")
+  set(BLA_VENDOR $ENV{BLA_VENDOR})
+else ()
+  if(NOT BLA_VENDOR)
+    set(BLA_VENDOR "All")
+  endif()
+endif ()
+
+if (BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    # Implicitly linked BLAS libraries
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      ""
+      ""
+      )
+  endif()
+endif ()
+
+#BLAS in intel mkl 10+ library? (em64t 64bit)
+if (BLA_VENDOR MATCHES "Intel" OR BLA_VENDOR STREQUAL "All")
+  if (NOT BLAS_LIBRARIES)
+
+    # System-specific settings
+    if (WIN32)
+      if (BLA_STATIC)
+        set(BLAS_mkl_DLL_SUFFIX "")
+      else()
+        set(BLAS_mkl_DLL_SUFFIX "_dll")
+      endif()
+    else()
+      # Switch to GNU Fortran support layer if needed (but not on Apple, where MKL does not provide it)
+      if(CMAKE_Fortran_COMPILER_LOADED AND CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+          set(BLAS_mkl_INTFACE "gf")
+          set(BLAS_mkl_THREADING "gnu")
+          set(BLAS_mkl_OMP "gomp")
+      else()
+          set(BLAS_mkl_INTFACE "intel")
+          set(BLAS_mkl_THREADING "intel")
+          set(BLAS_mkl_OMP "iomp5")
+      endif()
+      set(BLAS_mkl_LM "-lm")
+      set(BLAS_mkl_LDL "-ldl")
+    endif()
+
+    if (BLA_VENDOR MATCHES "_64ilp")
+      set(BLAS_mkl_ILP_MODE "ilp64")
+    else ()
+      set(BLAS_mkl_ILP_MODE "lp64")
+    endif ()
+
+    if (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED)
+      if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED)
+        find_package(Threads)
+      else()
+        find_package(Threads REQUIRED)
+      endif()
+
+      set(BLAS_SEARCH_LIBS "")
+
+      if(BLA_F95)
+        set(BLAS_mkl_SEARCH_SYMBOL sgemm_f95)
+        set(_LIBRARIES BLAS95_LIBRARIES)
+        if (WIN32)
+          # Find the main file (32-bit or 64-bit)
+          set(BLAS_SEARCH_LIBS_WIN_MAIN "")
+          if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+              "mkl_blas95${BLAS_mkl_DLL_SUFFIX} mkl_intel_c${BLAS_mkl_DLL_SUFFIX}")
+          endif()
+          if (BLA_VENDOR MATCHES "^Intel10_64i?lp" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+              "mkl_blas95_${BLAS_mkl_ILP_MODE}${BLAS_mkl_DLL_SUFFIX} mkl_intel_${BLAS_mkl_ILP_MODE}${BLAS_mkl_DLL_SUFFIX}")
+          endif ()
+
+          # Add threading/sequential libs
+          set(BLAS_SEARCH_LIBS_WIN_THREAD "")
+          if (BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+              "mkl_sequential${BLAS_mkl_DLL_SUFFIX}")
+          endif()
+          if (NOT BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All")
+            # old version
+            list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+              "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+            # mkl >= 10.3
+            list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+              "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+          endif()
+
+          # Cartesian product of the above
+          foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN})
+            foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD})
+              list(APPEND BLAS_SEARCH_LIBS
+                "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}")
+            endforeach()
+          endforeach()
+        else ()
+          if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+            # old version
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_blas95 mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide")
+
+            # mkl >= 10.3
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_blas95 mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}")
+          endif ()
+          if (BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR STREQUAL "All")
+            # old version
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_blas95 mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide")
+
+            # mkl >= 10.3
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_blas95_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}")
+          endif ()
+          if (BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_blas95_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_sequential mkl_core")
+          endif ()
+        endif ()
+      else ()
+        set(BLAS_mkl_SEARCH_SYMBOL sgemm)
+        set(_LIBRARIES BLAS_LIBRARIES)
+        if (WIN32)
+          # Find the main file (32-bit or 64-bit)
+          set(BLAS_SEARCH_LIBS_WIN_MAIN "")
+          if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+              "mkl_intel_c${BLAS_mkl_DLL_SUFFIX}")
+          endif()
+          if (BLA_VENDOR MATCHES "^Intel10_64i?lp" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS_WIN_MAIN
+              "mkl_intel_${BLAS_mkl_ILP_MODE}${BLAS_mkl_DLL_SUFFIX}")
+          endif ()
+
+          # Add threading/sequential libs
+          set(BLAS_SEARCH_LIBS_WIN_THREAD "")
+          if (NOT BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All")
+            # old version
+            list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+              "libguide40 mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+            # mkl >= 10.3
+            list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+              "libiomp5md mkl_intel_thread${BLAS_mkl_DLL_SUFFIX}")
+          endif()
+          if (BLA_VENDOR MATCHES "_seq$" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS_WIN_THREAD
+              "mkl_sequential${BLAS_mkl_DLL_SUFFIX}")
+          endif()
+
+          # Cartesian product of the above
+          foreach (MAIN ${BLAS_SEARCH_LIBS_WIN_MAIN})
+            foreach (THREAD ${BLAS_SEARCH_LIBS_WIN_THREAD})
+              list(APPEND BLAS_SEARCH_LIBS
+                "${MAIN} ${THREAD} mkl_core${BLAS_mkl_DLL_SUFFIX}")
+            endforeach()
+          endforeach()
+        else ()
+          if (BLA_VENDOR STREQUAL "Intel10_32" OR BLA_VENDOR STREQUAL "All")
+            # old version
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide")
+
+            # mkl >= 10.3
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_${BLAS_mkl_INTFACE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}")
+          endif ()
+          if (BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR STREQUAL "All")
+            # old version
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core guide")
+
+            # mkl >= 10.3
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_${BLAS_mkl_THREADING}_thread mkl_core ${BLAS_mkl_OMP}")
+          endif ()
+          if (BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_${BLAS_mkl_INTFACE}_${BLAS_mkl_ILP_MODE} mkl_sequential mkl_core")
+          endif ()
+
+          #older vesions of intel mkl libs
+          if (BLA_VENDOR STREQUAL "Intel" OR BLA_VENDOR STREQUAL "All")
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl")
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_ia32")
+            list(APPEND BLAS_SEARCH_LIBS
+              "mkl_em64t")
+          endif ()
+        endif ()
+      endif ()
+
+      if (DEFINED ENV{MKLROOT})
+        if (BLA_VENDOR STREQUAL "Intel10_32")
+          set(_BLAS_MKLROOT_LIB_DIR "$ENV{MKLROOT}/lib/ia32")
+        elseif (BLA_VENDOR MATCHES "^Intel10_64i?lp$" OR BLA_VENDOR MATCHES "^Intel10_64i?lp_seq$")
+          set(_BLAS_MKLROOT_LIB_DIR "$ENV{MKLROOT}/lib/intel64")
+        endif ()
+      endif ()
+      if (_BLAS_MKLROOT_LIB_DIR)
+        if (WIN32)
+          string(APPEND _BLAS_MKLROOT_LIB_DIR "_win")
+        elseif (APPLE)
+          string(APPEND _BLAS_MKLROOT_LIB_DIR "_mac")
+        else ()
+          string(APPEND _BLAS_MKLROOT_LIB_DIR "_lin")
+        endif ()
+      endif ()
+
+      foreach (IT ${BLAS_SEARCH_LIBS})
+        string(REPLACE " " ";" SEARCH_LIBS ${IT})
+        if (NOT ${_LIBRARIES})
+          check_fortran_libraries(
+            ${_LIBRARIES}
+            BLAS
+            ${BLAS_mkl_SEARCH_SYMBOL}
+            ""
+            "${SEARCH_LIBS}"
+            "${CMAKE_THREAD_LIBS_INIT};${BLAS_mkl_LM};${BLAS_mkl_LDL}"
+            "${_BLAS_MKLROOT_LIB_DIR}"
+            )
+        endif ()
+      endforeach ()
+
+    endif ()
+    unset(BLAS_mkl_ILP_MODE)
+    unset(BLAS_mkl_INTFACE)
+    unset(BLAS_mkl_THREADING)
+    unset(BLAS_mkl_OMP)
+    unset(BLAS_mkl_DLL_SUFFIX)
+    unset(BLAS_mkl_LM)
+    unset(BLAS_mkl_LDL)
+  endif ()
+endif ()
+
+if(BLA_F95)
+  find_package_handle_standard_args(BLAS REQUIRED_VARS BLAS95_LIBRARIES)
+  set(BLAS95_FOUND ${BLAS_FOUND})
+  if(BLAS_FOUND)
+    set(BLAS_LIBRARIES "${BLAS95_LIBRARIES}")
+  endif()
+endif()
+
+if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    # gotoblas (http://www.tacc.utexas.edu/tacc-projects/gotoblas2)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "goto2"
+      ""
+      )
+  endif()
+endif ()
+
+if (BLA_VENDOR STREQUAL "OpenBLAS" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    # OpenBLAS (http://www.openblas.net)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "openblas"
+      ""
+      )
+  endif()
+  if(NOT BLAS_LIBRARIES)
+    find_package(Threads)
+    # OpenBLAS (http://www.openblas.net)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "openblas"
+      "${CMAKE_THREAD_LIBS_INIT}"
+      )
+  endif()
+endif ()
+
+if (BLA_VENDOR STREQUAL "FLAME" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    # FLAME's blis library (https://github.com/flame/blis)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "blis"
+      ""
+      )
+  endif()
+endif ()
+
+if (BLA_VENDOR STREQUAL "ATLAS" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    # BLAS in ATLAS library? (http://math-atlas.sourceforge.net/)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      dgemm
+      ""
+      "f77blas;atlas"
+      ""
+      )
+  endif()
+endif ()
+
+# BLAS in PhiPACK libraries? (requires generic BLAS lib, too)
+if (BLA_VENDOR STREQUAL "PhiPACK" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "sgemm;dgemm;blas"
+      ""
+      )
+  endif()
+endif ()
+
+# BLAS in Alpha CXML library?
+if (BLA_VENDOR STREQUAL "CXML" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "cxml"
+      ""
+      )
+  endif()
+endif ()
+
+# BLAS in Alpha DXML library? (now called CXML, see above)
+if (BLA_VENDOR STREQUAL "DXML" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "dxml"
+      ""
+      )
+  endif()
+endif ()
+
+# BLAS in Sun Performance library?
+if (BLA_VENDOR STREQUAL "SunPerf" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      "-xlic_lib=sunperf"
+      "sunperf;sunmath"
+      ""
+      )
+    if(BLAS_LIBRARIES)
+      set(BLAS_LINKER_FLAGS "-xlic_lib=sunperf")
+    endif()
+  endif()
+endif ()
+
+# BLAS in SCSL library?  (SGI/Cray Scientific Library)
+if (BLA_VENDOR STREQUAL "SCSL" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "scsl"
+      ""
+      )
+  endif()
+endif ()
+
+# BLAS in SGIMATH library?
+if (BLA_VENDOR STREQUAL "SGIMATH" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "complib.sgimath"
+      ""
+      )
+  endif()
+endif ()
+
+# BLAS in IBM ESSL library? (requires generic BLAS lib, too)
+if (BLA_VENDOR STREQUAL "IBMESSL" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "essl;blas"
+      ""
+      )
+  endif()
+endif ()
+
+#BLAS in acml library?
+if (BLA_VENDOR MATCHES "ACML" OR BLA_VENDOR STREQUAL "All")
+  if( ((BLA_VENDOR STREQUAL "ACML") AND (NOT BLAS_ACML_LIB_DIRS)) OR
+    ((BLA_VENDOR STREQUAL "ACML_MP") AND (NOT BLAS_ACML_MP_LIB_DIRS)) OR
+    ((BLA_VENDOR STREQUAL "ACML_GPU") AND (NOT BLAS_ACML_GPU_LIB_DIRS))
+    )
+  # try to find acml in "standard" paths
+  if( WIN32 )
+    file( GLOB _ACML_ROOT "C:/AMD/acml*/ACML-EULA.txt" )
+  else()
+    file( GLOB _ACML_ROOT "/opt/acml*/ACML-EULA.txt" )
+  endif()
+  if( WIN32 )
+    file( GLOB _ACML_GPU_ROOT "C:/AMD/acml*/GPGPUexamples" )
+  else()
+    file( GLOB _ACML_GPU_ROOT "/opt/acml*/GPGPUexamples" )
+  endif()
+  list(GET _ACML_ROOT 0 _ACML_ROOT)
+  list(GET _ACML_GPU_ROOT 0 _ACML_GPU_ROOT)
+  if( _ACML_ROOT )
+    get_filename_component( _ACML_ROOT ${_ACML_ROOT} PATH )
+    if( SIZEOF_INTEGER EQUAL 8 )
+      set( _ACML_PATH_SUFFIX "_int64" )
+    else()
+      set( _ACML_PATH_SUFFIX "" )
+    endif()
+    if( CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" )
+      set( _ACML_COMPILER32 "ifort32" )
+      set( _ACML_COMPILER64 "ifort64" )
+    elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "SunPro" )
+      set( _ACML_COMPILER32 "sun32" )
+      set( _ACML_COMPILER64 "sun64" )
+    elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" )
+      set( _ACML_COMPILER32 "pgi32" )
+      if( WIN32 )
+        set( _ACML_COMPILER64 "win64" )
+      else()
+        set( _ACML_COMPILER64 "pgi64" )
+      endif()
+    elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "Open64" )
+      # 32 bit builds not supported on Open64 but for code simplicity
+      # We'll just use the same directory twice
+      set( _ACML_COMPILER32 "open64_64" )
+      set( _ACML_COMPILER64 "open64_64" )
+    elseif( CMAKE_Fortran_COMPILER_ID STREQUAL "NAG" )
+      set( _ACML_COMPILER32 "nag32" )
+      set( _ACML_COMPILER64 "nag64" )
+    else()
+      set( _ACML_COMPILER32 "gfortran32" )
+      set( _ACML_COMPILER64 "gfortran64" )
+    endif()
+
+    if( BLA_VENDOR STREQUAL "ACML_MP" )
+      set(_ACML_MP_LIB_DIRS
+        "${_ACML_ROOT}/${_ACML_COMPILER32}_mp${_ACML_PATH_SUFFIX}/lib"
+        "${_ACML_ROOT}/${_ACML_COMPILER64}_mp${_ACML_PATH_SUFFIX}/lib" )
+    else()
+      set(_ACML_LIB_DIRS
+        "${_ACML_ROOT}/${_ACML_COMPILER32}${_ACML_PATH_SUFFIX}/lib"
+        "${_ACML_ROOT}/${_ACML_COMPILER64}${_ACML_PATH_SUFFIX}/lib" )
+    endif()
+  endif()
+elseif(BLAS_${BLA_VENDOR}_LIB_DIRS)
+  set(_${BLA_VENDOR}_LIB_DIRS ${BLAS_${BLA_VENDOR}_LIB_DIRS})
+endif()
+
+if( BLA_VENDOR STREQUAL "ACML_MP" )
+  foreach( BLAS_ACML_MP_LIB_DIRS ${_ACML_MP_LIB_DIRS})
+    check_fortran_libraries (
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      "" "acml_mp;acml_mv" "" ${BLAS_ACML_MP_LIB_DIRS}
+      )
+    if( BLAS_LIBRARIES )
+      break()
+    endif()
+  endforeach()
+elseif( BLA_VENDOR STREQUAL "ACML_GPU" )
+  foreach( BLAS_ACML_GPU_LIB_DIRS ${_ACML_GPU_LIB_DIRS})
+    check_fortran_libraries (
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      "" "acml;acml_mv;CALBLAS" "" ${BLAS_ACML_GPU_LIB_DIRS}
+      )
+    if( BLAS_LIBRARIES )
+      break()
+    endif()
+  endforeach()
+else()
+  foreach( BLAS_ACML_LIB_DIRS ${_ACML_LIB_DIRS} )
+    check_fortran_libraries (
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      "" "acml;acml_mv" "" ${BLAS_ACML_LIB_DIRS}
+      )
+    if( BLAS_LIBRARIES )
+      break()
+    endif()
+  endforeach()
+endif()
+
+# Either acml or acml_mp should be in LD_LIBRARY_PATH but not both
+if(NOT BLAS_LIBRARIES)
+  check_fortran_libraries(
+    BLAS_LIBRARIES
+    BLAS
+    sgemm
+    ""
+    "acml;acml_mv"
+    ""
+    )
+endif()
+if(NOT BLAS_LIBRARIES)
+  check_fortran_libraries(
+    BLAS_LIBRARIES
+    BLAS
+    sgemm
+    ""
+    "acml_mp;acml_mv"
+    ""
+    )
+endif()
+if(NOT BLAS_LIBRARIES)
+  check_fortran_libraries(
+    BLAS_LIBRARIES
+    BLAS
+    sgemm
+    ""
+    "acml;acml_mv;CALBLAS"
+    ""
+    )
+endif()
+endif () # ACML
+
+# Apple BLAS library?
+if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      dgemm
+      ""
+      "Accelerate"
+      ""
+      )
+  endif()
+endif ()
+
+if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All")
+  if ( NOT BLAS_LIBRARIES )
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      dgemm
+      ""
+      "vecLib"
+      ""
+      )
+  endif ()
+endif ()
+
+# Generic BLAS library?
+if (BLA_VENDOR STREQUAL "Generic" OR BLA_VENDOR STREQUAL "All")
+  if(NOT BLAS_LIBRARIES)
+    check_fortran_libraries(
+      BLAS_LIBRARIES
+      BLAS
+      sgemm
+      ""
+      "blas"
+      ""
+      )
+  endif()
+endif ()
+
+if(NOT BLA_F95)
+  find_package_handle_standard_args(BLAS REQUIRED_VARS BLAS_LIBRARIES)
+endif()
+
+# On compilers that implicitly link BLAS (such as ftn, cc, and CC on Cray HPC machines)
+# we used a placeholder for empty BLAS_LIBRARIES to get through our logic above.
+if (BLAS_LIBRARIES STREQUAL "BLAS_LIBRARIES-PLACEHOLDER-FOR-EMPTY-LIBRARIES")
+  set(BLAS_LIBRARIES "")
+endif()
+
+cmake_pop_check_state()
+set(CMAKE_FIND_LIBRARY_SUFFIXES ${_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
diff --git a/cmake/FindCUB.cmake b/cmake/FindCUB.cmake
new file mode 100644
index 00000000000..33c8a926f97
--- /dev/null
+++ b/cmake/FindCUB.cmake
@@ -0,0 +1,25 @@
+# Try to find the CUB library and headers.
+#  CUB_ROOT_DIR     - where to find
+
+#  CUB_FOUND        - system has CUB
+#  CUB_INCLUDE_DIRS - the CUB include directory
+
+
+find_path(CUB_INCLUDE_DIR
+    NAMES cub/cub.cuh
+    HINTS ${CUB_ROOT_DIR}
+    DOC "The directory where CUB includes reside"
+)
+
+set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUB
+        FOUND_VAR CUB_FOUND
+        REQUIRED_VARS CUB_INCLUDE_DIR
+)
+
+mark_as_advanced(CUB_FOUND)
+
+add_library(CUB INTERFACE)
+target_include_directories(CUB INTERFACE ${CUB_INCLUDE_DIR})
diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
new file mode 100644
index 00000000000..8c460082c36
--- /dev/null
+++ b/cmake/FindICU.cmake
@@ -0,0 +1,428 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindICU
+-------
+
+Find the International Components for Unicode (ICU) libraries and
+programs.
+
+This module supports multiple components.
+Components can include any of: ``data``, ``i18n``, ``io``, ``le``,
+``lx``, ``test``, ``tu`` and ``uc``.
+
+Note that on Windows ``data`` is named ``dt`` and ``i18n`` is named
+``in``; any of the names may be used, and the appropriate
+platform-specific library name will be automatically selected.
+
+This module reports information about the ICU installation in
+several variables.  General variables::
+
+  ICU_VERSION - ICU release version
+  ICU_FOUND - true if the main programs and libraries were found
+  ICU_LIBRARIES - component libraries to be linked
+  ICU_INCLUDE_DIRS - the directories containing the ICU headers
+
+Imported targets::
+
+  ICU::<C>
+
+Where ``<C>`` is the name of an ICU component, for example
+``ICU::i18n``.
+
+ICU programs are reported in::
+
+  ICU_GENCNVAL_EXECUTABLE - path to gencnval executable
+  ICU_ICUINFO_EXECUTABLE - path to icuinfo executable
+  ICU_GENBRK_EXECUTABLE - path to genbrk executable
+  ICU_ICU-CONFIG_EXECUTABLE - path to icu-config executable
+  ICU_GENRB_EXECUTABLE - path to genrb executable
+  ICU_GENDICT_EXECUTABLE - path to gendict executable
+  ICU_DERB_EXECUTABLE - path to derb executable
+  ICU_PKGDATA_EXECUTABLE - path to pkgdata executable
+  ICU_UCONV_EXECUTABLE - path to uconv executable
+  ICU_GENCFU_EXECUTABLE - path to gencfu executable
+  ICU_MAKECONV_EXECUTABLE - path to makeconv executable
+  ICU_GENNORM2_EXECUTABLE - path to gennorm2 executable
+  ICU_GENCCODE_EXECUTABLE - path to genccode executable
+  ICU_GENSPREP_EXECUTABLE - path to gensprep executable
+  ICU_ICUPKG_EXECUTABLE - path to icupkg executable
+  ICU_GENCMN_EXECUTABLE - path to gencmn executable
+
+ICU component libraries are reported in::
+
+  ICU_<C>_FOUND - ON if component was found
+  ICU_<C>_LIBRARIES - libraries for component
+
+ICU datafiles are reported in::
+
+  ICU_MAKEFILE_INC - Makefile.inc
+  ICU_PKGDATA_INC - pkgdata.inc
+
+Note that ``<C>`` is the uppercased name of the component.
+
+This module reads hints about search results from::
+
+  ICU_ROOT - the root of the ICU installation
+
+The environment variable ``ICU_ROOT`` may also be used; the
+ICU_ROOT variable takes precedence.
+
+The following cache variables may also be set::
+
+  ICU_<P>_EXECUTABLE - the path to executable <P>
+  ICU_INCLUDE_DIR - the directory containing the ICU headers
+  ICU_<C>_LIBRARY - the library for component <C>
+
+.. note::
+
+  In most cases none of the above variables will require setting,
+  unless multiple ICU versions are available and a specific version
+  is required.
+
+Other variables one may set to control this module are::
+
+  ICU_DEBUG - Set to ON to enable debug output from FindICU.
+#]=======================================================================]
+
+# Written by Roger Leigh <rleigh@codelibre.net>
+
+set(icu_programs
+  gencnval
+  icuinfo
+  genbrk
+  icu-config
+  genrb
+  gendict
+  derb
+  pkgdata
+  uconv
+  gencfu
+  makeconv
+  gennorm2
+  genccode
+  gensprep
+  icupkg
+  gencmn)
+
+set(icu_data
+  Makefile.inc
+  pkgdata.inc)
+
+# The ICU checks are contained in a function due to the large number
+# of temporary variables needed.
+function(_ICU_FIND)
+  # Set up search paths, taking compiler into account.  Search ICU_ROOT,
+  # with ICU_ROOT in the environment as a fallback if unset.
+  if(ICU_ROOT)
+    list(APPEND icu_roots "${ICU_ROOT}")
+  else()
+    if(NOT "$ENV{ICU_ROOT}" STREQUAL "")
+      file(TO_CMAKE_PATH "$ENV{ICU_ROOT}" NATIVE_PATH)
+      list(APPEND icu_roots "${NATIVE_PATH}")
+      set(ICU_ROOT "${NATIVE_PATH}"
+          CACHE PATH "Location of the ICU installation" FORCE)
+    endif()
+  endif()
+
+  # Find include directory
+  list(APPEND icu_include_suffixes "include")
+  find_path(ICU_INCLUDE_DIR
+            NAMES "unicode/utypes.h"
+            HINTS ${icu_roots}
+            PATH_SUFFIXES ${icu_include_suffixes}
+            DOC "ICU include directory")
+  set(ICU_INCLUDE_DIR "${ICU_INCLUDE_DIR}" PARENT_SCOPE)
+
+  # Get version
+  if(ICU_INCLUDE_DIR AND EXISTS "${ICU_INCLUDE_DIR}/unicode/uvernum.h")
+    file(STRINGS "${ICU_INCLUDE_DIR}/unicode/uvernum.h" icu_header_str
+      REGEX "^#define[\t ]+U_ICU_VERSION[\t ]+\".*\".*")
+
+    string(REGEX REPLACE "^#define[\t ]+U_ICU_VERSION[\t ]+\"([^ \\n]*)\".*"
+      "\\1" icu_version_string "${icu_header_str}")
+    set(ICU_VERSION "${icu_version_string}")
+    set(ICU_VERSION "${icu_version_string}" PARENT_SCOPE)
+    unset(icu_header_str)
+    unset(icu_version_string)
+  endif()
+
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # 64-bit binary directory
+    set(_bin64 "bin64")
+    # 64-bit library directory
+    set(_lib64 "lib64")
+  endif()
+
+
+  # Find all ICU programs
+  list(APPEND icu_binary_suffixes "${_bin64}" "bin" "sbin")
+  foreach(program ${icu_programs})
+    string(TOUPPER "${program}" program_upcase)
+    set(cache_var "ICU_${program_upcase}_EXECUTABLE")
+    set(program_var "ICU_${program_upcase}_EXECUTABLE")
+    find_program("${cache_var}"
+      NAMES "${program}"
+      HINTS ${icu_roots}
+      PATH_SUFFIXES ${icu_binary_suffixes}
+      DOC "ICU ${program} executable"
+      NO_PACKAGE_ROOT_PATH
+      )
+    mark_as_advanced(cache_var)
+    set("${program_var}" "${${cache_var}}" PARENT_SCOPE)
+  endforeach()
+
+  # Find all ICU libraries
+  list(APPEND icu_library_suffixes "${_lib64}" "lib")
+  set(ICU_REQUIRED_LIBS_FOUND ON)
+  set(static_prefix )
+  # static icu libraries compiled with MSVC have the prefix 's'
+  if(MSVC)
+    set(static_prefix "s")
+  endif()
+  foreach(component ${ICU_FIND_COMPONENTS})
+    string(TOUPPER "${component}" component_upcase)
+    set(component_cache "ICU_${component_upcase}_LIBRARY")
+    set(component_cache_release "${component_cache}_RELEASE")
+    set(component_cache_debug "${component_cache}_DEBUG")
+    set(component_found "${component_upcase}_FOUND")
+    set(component_libnames "icu${component}")
+    set(component_debug_libnames "icu${component}d")
+
+    # Special case deliberate library naming mismatches between Unix
+    # and Windows builds
+    unset(component_libnames)
+    unset(component_debug_libnames)
+    list(APPEND component_libnames "icu${component}")
+    list(APPEND component_debug_libnames "icu${component}d")
+    if(component STREQUAL "data")
+      list(APPEND component_libnames "icudt")
+      # Note there is no debug variant at present
+      list(APPEND component_debug_libnames "icudtd")
+    endif()
+    if(component STREQUAL "dt")
+      list(APPEND component_libnames "icudata")
+      # Note there is no debug variant at present
+      list(APPEND component_debug_libnames "icudatad")
+    endif()
+    if(component STREQUAL "i18n")
+      list(APPEND component_libnames "icuin")
+      list(APPEND component_debug_libnames "icuind")
+    endif()
+    if(component STREQUAL "in")
+      list(APPEND component_libnames "icui18n")
+      list(APPEND component_debug_libnames "icui18nd")
+    endif()
+
+    if(static_prefix)
+      unset(static_component_libnames)
+      unset(static_component_debug_libnames)
+      foreach(component_libname ${component_libnames})
+        list(APPEND static_component_libnames
+          ${static_prefix}${component_libname})
+      endforeach()
+      foreach(component_libname ${component_debug_libnames})
+        list(APPEND static_component_debug_libnames
+          ${static_prefix}${component_libname})
+      endforeach()
+      list(APPEND component_libnames ${static_component_libnames})
+      list(APPEND component_debug_libnames ${static_component_debug_libnames})
+    endif()
+    find_library("${component_cache_release}"
+      NAMES ${component_libnames}
+      HINTS ${icu_roots}
+      PATH_SUFFIXES ${icu_library_suffixes}
+      DOC "ICU ${component} library (release)"
+      NO_PACKAGE_ROOT_PATH
+      )
+    find_library("${component_cache_debug}"
+      NAMES ${component_debug_libnames}
+      HINTS ${icu_roots}
+      PATH_SUFFIXES ${icu_library_suffixes}
+      DOC "ICU ${component} library (debug)"
+      NO_PACKAGE_ROOT_PATH
+      )
+    include(SelectLibraryConfigurations)
+    select_library_configurations(ICU_${component_upcase})
+    mark_as_advanced("${component_cache_release}" "${component_cache_debug}")
+    if(${component_cache})
+      set("${component_found}" ON)
+      list(APPEND ICU_LIBRARY "${${component_cache}}")
+    endif()
+    mark_as_advanced("${component_found}")
+    set("${component_cache}" "${${component_cache}}" PARENT_SCOPE)
+    set("${component_found}" "${${component_found}}" PARENT_SCOPE)
+    if(${component_found})
+      if (ICU_FIND_REQUIRED_${component})
+        list(APPEND ICU_LIBS_FOUND "${component} (required)")
+      else()
+        list(APPEND ICU_LIBS_FOUND "${component} (optional)")
+      endif()
+    else()
+      if (ICU_FIND_REQUIRED_${component})
+        set(ICU_REQUIRED_LIBS_FOUND OFF)
+        list(APPEND ICU_LIBS_NOTFOUND "${component} (required)")
+      else()
+        list(APPEND ICU_LIBS_NOTFOUND "${component} (optional)")
+      endif()
+    endif()
+  endforeach()
+  set(_ICU_REQUIRED_LIBS_FOUND "${ICU_REQUIRED_LIBS_FOUND}" PARENT_SCOPE)
+  set(ICU_LIBRARY "${ICU_LIBRARY}" PARENT_SCOPE)
+
+  # Find all ICU data files
+  if(CMAKE_LIBRARY_ARCHITECTURE)
+    list(APPEND icu_data_suffixes
+      "${_lib64}/${CMAKE_LIBRARY_ARCHITECTURE}/icu/${ICU_VERSION}"
+      "lib/${CMAKE_LIBRARY_ARCHITECTURE}/icu/${ICU_VERSION}"
+      "${_lib64}/${CMAKE_LIBRARY_ARCHITECTURE}/icu"
+      "lib/${CMAKE_LIBRARY_ARCHITECTURE}/icu")
+  endif()
+  list(APPEND icu_data_suffixes
+    "${_lib64}/icu/${ICU_VERSION}"
+    "lib/icu/${ICU_VERSION}"
+    "${_lib64}/icu"
+    "lib/icu")
+  foreach(data ${icu_data})
+    string(TOUPPER "${data}" data_upcase)
+    string(REPLACE "." "_" data_upcase "${data_upcase}")
+    set(cache_var "ICU_${data_upcase}")
+    set(data_var "ICU_${data_upcase}")
+    find_file("${cache_var}"
+      NAMES "${data}"
+      HINTS ${icu_roots}
+      PATH_SUFFIXES ${icu_data_suffixes}
+      DOC "ICU ${data} data file")
+    mark_as_advanced(cache_var)
+    set("${data_var}" "${${cache_var}}" PARENT_SCOPE)
+  endforeach()
+
+  if(NOT ICU_FIND_QUIETLY)
+    if(ICU_LIBS_FOUND)
+      message(STATUS "Found the following ICU libraries:")
+      foreach(found ${ICU_LIBS_FOUND})
+        message(STATUS "  ${found}")
+      endforeach()
+    endif()
+    if(ICU_LIBS_NOTFOUND)
+      message(STATUS "The following ICU libraries were not found:")
+      foreach(notfound ${ICU_LIBS_NOTFOUND})
+        message(STATUS "  ${notfound}")
+      endforeach()
+    endif()
+  endif()
+
+  if(ICU_DEBUG)
+    message(STATUS "--------FindICU.cmake search debug--------")
+    message(STATUS "ICU binary path search order: ${icu_roots}")
+    message(STATUS "ICU include path search order: ${icu_roots}")
+    message(STATUS "ICU library path search order: ${icu_roots}")
+    message(STATUS "----------------")
+  endif()
+endfunction()
+
+_ICU_FIND()
+
+include(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(ICU
+                                  FOUND_VAR ICU_FOUND
+                                  REQUIRED_VARS ICU_INCLUDE_DIR
+                                                ICU_LIBRARY
+                                                _ICU_REQUIRED_LIBS_FOUND
+                                  VERSION_VAR ICU_VERSION
+                                  FAIL_MESSAGE "Failed to find all ICU components")
+
+unset(_ICU_REQUIRED_LIBS_FOUND)
+
+if(ICU_FOUND)
+  set(ICU_INCLUDE_DIRS "${ICU_INCLUDE_DIR}")
+  set(ICU_LIBRARIES "${ICU_LIBRARY}")
+  foreach(_ICU_component ${ICU_FIND_COMPONENTS})
+    string(TOUPPER "${_ICU_component}" _ICU_component_upcase)
+    set(_ICU_component_cache "ICU_${_ICU_component_upcase}_LIBRARY")
+    set(_ICU_component_cache_release "ICU_${_ICU_component_upcase}_LIBRARY_RELEASE")
+    set(_ICU_component_cache_debug "ICU_${_ICU_component_upcase}_LIBRARY_DEBUG")
+    set(_ICU_component_lib "ICU_${_ICU_component_upcase}_LIBRARIES")
+    set(_ICU_component_found "${_ICU_component_upcase}_FOUND")
+    set(_ICU_imported_target "ICU::${_ICU_component}")
+    if(${_ICU_component_found})
+      set("${_ICU_component_lib}" "${${_ICU_component_cache}}")
+      if(NOT TARGET ${_ICU_imported_target})
+        add_library(${_ICU_imported_target} UNKNOWN IMPORTED)
+        if(ICU_INCLUDE_DIR)
+          set_target_properties(${_ICU_imported_target} PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES "${ICU_INCLUDE_DIR}")
+        endif()
+        if(EXISTS "${${_ICU_component_cache}}")
+          set_target_properties(${_ICU_imported_target} PROPERTIES
+            IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+            IMPORTED_LOCATION "${${_ICU_component_cache}}")
+        endif()
+        if(EXISTS "${${_ICU_component_cache_release}}")
+          set_property(TARGET ${_ICU_imported_target} APPEND PROPERTY
+            IMPORTED_CONFIGURATIONS RELEASE)
+          set_target_properties(${_ICU_imported_target} PROPERTIES
+            IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX"
+            IMPORTED_LOCATION_RELEASE "${${_ICU_component_cache_release}}")
+        endif()
+        if(EXISTS "${${_ICU_component_cache_debug}}")
+          set_property(TARGET ${_ICU_imported_target} APPEND PROPERTY
+            IMPORTED_CONFIGURATIONS DEBUG)
+          set_target_properties(${_ICU_imported_target} PROPERTIES
+            IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "CXX"
+            IMPORTED_LOCATION_DEBUG "${${_ICU_component_cache_debug}}")
+        endif()
+        if(CMAKE_DL_LIBS AND _ICU_component STREQUAL "uc")
+          set_target_properties(${_ICU_imported_target} PROPERTIES
+            INTERFACE_LINK_LIBRARIES "${CMAKE_DL_LIBS}")
+        endif()
+      endif()
+    endif()
+    unset(_ICU_component_upcase)
+    unset(_ICU_component_cache)
+    unset(_ICU_component_lib)
+    unset(_ICU_component_found)
+    unset(_ICU_imported_target)
+  endforeach()
+endif()
+
+if(ICU_DEBUG)
+  message(STATUS "--------FindICU.cmake results debug--------")
+  message(STATUS "ICU found: ${ICU_FOUND}")
+  message(STATUS "ICU_VERSION number: ${ICU_VERSION}")
+  message(STATUS "ICU_ROOT directory: ${ICU_ROOT}")
+  message(STATUS "ICU_INCLUDE_DIR directory: ${ICU_INCLUDE_DIR}")
+  message(STATUS "ICU_LIBRARIES: ${ICU_LIBRARIES}")
+
+  foreach(program IN LISTS icu_programs)
+    string(TOUPPER "${program}" program_upcase)
+    set(program_lib "ICU_${program_upcase}_EXECUTABLE")
+    message(STATUS "${program} program: ${${program_lib}}")
+    unset(program_upcase)
+    unset(program_lib)
+  endforeach()
+
+  foreach(data IN LISTS icu_data)
+    string(TOUPPER "${data}" data_upcase)
+    string(REPLACE "." "_" data_upcase "${data_upcase}")
+    set(data_lib "ICU_${data_upcase}")
+    message(STATUS "${data} data: ${${data_lib}}")
+    unset(data_upcase)
+    unset(data_lib)
+  endforeach()
+
+  foreach(component IN LISTS ICU_FIND_COMPONENTS)
+    string(TOUPPER "${component}" component_upcase)
+    set(component_lib "ICU_${component_upcase}_LIBRARIES")
+    set(component_found "${component_upcase}_FOUND")
+    message(STATUS "${component} library found: ${${component_found}}")
+    message(STATUS "${component} library: ${${component_lib}}")
+    unset(component_upcase)
+    unset(component_lib)
+    unset(component_found)
+  endforeach()
+  message(STATUS "----------------")
+endif()
+
+unset(icu_programs)
diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake
new file mode 100644
index 00000000000..60fbf0726a0
--- /dev/null
+++ b/cmake/FindLAPACK.cmake
@@ -0,0 +1,430 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindLAPACK
+----------
+
+Find Linear Algebra PACKage (LAPACK) library
+
+This module finds an installed fortran library that implements the
+LAPACK linear-algebra interface (see http://www.netlib.org/lapack/).
+
+The approach follows that taken for the autoconf macro file,
+``acx_lapack.m4`` (distributed at
+http://ac-archive.sourceforge.net/ac-archive/acx_lapack.html).
+
+Input Variables
+^^^^^^^^^^^^^^^
+
+The following variables may be set to influence this module's behavior:
+
+``BLA_STATIC``
+  if ``ON`` use static linkage
+
+``BLA_VENDOR``
+  If set, checks only the specified vendor, if not set checks all the
+  possibilities.  List of vendors valid in this module:
+
+  * ``Intel10_32`` (intel mkl v10 32 bit)
+  * ``Intel10_64lp`` (intel mkl v10+ 64 bit, threaded code, lp64 model)
+  * ``Intel10_64lp_seq`` (intel mkl v10+ 64 bit, sequential code, lp64 model)
+  * ``Intel10_64ilp`` (intel mkl v10+ 64 bit, threaded code, ilp64 model)
+  * ``Intel10_64ilp_seq`` (intel mkl v10+ 64 bit, sequential code, ilp64 model)
+  * ``Intel`` (obsolete versions of mkl 32 and 64 bit)
+  * ``OpenBLAS``
+  * ``FLAME``
+  * ``ACML``
+  * ``Apple``
+  * ``NAS``
+  * ``Generic``
+
+``BLA_F95``
+  if ``ON`` tries to find BLAS95/LAPACK95
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This module defines the following variables:
+
+``LAPACK_FOUND``
+  library implementing the LAPACK interface is found
+``LAPACK_LINKER_FLAGS``
+  uncached list of required linker flags (excluding -l and -L).
+``LAPACK_LIBRARIES``
+  uncached list of libraries (using full path name) to link against
+  to use LAPACK
+``LAPACK95_LIBRARIES``
+  uncached list of libraries (using full path name) to link against
+  to use LAPACK95
+``LAPACK95_FOUND``
+  library implementing the LAPACK95 interface is found
+
+.. note::
+
+  C or CXX must be enabled to use Intel MKL
+
+  For example, to use Intel MKL libraries and/or Intel compiler:
+
+  .. code-block:: cmake
+
+    set(BLA_VENDOR Intel10_64lp)
+    find_package(LAPACK)
+#]=======================================================================]
+
+set(_lapack_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
+
+# Check the language being used
+if( NOT (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED OR CMAKE_Fortran_COMPILER_LOADED) )
+  if(LAPACK_FIND_REQUIRED)
+    message(FATAL_ERROR "FindLAPACK requires Fortran, C, or C++ to be enabled.")
+  else()
+    message(STATUS "Looking for LAPACK... - NOT found (Unsupported languages)")
+    return()
+  endif()
+endif()
+
+if (CMAKE_Fortran_COMPILER_LOADED)
+include(CheckFortranFunctionExists)
+else ()
+include(CheckFunctionExists)
+endif ()
+include(CMakePushCheckState)
+
+cmake_push_check_state()
+set(CMAKE_REQUIRED_QUIET ${LAPACK_FIND_QUIETLY})
+
+set(LAPACK_FOUND FALSE)
+set(LAPACK95_FOUND FALSE)
+
+# TODO: move this stuff to separate module
+
+macro(Check_Lapack_Libraries LIBRARIES _prefix _name _flags _list _blas _threads)
+# This macro checks for the existence of the combination of fortran libraries
+# given by _list.  If the combination is found, this macro checks (using the
+# Check_Fortran_Function_Exists macro) whether can link against that library
+# combination using the name of a routine given by _name using the linker
+# flags given by _flags.  If the combination of libraries is found and passes
+# the link test, LIBRARIES is set to the list of complete library paths that
+# have been found.  Otherwise, LIBRARIES is set to FALSE.
+
+# N.B. _prefix is the prefix applied to the names of all cached variables that
+# are generated internally and marked advanced by this macro.
+
+set(_libraries_work TRUE)
+set(${LIBRARIES})
+set(_combined_name)
+if (NOT _libdir)
+  if (WIN32)
+    set(_libdir ENV LIB)
+  elseif (APPLE)
+    set(_libdir ENV DYLD_LIBRARY_PATH)
+  else ()
+    set(_libdir ENV LD_LIBRARY_PATH)
+  endif ()
+endif ()
+
+list(APPEND _libdir "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+
+foreach(_library ${_list})
+  set(_combined_name ${_combined_name}_${_library})
+
+  if(_libraries_work)
+    if (BLA_STATIC)
+      if (WIN32)
+        set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES})
+      endif ()
+      if (APPLE)
+        set(CMAKE_FIND_LIBRARY_SUFFIXES .lib ${CMAKE_FIND_LIBRARY_SUFFIXES})
+      else ()
+        set(CMAKE_FIND_LIBRARY_SUFFIXES .a ${CMAKE_FIND_LIBRARY_SUFFIXES})
+      endif ()
+    else ()
+      if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        # for ubuntu's libblas3gf and liblapack3gf packages
+        set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES} .so.3gf)
+      endif ()
+    endif ()
+    find_library(${_prefix}_${_library}_LIBRARY
+      NAMES ${_library}
+      PATHS ${_libdir}
+      )
+    mark_as_advanced(${_prefix}_${_library}_LIBRARY)
+    set(${LIBRARIES} ${${LIBRARIES}} ${${_prefix}_${_library}_LIBRARY})
+    set(_libraries_work ${${_prefix}_${_library}_LIBRARY})
+  endif()
+endforeach()
+
+if(_libraries_work)
+  # Test this combination of libraries.
+  if(UNIX AND BLA_STATIC)
+    set(CMAKE_REQUIRED_LIBRARIES ${_flags} "-Wl,--start-group" ${${LIBRARIES}} ${_blas} "-Wl,--end-group" ${_threads})
+  else()
+    set(CMAKE_REQUIRED_LIBRARIES ${_flags} ${${LIBRARIES}} ${_blas} ${_threads})
+  endif()
+#  message("DEBUG: CMAKE_REQUIRED_LIBRARIES = ${CMAKE_REQUIRED_LIBRARIES}")
+  if (NOT CMAKE_Fortran_COMPILER_LOADED)
+    check_function_exists("${_name}_" ${_prefix}${_combined_name}_WORKS)
+  else ()
+    check_fortran_function_exists(${_name} ${_prefix}${_combined_name}_WORKS)
+  endif ()
+  set(CMAKE_REQUIRED_LIBRARIES)
+  set(_libraries_work ${${_prefix}${_combined_name}_WORKS})
+  #message("DEBUG: ${LIBRARIES} = ${${LIBRARIES}}")
+endif()
+
+if(_libraries_work)
+  set(${LIBRARIES} ${${LIBRARIES}} ${_blas} ${_threads})
+else()
+  set(${LIBRARIES} FALSE)
+endif()
+
+endmacro()
+
+
+set(LAPACK_LINKER_FLAGS)
+set(LAPACK_LIBRARIES)
+set(LAPACK95_LIBRARIES)
+
+
+if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+  find_package(BLAS)
+else()
+  find_package(BLAS REQUIRED)
+endif()
+
+
+if(BLAS_FOUND)
+  set(LAPACK_LINKER_FLAGS ${BLAS_LINKER_FLAGS})
+  if (NOT $ENV{BLA_VENDOR} STREQUAL "")
+    set(BLA_VENDOR $ENV{BLA_VENDOR})
+  else ()
+    if(NOT BLA_VENDOR)
+      set(BLA_VENDOR "All")
+    endif()
+  endif ()
+
+#intel lapack
+if (BLA_VENDOR MATCHES "Intel" OR BLA_VENDOR STREQUAL "All")
+  if (NOT WIN32)
+    set(LAPACK_mkl_LM "-lm")
+    set(LAPACK_mkl_LDL "-ldl")
+  endif ()
+  if (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED)
+    if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
+      find_PACKAGE(Threads)
+    else()
+      find_package(Threads REQUIRED)
+    endif()
+
+    if (BLA_VENDOR MATCHES "_64ilp")
+      set(LAPACK_mkl_ILP_MODE "ilp64")
+    else ()
+      set(LAPACK_mkl_ILP_MODE "lp64")
+    endif ()
+
+    set(LAPACK_SEARCH_LIBS "")
+
+    if (BLA_F95)
+      set(LAPACK_mkl_SEARCH_SYMBOL "cheev_f95")
+      set(_LIBRARIES LAPACK95_LIBRARIES)
+      set(_BLAS_LIBRARIES ${BLAS95_LIBRARIES})
+
+      # old
+      list(APPEND LAPACK_SEARCH_LIBS
+        "mkl_lapack95")
+      # new >= 10.3
+      list(APPEND LAPACK_SEARCH_LIBS
+        "mkl_intel_c")
+      list(APPEND LAPACK_SEARCH_LIBS
+        "mkl_lapack95_${LAPACK_mkl_ILP_MODE}")
+    else()
+      set(LAPACK_mkl_SEARCH_SYMBOL "cheev")
+      set(_LIBRARIES LAPACK_LIBRARIES)
+      set(_BLAS_LIBRARIES ${BLAS_LIBRARIES})
+
+      # old
+      list(APPEND LAPACK_SEARCH_LIBS
+        "mkl_lapack")
+    endif()
+
+    # First try empty lapack libs
+    if (NOT ${_LIBRARIES})
+      check_lapack_libraries(
+        ${_LIBRARIES}
+        LAPACK
+        ${LAPACK_mkl_SEARCH_SYMBOL}
+        ""
+        ""
+        "${_BLAS_LIBRARIES}"
+        ""
+        )
+    endif ()
+    # Then try the search libs
+    foreach (IT ${LAPACK_SEARCH_LIBS})
+      if (NOT ${_LIBRARIES})
+        check_lapack_libraries(
+          ${_LIBRARIES}
+          LAPACK
+          ${LAPACK_mkl_SEARCH_SYMBOL}
+          ""
+          "${IT}"
+          "${_BLAS_LIBRARIES}"
+          "${CMAKE_THREAD_LIBS_INIT};${LAPACK_mkl_LM};${LAPACK_mkl_LDL}"
+          )
+      endif ()
+    endforeach ()
+
+    unset(LAPACK_mkl_ILP_MODE)
+    unset(LAPACK_mkl_SEARCH_SYMBOL)
+    unset(LAPACK_mkl_LM)
+    unset(LAPACK_mkl_LDL)
+  endif ()
+endif()
+
+if (BLA_VENDOR STREQUAL "Goto" OR BLA_VENDOR STREQUAL "All")
+ if(NOT LAPACK_LIBRARIES)
+  check_lapack_libraries(
+  LAPACK_LIBRARIES
+  LAPACK
+  cheev
+  ""
+  "goto2"
+  "${BLAS_LIBRARIES}"
+  ""
+  )
+ endif()
+endif ()
+
+if (BLA_VENDOR STREQUAL "OpenBLAS" OR BLA_VENDOR STREQUAL "All")
+ if(NOT LAPACK_LIBRARIES)
+  check_lapack_libraries(
+  LAPACK_LIBRARIES
+  LAPACK
+  cheev
+  ""
+  "openblas"
+  "${BLAS_LIBRARIES}"
+  ""
+  )
+ endif()
+endif ()
+
+if (BLA_VENDOR STREQUAL "FLAME" OR BLA_VENDOR STREQUAL "All")
+ if(NOT LAPACK_LIBRARIES)
+  check_lapack_libraries(
+  LAPACK_LIBRARIES
+  LAPACK
+  cheev
+  ""
+  "flame"
+  "${BLAS_LIBRARIES}"
+  ""
+  )
+ endif()
+endif ()
+
+#acml lapack
+if (BLA_VENDOR MATCHES "ACML" OR BLA_VENDOR STREQUAL "All")
+  if (BLAS_LIBRARIES MATCHES ".+acml.+")
+    set (LAPACK_LIBRARIES ${BLAS_LIBRARIES})
+  endif ()
+endif ()
+
+# Apple LAPACK library?
+if (BLA_VENDOR STREQUAL "Apple" OR BLA_VENDOR STREQUAL "All")
+  if(NOT LAPACK_LIBRARIES)
+    check_lapack_libraries(
+    LAPACK_LIBRARIES
+    LAPACK
+    cheev
+    ""
+    "Accelerate"
+    "${BLAS_LIBRARIES}"
+    ""
+    )
+  endif()
+endif ()
+if (BLA_VENDOR STREQUAL "NAS" OR BLA_VENDOR STREQUAL "All")
+  if ( NOT LAPACK_LIBRARIES )
+    check_lapack_libraries(
+    LAPACK_LIBRARIES
+    LAPACK
+    cheev
+    ""
+    "vecLib"
+    "${BLAS_LIBRARIES}"
+    ""
+    )
+  endif ()
+endif ()
+# Generic LAPACK library?
+if (BLA_VENDOR STREQUAL "Generic" OR
+    BLA_VENDOR STREQUAL "ATLAS" OR
+    BLA_VENDOR STREQUAL "All")
+  if ( NOT LAPACK_LIBRARIES )
+    check_lapack_libraries(
+    LAPACK_LIBRARIES
+    LAPACK
+    cheev
+    ""
+    "lapack"
+    "${BLAS_LIBRARIES}"
+    ""
+    )
+  endif ()
+endif ()
+
+else()
+  message(STATUS "LAPACK requires BLAS")
+endif()
+
+if(BLA_F95)
+  if(LAPACK95_LIBRARIES)
+    set(LAPACK95_FOUND TRUE)
+  else()
+    set(LAPACK95_FOUND FALSE)
+  endif()
+  if(NOT LAPACK_FIND_QUIETLY)
+    if(LAPACK95_FOUND)
+      message(STATUS "A library with LAPACK95 API found.")
+    else()
+      if(LAPACK_FIND_REQUIRED)
+        message(FATAL_ERROR
+        "A required library with LAPACK95 API not found. Please specify library location."
+        )
+      else()
+        message(STATUS
+        "A library with LAPACK95 API not found. Please specify library location."
+        )
+      endif()
+    endif()
+  endif()
+  set(LAPACK_FOUND "${LAPACK95_FOUND}")
+  set(LAPACK_LIBRARIES "${LAPACK95_LIBRARIES}")
+else()
+  if(LAPACK_LIBRARIES)
+    set(LAPACK_FOUND TRUE)
+  else()
+    set(LAPACK_FOUND FALSE)
+  endif()
+
+  if(NOT LAPACK_FIND_QUIETLY)
+    if(LAPACK_FOUND)
+      message(STATUS "A library with LAPACK API found.")
+    else()
+      if(LAPACK_FIND_REQUIRED)
+        message(FATAL_ERROR
+        "A required library with LAPACK API not found. Please specify library location."
+        )
+      else()
+        message(STATUS
+        "A library with LAPACK API not found. Please specify library location."
+        )
+      endif()
+    endif()
+  endif()
+endif()
+
+cmake_pop_check_state()
+set(CMAKE_FIND_LIBRARY_SUFFIXES ${_lapack_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES})
diff --git a/cmake/FindNvToolExt.cmake b/cmake/FindNvToolExt.cmake
new file mode 100644
index 00000000000..5f2998e442a
--- /dev/null
+++ b/cmake/FindNvToolExt.cmake
@@ -0,0 +1,35 @@
+# The following variables are optionally searched for defaults
+#  NvToolExt_ROOT_DIR:
+#
+# The following are set after configuration is done:
+#  NvToolExt_FOUND
+#  NvToolExt_INCLUDE_DIR
+#  NvToolExt_LIBRARIES
+#  NvToolExt_LIBRARY_DIR
+#  NvToolExt:                   a target
+
+include(FindPackageHandleStandardArgs)
+
+set(NvToolExt_SEARCH_DIRS ${CUDA_TOOLKIT_ROOT_DIR})
+if(WIN32)
+    list(APPEND NvToolExt_SEARCH_DIRS "C:/Program Files/NVIDIA Corporation/NvToolsExt")
+endif()
+set(NvToolExt_SEARCH_DIRS ${NvToolExt_ROOT_DIR} ${NvToolExt_SEARCH_DIRS})
+
+
+find_path(NvToolExt_INCLUDE_DIR nvToolsExt.h HINTS ${NvToolExt_SEARCH_DIRS} PATH_SUFFIXES include)
+
+# 32bit not considered
+set(NvToolExt_LIBNAME nvToolsExt libnvToolsExt.so libnvToolsExt.a libnvToolsExt.so nvToolsExt64_1.lib)
+find_library(NvToolExt_LIBRARIES NAMES ${NvToolExt_LIBNAME} HINTS ${NvToolExt_SEARCH_DIRS}
+    PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+
+find_package_handle_standard_args(NvToolExt REQUIRED_VARS NvToolExt_INCLUDE_DIR NvToolExt_LIBRARIES)
+
+add_library(NvToolExt INTERFACE)
+target_include_directories(NvToolExt INTERFACE ${NvToolExt_INCLUDE_DIR})
+# target_link_directories(NvToolExt INTERFACE ${NvToolExt_INCLUDE_DIR})
+target_link_libraries(NvToolExt INTERFACE ${NvToolExt_LIBRARIES})
+
+unset(NvToolExt_SEARCH_DIRS)
+unset(NvToolExt_LIBNAME)
diff --git a/cmake/INSTALL.md b/cmake/INSTALL.md
new file mode 100644
index 00000000000..0082212eb9b
--- /dev/null
+++ b/cmake/INSTALL.md
@@ -0,0 +1,49 @@
+# Install Instruction
+
+Execute following commands in the repo root.
+
+## Build with Old Style Make Generator
+```bash
+mkdir -p build && cd build
+cmake -DCMAKE_INSTALL_PREFIX=../dist .. # configure
+cmake --build . --target install -- -j8 # build && install, substitude -j8 with /m:8 if you are on Windows
+```
+
+## Build with Ninja Generator
+``` bash
+mkdir -p build && cd build
+cmake -GNinja -DCMAKE_INSTALL_PREFIX=../dist ..
+cmake --build . --target install
+```
+
+After built, you can find all installed files in <your_repo_root>/dist
+
+# For Advance Configuration
+
+Follow options are currently available:
+
+| Variable               | Available Options         | Default  |
+| ---------------------- | ------------------------- | -------- |
+| MATHLIB                | OpenBLAS, MKL, Accelerate | OpenBLAS |
+| KALDI_BUILD_EXE        | ON,OFF                    | ON |
+| KALDI_BUILD_TEST       | ON,OFF                    | ON |
+| KALDI_USE_PATCH_NUMBER | ON,OFF                    | OFF |
+| BUILD_SHARED_LIBS      | ON,OFF                    | OFF |
+
+Append `-D<Variable>=<Value>` to the configure command to use it, e.g.,
+`-DKALDI_BUILD_TEST=OFF` will disable building of test executables. For more
+information, please refers to
+[CMake Documentation](https://cmake.org/cmake/help/latest/manual/cmake.1.html).
+For quick learning CMake usage, LLVM's short introuction will do the trick:
+[Basic CMake usage](https://llvm.org/docs/CMake.html#usage),
+[Options and variables](https://llvm.org/docs/CMake.html#options-and-variables),
+[Frequently-used CMake variables](https://llvm.org/docs/CMake.html#frequently-used-cmake-variables).
+
+NOTE 1: Currently, BUILD_SHARED_LIBS does not work on Windows due to some symbols
+        (variables) are not properly exported.
+
+NOTE 2: For scripts users, since you are doing an out of source build, and the
+        install destination is at your disposal, the `$PATH` is not configured
+        properly in this case. Scripts will not work out of box. See how `$PATH`
+        is modified in [path.sh](../egs/wsj/s5/path.sh). You should add
+        `<installation_path>/bin` to your `$PATH` before running any scripts.
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
new file mode 100644
index 00000000000..88dbefdacc9
--- /dev/null
+++ b/cmake/Utils.cmake
@@ -0,0 +1,46 @@
+if(NOT CMAKE_VERSION VERSION_LESS "3.10")
+    include_guard()
+endif()
+
+# For Windows, some env or vars are using backward slash for pathes, convert
+# them to forward slashes will fix some nasty problem in CMake.
+macro(normalize_path in_path)
+    file(TO_CMAKE_PATH "${${in_path}}" normalize_path_out_path)
+    set(${in_path} "${normalize_path_out_path}")
+    unset(normalize_path_out_path)
+endmacro()
+
+macro(normalize_env_path in_path)
+    file(TO_CMAKE_PATH "$${in_path}" normalize_env_path_out_path)
+    set(${in_path} "${normalize_env_path_out_path}")
+    unset(normalize_env_path_out_path)
+endmacro()
+
+
+macro(add_kaldi_executable)
+    if(${KALDI_BUILD_EXE})
+        cmake_parse_arguments(kaldi_exe "" "NAME" "SOURCES;DEPENDS" ${ARGN})
+        add_executable(${kaldi_exe_NAME} ${kaldi_exe_SOURCES})
+        target_link_libraries(${kaldi_exe_NAME} PRIVATE ${kaldi_exe_DEPENDS})
+        # list(APPEND KALDI_EXECUTABLES ${kaldi_exe_NAME})
+        install(TARGETS ${kaldi_exe_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+        unset(kaldi_exe_NAME)
+        unset(kaldi_exe_SOURCES)
+        unset(kaldi_exe_DEPENDS)
+    endif()
+endmacro()
+
+macro(add_kaldi_test_executable)
+    if(${KALDI_BUILD_TEST})
+        cmake_parse_arguments(kaldi_test_exe "" "NAME" "SOURCES;DEPENDS" ${ARGN})
+        add_executable(${kaldi_test_exe_NAME} ${kaldi_test_exe_SOURCES})
+        target_link_libraries(${kaldi_test_exe_NAME} PRIVATE ${kaldi_test_exe_DEPENDS})
+        # list(APPEND KALDI_TEST_EXECUTABLES ${kaldi_test_exe_NAME})
+        install(TARGETS ${kaldi_test_exe_NAME} RUNTIME DESTINATION testbin)
+
+        unset(kaldi_test_exe_NAME)
+        unset(kaldi_test_exe_SOURCES)
+        unset(kaldi_test_exe_DEPENDS)
+    endif()
+endmacro()
diff --git a/cmake/VersionHelper.cmake b/cmake/VersionHelper.cmake
new file mode 100644
index 00000000000..e494a255663
--- /dev/null
+++ b/cmake/VersionHelper.cmake
@@ -0,0 +1,14 @@
+function(get_version)
+    file(READ ${CMAKE_CURRENT_SOURCE_DIR}/src/.version version)
+    string(STRIP ${version} version)
+    execute_process(COMMAND git log -n1 --format=%H src/.version
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                    OUTPUT_VARIABLE version_commit
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(COMMAND git rev-list --count "${version_commit}..HEAD"
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                    OUTPUT_VARIABLE patch_number)
+
+    set(KALDI_VERSION ${version} PARENT_SCOPE)
+    set(KALDI_PATCH_NUMBER ${patch_number} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/gen_cmake_skeleton.py b/cmake/gen_cmake_skeleton.py
new file mode 100644
index 00000000000..fa506943662
--- /dev/null
+++ b/cmake/gen_cmake_skeleton.py
@@ -0,0 +1,310 @@
+import os
+import sys
+import re
+import argparse
+
+# earily parse, will refernece args globally
+parser = argparse.ArgumentParser()
+parser.add_argument("working_dir")
+parser.add_argument("--quiet", default=False, action="store_true")
+args = parser.parse_args()
+
+def print_wrapper(*args_, **kwargs):
+    if not args.quiet:
+        print(*args_, **kwargs)
+
+def get_subdirectories(d):
+    return [name for name in os.listdir(d) if os.path.isdir(os.path.join(d, name))]
+
+def is_bin_dir(d):
+    return d.endswith("bin")
+
+def get_files(d):
+    return [name for name in os.listdir(d) if os.path.isfile(os.path.join(d, name))]
+
+def is_header(f):
+    return f.endswith(".h")
+
+def is_cu_source(f):
+    return f.endswith(".cu")
+
+def is_test_source(f):
+    return f.endswith("-test.cc")
+
+def is_source(f):
+    return f.endswith(".cc") and not is_test_source(f)
+
+def dir_name_to_lib_target(dir_name):
+    return "kaldi-" + dir_name
+
+def wrap_notwin32_condition(should_wrap, lines):
+    if isinstance(lines, str):
+        lines = [lines]
+    if should_wrap:
+        return ["if(NOT WIN32)"] + list(map(lambda l: "    " + l, lines)) + ["endif()"]
+    else:
+        return lines
+
+
+def get_exe_additional_depends(t):
+    additional = {
+        "transform-feats" : ["transform"],
+        "interpolate-pitch" : ["transform"],
+        "post-to-feats" : ["hmm"],
+        "append-post-to-feats" : ["hmm"],
+        "gmm-est-fmllr-gpost": ["sgmm2", "hmm"],
+        "gmm-est-fmllr": ["hmm", "transform"],
+        "gmm-latgen-faster": ["decoder"],
+        "gmm-transform-means": ["hmm"],
+        "gmm-post-to-gpost": ["hmm"],
+        "gmm-init-lvtln": ["transform"],
+        "gmm-rescore-lattice": ["hmm", "lat"],
+        "gmm-est-fmllr-global": ["transform"],
+        "gmm-copy": ["hmm"],
+        "gmm-train-lvtln-special": ["transform", "hmm"],
+        "gmm-est-map": ["hmm"],
+        "gmm-acc-stats2": ["hmm"],
+        "gmm-decode-faster-regtree-mllr": ["decoder"],
+        "gmm-global-est-fmllr": ["transform"],
+        "gmm-est-basis-fmllr": ["hmm", "transform"],
+        "gmm-init-model": ["hmm"],
+        "gmm-est-weights-ebw": ["hmm"],
+        "gmm-init-biphone": ["hmm"],
+        "gmm-compute-likes": ["hmm"],
+        "gmm-est-fmllr-raw-gpost": ["hmm", "transform"],
+        # gmm-* is a bottom case, it will add link dependencies to all other
+        # target whose names start with gmm-, it is harmless, but will increase
+        # link time. Better to avoid it at best.
+        "gmm-*": ["hmm", "transform", "lat", "decoder"],
+    }
+    if t in additional:
+        return list(map(lambda name: dir_name_to_lib_target(name), additional[t]))
+    elif (t.split("-", 1)[0] + "-*") in additional:
+        wildcard = (t.split("-", 1)[0] + "-*")
+        return list(map(lambda name: dir_name_to_lib_target(name), additional[wildcard]))
+    else:
+        return []
+
+def disable_for_win32(t):
+    disabled = [
+        "online-audio-client",
+        "online-net-client",
+        "online2-tcp-nnet3-decode-faster",
+        "online-server-gmm-decode-faster",
+        "online-audio-server-decode-faster"
+    ]
+    return t in disabled
+
+class CMakeListsHeaderLibrary(object):
+    def __init__(self, dir_name):
+        self.dir_name = dir_name
+        self.target_name = dir_name_to_lib_target(self.dir_name)
+        self.header_list = []
+
+    def add_header(self, filename):
+        self.header_list.append(filename)
+
+    def add_source(self, filename):
+        pass
+
+    def add_cuda_source(self, filename):
+        pass
+
+    def add_test_source(self, filename):
+        pass
+
+    def gen_code(self):
+        ret = []
+        if len(self.header_list) > 0:
+            ret.append("set(PUBLIC_HEADERS")
+            for f in self.header_list:
+                ret.append("    " + f)
+            ret.append(")\n")
+
+        ret.append("add_library(" + self.target_name + " INTERFACE)")
+        ret.append("target_include_directories(" + self.target_name + " INTERFACE ")
+        ret.append("     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>")
+        ret.append("     $<INSTALL_INTERFACE:include/kaldi>")
+        ret.append(")\n")
+
+        ret.append("""
+install(TARGETS {tgt} EXPORT kaldi-targets)
+
+install(FILES ${{PUBLIC_HEADERS}} DESTINATION include/kaldi/{dir})
+""".format(tgt=self.target_name, dir=self.dir_name))
+
+        return "\n".join(ret)
+
+class CMakeListsLibrary(object):
+
+    def __init__(self, dir_name):
+        self.dir_name = dir_name
+        self.target_name = dir_name_to_lib_target(self.dir_name)
+        self.header_list = []
+        self.source_list = []
+        self.cuda_source_list = []
+        self.test_source_list = []
+        self.depends = []
+
+    def add_header(self, filename):
+        self.header_list.append(filename)
+
+    def add_source(self, filename):
+        self.source_list.append(filename)
+
+    def add_cuda_source(self, filename):
+        self.cuda_source_list.append(filename)
+
+    def add_test_source(self, filename):
+        self.test_source_list.append(filename)
+
+    def load_dependency_from_makefile(self, filename):
+        with open(filename) as f:
+            makefile = f.read()
+            if "ADDLIBS" not in makefile:
+                print_wrapper("WARNING: non-standard", filename)
+                return
+            libs = makefile.split("ADDLIBS")[-1].split("\n\n")[0]
+            libs = re.findall("[^\s\\\\=]+", libs)
+            for l in libs:
+                self.depends.append(os.path.splitext(os.path.basename(l))[0])
+
+    def gen_code(self):
+        ret = []
+
+        if len(self.header_list) > 0:
+            ret.append("set(PUBLIC_HEADERS")
+            for f in self.header_list:
+                ret.append("    " + f)
+            ret.append(")\n")
+
+        if len(self.cuda_source_list) > 0:
+            self.source_list.append("${CUDA_OBJS}")
+            ret.append("cuda_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)")
+            ret.append("cuda_compile(CUDA_OBJS")
+            for f in self.cuda_source_list:
+                ret.append("    " + f)
+            ret.append(")\n")
+
+        ret.append("add_library(" + self.target_name)
+        for f in self.source_list:
+            ret.append("    " + f)
+        ret.append(")\n")
+        ret.append("target_include_directories(" + self.target_name + " PUBLIC ")
+        ret.append("     $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>")
+        ret.append("     $<INSTALL_INTERFACE:include/kaldi>")
+        ret.append(")\n")
+
+        if len(self.depends) > 0:
+            ret.append("target_link_libraries(" + self.target_name + " PUBLIC")
+            for d in self.depends:
+                ret.append("    " + d)
+            ret.append(")\n")
+
+        def get_test_exe_name(filename):
+            exe_name = os.path.splitext(f)[0]
+            if self.dir_name.startswith("nnet") and exe_name.startswith("nnet"):
+                return self.dir_name + "-" + exe_name.split("-", 1)[1]
+            else:
+                return exe_name
+
+        if len(self.test_source_list) > 0:
+            ret.append("if(KALDI_BUILD_TEST)")
+            for f in self.test_source_list:
+                exe_target = get_test_exe_name(f)
+                depends = (self.target_name + " " + " ".join(get_exe_additional_depends(exe_target))).strip()
+                ret.extend(wrap_notwin32_condition(disable_for_win32(self.target_name),
+                    "    add_kaldi_test_executable(NAME " + exe_target + " SOURCES " + f + " DEPENDS " + depends + ")"))
+            ret.append("endif()")
+
+        ret.append("""
+install(TARGETS {tgt}
+    EXPORT kaldi-targets
+    ARCHIVE DESTINATION ${{CMAKE_INSTALL_LIBDIR}}
+    LIBRARY DESTINATION ${{CMAKE_INSTALL_LIBDIR}}
+    RUNTIME DESTINATION ${{CMAKE_INSTALL_BINDIR}}
+)
+
+install(FILES ${{PUBLIC_HEADERS}} DESTINATION include/kaldi/{dir})
+""".format(tgt=self.target_name, dir=self.dir_name))
+
+        return "\n".join(ret)
+
+
+
+class CMakeListsExecutable(object):
+
+    def __init__(self, dir_name, filename):
+        assert(dir_name.endswith("bin"))
+        self.list = []
+        exe_name = os.path.splitext(os.path.basename(filename))[0]
+        file_name = filename
+        depend = dir_name_to_lib_target(dir_name[:-3])
+        self.list.append((exe_name, file_name, depend))
+
+    def gen_code(self):
+        ret = []
+        for exe_name, file_name, depend in self.list:
+            depends = (depend + " " + " ".join(get_exe_additional_depends(exe_name))).strip()
+            ret.extend(wrap_notwin32_condition(disable_for_win32(exe_name),
+                       "add_kaldi_executable(NAME " + exe_name + " SOURCES " + file_name + " DEPENDS " + depends + ")"))
+
+        return "\n".join(ret)
+
+class CMakeListsFile(object):
+
+    GEN_CMAKE_HEADER = "# generated with cmake/gen_cmake_skeleton.py, DO NOT MODIFY.\n"
+
+    def __init__(self, directory):
+        self.path = os.path.realpath(os.path.join(directory, "CMakeLists.txt"))
+        self.sections = []
+
+    def add_section(self, section):
+        self.sections.append(section)
+
+    def write_file(self):
+        with open(self.path, "w", newline='\n') as f: # good luck for python2
+            f.write(CMakeListsFile.GEN_CMAKE_HEADER)
+            for s in self.sections:
+                code = s.gen_code()
+                f.write(code)
+                f.write("\n")
+        print_wrapper("  Writed", self.path)
+
+
+if __name__ == "__main__":
+    os.chdir(args.working_dir)
+    print_wrapper("Working in ", args.working_dir)
+
+    subdirs = get_subdirectories(".")
+    for d in subdirs:
+        cmakelists = CMakeListsFile(d)
+        if is_bin_dir(d):
+            for f in get_files(d):
+                if is_source(f):
+                    dir_name = os.path.basename(d)
+                    filename = os.path.basename(f)
+                    exe = CMakeListsExecutable(dir_name, filename)
+                    cmakelists.add_section(exe)
+        else:
+            dir_name = os.path.basename(d)
+            lib = None
+            makefile = os.path.join(d, "Makefile")
+            if not os.path.exists(makefile):
+                lib = CMakeListsHeaderLibrary(dir_name)
+            else:
+                lib = CMakeListsLibrary(dir_name)
+                lib.load_dependency_from_makefile(makefile)
+            cmakelists.add_section(lib)
+            for f in sorted(get_files(d)):
+                filename = os.path.basename(f)
+                if is_source(filename):
+                    lib.add_source(filename)
+                elif is_cu_source(filename):
+                    lib.add_cuda_source(filename)
+                elif is_test_source(filename):
+                    lib.add_test_source(filename)
+                elif is_header(filename):
+                    lib.add_header(filename)
+
+        cmakelists.write_file()
diff --git a/cmake/kaldi-config.cmake.in b/cmake/kaldi-config.cmake.in
new file mode 100644
index 00000000000..123f58c5699
--- /dev/null
+++ b/cmake/kaldi-config.cmake.in
@@ -0,0 +1,7 @@
+@PACKAGE_INIT@
+
+find_package(Threads)
+
+if(NOT TARGET kaldi-base)
+    include(${CMAKE_CURRENT_LIST_DIR}/kaldi-targets.cmake)
+endif()
diff --git a/cmake/third_party/get_third_party.cmake b/cmake/third_party/get_third_party.cmake
new file mode 100644
index 00000000000..8e24dc9f643
--- /dev/null
+++ b/cmake/third_party/get_third_party.cmake
@@ -0,0 +1,20 @@
+# Download and unpack a third-party library at configure time
+# The original code is at the README of google-test:
+# https://github.com/google/googletest/tree/master/googletest
+function(get_third_party name)
+    configure_file(
+        "${PROJECT_SOURCE_DIR}/cmake/third_party/${name}.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/${name}-download/CMakeLists.txt")
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+        RESULT_VARIABLE result
+        WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${name}-download")
+    if(result)
+        message(FATAL_ERROR "CMake step for ${name} failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+        RESULT_VARIABLE result
+        WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${name}-download")
+    if(result)
+        message(FATAL_ERROR "Build step for ${name} failed: ${result}")
+    endif()
+endfunction()
diff --git a/cmake/third_party/openfst.cmake b/cmake/third_party/openfst.cmake
new file mode 100644
index 00000000000..19a7f527f8f
--- /dev/null
+++ b/cmake/third_party/openfst.cmake
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 2.8.2)
+project(openfst-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(openfst
+    GIT_REPOSITORY https://github.com/kkm000/openfst
+    GIT_TAG 0bca6e76d24647427356dc242b0adbf3b5f1a8d9 # tag win/1.7.2.1
+    SOURCE_DIR "${CMAKE_BINARY_DIR}/openfst"
+    BINARY_DIR ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+    TEST_COMMAND ""
+)
diff --git a/cmake/third_party/openfst_lib_target.cmake b/cmake/third_party/openfst_lib_target.cmake
new file mode 100644
index 00000000000..dde5efc402a
--- /dev/null
+++ b/cmake/third_party/openfst_lib_target.cmake
@@ -0,0 +1,31 @@
+if(NOT OPENFST_ROOT_DIR)
+    message(FATAL_ERROR)
+endif()
+
+set(fst_source_dir ${OPENFST_ROOT_DIR}/src/lib)
+set(fst_include_dir ${OPENFST_ROOT_DIR}/src/include)
+
+include_directories(${fst_include_dir})
+file(GLOB fst_sources "${fst_source_dir}/*.cc")
+
+add_library(fst ${fst_sources})
+target_include_directories(fst PUBLIC
+     $<BUILD_INTERFACE:${fst_include_dir}>
+     $<INSTALL_INTERFACE:include/openfst>
+)
+
+install(TARGETS fst
+    EXPORT kaldi-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
+
+install(DIRECTORY ${fst_include_dir}/fst
+    DESTINATION include/openfst
+    PATTERN "test/*.h" EXCLUDE
+)
+
+unset(fst_source_dir)
+unset(fst_include_dir)
+unset(fst_sources)
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 00000000000..852e9531bd6
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,30 @@
+# Kaldi Docker images
+
+Kaldi offers two set of images: CPU-based images and GPU-based images. Daily builds of the latest version of the master branch (both CPU and GPU images) are pushed daily to [DockerHub](https://hub.docker.com/r/kaldiasr/kaldi). 
+
+## Using pre-built images 
+Sample usage of the CPU based images:
+```bash
+docker run -it kaldiasr/kaldi:latest bash
+``` 
+
+Sample usage of the GPU based images:
+
+Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images.
+
+```bash
+docker run -it --runtime=nvidia kaldiasr/kaldi:gpu-latest bash
+```
+
+## Building images locally
+For building the CPU-based image:
+```bash
+cd docker/debian9.8-cpu
+docker build --tag kaldiasr/kaldi:latest .
+```
+
+and for GPU-based image:
+```bash
+cd docker/ubuntu16.04-gpu
+docker build --tag kaldiasr/kaldi:gpu-latest .
+```
diff --git a/docker/debian9.8-cpu/Dockerfile b/docker/debian9.8-cpu/Dockerfile
new file mode 100644
index 00000000000..db0b9c47a73
--- /dev/null
+++ b/docker/debian9.8-cpu/Dockerfile
@@ -0,0 +1,41 @@
+
+FROM debian:9.8
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        ca-certificates \
+        gfortran \
+        patch \
+        ffmpeg \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python 
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared && \
+    make depend -j $(nproc) && \
+    make -j $(nproc)
+
+WORKDIR /opt/kaldi/
+
diff --git a/docker/ubuntu16.04-gpu/Dockerfile b/docker/ubuntu16.04-gpu/Dockerfile
new file mode 100644
index 00000000000..d705a5c1689
--- /dev/null
+++ b/docker/ubuntu16.04-gpu/Dockerfile
@@ -0,0 +1,41 @@
+
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
+LABEL maintainer="mdoulaty@gmail.com"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        python2.7 \
+        python3 \
+        zlib1g-dev \
+        gfortran \
+        ca-certificates \
+        patch \
+        ffmpeg \
+	vim && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN ln -s /usr/bin/python2.7 /usr/bin/python 
+
+RUN git clone --depth 1 https://github.com/kaldi-asr/kaldi.git /opt/kaldi && \
+    cd /opt/kaldi && \
+    cd /opt/kaldi/tools && \
+    ./extras/install_mkl.sh && \
+    make -j $(nproc) && \
+    cd /opt/kaldi/src && \
+    ./configure --shared --use-cuda && \
+    make depend -j $(nproc) && \
+    make -j $(nproc)
+
+WORKDIR /opt/kaldi/
+
diff --git a/egs/aidatatang_200zh/README.md b/egs/aidatatang_200zh/README.md
new file mode 100644
index 00000000000..097454d84ce
--- /dev/null
+++ b/egs/aidatatang_200zh/README.md
@@ -0,0 +1,21 @@
+Aidatatang_200zh is a free Chinese Mandarin speech corpus provided by Beijing DataTang Technology Co., Ltd under Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License. 
+
+**About the aidatatang_200zh corpus:**
+
+- The corpus contains 200 hours of acoustic data, which is mostly mobile recorded data.
+- 600 speakers from different accent areas in China are invited to participate in the recording.
+- The transcription accuracy for each sentence is larger than 98%.
+- Recordings are conducted in a quiet indoor environment. 
+- The database is divided into training set, validation set, and testing set in a ratio of 7: 1: 2.
+- Detail information such as speech data coding and speaker information is preserved in the metadata file.
+- Segmented transcripts are also provided.
+
+You can get the corpus from [here](https://www.datatang.com/webfront/opensource.html). 
+
+DataTang is a community of creators-of world-changers and future-builders. We're invested in collaborating with a diverse set of voices in the AI world, and are excited about working on large-scale projects. Beyond speech, we're providing multiple resources in image, and text. For more details, please visit [datatang](<https://www.datatang.com/>).
+
+**About the recipe:**
+
+To demonstrate that this corpus is a reasonable data resource for Chinese Mandarin speech recognition research, a baseline recipe is provided here for everyone to explore their own systems easily and quickly.
+
+In this directory, each subdirectory contains the scripts for a sequence of experiments. The recipe in subdirectory "s5" is based on the hkust s5 recipe and aishell s5 recipe. It generates an integrated phonetic lexicon with CMU dictionary and cedit dictionary. This recipe follows the Mono+Triphone+SAT+fMLLR+DNN pipeline. In addition, this directory will be extended as scripts for speaker diarization and so on are created.
diff --git a/egs/aidatatang_200zh/s5/RESULTS b/egs/aidatatang_200zh/s5/RESULTS
new file mode 100644
index 00000000000..8c458e8015e
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/RESULTS
@@ -0,0 +1,17 @@
+%WER 37.09 [ 173936 / 468933, 4868 ins, 31143 del, 137925 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 17.98 [ 84305 / 468933, 4724 ins, 12637 del, 66944 sub ] exp/tri1/decode_test/cer_13_0.0
+%WER 17.94 [ 84149 / 468933, 5025 ins, 12427 del, 66697 sub ] exp/tri2/decode_test/cer_13_0.0
+%WER 17.26 [ 80945 / 468933, 4421 ins, 12958 del, 63566 sub ] exp/tri3a/decode_test/cer_14_0.0
+%WER 14.16 [ 66424 / 468933, 4567 ins, 10224 del, 51633 sub ] exp/tri4a/decode_test/cer_14_0.0
+%WER 12.22 [ 57304 / 468933, 4799 ins, 8197 del, 44308 sub ] exp/tri5a/decode_test/cer_14_0.0
+%WER 5.59 [ 26232 / 468933, 1701 ins, 4377 del, 20154 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_0.0
+
+# nnet3 tdnn with online pitch, local/nnet3/tuning/run_tdnn_2a.sh
+%WER 7.21 [ 33797 / 468933, 2141 ins, 6117 del, 25539 sub ] exp/nnet3/tdnn_sp/decode_test/cer_13_0.0
+%WER 7.44 [ 34878 / 468933, 2252 ins, 5854 del, 26772 sub ] exp/nnet3/tdnn_sp_online/decode_test/cer_12_0.0
+%WER 7.79 [ 36542 / 468933, 2527 ins, 5674 del, 28341 sub ] exp/nnet3/tdnn_sp_online/decode_test_per_utt/cer_12_0.0
+
+# chain with online pitch, local/chain/tuning/run_tdnn_2a.sh
+%WER 5.61 [ 26311 / 468933, 1773 ins, 4789 del, 19749 sub ] exp/chain/tdnn_2a_sp/decode_test/cer_11_0.0
+%WER 5.69 [ 26661 / 468933, 1723 ins, 4724 del, 20214 sub ] exp/chain/tdnn_2a_sp_online/decode_test/cer_11_0.0
+%WER 5.98 [ 28046 / 468933, 2031 ins, 4527 del, 21488 sub ] exp/chain/tdnn_2a_sp_online/decode_test_per_utt/cer_11_0.0
diff --git a/egs/aidatatang_200zh/s5/cmd.sh b/egs/aidatatang_200zh/s5/cmd.sh
new file mode 100644
index 00000000000..811adcde474
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/aidatatang_200zh/s5/conf/cmu2pinyin b/egs/aidatatang_200zh/s5/conf/cmu2pinyin
new file mode 100644
index 00000000000..c02eb600fcc
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/cmu2pinyin
@@ -0,0 +1,39 @@
+AA A
+AE A
+AH A
+AO UO
+AW U
+AY AI
+B B
+CH CH 
+D D
+DH S I
+EH AI
+ER E
+EY AI
+F F
+G G
+HH H
+IH I
+IY I
+JH ZH 
+K K
+L L
+M M
+N N
+NG N
+OW UO
+OY UO
+P P
+R R
+S S
+SH SH
+T T
+TH S
+UH U
+UW U
+V W
+W W
+Y Y
+Z Z 
+ZH X  
diff --git a/egs/aidatatang_200zh/s5/conf/decode.config b/egs/aidatatang_200zh/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/aidatatang_200zh/s5/conf/mfcc.conf b/egs/aidatatang_200zh/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf b/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/aidatatang_200zh/s5/conf/online_cmvn.conf b/egs/aidatatang_200zh/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/aidatatang_200zh/s5/conf/online_pitch.conf b/egs/aidatatang_200zh/s5/conf/online_pitch.conf
new file mode 100644
index 00000000000..c0f1342160d
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/online_pitch.conf
@@ -0,0 +1,4 @@
+--sample-frequency=16000
+--simulate-first-pass-online=true
+--normalization-right-context=25
+--frames-per-chunk=10
diff --git a/egs/aidatatang_200zh/s5/conf/pinyin2cmu b/egs/aidatatang_200zh/s5/conf/pinyin2cmu
new file mode 100644
index 00000000000..a6e53620479
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/pinyin2cmu
@@ -0,0 +1,58 @@
+A AA
+AI AY
+AN AE N 
+ANG AE NG
+AO AW   
+B B 
+CH CH
+C T S
+D D
+E ER 
+EI EY
+EN AH N
+ENG AH NG
+ER AA R 
+F F
+G G
+H HH
+IA IY AA
+IANG IY AE NG
+IAN IY AE N
+IAO IY AW
+IE IY EH
+I IY
+ING IY NG
+IN IY N
+IONG IY UH NG
+IU IY UH 
+J J
+K K
+L L
+M M
+N N
+O AO
+ONG UH NG
+OU OW
+P P
+Q Q
+R R
+SH SH
+S S
+T T
+UAI UW AY
+UANG UW AE NG
+UAN UW AE N
+UA UW AA
+UI UW IY 
+UN UW AH N
+UO UW AO
+U UW
+UE IY EH 
+VE IY EH 
+V IY UW
+VN IY N 
+W W
+X X 
+Y Y
+ZH JH 
+Z Z
diff --git a/egs/aidatatang_200zh/s5/conf/pinyin_initial b/egs/aidatatang_200zh/s5/conf/pinyin_initial
new file mode 100644
index 00000000000..e263ad07e2a
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/pinyin_initial
@@ -0,0 +1,23 @@
+B 
+C
+CH
+D
+F
+G
+H
+J
+K
+L
+M
+N
+P
+Q
+R
+S
+SH
+T
+W
+X
+Y
+Z
+ZH
diff --git a/egs/aidatatang_200zh/s5/conf/pitch.conf b/egs/aidatatang_200zh/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh b/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..71e6fbe106d
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/compare_wer.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+# Copyright 2018  Emotech LTD (Author: Xuechen Liu)
+
+# compare wer between diff. models in aidatatang_200zh chain directory
+# exemplar usage: local/chain/compare_wer.sh --online exp/chain/tdnn_2a_sp
+# note: this script is made quite general since we kinda wanna give more flexibility to
+#       users on adding affix for their own use when training models.
+
+set -e
+. ./cmd.sh
+. ./path.sh
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 --online exp/chain/tdnn_2a_sp"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+}
+
+# print model names
+echo -n "# Model               "
+for x in $*; do
+  printf "% 10s" " $(basename $x)"
+done
+echo
+
+# print decode WER results
+echo -n "# WER(%)               "
+for x in $*; do
+  set_names $x
+  wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+# so how about online WER?
+if $include_online; then
+  echo -n "# WER(%)[online]       "
+  for x in $*; do
+    set_names $x
+    wer=$(cat ${x}_online/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+  echo -n "# WER(%)[per-utt]      "
+  for x in $*; do
+    set_names $x
+    wer_per_utt=$(cat ${x}_online/decode_test_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer_per_utt
+  done
+  echo
+fi
+
+# print final log prob for train & validation
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
+
+# do the same for xent objective
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh b/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100644
index 00000000000..0be0e2c79c6
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,193 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+# results
+# local/chain/compare_wer.sh exp/chain/tdnn_1a_sp/
+# Model                tdnn_1a_sp
+# WER(%)                     5.59
+# Final train prob        -0.0488
+# Final valid prob        -0.0925
+# Final train prob (xent)   -0.8001
+# Final valid prob (xent)   -1.0398
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_1a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
new file mode 100644
index 00000000000..78dd4000e58
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_1a.sh.
+# This setup used online pitch to train the neural network.
+# It requires a online_pitch.conf in the conf dir.
+
+# results
+# local/chain/compare_wer.sh exp/chain/tdnn_2a_sp
+# Model                tdnn_2a_sp
+# WER(%)                     5.61
+# Final train prob        -0.0502
+# Final valid prob        -0.0913
+# Final train prob (xent)   -0.8047
+# Final valid prob (xent)   -1.0292
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn_2a_sp
+# Model                tdnn_2a_sp
+# WER(%)                     5.61
+# WER(%)[online]             5.69
+# WER(%)[per-utt]            5.98
+# Final train prob        -0.0502
+# Final valid prob        -0.0913
+# Final train prob (xent)   -0.8047
+# Final valid prob (xent)   -1.0292
+
+# local/chain/compare_wer.sh exp/chain/tdnn_1a_sp exp/chain/tdnn_2a_sp
+# Model                tdnn_1a_sp tdnn_2a_sp
+# WER(%)                     5.59      5.61
+# Final train prob        -0.0488   -0.0502
+# Final valid prob        -0.0925   -0.0913
+# Final train prob (xent)   -0.8001   -0.8047
+# Final valid prob (xent)   -1.0398   -1.0292
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_2a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires_online \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+dir=${dir}_online
+if [ $stage -le 15 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 16 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" --per-utt true \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl b/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl
new file mode 100644
index 00000000000..33e2e8061c3
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/create_oov_char_lexicon.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/env perl
+# Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na)
+#
+# A script for char-based Chinese OOV lexicon generation.
+#
+# Input 1: char-based dictionary, example
+# CHAR1 ph1 ph2
+# CHAR2 ph3
+# CHAR3 ph2 ph4
+#
+# Input 2: OOV word list, example
+# WORD1
+# WORD2
+# WORD3
+#
+# where WORD1 is in the format of "CHAR1CHAR2".
+#
+# Output: OOV lexicon, in the format of normal lexicon
+
+if($#ARGV != 1) {
+  print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n";
+  print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n";
+  print STDERR "### oovwordlist: OOV word list\n";
+  print STDERR "### oovlex: output OOV lexicon\n";
+  exit;
+}
+
+use utf8;
+my %prons;
+open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
+binmode(DICT,":encoding(utf8)");
+foreach (<DICT>) {
+  chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
+}
+close DICT;
+
+open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
+binmode(WORDS,":encoding(utf8)");
+while (<WORDS>) {
+  chomp;
+  print $_;
+  @A = split("", $_);
+  foreach (@A) {
+    print " $prons{$_}";
+  }
+  print "\n";
+}
+close WORDS;
diff --git a/egs/aidatatang_200zh/s5/local/data_prep.sh b/egs/aidatatang_200zh/s5/local/data_prep.sh
new file mode 100644
index 00000000000..bb278a7d904
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/data_prep.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <text-path>"
+  echo " $0 /export/a05/xna/data/data_aidatatang_200zh/corpus /export/a05/xna/data/data_aidatatang_200zh/transcript"
+  exit 1;
+fi
+
+aidatatang_audio_dir=$1
+aidatatang_text=$2/aidatatang_200_zh_transcript.txt
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+tmp_dir=data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 237265 ] && \
+  echo Warning: expected 237265 data files, found $n
+
+grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/dev data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $dev_dir/$f data/dev/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: aidatatang_200zh data preparation succeeded"
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/local/download_and_untar.sh b/egs/aidatatang_200zh/s5/local/download_and_untar.sh
new file mode 100644
index 00000000000..39f9ac01ff7
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/download_and_untar.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: aidatatang_200zh."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="aidatatang_200zh"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="18756983399"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.gz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+dev_dir=$data/$part/corpus/dev
+test_dir=$data/$part/corpus/test
+train_dir=$data/$part/corpus/train
+if [ $part == "aidatatang_200zh" ]; then
+  for set in $dev_dir $test_dir $train_dir;do
+    cd $set
+    for wav in ./*.tar.gz; do
+      echo "Extracting wav from $wav"
+      tar -zxf $wav && rm $wav
+    done
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5b/local/gale_format_data.sh b/egs/aidatatang_200zh/s5/local/format_data.sh
old mode 100755
new mode 100644
similarity index 73%
rename from egs/gale_arabic/s5b/local/gale_format_data.sh
rename to egs/aidatatang_200zh/s5/local/format_data.sh
index b69c34e68b9..47af9dd9dfd
--- a/egs/gale_arabic/s5b/local/gale_format_data.sh
+++ b/egs/aidatatang_200zh/s5/local/format_data.sh
@@ -1,23 +1,25 @@
 #!/bin/bash
+#
 
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
+. ./path.sh
 
-if [ -f path.sh ]; then
-  . ./path.sh; else
-   echo "$0: missing path.sh"; exit 1;
-fi
+silprob=0.5
+mkdir -p data/lang_test data/train data/dev
 
-for dir in test train; do
-   cp -pr data/local/$dir data/$dir
-done
-
-
-mkdir -p data/lang_test
 
 arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
 [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
 
+# Copy stuff into its final locations...
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp data/local/train/$f data/train/$f || exit 1;
+done
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp data/local/dev/$f data/dev/$f || exit 1;
+done
+
 rm -r data/lang_test
 cp -r data/lang data/lang_test
 
@@ -26,15 +28,15 @@ gunzip -c "$arpa_lm" | \
            --read-symbol-table=data/lang_test/words.txt - data/lang_test/G.fst
 
 
-echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
+echo  "Checking how stochastic G is (the first of these numbers should be small):"
 fstisstochastic data/lang_test/G.fst
 
 ## Check lexicon.
 ## just have a look and make sure it seems sane.
-echo "$0: First few lines of lexicon FST:"
+echo "First few lines of lexicon FST:"
 fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
 
-echo "$0: Performing further checks"
+echo Performing further checks
 
 # Checking that G.fst is determinizable.
 fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
@@ -55,6 +57,4 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
    fstisstochastic || echo LG is not stochastic
 
 
-echo gale_format_data succeeded.
-
-exit 0
+echo format_data succeeded.
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh b/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..2d85626c356
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/compare_wer.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# Copyright 2018  Emotech LTD (Author: Xuechen Liu)
+
+# compare wer between diff. models in aidatatang_200zh nnet3 directory
+# exemplar usage: local/nnet3/compare_wer.sh exp/nnet3/tdnn_sp
+# note: this script is made quite general since we kinda wanna give more flexibility to
+#       users on adding affix for their own use when training models.
+
+set -e
+. ./cmd.sh
+. ./path.sh
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3/tdnn_sp exp/nnet3/tdnn_sp_pr"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+}
+
+# print model names
+echo -n "# Model               "
+for x in $*; do
+  printf "% 10s" " $(basename $x)"
+done
+echo
+
+# print decode WER results
+echo -n "# WER(%)               "
+for x in $*; do
+  set_names $x
+  wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+# so how about online WER?
+if $include_online; then
+  echo -n "# WER(%)[online]       "
+  for x in $*; do
+    set_names $x
+    wer=$(cat ${x}_online/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+  echo -n "# WER(%)[per-utt]      "
+  for x in $*; do
+    set_names $x
+    wer_per_utt=$(cat ${x}_online/decode_test_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer_per_utt
+  done
+  echo
+fi
+
+# print log for train & validation
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7)
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh b/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh
new file mode 100644
index 00000000000..0fe55ecf000
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev test"
+gmm=tri5a
+online=false
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+online_affix=
+if [ $online = true ]; then
+  online_affix=_online
+fi
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp \
+    exp/make_mfcc/train_sp mfcc_perturbed || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=mfcc_perturbed_hires$online_affix
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/aidatatang-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1;
+    # create MFCC data dir without pitch to extract iVector
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+train_set=train_sp
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/aidatatang-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}
+  done
+fi
+
+exit 0
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/run_tdnn.sh b/egs/aidatatang_200zh/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..0659f92fa61
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn.sh
\ No newline at end of file
diff --git a/egs/aidatatang_200zh/s5/local/nnet3/tuning/run_tdnn.sh b/egs/aidatatang_200zh/s5/local/nnet3/tuning/run_tdnn.sh
new file mode 100644
index 00000000000..2bcded42ed1
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/nnet3/tuning/run_tdnn.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
+
+# In this script, the neural network in trained based on hires mfcc and online pitch.
+# The online pitch setup requires a online_pitch.conf in the conf dir for both training
+# and testing.
+
+# results
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn_sp
+# Model                  tdnn_sp
+# WER(%)                     7.21
+# Final train prob        -0.6475
+# Final valid prob        -0.9461
+
+# local/nnet3/compare_wer.sh --online exp/nnet3/tdnn_sp
+# Model                  tdnn_sp
+# WER(%)                     7.21
+# WER(%)[online]             7.44
+# WER(%)[per-utt]            7.79
+# Final train prob        -0.6475
+# Final valid prob        -0.9461
+
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=12
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  ivector_dim=$(feat-to-dim scp:exp/nnet3/ivectors_${train_set}/ivector_online.scp - || exit 1;)
+  feat_dim=$(feat-to-dim scp:data/${train_set}_hires_online/feats.scp - || exit 1;)
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=$ivector_dim name=ivector
+  input dim=$feat_dim name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aidatatang-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --feat-dir=data/${train_set}_hires_online \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  # do the actual online decoding with iVectors, carrying info forward from
+  # previous utterances of the same speaker.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_$decode_set
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 12 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_${decode_set}_per_utt
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config --per-utt true \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/local/prepare_dict.sh b/egs/aidatatang_200zh/s5/local/prepare_dict.sh
new file mode 100644
index 00000000000..aa72bcd48d2
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/prepare_dict.sh
@@ -0,0 +1,320 @@
+#!/bin/bash
+#Copyright 2016 LeSpeech (Author: Xingyu Na)
+
+# prepare dictionary for aidatatang
+# it is done for English and Chinese separately,
+# For English, we use CMU dictionary, and Sequitur G2P
+# for OOVs, while all englist phone set will concert to Chinese
+# phone set at the end. For Chinese, we use an online dictionary,
+# for OOV, we just produce pronunciation using Charactrt Mapping.
+
+. ./path.sh
+
+[ $# != 0 ] && echo "Usage: $0" && exit 1;
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+dict_dir=data/local/dict
+mkdir -p $dict_dir
+mkdir -p $dict_dir/lexicon-{en,ch}
+
+# extract full vocabulary
+cat $train_dir/text $dev_dir/text $test_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
+  perl -ape 's/ /\n/g;' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
+  grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt || exit 1;
+
+# split into English and Chinese
+cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt || exit 1;
+cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt || exit 1;
+
+
+##### produce pronunciations for english
+if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
+  echo "--- Downloading CMU dictionary ..."
+  svn co -r 13068 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+    $dict_dir/cmudict || exit 1;
+fi
+
+# format cmudict
+echo "--- Striping stress and pronunciation variant markers from cmudict ..."
+perl $dict_dir/cmudict/scripts/make_baseform.pl \
+  $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\
+  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt || exit 1;
+
+# extract in-vocab lexicon and oov words
+echo "--- Searching for English OOV words ..."
+awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/words-en-oov.txt || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/lexicon-en-iv.txt || exit 1;
+
+wc -l $dict_dir/lexicon-en/words-en-oov.txt
+wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt
+
+# setup g2p and generate oov lexicon
+if [ ! -f conf/g2p_model ]; then
+  echo "--- Downloading a pre-trained Sequitur G2P model ..."
+  wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
+  if [ ! -f conf/g2p_model ]; then
+    echo "Failed to download the g2p model!"
+    exit 1
+  fi
+fi
+
+echo "--- Preparing pronunciations for OOV words ..."
+g2p=`which g2p.py`
+if [ ! -x $g2p ]; then
+  echo "g2p.py is not found. Checkout tools/extras/install_sequitur.sh."
+  exit 1
+fi
+g2p.py --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt \
+  > $dict_dir/lexicon-en/lexicon-en-oov.txt || exit 1;
+
+# merge in-vocab and oov lexicon
+cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\
+  sort > $dict_dir/lexicon-en/lexicon-en-phn.txt || exit 1;
+
+# convert cmu phoneme to pinyin phonenme
+mkdir -p $dict_dir/map
+cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/map/cmu || exit 1;
+cat conf/pinyin2cmu | awk -v cmu=$dict_dir/map/cmu \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/map/cmu-used || exit 1;
+cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {if (!dict[$1]) print $1;}' > $dict_dir/map/cmu-not-used || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/map/cmu-not-used conf/cmu2pinyin |\
+  egrep -v '<.?s>' > $dict_dir/map/cmu-py || exit 1;
+
+cat $dict_dir/map/cmu-py | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
+    print "@entry";
+    print "\n";
+  }
+' conf/pinyin2cmu > $dict_dir/map/cmu-cmu || exit 1;
+
+cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) {
+      if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
+      else {push(@entry, $A[$i])};
+    }
+    print "@entry";
+    print "\n";
+  }
+' $dict_dir/map/cmu-cmu > $dict_dir/lexicon-en/lexicon-en.txt || exit 1;
+
+
+##### produce pronunciations for chinese
+if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then
+  echo "------------- Downloading cedit dictionary ---------------"
+  mkdir -p $dict_dir/cedict
+  wget -P $dict_dir/cedict http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
+  gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
+fi
+
+cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
+ perl -e '
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    print $A[1];
+    for($n = 2; $n < @A; $n++) {
+      $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:;
+      $tmp = uc($A[$n]);
+      print " $tmp";
+    }
+    print "\n";
+  }
+ ' | sort -k1 > $dict_dir/cedict/ch-dict.txt || exit 1;
+
+echo "--- Searching for Chinese OOV words ..."
+awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt || exit 1;
+
+wc -l $dict_dir/lexicon-ch/words-ch-oov.txt
+wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
+
+
+# validate Chinese dictionary and compose a char-based
+# dictionary in order to get OOV pronunciations
+cat $dict_dir/cedict/ch-dict.txt |\
+  perl -e '
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    $word_len = length($A[0]);
+    $proun_len = @A - 1 ;
+    if ($word_len == $proun_len) {print $_;}
+  }
+  ' > $dict_dir/cedict/ch-dict-1.txt || exit 1;
+
+# extract chars
+cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
+  perl -e '
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");  
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @chars = split("", $A[0]);
+    foreach (@chars) {
+      print "$_\n";
+    }
+  }
+  ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt || exit 1;
+
+# extract individual pinyins
+cat $dict_dir/cedict/ch-dict-1.txt |\
+  awk '{for(i=2; i<=NF; i++) print $i}' |\
+  perl -ape 's/ /\n/g;' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1;
+
+# first make sure number of characters and pinyins
+# are equal, so that a char-based dictionary can
+# be composed.
+nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt`
+npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt`
+if [ $nchars -ne $npinyin ]; then
+  echo "Found $nchars chars and $npinyin pinyin. Please check!"
+  exit 1
+fi
+
+paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt |\
+  sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt || exit 1;
+
+# create a multiple pronunciation dictionary
+cat $dict_dir/lexicon-ch/ch-char-dict.txt |\
+  perl -e '
+  my $prev = "";
+  my $out_line = "";
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    $cur = $A[0];
+    $cur_py = $A[1];
+    #print length($prev);
+    if (length($prev) == 0) { $out_line = $_; chomp($out_line);}
+    if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);}
+    if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";}
+    $prev = $cur;
+  }
+  print $out_line;
+  ' >  $dict_dir/lexicon-ch/ch-char-dict-mp.txt || exit 1;
+
+# get lexicon for Chinese OOV words
+local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt \
+  $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt || exit 1;
+
+# seperate multiple prons for Chinese OOV lexicon
+cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\
+  perl -e '
+  my @entry;
+  my @entry1;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    push(@entry, $A[0]);
+    for($i = 1; $i < @A; $i++ ) {
+      @py = split("/", $A[$i]);
+      @entry1 = @entry;
+      @entry = ();
+      for ($j = 0; $j < @entry1; $j++) {
+        for ($k = 0; $k < @py; $k++) {
+          $tmp = $entry1[$j]." ".$py[$k];
+          push(@entry, $tmp);
+        }
+      }
+    }
+    for ($i = 0; $i < @entry; $i++) {
+      print $entry[$i];
+      print "\n";
+    }
+  }
+  ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt || exit 1;
+
+# compose IV and OOV lexicons for Chinese
+cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\
+  awk '{if (NF > 1 && $2 ~ /[A-Za-z0-9]+/) print $0;}' > $dict_dir/lexicon-ch/lexicon-ch.txt || exit 1;
+
+# convert Chinese pinyin to CMU format
+cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
+  utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt || exit 1;
+
+# combine English and Chinese lexicons
+cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\
+  sort -u > $dict_dir/lexicon1.txt || exit 1;
+
+cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
+  sort -u |\
+  perl -e '
+  my %ph_cl;
+  while (<STDIN>) {
+    $phone = $_;
+    chomp($phone);
+    chomp($_);
+    $phone =~ s:([A-Z]+)[0-9]:$1:;
+    if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
+    else { $ph_cl{$phone} = [$_]; }
+  }
+  foreach $key ( keys %ph_cl ) {
+     print "@{ $ph_cl{$key} }\n"
+  }
+  ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
+
+( echo SIL; echo SPN; echo NSN; echo LAU ) > $dict_dir/silence_phones.txt
+
+echo SIL > $dict_dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone
+
+cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dict_dir/extra_questions.txt || exit 1;
+
+# Add to the lexicon the silences, noises etc.
+(echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
+ echo '<UNK> SPN' ) | \
+ cat - $dict_dir/lexicon1.txt  > $dict_dir/lexicon.txt || exit 1;
+
+echo "$0: aidatatang_200zh dict preparation succeeded"
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/local/score.sh b/egs/aidatatang_200zh/s5/local/score.sh
new file mode 100644
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/aidatatang_200zh/s5/local/train_lms.sh b/egs/aidatatang_200zh/s5/local/train_lms.sh
new file mode 100644
index 00000000000..bc52f8acb20
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/train_lms.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aidatatang_data_prep.sh.
+# It takes as input the files
+#data/local/train/text
+#data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+                # have a different locale.
+kaldi_lm=`which train_lm.sh`
+if [ ! -x $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <UNK> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <UNK>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/aidatatang_200zh/s5/local/wer_hyp_filter b/egs/aidatatang_200zh/s5/local/wer_hyp_filter
new file mode 100644
index 00000000000..a1bfdb57efc
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('[NOISE]','[LAUGHTER]','[VOCALIZED-NOISE]','<UNK>','%HESITATION');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/aidatatang_200zh/s5/local/wer_output_filter b/egs/aidatatang_200zh/s5/local/wer_output_filter
new file mode 100644
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/aidatatang_200zh/s5/local/wer_ref_filter b/egs/aidatatang_200zh/s5/local/wer_ref_filter
new file mode 100644
index 00000000000..a1bfdb57efc
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('[NOISE]','[LAUGHTER]','[VOCALIZED-NOISE]','<UNK>','%HESITATION');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/aidatatang_200zh/s5/path.sh b/egs/aidatatang_200zh/s5/path.sh
new file mode 100644
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/aidatatang_200zh/s5/run.sh b/egs/aidatatang_200zh/s5/run.sh
new file mode 100644
index 00000000000..47e46a660cd
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/run.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+# Copyright 2019 Beijing DataTang Tech. Co. Ltd. (Author: Liyuan Wang)
+#           2017 Hui Bu
+#           2017 Jiayu Du
+#           2017 Xingyu Na
+#           2017 Bengu Wu
+#           2017 Hao Zheng
+# Apache 2.0
+
+# This is a shell script, but it's recommended that you run the commands one by
+# one by copying and pasting into the shell.
+# Caution: some of the graph creation steps use quite a bit of memory, so you
+# should run this on a machine that has sufficient memory.
+
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+
+
+# corpus directory and download URL
+data=/export/a05/xna/data
+data_url=www.openslr.org/resources/62
+
+# Obtain the database
+#[ -d $data ] || mkdir -p $data || exit 1;
+local/download_and_untar.sh $data $data_url aidatatang_200zh || exit 1;
+
+# Data Preparation: generate text, wav.scp, utt2spk, spk2utt
+local/data_prep.sh $data/aidatatang_200zh/corpus $data/aidatatang_200zh/transcript || exit 1;
+
+# Lexicon Preparation: build a large lexicon that invovles words in both the training and decoding
+local/prepare_dict.sh || exit 1;
+
+# Prepare Language Stuff
+# Phone Sets, questions, L compilation
+utils/prepare_lang.sh --position-dependent-phones false data/local/dict "<UNK>" data/local/lang data/lang || exit 1;
+
+# LM training
+local/train_lms.sh || exit 1;
+
+# G compilation, check LG composition
+local/format_data.sh
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you want to store MFCC features.
+mfccdir=mfcc
+for x in train dev test; do
+  steps/make_mfcc_pitch.sh --write_utt2dur false --write_utt2num_frames false --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  utils/fix_data_dir.sh data/$x || exit 1;
+done
+
+steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono || exit 1;
+
+# Monophone decoding
+utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/mono/graph data/dev exp/mono/decode_dev
+
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/mono/graph data/test exp/mono/decode_test
+
+# Get alignments from monophone system.
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+# train tri1 [first triphone pass]
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+# decode tri1
+utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri1/graph data/dev exp/tri1/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri1/graph data/test exp/tri1/decode_test
+
+# align tri1
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+# train tri2 [delta+delta-deltas]
+steps/train_deltas.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+# decode tri2
+utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri2/graph data/dev exp/tri2/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+  exp/tri2/graph data/test exp/tri2/decode_test
+
+#align tri2
+steps/align_si.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+# Train tri3a, which is LDA+MLLT,
+steps/train_lda_mllt.sh --cmd "$train_cmd" \
+ 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+
+utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri3a/graph data/dev exp/tri3a/decode_dev
+steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri3a/graph data/test exp/tri3a/decode_test
+
+# From now, we start building a more serious system (with SAT), and we'll
+# do the alignment with fMLLR.
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri4a/graph data/dev exp/tri4a/decode_dev
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+  exp/tri4a/graph data/test exp/tri4a/decode_test
+
+steps/align_fmllr.sh  --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri4a exp/tri4a_ali
+
+# Building a larger SAT system.
+
+steps/train_sat.sh --cmd "$train_cmd" \
+  3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+
+utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+   exp/tri5a/graph data/dev exp/tri5a/decode_dev || exit 1;
+steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
+   exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+
+steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
+  data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+# nnet3
+local/nnet3/run_tdnn.sh
+
+# chain
+local/chain/run_tdnn.sh
+
+# getting results (see RESULTS file)
+for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+
+exit 0;
diff --git a/egs/aidatatang_200zh/s5/steps b/egs/aidatatang_200zh/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/aidatatang_200zh/s5/utils b/egs/aidatatang_200zh/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/aidatatang_200zh/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/aishell/s5/RESULTS b/egs/aishell/s5/RESULTS
index b58ede148c4..b6155cb62d4 100644
--- a/egs/aishell/s5/RESULTS
+++ b/egs/aishell/s5/RESULTS
@@ -1,8 +1,18 @@
-%WER 33.82 [ 35432 / 104765, 743 ins, 3991 del, 30698 sub ] exp/mono/decode_test/cer_12_0.0
-%WER 19.39 [ 20310 / 104765, 903 ins, 1452 del, 17955 sub ] exp/tri1/decode_test/cer_13_0.5
-%WER 19.23 [ 20147 / 104765, 910 ins, 1287 del, 17950 sub ] exp/tri2/decode_test/cer_14_0.5
-%WER 17.14 [ 17961 / 104765, 812 ins, 1024 del, 16125 sub ] exp/tri3a/decode_test/cer_14_0.0
-%WER 13.64 [ 14294 / 104765, 669 ins, 736 del, 12889 sub ] exp/tri4a/decode_test/cer_14_0.5
-%WER 12.23 [ 12809 / 104765, 656 ins, 580 del, 11573 sub ] exp/tri5a/decode_test/cer_13_1.0
-%WER 8.45 [ 8849 / 104765, 312 ins, 538 del, 7999 sub ] exp/nnet3/tdnn_sp/decode_test/cer_13_1.0
-%WER 7.46 [ 7813 / 104765, 287 ins, 472 del, 7054 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_1.0
+%WER 36.41 [ 38146 / 104765, 837 ins, 3114 del, 34195 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 18.76 [ 19654 / 104765, 949 ins, 1152 del, 17553 sub ] exp/tri1/decode_test/cer_13_0.5
+%WER 18.64 [ 19531 / 104765, 941 ins, 1159 del, 17431 sub ] exp/tri2/decode_test/cer_14_0.5
+%WER 17.04 [ 17849 / 104765, 810 ins, 1021 del, 16018 sub ] exp/tri3a/decode_test/cer_14_0.5
+%WER 13.82 [ 14482 / 104765, 764 ins, 670 del, 13048 sub ] exp/tri4a/decode_test/cer_13_0.5
+%WER 12.12 [ 12694 / 104765, 751 ins, 523 del, 11420 sub ] exp/tri5a/decode_test/cer_13_0.5
+%WER 8.65 [ 9064 / 104765, 367 ins, 455 del, 8242 sub ] exp/nnet3/tdnn_sp/decode_test/cer_14_0.5
+%WER 7.48 [ 7839 / 104765, 285 ins, 454 del, 7100 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_1.0
+
+# nnet3 tdnn with online pitch, local/nnet3/tuning/tun_tdnn_2a.sh
+%WER 8.64 [ 9050 / 104765, 349 ins, 521 del, 8180 sub ] exp/nnet3/tdnn_sp/decode_test/cer_15_0.5
+%WER 8.72 [ 9135 / 104765, 367 ins, 422 del, 8346 sub ] exp/nnet3/tdnn_sp_online/decode_test/cer_12_1.0
+%WER 9.36 [ 9807 / 104765, 386 ins, 441 del, 8980 sub ] exp/nnet3/tdnn_sp_online/decode_test_per_utt/cer_13_1.0
+
+# chain with online pitch, local/chain/tuning/run_tdnn_2a.sh
+%WER 7.45 [ 7807 / 104765, 340 ins, 497 del, 6970 sub ] exp/chain/tdnn_2a_sp/decode_test/cer_11_0.5
+%WER 7.43 [ 7780 / 104765, 341 ins, 469 del, 6970 sub ] exp/chain/tdnn_2a_sp_online/decode_test/cer_11_0.5
+%WER 7.92 [ 8296 / 104765, 384 ins, 472 del, 7440 sub ] exp/chain/tdnn_2a_sp_online/decode_test_per_utt/cer_11_0.5
diff --git a/egs/aishell/s5/conf/online_pitch.conf b/egs/aishell/s5/conf/online_pitch.conf
new file mode 100644
index 00000000000..c0f1342160d
--- /dev/null
+++ b/egs/aishell/s5/conf/online_pitch.conf
@@ -0,0 +1,4 @@
+--sample-frequency=16000
+--simulate-first-pass-online=true
+--normalization-right-context=25
+--frames-per-chunk=10
diff --git a/egs/aishell/s5/local/aishell_prepare_dict.sh b/egs/aishell/s5/local/aishell_prepare_dict.sh
index 3763622a3e7..c4cabb24de4 100755
--- a/egs/aishell/s5/local/aishell_prepare_dict.sh
+++ b/egs/aishell/s5/local/aishell_prepare_dict.sh
@@ -15,21 +15,9 @@ mkdir -p $dict_dir
 cp $res_dir/lexicon.txt $dict_dir
 
 cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
-  sort -u |\
-  perl -e '
-  my %ph_cl;
-  while (<STDIN>) {
-    $phone = $_;
-    chomp($phone);
-    chomp($_);
-    $phone = $_;
-    next if ($phone eq "sil");
-    if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
-    else { $ph_cl{$phone} = [$_]; }
-  }
-  foreach $key ( keys %ph_cl ) {
-     print "@{ $ph_cl{$key} }\n"
-  }
+  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
+    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
+    foreach $l (values %q) {print "$l\n";}
   ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
 
 echo sil > $dict_dir/silence_phones.txt
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
index a0b183e3c5a..b38fa4d9c7a 100755
--- a/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -90,7 +90,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
new file mode 100755
index 00000000000..6b7223785d9
--- /dev/null
+++ b/egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -0,0 +1,211 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_1a.sh.
+# This setup used online pitch to train the neural network.
+# It requires a online_pitch.conf in the conf dir.
+
+set -e
+
+# configs for 'chain'
+affix=
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn_2a  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires_online \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in dev test; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    $lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+dir=${dir}_online
+if [ $stage -le 15 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
+  done
+fi
+
+if [ $stage -le 16 ]; then
+  for test_set in dev test; do
+    steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" --per-utt true \
+      --config conf/decode.config \
+      $graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1;
+  done
+fi
+
+exit;
diff --git a/egs/aishell/s5/local/download_and_untar.sh b/egs/aishell/s5/local/download_and_untar.sh
index 3578a1c0835..58a278241d7 100755
--- a/egs/aishell/s5/local/download_and_untar.sh
+++ b/egs/aishell/s5/local/download_and_untar.sh
@@ -57,7 +57,7 @@ if [ -f $data/$part.tgz ]; then
   if ! $size_ok; then
     echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
     echo "does not equal the size of one of the archives."
-    rm $data/$part.gz
+    rm $data/$part.tgz
   else
     echo "$data/$part.tgz exists and appears to be complete."
   fi
diff --git a/egs/aishell/s5/local/nnet3/run_ivector_common.sh b/egs/aishell/s5/local/nnet3/run_ivector_common.sh
index 1643e6381b1..af0ae122372 100755
--- a/egs/aishell/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/aishell/s5/local/nnet3/run_ivector_common.sh
@@ -14,7 +14,7 @@ stage=0
 train_set=train
 test_sets="dev test"
 gmm=tri5a
-
+online=false
 nnet3_affix=
 
 . ./cmd.sh
@@ -31,6 +31,11 @@ for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   fi
 done
 
+online_affix=
+if [ $online = true ]; then
+  online_affix=_online
+fi
+
 if [ $stage -le 1 ]; then
   # Although the nnet will be trained by high resolution data, we still have to
   # perturb the normal data to get the alignment _sp stands for speed-perturbed
@@ -54,26 +59,26 @@ if [ $stage -le 3 ]; then
   # Create high-resolution MFCC features (with 40 cepstra instead of 13).
   # this shows how you can split across multiple file-systems.
   echo "$0: creating high-resolution MFCC features"
-  mfccdir=mfcc_perturbed_hires
+  mfccdir=mfcc_perturbed_hires$online_affix
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/aishell-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
   for datadir in ${train_set}_sp ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires$online_affix
   done
 
   # do volume-perturbation on the training data prior to extracting hires
   # features; this helps make trained nnets more invariant to test data volume.
-  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires$online_affix || exit 1;
 
   for datadir in ${train_set}_sp ${test_sets}; do
-    steps/make_mfcc_pitch.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+    steps/make_mfcc_pitch$online_affix.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires$online_affix exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires$online_affix || exit 1;
     # create MFCC data dir without pitch to extract iVector
-    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch || exit 1;
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires$online_affix data/${datadir}_hires_nopitch || exit 1;
     steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
   done
 fi
diff --git a/egs/aishell/s5/local/nnet3/run_tdnn.sh b/egs/aishell/s5/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 3cb8cd861a3..00000000000
--- a/egs/aishell/s5/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-
-# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
-
-# this is the standard "tdnn" system, built in nnet3; it's what we use to
-# call multi-splice.
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-set -e
-
-stage=0
-train_stage=-10
-affix=
-common_egs_dir=
-
-# training options
-initial_effective_lrate=0.0015
-final_effective_lrate=0.00015
-num_epochs=4
-num_jobs_initial=2
-num_jobs_final=12
-remove_egs=true
-
-# feature options
-use_ivectors=true
-
-# End configuration section.
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-dir=exp/nnet3/tdnn_sp${affix:+_$affix}
-gmm_dir=exp/tri5a
-train_set=train_sp
-ali_dir=${gmm_dir}_sp_ali
-graph_dir=$gmm_dir/graph
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 7 ]; then
-  echo "$0: creating neural net configs";
-
-  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-
-  mkdir -p $dir/configs
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=43 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 dim=850
-  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
-  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
-  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn6 dim=850
-  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_dnn.py --stage=$train_stage \
-    --cmd="$decode_cmd" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
-    --trainer.num-epochs $num_epochs \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
-    --trainer.optimization.final-effective-lrate $final_effective_lrate \
-    --egs.dir "$common_egs_dir" \
-    --cleanup.remove-egs $remove_egs \
-    --cleanup.preserve-model-interval 500 \
-    --use-gpu true \
-    --feat-dir=data/${train_set}_hires \
-    --ali-dir $ali_dir \
-    --lang data/lang \
-    --reporting.email="$reporting_email" \
-    --dir=$dir  || exit 1;
-fi
-
-if [ $stage -le 9 ]; then
-  # this version of the decoding treats each utterance separately
-  # without carrying forward speaker information.
-  for decode_set in dev test; do
-    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-    decode_dir=${dir}/decode_$decode_set
-    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
-       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
-  done
-fi
-
-wait;
-exit 0;
diff --git a/egs/aishell/s5/local/nnet3/run_tdnn.sh b/egs/aishell/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/aishell/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..3cb8cd861a3
--- /dev/null
+++ b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=12
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/aishell/s5/local/nnet3/tuning/run_tdnn_2a.sh b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_2a.sh
new file mode 100755
index 00000000000..603149585f2
--- /dev/null
+++ b/egs/aishell/s5/local/nnet3/tuning/run_tdnn_2a.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+# This script is based on aishell/s5/local/nnet3/tuning/run_tdnn_1a.sh
+
+# In this script, the neural network in trained based on hires mfcc and online pitch.
+# The online pitch setup requires a online_pitch.conf in the conf dir for both training
+# and testing.
+
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=12
+remove_egs=true
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires_online \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
+    --add-pitch true \
+    data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  # do the actual online decoding with iVectors, carrying info forward from
+  # previous utterances of the same speaker.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_$decode_set
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+if [ $stage -le 12 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+  for decode_set in dev test; do
+    num_jobs=`cat data/${decode_set}_hires_online/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}_online/decode_${decode_set}_per_utt
+    steps/online/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --config conf/decode.config --per-utt true \
+       $graph_dir data/${decode_set}_hires_online $decode_dir || exit 1;
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/aishell/v1/local/aishell_data_prep.sh b/egs/aishell/v1/local/aishell_data_prep.sh
index 70d6ba1f3e5..11d131dcdb1 100755
--- a/egs/aishell/v1/local/aishell_data_prep.sh
+++ b/egs/aishell/v1/local/aishell_data_prep.sh
@@ -40,13 +40,11 @@ n=`cat $train_dir/wav.flist $dev_dir/wav.flist $test_dir/wav.flist | wc -l`
 # Transcriptions preparation
 for dir in $train_dir $test_dir; do
   echo Preparing $dir transcriptions
-  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' |\
-    sort > $dir/utt.list
-  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' |\
-    sort > $dir/utt2spk_all
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
   paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
   utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text_dir/*.txt > $dir/transcripts.txt
-  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  awk '{print $1}' $dir/transcripts.txt | sort -u > $dir/utt.list
   utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
   utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
   sort -u $dir/transcripts.txt > $dir/text
diff --git a/egs/aishell/v1/local/download_and_untar.sh b/egs/aishell/v1/local/download_and_untar.sh
index 0189bad1d4a..3578a1c0835 100755
--- a/egs/aishell/v1/local/download_and_untar.sh
+++ b/egs/aishell/v1/local/download_and_untar.sh
@@ -15,7 +15,7 @@ if [ $# -ne 3 ]; then
   echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
   echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
   echo "With --remove-archive it will remove the archive after successfully un-tarring it."
-  echo "<corpus-part> can be one of: data_aishell, resource."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
 fi
 
 data=$1
@@ -28,7 +28,7 @@ if [ ! -d "$data" ]; then
 fi
 
 part_ok=false
-list="data_aishell resource"
+list="data_aishell resource_aishell"
 for x in $list; do
   if [ "$part" == $x ]; then part_ok=true; fi
 done
diff --git a/egs/aishell2/README.md b/egs/aishell2/README.md
new file mode 100644
index 00000000000..f87f3819036
--- /dev/null
+++ b/egs/aishell2/README.md
@@ -0,0 +1,64 @@
+# AISHELL-2
+
+AISHELL-2 is by far the largest free speech corpus available for Mandarin ASR research.
+## 1. DATA
+### Training data
+* 1000 hours of speech data (around 1 million utterances)
+* 1991 speakers (845 male and 1146 female)
+* clean recording environment (studio or quiet living room)
+* read speech
+* reading prompts from various domain: entertainment, finance, technology, sports, control command, place of interest etc.
+* near field recording via 3 parallel channels (iOS, Android, Microphone).
+* iOS data is free for non-commercial research and education use (e.g. universities and non-commercial institutes)
+
+### Evaluation data:
+Currently we release AISHELL2-2018A-EVAL, containing:
+* dev: 2500 utterances from 5 speakers
+* test: 5000 utterances from 10 speakers
+
+Both sets are available across the three channel conditions.
+
+One of interest can download the sets from [here](http://www.aishelltech.com/aishell_eval). Note that we may update and release other evaluation sets on the website later, targeting on different applications and senarios.
+
+## 2. RECIPE
+Based on Kaldi standard system, AISHELL-2 provides a self-contained Mandarin ASR recipe, with:
+* a word segmentation module, which is a must-have component for Chinese ASR systems
+* an open-sourced Mandarin lexicon (DaCiDian, open-sourced at [here](https://github.com/aishell-foundation/DaCiDian))
+* Simplified GMM training & alignment generating recipe (we stopped at speaker independent stage)
+* LFMMI TDNN training and decoding recipe
+
+# REFERENCE
+We released a [paper on Arxiv](https://arxiv.org/abs/1808.10583) on a more detailed description about the corpus with some preliminary resulting numbers. If one would like to use AISHELL-2 in experiments, please cite the paper as below:
+```
+@ARTICLE{aishell2,
+   author = {{Du}, J. and {Na}, X. and {Liu}, X. and {Bu}, H.},
+   title = "{AISHELL-2: Transforming Mandarin ASR Research Into Industrial Scale}",
+   journal = {ArXiv},
+   eprint = {1808.10583},
+   primaryClass = "cs.CL",
+   year = 2018,
+   month = Aug,
+}
+```
+
+# APPLY FOR DATA/CONTACT
+AISHELL foundation is a non-profit online organization, with members from speech industry and research institutes.
+
+We hope AISHELL-2 corpus and recipe could be beneficial to the entire speech community.
+
+Depends on your location and internet speed, we distribute the corpus in two ways:
+* hard-disk delivery
+* cloud-disk downloading
+
+To apply for AISHELL-2 corpus for free, you need to fill in a very simple application form, confirming that:
+* university department / educational institute information has been fully provided
+* only for non-commercial research / education use
+
+AISHELL-foundation covers all data distribution fees (including the corpus, hard-disk cost etc)
+
+Data re-distribution inside your university department is OK for convenience. However, users are not supposed to re-distribute the data to other universities or educational institutes.
+
+To get the application form, or you come across any problem with the recipe, contact us via:
+
+aishell.foundation@gmail.com
+
diff --git a/egs/aishell2/README.txt b/egs/aishell2/README.txt
deleted file mode 100644
index e8b4260f2bb..00000000000
--- a/egs/aishell2/README.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-# AISHELL-2
-
-AISHELL-2 is by far the largest free speech corpus available for Mandarin ASR research.
-## 1. DATA
-### training data
-* 1000 hours of speech data (around 1 million utterances)
-* 1991 speakers (845 male and 1146 female)
-* clean recording environment(studio or quiet living room)
-* read speech
-* reading prompts from various domain: entertainment, finance, technology, sports, control command, place of interest etc.
-* near field recording via 3 parallel channels(iOS, Android, Microphone).
-* iOS data is free for non-commercial research and education use (e.g. universities and colleges)
-
-### evaluation data:
-Currently we release AISHELL2-2018A-EVAL, containing:
-* dev: 2500 utterances from 5 speaker
-* test: 5000 utterances from 10 speakers
-
-you can download above evaluation set from:
-http://www.aishelltech.com/aishell_eval
-
-we may update and release other evaluation sets on the website later, targeting on different applications and senarios.
-
-## 2. RECIPE
-Based on Kaldi standard system, AISHELL-2 provides a self-contained Mandarin ASR recipe, with:
-* a word segmentation module, which is a must-have component for Chinese ASR systems
-* an open-sourced Mandarin lexicon(DaCiDian)
-* a simplified GMM training recipe
-* acoustic channel adaptation recipe(AM fine-tuning)
-
-# CONTACT
-AISHELL foundation is a non-profit online organization, with members from speech industry and research institutes.
-
-We hope AISHELL-2 corpus and recipe could be beneficial to the entire speech community.
-
-Depends on your location and internet speed, we distribute the corpus in two ways:
-* hard-disk delivery
-* cloud-disk downloading
-
-To apply for AISHELL-2 corpus for free, you need to fill in a very simple application form, confirming that:
-* university department / education institute info
-* only for non-commercial research / education use
-
-AISHELL-foundation covers all data distribution fees (including the corpus, hard-disk cost etc)
-
-Data re-distribution inside your university department is OK for convenience. However, users are not supposed to re-distribute AISHELL-2 to other universities or education institutes.
-
-To get the application form, or you come across any problem with the recipe, contact us via:
-
-aishell.foundation@gmail.com
diff --git a/egs/aishell2/s5/conf/online_cmvn.conf b/egs/aishell2/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..048bdfa65de
--- /dev/null
+++ b/egs/aishell2/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
index 459bd64eeb5..86c9becac5b 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -103,7 +103,7 @@ fi
 if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
index 30a19293181..d8560e63909 100755
--- a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -3,18 +3,17 @@
 # _1b is as _1a, but with pitch feats, i-vector and dropout schedule added, referenced from wsj
 
 # basic info:
-# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_all_sp/
-# exp/chain/tdnn_1b_all_sp/: num-iters=1446 nj=2..2 num-params=19.3M dim=43+100->4456 combine=-0.079->-0.075 (over 9) xent:train/valid[962,1445,final]=(-0.922,-0.795,-0.746/-0.960,-0.840,-0.785) logprob:train/valid[962,1445,final]=(-0.084,-0.072,-0.070/-0.085,-0.075,-0.071)
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1f_nopitch_ivec_sp/exp/chain/tdnn_1f_nopitch_ivec_sp/: num-iters=578 nj=2..8 num-params=19.3M dim=43+100->4520 combine=-0.082->-0.081 (over 6) xent:train/valid[384,577,final]=(-0.863,-0.752,-0.740/-0.901,-0.791,-0.784) logprob:train/valid[384,577,final]=(-0.083,-0.076,-0.075/-0.084,-0.077,-0.076)
 
 # results:
-# local/chain/compare_wer.sh exp/chain/tdnn_1d_all_sp/
-# Model                tdnn_1d_all_sp
+# local/chain/compare_wer.sh exp/chain/tdnn_1f_nopitch_ivec_sp/
+# Model                tdnn_1f_nopitch_ivec_sp
 # Num. of params             19.3M
-# WER(%)                     8.84
-# Final train prob        -0.0696
-# Final valid prob        -0.0714
-# Final train prob (xent)   -0.7458
-# Final valid prob (xent)   -0.7854
+# WER(%)                     8.81
+# Final train prob        -0.0749
+# Final valid prob        -0.0756
+# Final train prob (xent)   -0.7401
+# Final valid prob (xent)   -0.7837
 
 set -e
 
@@ -68,9 +67,12 @@ if [ $stage -le 5 ]; then
   mfccdir=mfcc_hires
   for datadir in ${train_set} ${test_sets}; do
   	utils/copy_data_dir.sh data/${datadir} data/${datadir}_hires
-	  utils/data/perturb_data_dir_volume.sh data/${datadir}_hires || exit 1;
-	  steps/make_mfcc_pitch.sh --mfcc-config conf/mfcc_hires.conf --pitch-config conf/pitch.conf \
+    utils/data/perturb_data_dir_volume.sh data/${datadir}_hires || exit 1;
+	steps/make_mfcc_pitch.sh --mfcc-config conf/mfcc_hires.conf --pitch-config conf/pitch.conf \
       --nj $nj data/${datadir}_hires exp/make_mfcc/ ${mfccdir}
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_mfcc ${mfccdir}
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_mfcc ${mfccdir}
   done
 fi
 
@@ -81,15 +83,11 @@ if [ $stage -le 6 ]; then
   mkdir -p exp/chain/diag_ubm_${affix}
   temp_data_root=exp/chain/diag_ubm_${affix}
 
-  num_utts_total=$(wc -l < data/${train_set}_hires/utt2spk)
+  num_utts_total=$(wc -l < data/${train_set}_hires_nopitch/utt2spk)
   num_utts=$[$num_utts_total/4]
-  utils/data/subset_data_dir.sh data/${train_set}_hires \
+  utils/data/subset_data_dir.sh data/${train_set}_hires_nopitch \
     $num_utts ${temp_data_root}/${train_set}_subset
 
-  #echo "$0: get cmvn stats if not there for subset"
-  #[ -f ${temp_data_root}/${train_set}_subset/cmvn.scp ] || \
-    steps/compute_cmvn_stats.sh ${temp_data_root}/${train_set}_subset || exit 1;
-
   echo "$0: computing a PCA transform from the hires data."
   steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
     --splice-opts "--left-context=3 --right-context=3" \
@@ -108,13 +106,13 @@ if [ $stage -le 6 ]; then
 
   echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \
-    data/${train_set}_hires exp/chain/diag_ubm_${affix} \
+    data/${train_set}_hires_nopitch exp/chain/diag_ubm_${affix} \
     exp/chain/extractor_${affix} || exit 1;
 
   for datadir in ${train_set} ${test_sets}; do
-    steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${datadir}_hires data/${datadir}_hires_max2
+    steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${datadir}_hires_nopitch data/${datadir}_hires_nopitch_max2
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-      data/${datadir}_hires_max2 exp/chain/extractor_${affix} exp/chain/ivectors_${datadir}_${affix} || exit 1;
+      data/${datadir}_hires_nopitch_max2 exp/chain/extractor_${affix} exp/chain/ivectors_${datadir}_${affix} || exit 1;
   done
 fi
 
@@ -152,7 +150,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -)
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/aishell2/s5/local/prepare_data.sh b/egs/aishell2/s5/local/prepare_data.sh
index 419d8eddfd1..4be9664ac31 100755
--- a/egs/aishell2/s5/local/prepare_data.sh
+++ b/egs/aishell2/s5/local/prepare_data.sh
@@ -45,8 +45,9 @@ utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tm
 python -c "import jieba" 2>/dev/null || \
   (echo "jieba is not found. Use tools/extra/install_jieba.sh to install it." && exit 1;)
 utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt
-awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk 'BEGIN{idx=0}{print $1,idx++}'> $tmp/vocab.txt
-python local/word_segmentation.py $tmp/vocab.txt $tmp/trans.txt > $tmp/text
+# jieba's vocab format requires word count(frequency), set to 99
+awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk '{print $1,99}'> $tmp/word_seg_vocab.txt
+python local/word_segmentation.py $tmp/word_seg_vocab.txt $tmp/trans.txt > $tmp/text
 
 # utt2spk & spk2utt
 awk -F'\t' '{print $2}' $tmp/wav.scp > $tmp/wav.list
diff --git a/egs/aishell2/s5/local/prepare_dict.sh b/egs/aishell2/s5/local/prepare_dict.sh
index d59585273a7..56ab885ae94 100755
--- a/egs/aishell2/s5/local/prepare_dict.sh
+++ b/egs/aishell2/s5/local/prepare_dict.sh
@@ -10,7 +10,7 @@
 download_dir=data/local/DaCiDian
 dir=data/local/dict
 
-if [ $# -ne 1 ]; then 
+if [ $# -ne 1 ]; then
   echo "Usage: $0 <dict-dir>";
   exit 1;
 fi
@@ -18,7 +18,9 @@ fi
 dir=$1
 
 # download the DaCiDian from github
-git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir
+if [ ! -d $download_dir ]; then
+  git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir
+fi
 
 # here we map <UNK> to the phone spn(spoken noise)
 mkdir -p $dir
@@ -27,21 +29,9 @@ echo -e "<UNK>\tspn" >> $dir/lexicon.txt
 
 # prepare silence_phones.txt, nonsilence_phones.txt, optional_silence.txt, extra_questions.txt
 cat $dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
-  sort -u |\
-  perl -e '
-  my %ph_cl;
-  while (<STDIN>) {
-    $phone = $_;
-    chomp($phone);
-    chomp($_);
-    $phone = $_;
-    next if ($phone eq "sil");
-    if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
-    else { $ph_cl{$phone} = [$_]; }
-  }
-  foreach $key ( keys %ph_cl ) {
-     print "@{ $ph_cl{$key} }\n"
-  }
+  perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
+    m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
+    foreach $l (values %q) {print "$l\n";}
   ' | sort -k1 > $dir/nonsilence_phones.txt  || exit 1;
 
 echo sil > $dir/silence_phones.txt
@@ -49,9 +39,8 @@ echo sil > $dir/optional_silence.txt
 
 cat $dir/silence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
 cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
-  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; if($p eq "\$0"){$q{""} .= "$p ";}else{$q{$2} .= "$p ";} } } foreach $l (values %q) {print "$l\n";}' \
  >> $dir/extra_questions.txt || exit 1;
 
 echo "local/prepare_dict.sh succeeded"
 exit 0;
-
diff --git a/egs/aishell2/s5/local/word_segmentation.py b/egs/aishell2/s5/local/word_segmentation.py
index 1cb2c1e7350..4ce55a2003e 100644
--- a/egs/aishell2/s5/local/word_segmentation.py
+++ b/egs/aishell2/s5/local/word_segmentation.py
@@ -4,6 +4,7 @@
 #           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
 # Apache 2.0
 
+from __future__ import print_function
 import sys
 import jieba
 reload(sys)
@@ -19,6 +20,6 @@
 jieba.set_dictionary(vocab_file)
 for line in open(trans_file):
   key,trans = line.strip().split('\t',1)
-  words = jieba.cut(trans)
+  words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based)
   new_line = key + '\t' + " ".join(words)
   print(new_line)
diff --git a/egs/ami/s5/local/ami_download.sh b/egs/ami/s5/local/ami_download.sh
index b14f8550c75..cba130c8467 100755
--- a/egs/ami/s5/local/ami_download.sh
+++ b/egs/ami/s5/local/ami_download.sh
@@ -53,12 +53,12 @@ cat local/split_train.orig local/split_eval.orig local/split_dev.orig > $wdir/am
 wgetfile=$wdir/wget_$mic.sh
 
 # TODO fix this with Pawel, files don't exist anymore,
-manifest="wget --continue -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-04237-Sun-Jun-15-2014.manifest.txt"
-license="wget --continue -O $adir/LICENCE.TXT http://groups.inf.ed.ac.uk/ami/download/temp/Creative-Commons-Attribution-NonCommercial-ShareAlike-2.5.txt"
+manifest="wget --continue -O $adir/MANIFEST.TXT http://groups.inf.ed.ac.uk/ami/download/temp/amiBuild-0153-Tue-Oct-2-2018.manifest.txt"
+
 
 echo "#!/bin/bash" > $wgetfile
 echo $manifest >> $wgetfile
-echo $license >> $wgetfile
+
 while read line; do
    if [ "$mic" == "ihm" ]; then
      extra_headset= #some meetings have 5 sepakers (headsets)
@@ -100,8 +100,7 @@ else
   fi
 fi
 
-echo "Downloads of AMI corpus completed succesfully. License can be found under $adir/LICENCE.TXT"
+echo "Downloads of AMI corpus completed succesfully."
 exit 0;
 
 
-
diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
index 3157d7ffec7..7112e0259a0 100755
--- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
@@ -87,18 +87,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
            if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
+             print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n"
            }
-           $pu=$_[1]; $pt=$_[4]; 
+           $pu=$_[1]; $pt=$_[4];
          }' > $dir/segments_to_fix
-if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then
+
+if [ -s $dir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $dir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $dir/segments
-  done < $dir/segments_to_fix
+  perl -i -pf $dir/segments_to_fix $dir/segments
 fi
 
 # Copy stuff into its final locations
diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
index 4cfa9110edf..9c4b55308f2 100755
--- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
@@ -94,19 +94,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
index 91baa37d6e1..815e1b2d270 100755
--- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
@@ -101,19 +101,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5/local/sort_bad_utts.py b/egs/ami/s5/local/sort_bad_utts.py
index f84fcb12608..baabdc73508 100644
--- a/egs/ami/s5/local/sort_bad_utts.py
+++ b/egs/ami/s5/local/sort_bad_utts.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+from __future__ import print_function
 import sys
 import argparse
 import logging
@@ -38,10 +39,10 @@ def GetSortedWers(utt_info_file):
     utt_wer_sorted = sorted(utt_wer, key = lambda k : k[1])
     try:
         import numpy as np
-        bins = range(0,105,5)
+        bins = list(range(0,105,5))
         bins.append(sys.float_info.max)
 
-        hist, bin_edges = np.histogram(map(lambda x: x[1], utt_wer_sorted),
+        hist, bin_edges = np.histogram([x[1] for x in utt_wer_sorted],
                                        bins = bins)
         num_utts = len(utt_wer)
         string = ''
diff --git a/egs/ami/s5/local/tfrnnlm/run_lstm.sh b/egs/ami/s5/local/tfrnnlm/run_lstm.sh
index 31ae4a8bad7..d68fadb10f3 100755
--- a/egs/ami/s5/local/tfrnnlm/run_lstm.sh
+++ b/egs/ami/s5/local/tfrnnlm/run_lstm.sh
@@ -27,7 +27,7 @@ mkdir -p $dir
 if [ $stage -le 2 ]; then
 # the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it
   $cuda_cmd $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \
-    python steps/tfrnnlm/lstm.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final
+    python steps/tfrnnlm/lstm.py --data_path=$dir --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final
 fi
 
 final_lm=ami_fsh.o3g.kn
@@ -39,7 +39,7 @@ if [ $stage -le 3 ]; then
     decode_dir=${basedir}/decode_${decode_set}
 
     # Lattice rescoring
-    steps/lmrescore_rnnlm_lat.sh \
+    steps/tfrnnlm/lmrescore_rnnlm_lat.sh \
       --cmd "$tfrnnlm_cmd --mem 16G" \
       --rnnlm-ver tensorflow  --weight $weight --max-ngram-order $ngram_order \
       data/lang_$LM $dir \
diff --git a/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh b/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh
index 8dd876c2b2c..4cc71b55b5c 100755
--- a/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh
+++ b/egs/ami/s5/local/tfrnnlm/run_lstm_fast.sh
@@ -27,7 +27,7 @@ mkdir -p $dir
 if [ $stage -le 2 ]; then
 # the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it
   $cuda_cmd $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \
-    python steps/tfrnnlm/lstm_fast.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final
+    python steps/tfrnnlm/lstm_fast.py --data_path=$dir --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final
 fi
 
 final_lm=ami_fsh.o3g.kn
diff --git a/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh b/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh
index 7a4635f07a4..7a95f38ba1e 100755
--- a/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh
+++ b/egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh
@@ -27,7 +27,7 @@ mkdir -p $dir
 if [ $stage -le 2 ]; then
 # the following script uses TensorFlow. You could use tools/extras/install_tensorflow_py.sh to install it
   $cuda_cmd $dir/train_rnnlm.log utils/parallel/limit_num_gpus.sh \
-    python steps/tfrnnlm/vanilla_rnnlm.py --data-path=$dir --save-path=$dir/rnnlm --vocab-path=$dir/wordlist.rnn.final
+    python steps/tfrnnlm/vanilla_rnnlm.py --data_path=$dir --save_path=$dir/rnnlm --vocab_path=$dir/wordlist.rnn.final
 fi
 
 final_lm=ami_fsh.o3g.kn
@@ -39,7 +39,7 @@ if [ $stage -le 3 ]; then
     decode_dir=${basedir}/decode_${decode_set}
 
     # Lattice rescoring
-    steps/lmrescore_rnnlm_lat.sh \
+    steps/tfrnnlm/lmrescore_rnnlm_lat.sh \
       --cmd "$tfrnnlm_cmd --mem 16G" \
       --rnnlm-ver tensorflow  --weight $weight --max-ngram-order $ngram_order \
       data/lang_$LM $dir \
diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index 42af5763829..7eb908f685e 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -86,8 +86,7 @@
 %WER 19.8 | 13098 94475 | 83.1 9.6 7.4 2.8 19.8 51.8 | -0.041 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_dev/ascore_10/dev_hires.ctm.filt.sys
 %WER 19.2 | 12643 89964 | 83.2 10.7 6.1 2.5 19.2 49.7 | 0.079 | exp/ihm/chain_cleaned/tdnn_lstm1l_sp_bi_ld5/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
-# local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic ihm
+# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic ihm
 # cleanup + chain TDNN+LSTM model + IHM reverberated data
-%WER 19.4 | 13098 94479 | 83.8 10.0 6.1 3.2 19.4 51.8 | -0.168 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
-%WER 19.3 | 12643 89977 | 83.3 11.0 5.7 2.6 19.3 49.6 | -0.046 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
-
+%WER 18.9 | 13098 94488 | 84.1 9.7 6.2 3.0 18.9 51.2 | 0.012 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi/decode_dev/ascore_11/dev_hires.ctm.filt.sys
+%WER 19.3 | 12643 89989 | 83.1 10.7 6.2 2.5 19.3 50.0 | 0.136 | exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi/decode_eval/ascore_11/eval_hires.ctm.filt.sys
diff --git a/egs/ami/s5b/RESULTS_sdm b/egs/ami/s5b/RESULTS_sdm
index 0993b2eb52a..584c50f298a 100644
--- a/egs/ami/s5b/RESULTS_sdm
+++ b/egs/ami/s5b/RESULTS_sdm
@@ -93,9 +93,13 @@
 %WER 35.9 | 14900 94497 | 67.8 18.2 14.1 3.7 35.9 62.5 | 0.647 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_dev/ascore_9/dev_hires_o4.ctm.filt.sys
 %WER 39.4 | 13223 89946 | 64.1 19.7 16.2 3.5 39.4 67.0 | 0.611 | exp/sdm1/chain_cleaned/tdnn_lstm1l_sp_bi_ihmali_ld5/decode_eval/ascore_9/eval_hires_o4.ctm.filt.sys
 
-# local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned 
+# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned 
 # cleanup + chain TDNN+LSTM model, SDM original + IHM reverberated data, alignments from ihm data.
-# *** best system ***
-%WER 34.0 | 14455 94497 | 69.8 17.7 12.5 3.8 34.0 63.9 | 0.675 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi_ihmali/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
-%WER 37.5 | 13261 89982 | 65.9 19.3 14.7 3.5 37.5 66.2 | 0.642 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1i_sp_rvb_bi_ihmali/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
+%WER 33.9 | 14185 94492 | 70.3 18.1 11.7 4.2 33.9 66.0 | 0.605 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+%WER 37.4 | 13610 89969 | 66.3 19.9 13.7 3.7 37.4 65.5 | 0.568 | exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
 
+# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned 
+# cleanup + chain TDNN-F model, SDM original + IHM reverberated data, alignments from ihm data.
+# *** best system ***
+%WER 33.3 | 14696 94538 | 70.4 17.2 12.4 3.7 33.3 63.1 | 0.612 | exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali/decode_dev/ascore_10/dev_hires_o4.ctm.filt.sys
+%WER 36.7 | 14855 89974 | 66.7 18.9 14.4 3.4 36.7 59.8 | 0.580 | exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali/decode_eval/ascore_10/eval_hires_o4.ctm.filt.sys
diff --git a/egs/ami/s5b/conf/mfcc_hires80.conf b/egs/ami/s5b/conf/mfcc_hires80.conf
new file mode 100644
index 00000000000..5fb03de59c4
--- /dev/null
+++ b/egs/ami/s5b/conf/mfcc_hires80.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=80     # similar to Google's setup.
+--num-ceps=80     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
index 746c42c4c1a..c54876331f1 100755
--- a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
@@ -93,18 +93,15 @@ sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
 join $dir/utt2spk $dir/segments | \
    perl -ne '{BEGIN{$pu=""; $pt=0.0;} split;
            if ($pu eq $_[1] && $pt > $_[3]) {
-             print "$_[0] $_[2] $_[3] $_[4]>$_[0] $_[2] $pt $_[4]\n"
+             print "s/^$_[0] $_[2] $_[3] $_[4]\$/$_[0] $_[2] $pt $_[4]/;\n"
            }
            $pu=$_[1]; $pt=$_[4];
          }' > $dir/segments_to_fix
-if [ `cat $dir/segments_to_fix | wc -l` -gt 0 ]; then
+
+if [ -s $dir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $dir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s!$p1!$p2!" $dir/segments
-  done < $dir/segments_to_fix
+  perl -i -pf $dir/segments_to_fix $dir/segments
 fi
 
 # Copy stuff into its final locations
diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
index 65f514f223c..475ef5405ba 100755
--- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
@@ -99,19 +99,15 @@ awk '{print $1}' $tmpdir/segments | \
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s/^"utt, wav, t_beg, t_end"$/"utt, wav, t_end_prev, t_end"/;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
index 1378f8b8965..580880818fc 100755
--- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
@@ -107,29 +107,25 @@ awk '{print $1}' $tmpdir/segments | \
 
 #check and correct the case when segment timings for given speaker overlap themself
 #(important for simulatenous asclite scoring to proceed).
-#There is actually only one such case for devset and automatic segmentetions
+#There is actually only one such case for devset and automatic segmentations
 join $tmpdir/utt2spk_stm $tmpdir/segments | \
   awk '{ utt=$1; spk=$2; wav=$3; t_beg=$4; t_end=$5;
          if(spk_prev == spk && t_end_prev > t_beg) {
-           print utt, wav, t_beg, t_end">"utt, wav, t_end_prev, t_end;
+           print "s:[^\\S\\n]+$::;s:^"utt, wav, t_beg, t_end"$:"utt, wav, t_end_prev, t_end":;";
          }
          spk_prev=spk; t_end_prev=t_end;
        }' > $tmpdir/segments_to_fix
 
-if [ `cat $tmpdir/segments_to_fix | wc -l` -gt 0 ]; then
+if [ -s $tmpdir/segments_to_fix ]; then
   echo "$0. Applying following fixes to segments"
   cat $tmpdir/segments_to_fix
-  while read line; do
-     p1=`echo $line | awk -F'>' '{print $1}'`
-     p2=`echo $line | awk -F'>' '{print $2}'`
-     sed -ir "s:$p1:$p2:" $tmpdir/segments
-  done < $tmpdir/segments_to_fix
+  perl -i -pf $tmpdir/segments_to_fix $tmpdir/segments
 fi
 
 # Copy stuff into its final locations [this has been moved from the format_data
 # script]
 mkdir -p $dir
-for f in spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
+for f in segments_to_fix spk2utt utt2spk utt2spk_stm wav.scp text segments reco2file_and_channel; do
   cp $tmpdir/$f $dir/$f || exit 1;
 done
 
diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
deleted file mode 100755
index 754a9508e66..00000000000
--- a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
+++ /dev/null
@@ -1,283 +0,0 @@
-#!/bin/bash
-
-# This is a chain-training script with TDNN neural networks.
-# This script is based on local/chain/tuning/run_tdnn_1a.sh, but adding
-# the reverberated IHM data into the train set.
-# This script obtains better results on IHM, SDM and MDM tasks.
-
-# Please see RESULTS_* for examples of command lines invoking this script.
-
-# local/chain/multi_condition/run_tdnn.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned &
-# local/chain/multi_condition/run_tdnn.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
-# local/chain/multi_condition/run_tdnn.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
-
-
-set -e -o pipefail
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=1
-mic=ihm
-nj=30
-min_seg_len=1.55
-use_ihm_ali=false
-train_set=train_cleaned
-gmm=tri3_cleaned  # the gmm for the target data
-ihm_gmm=tri3_cleaned  # the gmm for the IHM system (if --use-ihm-ali true).
-num_threads_ubm=32
-num_data_reps=1
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! $use_ihm_ali; then
-  [ "$mic" != "ihm" ] && \
-    echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \
-    exit 1;
-else
-  [ "$mic" == "ihm" ] && \
-    echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \
-    exit 1;
-fi
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-nnet3_affix=_cleaned
-rvb_affix=_rvb
-
-
-if $use_ihm_ali; then
-  gmm_dir=exp/ihm/${ihm_gmm}
-  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
-  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
-  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
-  original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_comb_lats_ihmdata
-  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_comb_lats_ihmdata
-  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn${tdnn_affix}_sp${rvb_affix}_bi_ihmali
-  # note: the distinction between when we use the 'ihmdata' suffix versus
-  # 'ihmali' is pretty arbitrary.
-else
-  gmm_dir=exp/${mic}/$gmm
-  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
-  lores_train_data_dir=data/$mic/${train_set}_sp_comb
-  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
-  original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_comb_lats
-  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn${tdnn_affix}_sp${rvb_affix}_bi
-fi
-
-
-local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \
-                                  --mic $mic \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --num-data-reps $num_data_reps \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-# Note: the first stage of the following script is stage 8.
-local/nnet3/prepare_lores_feats.sh --stage $stage \
-                                   --mic $mic \
-                                   --nj $nj \
-                                   --min-seg-len $min_seg_len \
-                                   --use-ihm-ali $use_ihm_ali \
-                                   --train-set $train_set
-
-
-train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires_comb
-train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires_comb
-final_lm=`cat data/local/lm/final_lm`
-LM=$final_lm.pr1-7
-
-
-for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
-   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-
-if [ $stage -le 11 ]; then
-  if [ -f $ali_dir/ali.1.gz ]; then
-    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
-    echo " ... or use a later --stage option."
-    exit 1
-  fi
-  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
-fi
-
-[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
-fi
-
-if [ $stage -le 13 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $original_lat_dir
-  rm $original_lat_dir/fsts.*.gz # save space
-
-  lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-
-  mkdir -p $lat_dir/temp/
-  mkdir -p $lat_dir/temp2/
-  lattice-copy "ark:gunzip -c $original_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp
-  lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.*.gz |" ark,scp:$lat_dir/temp2/lats.ark,$lat_dir/temp2/lats.scp
-
-  # copy the lattices for the reverberated data
-  rm -f $lat_dir/temp/combined_lats.scp
-  touch $lat_dir/temp/combined_lats.scp
-  cat $lat_dir/temp/lats.scp >> $lat_dir/temp/combined_lats.scp
-  for i in `seq 1 $num_data_reps`; do
-    cat $lat_dir/temp2/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp
-  done
-  sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp
-
-  lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1;
-  echo "1" > $lat_dir/num_jobs
-
-  # copy other files from original lattice dir
-  for f in cmvn_opts final.mdl splice_opts tree; do
-    cp $original_lat_dir/$f $lat_dir/$f
-  done
-fi
-
-
-if [ $stage -le 14 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 15 ]; then
-  mkdir -p $dir
-
-  echo "$0: creating neural net configs";
-
-  steps/nnet3/tdnn/make_configs.py \
-    --self-repair-scale-nonlinearity 0.00001 \
-    --feat-dir data/$mic/${train_set}_sp_hires_comb \
-    --ivector-dir $train_ivector_dir \
-    --tree-dir $tree_dir \
-    --relu-dim 450 \
-    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0 0" \
-    --use-presoftmax-prior-scale false \
-    --xent-regularize 0.1 \
-    --xent-separate-forward-affine true \
-    --include-log-softmax false \
-    --final-layer-normalize-target 1.0 \
-   $dir/configs || exit 1;
-fi
-
-if [ $stage -le 16 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-rvb$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
-  fi
-
- touch $dir/egs/.nodelete # keep egs around when that run dies.
-
- steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize 0.1 \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 12 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
-    --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
-
-graph_dir=$dir/graph_${LM}
-if [ $stage -le 17 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
-fi
-
-if [ $stage -le 18 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in dev eval; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $nj --cmd "$decode_cmd" \
-          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \
-          --scoring-opts "--min-lmwt 5 " \
-         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-exit 0
diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/ami/s5b/local/chain/multi_condition/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh b/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh
index 8e647598556..a4fa11e0908 120000
--- a/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1b.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..4d260e3c517
--- /dev/null
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_1a.sh
@@ -0,0 +1,334 @@
+#!/bin/bash
+
+# This script is based on swbd 7q TDNN-F recipe 
+# with resnet-style skip connections, more layers,
+# skinnier bottlenecks, removing the 3-way splicing and skip-layer splicing,
+# and re-tuning the learning rate and l2 regularize.  The configs are
+# standardized and substantially simplified.
+# The advantage of this style of config is that it also works
+# well on smaller datasets, and we adopt this style here also for consistency.
+# This gives better results than TDNN+LSTM on AMI SDM.
+
+# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned &
+# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
+# local/chain/multi_condition/tuning/run_tdnn_1a.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
+
+# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali
+# exp/sdm1/chain_cleaned_rvb/tdnn1a_sp_rvb_bi_ihmali: num-iters=193 nj=3..16 num-params=17.5M dim=40+100->3728 combine=-0.122->-0.121 (over 2) xent:train/valid[127,192,final]=(-2.03,-1.57,-1.58/-2.12,-1.71,-1.71) logprob:train/valid[127,192,final]=(-0.179,-0.121,-0.122/-0.198,-0.158,-0.157)
+
+# local/chain/compare_wer_general.sh sdm1 chain_cleaned_rvb tdnn_lstm1b_sp_rvb_bi_ihmali tdnn1a_sp_rvb_bi_ihmali
+# System                tdnn_lstm1b_sp_rvb_bi_ihmali tdnn1a_sp_rvb_bi_ihmali
+# WER on dev        33.9      33.3
+# WER on eval        37.4      36.7
+# Final train prob      -0.133611 -0.122155
+# Final valid prob      -0.161014 -0.156612
+# Final train prob (xent)       -1.9774  -1.57504
+# Final valid prob (xent)      -2.09991    -1.705
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3_cleaned  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+num_data_reps=1
+num_epochs=6
+get_egs_stage=-5
+remove_egs=false
+
+chunk_width=160,140,110,80
+dropout_schedule='0,0@0.20,0.5@0.50,0' # dropout schedule controls the dropout
+                                       # proportion for each training iteration.
+xent_regularize=0.1
+
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1a  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# decode options
+frames_per_chunk=160
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! $use_ihm_ali; then
+  [ "$mic" != "ihm" ] && \
+    echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \
+    exit 1;
+else
+  [ "$mic" == "ihm" ] && \
+    echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \
+    exit 1;
+fi
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+nnet3_affix=_cleaned
+rvb_affix=_rvb
+
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_lats_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn${tdnn_affix}_sp${rvb_affix}_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  lores_train_data_dir=data/$mic/${train_set}_sp
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_lats
+  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn${tdnn_affix}_sp${rvb_affix}_bi
+fi
+
+
+local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --train-set $train_set \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --num-data-reps $num_data_reps \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len "" \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+
+train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \
+    --generate-ali-from-lats true ${lores_train_data_dir} \
+    data/lang $gmm_dir $original_lat_dir
+  rm $original_lat_dir/fsts.*.gz # save space
+
+  lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+
+  original_lat_nj=$(cat $original_lat_dir/num_jobs)
+  ihm_lat_nj=$(cat $lat_dir_ihmdata/num_jobs)
+
+  $train_cmd --max-jobs-run 10 JOB=1:$original_lat_nj $lat_dir/temp/log/copy_original_lats.JOB.log \
+    lattice-copy "ark:gunzip -c $original_lat_dir/lat.JOB.gz |" ark,scp:$lat_dir/temp/lats.JOB.ark,$lat_dir/temp/lats.JOB.scp
+
+  $train_cmd --max-jobs-run 10 JOB=1:$ihm_lat_nj $lat_dir/temp2/log/copy_ihm_lats.JOB.log \
+    lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.JOB.gz |" ark,scp:$lat_dir/temp2/lats.JOB.ark,$lat_dir/temp2/lats.JOB.scp
+
+  for n in $(seq $original_lat_nj); do
+    cat $lat_dir/temp/lats.$n.scp
+  done > $lat_dir/temp/combined_lats.scp
+
+  for i in `seq 1 $num_data_reps`; do
+    for n in $(seq $ihm_lat_nj); do
+      cat $lat_dir/temp2/lats.$n.scp
+    done | sed -e "s/^/rev${i}_/"
+  done >> $lat_dir/temp/combined_lats.scp
+
+  sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp
+
+  utils/split_data.sh $train_data_dir $nj
+
+  $train_cmd --max-jobs-run 10 JOB=1:$nj $lat_dir/copy_combined_lats.JOB.log \
+    lattice-copy --include=$train_data_dir/split$nj/JOB/utt2spk \
+    scp:$lat_dir/temp/combined_lats_sorted.scp \
+    "ark:|gzip -c >$lat_dir/lat.JOB.gz" || exit 1;
+
+  echo $nj > $lat_dir/num_jobs
+
+  # copy other files from original lattice dir
+  for f in cmvn_opts final.mdl splice_opts tree; do
+    cp $original_lat_dir/$f $lat_dir/$f
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $original_lat_dir $tree_dir
+fi
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
index 2869049843f..3546b6a7ced 100755
--- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh
@@ -19,7 +19,6 @@ set -e -o pipefail
 stage=0
 mic=ihm
 nj=30
-min_seg_len=1.55
 use_ihm_ali=false
 train_set=train_cleaned
 gmm=tri3_cleaned  # the gmm for the target data
@@ -27,7 +26,7 @@ ihm_gmm=tri3_cleaned  # the gmm for the IHM system (if --use-ihm-ali true).
 num_threads_ubm=32
 num_data_reps=1
 
-chunk_width=150
+chunk_width=160,140,110,80
 chunk_left_context=40
 chunk_right_context=0
 label_delay=5
@@ -35,13 +34,13 @@ label_delay=5
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tlstm_affix=1i  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 
 # decode options
 extra_left_context=50
-frames_per_chunk=
+frames_per_chunk=160
 
 
 # End configuration section.
@@ -75,21 +74,19 @@ rvb_affix=_rvb
 
 if $use_ihm_ali; then
   gmm_dir=exp/ihm/${ihm_gmm}
-  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
-  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp
   tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
-  original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_comb_lats_ihmdata
-  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_comb_lats_ihmdata
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_lats_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_lats_ihmdata
   dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn_lstm${tlstm_affix}_sp${rvb_affix}_bi_ihmali
   # note: the distinction between when we use the 'ihmdata' suffix versus
   # 'ihmali' is pretty arbitrary.
 else
   gmm_dir=exp/${mic}/$gmm
-  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
-  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp
   tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
-  original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_comb_lats
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_lats
   dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn_lstm${tlstm_affix}_sp${rvb_affix}_bi
 fi
 
@@ -97,9 +94,7 @@ fi
 local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \
                                   --mic $mic \
                                   --nj $nj \
-                                  --min-seg-len $min_seg_len \
                                   --train-set $train_set \
-                                  --gmm $gmm \
                                   --num-threads-ubm $num_threads_ubm \
                                   --num-data-reps $num_data_reps \
                                   --nnet3-affix "$nnet3_affix"
@@ -109,13 +104,13 @@ local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \
 local/nnet3/prepare_lores_feats.sh --stage $stage \
                                    --mic $mic \
                                    --nj $nj \
-                                   --min-seg-len $min_seg_len \
+                                   --min-seg-len "" \
                                    --use-ihm-ali $use_ihm_ali \
                                    --train-set $train_set
 
 
-train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires_comb
-train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires_comb
+train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires
 final_lm=`cat data/local/lm/final_lm`
 LM=$final_lm.pr1-7
 
@@ -126,19 +121,6 @@ for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
 done
 
 
-if [ $stage -le 11 ]; then
-  if [ -f $ali_dir/ali.1.gz ]; then
-    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
-    echo " ... or use a later --stage option."
-    exit 1
-  fi
-  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
-fi
-
-[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
-
 if [ $stage -le 12 ]; then
   echo "$0: creating lang directory with one state per phone."
   # Create a version of the lang/ directory that has one state per phone in the
@@ -165,28 +147,42 @@ fi
 if [ $stage -le 13 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \
+    --generate-ali-from-lats true ${lores_train_data_dir} \
     data/lang $gmm_dir $original_lat_dir
   rm $original_lat_dir/fsts.*.gz # save space
 
-  lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+
+  original_lat_nj=$(cat $original_lat_dir/num_jobs)
+  ihm_lat_nj=$(cat $lat_dir_ihmdata/num_jobs)
 
-  mkdir -p $lat_dir/temp/
-  mkdir -p $lat_dir/temp2/
-  lattice-copy "ark:gunzip -c $original_lat_dir/lat.*.gz |" ark,scp:$lat_dir/temp/lats.ark,$lat_dir/temp/lats.scp
-  lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.*.gz |" ark,scp:$lat_dir/temp2/lats.ark,$lat_dir/temp2/lats.scp
+  $train_cmd --max-jobs-run 10 JOB=1:$original_lat_nj $lat_dir/temp/log/copy_original_lats.JOB.log \
+    lattice-copy "ark:gunzip -c $original_lat_dir/lat.JOB.gz |" ark,scp:$lat_dir/temp/lats.JOB.ark,$lat_dir/temp/lats.JOB.scp
+
+  $train_cmd --max-jobs-run 10 JOB=1:$ihm_lat_nj $lat_dir/temp2/log/copy_ihm_lats.JOB.log \
+    lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.JOB.gz |" ark,scp:$lat_dir/temp2/lats.JOB.ark,$lat_dir/temp2/lats.JOB.scp
+
+  for n in $(seq $original_lat_nj); do
+    cat $lat_dir/temp/lats.$n.scp
+  done > $lat_dir/temp/combined_lats.scp
 
-  # copy the lattices for the reverberated data
-  rm -f $lat_dir/temp/combined_lats.scp
-  touch $lat_dir/temp/combined_lats.scp
-  cat $lat_dir/temp/lats.scp >> $lat_dir/temp/combined_lats.scp
   for i in `seq 1 $num_data_reps`; do
-    cat $lat_dir/temp2/lats.scp | sed -e "s/^/rev${i}_/" >> $lat_dir/temp/combined_lats.scp
-  done
+    for n in $(seq $ihm_lat_nj); do
+      cat $lat_dir/temp2/lats.$n.scp
+    done | sed -e "s/^/rev${i}_/"
+  done >> $lat_dir/temp/combined_lats.scp
+
   sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp
 
-  lattice-copy scp:$lat_dir/temp/combined_lats_sorted.scp "ark:|gzip -c >$lat_dir/lat.1.gz" || exit 1;
-  echo "1" > $lat_dir/num_jobs
+  utils/split_data.sh $train_data_dir $nj
+
+  $train_cmd --max-jobs-run 10 JOB=1:$nj $lat_dir/copy_combined_lats.JOB.log \
+    lattice-copy --include=$train_data_dir/split$nj/JOB/utt2spk \
+    scp:$lat_dir/temp/combined_lats_sorted.scp \
+    "ark:|gzip -c >$lat_dir/lat.JOB.gz" || exit 1;
+
+  echo $nj > $lat_dir/num_jobs
 
   # copy other files from original lattice dir
   for f in cmvn_opts final.mdl splice_opts tree; do
@@ -206,7 +202,7 @@ if [ $stage -le 14 ]; then
   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
       --context-opts "--context-width=2 --central-position=1" \
       --leftmost-questions-truncate -1 \
-      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $original_lat_dir $tree_dir
 fi
 
 xent_regularize=0.1
@@ -215,7 +211,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -312,7 +308,6 @@ if [ $stage -le 18 ]; then
   rm $dir/.error 2>/dev/null || true
 
   [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
-  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
 
   for decode_set in dev eval; do
       (
diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..1a839b045bd
--- /dev/null
+++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,360 @@
+#!/bin/bash
+
+# This is a chain-training script with TDNN+LSTM neural networks.
+# This script is similar to local/chain/multi_condition/tuning/run_tdnn_lstm_1a.sh,
+# but updated to use new l2-regularize options and fast-lstmp with decay-time.
+# It uses the reverberated IHM data in the train set.
+# This script obtains better results on IHM, SDM and MDM tasks.
+
+# Please see RESULTS_* for examples of command lines invoking this script.
+
+# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned &
+# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
+# local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned &
+
+# steps/info/chain_dir_info.pl exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi
+# exp/ihm/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi: num-iters=176 nj=2..12 num-params=43.4M dim=40+100->3736 combine=-0.101->-0.100 (over 2) xent:train/valid[116,175,final]=(-2.47,-1.60,-1.55/-2.58,-1.73,-1.69) logprob:train/valid[116,175,final]=(-0.144,-0.101,-0.099/-0.163,-0.138,-0.136)
+# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali
+# exp/sdm1/chain_cleaned_rvb/tdnn_lstm1b_sp_rvb_bi_ihmali: num-iters=174 nj=2..12 num-params=43.4M dim=40+100->3728 combine=-0.129->-0.126 (over 4) xent:train/valid[115,173,final]=(-2.86,-1.97,-1.98/-2.96,-2.10,-2.10) logprob:train/valid[115,173,final]=(-0.184,-0.134,-0.134/-0.200,-0.164,-0.161)
+
+# local/chain/compare_wer_general.sh ihm chain_cleaned_rvb tdnn_lstm1{a,b}_sp_rvb_bi
+# System                tdnn_lstm1a_sp_rvb_bi tdnn_lstm1b_sp_rvb_bi
+# WER on dev        19.4      18.9
+# WER on eval        19.4      19.3
+# Final train prob     -0.0627414-0.0985175
+# Final valid prob      -0.141082 -0.136302
+# Final train prob (xent)     -0.847054  -1.55263
+# Final valid prob (xent)      -1.25849  -1.69064
+
+# local/chain/compare_wer_general.sh sdm1 chain_cleaned_rvb tdnn_lstm1{a,b}_sp_rvb_bi_ihmali
+# System                tdnn_lstm1a_sp_rvb_bi_ihmali tdnn_lstm1b_sp_rvb_bi_ihmali
+# WER on dev        34.6      33.9
+# WER on eval        37.6      37.4
+# Final train prob     -0.0861836 -0.133611
+# Final valid prob      -0.149669 -0.161014
+# Final train prob (xent)      -1.21927   -1.9774
+# Final valid prob (xent)      -1.53542  -2.09991
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3_cleaned  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+num_data_reps=1
+num_epochs=4
+
+chunk_width=160,140,110,80
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+dropout_schedule='0,0@0.20,0.3@0.50,0' # dropout schedule controls the dropout
+                                       # proportion for each training iteration.
+xent_regularize=0.025
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1b  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# decode options
+extra_left_context=50
+frames_per_chunk=160
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! $use_ihm_ali; then
+  [ "$mic" != "ihm" ] && \
+    echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \
+    exit 1;
+else
+  [ "$mic" == "ihm" ] && \
+    echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \
+    exit 1;
+fi
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+nnet3_affix=_cleaned
+rvb_affix=_rvb
+
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${ihm_gmm}_${train_set}_sp_lats_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${ihm_gmm}_${train_set}_sp${rvb_affix}_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn_lstm${tlstm_affix}_sp${rvb_affix}_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  lores_train_data_dir=data/$mic/${train_set}_sp
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  original_lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+  lat_dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/${gmm}_${train_set}_sp${rvb_affix}_lats
+  dir=exp/$mic/chain${nnet3_affix}${rvb_affix}/tdnn_lstm${tlstm_affix}_sp${rvb_affix}_bi
+fi
+
+
+local/nnet3/multi_condition/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --train-set $train_set \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --num-data-reps $num_data_reps \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len "" \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+
+train_data_dir=data/$mic/${train_set}_sp${rvb_affix}_hires
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${train_set}_sp${rvb_affix}_hires
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \
+    --generate-ali-from-lats true ${lores_train_data_dir} \
+    data/lang $gmm_dir $original_lat_dir
+  rm $original_lat_dir/fsts.*.gz # save space
+
+  lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+
+  original_lat_nj=$(cat $original_lat_dir/num_jobs)
+  ihm_lat_nj=$(cat $lat_dir_ihmdata/num_jobs)
+
+  $train_cmd --max-jobs-run 10 JOB=1:$original_lat_nj $lat_dir/temp/log/copy_original_lats.JOB.log \
+    lattice-copy "ark:gunzip -c $original_lat_dir/lat.JOB.gz |" ark,scp:$lat_dir/temp/lats.JOB.ark,$lat_dir/temp/lats.JOB.scp
+
+  $train_cmd --max-jobs-run 10 JOB=1:$ihm_lat_nj $lat_dir/temp2/log/copy_ihm_lats.JOB.log \
+    lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.JOB.gz |" ark,scp:$lat_dir/temp2/lats.JOB.ark,$lat_dir/temp2/lats.JOB.scp
+
+  for n in $(seq $original_lat_nj); do
+    cat $lat_dir/temp/lats.$n.scp
+  done > $lat_dir/temp/combined_lats.scp
+
+  for i in `seq 1 $num_data_reps`; do
+    for n in $(seq $ihm_lat_nj); do
+      cat $lat_dir/temp2/lats.$n.scp
+    done | sed -e "s/^/rev${i}_/"
+  done >> $lat_dir/temp/combined_lats.scp
+
+  sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp
+
+  utils/split_data.sh $train_data_dir $nj
+
+  $train_cmd --max-jobs-run 10 JOB=1:$nj $lat_dir/copy_combined_lats.JOB.log \
+    lattice-copy --include=$train_data_dir/split$nj/JOB/utt2spk \
+    scp:$lat_dir/temp/combined_lats_sorted.scp \
+    "ark:|gzip -c >$lat_dir/lat.JOB.gz" || exit 1;
+
+  echo $nj > $lat_dir/num_jobs
+
+  # copy other files from original lattice dir
+  for f in cmvn_opts final.mdl splice_opts tree; do
+    cp $original_lat_dir/$f $lat_dir/$f
+  done
+fi
+
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $original_lat_dir $tree_dir
+fi
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.006"
+  lstm_opts="l2-regularize=0.0025 decay-time=20 dropout-proportion=0.0"
+  output_opts="l2-regularize=0.001"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024 $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 $tdnn_opts
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 $tdnn_opts
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 $tdnn_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 $tdnn_opts
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 $tdnn_opts
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 $tdnn_opts
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 $tdnn_opts
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs false \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}${rvb_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh
index deb68d515d2..05a7c2d345b 120000
--- a/egs/ami/s5b/local/chain/run_tdnn.sh
+++ b/egs/ami/s5b/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1i.sh
\ No newline at end of file
+tuning/run_tdnn_1j.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
index 16d1f4044f5..d926c1dc6d7 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -184,7 +184,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
index 83e6a95582f..d9cd1c356e8 100644
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1b.sh
@@ -176,7 +176,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
index 387b4bfcc88..a0805b4f9f1 100755
--- a/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1c.sh
@@ -185,7 +185,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=40"
   
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
index 57108dbddae..997357b80a9 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -164,7 +164,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
index f87e1a12d36..4d062e65429 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1c.sh
@@ -151,7 +151,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
index eb84a1cd876..387570388d0 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
@@ -163,7 +163,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
index e6592b667dc..0436b08cdc0 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1e.sh
@@ -161,7 +161,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
index 8bf2b73dada..4ca526d63b8 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1f.sh
@@ -165,7 +165,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
index dfb6dfedee7..baed760bb68 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1g.sh
@@ -166,7 +166,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
index 3e26a8b38bd..e721a858c0a 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1h.sh
@@ -167,7 +167,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
index 1931127c86d..de40cb2d1a4 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1i.sh
@@ -168,7 +168,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.02"
   output_opts="l2-regularize=0.004"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh
new file mode 100755
index 00000000000..80b2aee60e9
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1j.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+
+#  1j is same as swbd 7q. It uses modified topology with resnet-style skip connections, more layers,
+#  skinnier bottlenecks.
+
+# local/chain/tuning/run_tdnn_1j.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+
+# local/chain/compare_wer_general.sh sdm1 tdnn1i_sp_bi_ihmali tdnn1j_sp_bi_ihmali
+# System                tdnn1i_sp_bi_ihmali tdnn1i_sp_bi_ihmali
+# WER on dev                   36.6                  31.7
+# WER on eval                  40.6                  35.1
+# Final train prob             -0.196231             -0.114088
+# Final valid prob             -0.265572             -0.214282
+# Final train prob (xent)      -2.48061              -1.37987
+# Final valid prob (xent)      -2.71794              -1.8639
+
+# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali
+# exp/sdm1/chain_cleaned/tdnn1j_sp_bi_ihmali: num-iters=327 nj=2..12 num-params=34.3M dim=80+100->3728 combine=-0.126->-0.124 (over 4) xent:train/valid[217,326,final]=(-1.69,-1.43,-1.38/-2.06,-1.93,-1.86) logprob:train/valid[217,326,final]=(-0.143,-0.120,-0.114/-0.226,-0.218,-0.214)
+
+set -e -o pipefail
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+ivector_transform_type=pca
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+num_epochs=15
+remove_egs=true
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1j  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --hires_suffix 80 \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --ivector-transform-type "$ivector_transform_type" \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=80 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=2136
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=2136 bottleneck-dim=210 time-stride=3
+  linear-component name=prefinal-l dim=512 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=2136 small-dim=512
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 50 \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index d63712f1f0f..4f580b88f6b 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -171,7 +171,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
index a53785f45c2..904a079d7de 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -173,7 +173,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
index 76a9f735c5f..511e520465a 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -172,7 +172,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
index 8cc1a4e15fa..bd81b7df4eb 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -172,7 +172,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
index accfd158a9d..50903e78b6d 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
index 2b275e4e27d..f6c53001498 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -173,7 +173,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
index 1c90af38c4c..79fd9ef3fb5 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
index fb4b6a475e2..e58a7f89e03 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -171,7 +171,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
index 92636b4c17e..13f894f5a48 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -174,7 +174,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
index 89fd8ce2915..48b31832e8c 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -181,7 +181,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
index b8d947d8e92..e675bc494bb 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -177,7 +177,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
index 74c0f5a6ead..2d019398274 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -224,7 +224,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
index b0e7af0618d..9e5b971bbe2 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -226,7 +226,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
index bee4d997b01..9575c3cf686 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -178,7 +178,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
index 1e4111adc6a..a7f2625c181 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -182,7 +182,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.025"
   lstm_opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.004"
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
index b672a44e572..ca920869b30 100755
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh
@@ -180,7 +180,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.003"
   lstm_opts="l2-regularize=0.005"
   output_opts="l2-regularize=0.001"
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
index f68c4203767..53dbd5238db 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -178,7 +178,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
index ac4266ca162..dafef668e60 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -177,7 +177,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
index 74b21f10c33..677946d0b9a 100644
--- a/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh
@@ -176,7 +176,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh
index eb20415e515..5ba35fa421c 100755
--- a/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh
+++ b/egs/ami/s5b/local/nnet3/multi_condition/run_ivector_common.sh
@@ -10,19 +10,17 @@ set -e -o pipefail
 stage=1
 mic=ihm
 nj=30
-min_seg_len=1.55  # min length in seconds... we do this because chain training
-                  # will discard segments shorter than 1.5 seconds.  Must remain in sync with
-                  # the same option given to prepare_lores_feats.sh.
 train_set=train_cleaned   # you might set this to e.g. train_cleaned.
-gmm=tri3_cleaned  # This specifies a GMM-dir from the features of the type you're training the system on;
-                  # it should contain alignments for 'train_set'.
-
+norvb_datadir=data/ihm/train_cleaned_sp
 
 num_threads_ubm=32
 rvb_affix=_rvb
 nnet3_affix=_cleaned     # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/$mic/nnet3_cleaned or whatever.
 num_data_reps=1
+sample_rate=16000
+
+max_jobs_run=10
 
 . ./cmd.sh
 . ./path.sh
@@ -30,10 +28,7 @@ num_data_reps=1
 
 nnet3_affix=${nnet3_affix}$rvb_affix
 
-gmmdir=exp/${mic}/${gmm}
-
-
-for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do
+for f in data/${mic}/${train_set}/feats.scp; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
@@ -73,36 +68,23 @@ if [ $stage -le 1 ]; then
 
   for datadir in ${train_set}_sp dev eval; do
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/$mic/${datadir}_hires
+      --cmd "$train_cmd --max-jobs-run $max_jobs_run" data/$mic/${datadir}_hires
     steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires
     utils/fix_data_dir.sh data/$mic/${datadir}_hires
   done
 fi
 
-if [ $stage -le 2 ]; then
-  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
-  # we have to combine short segments or we won't be able to train chain models
-  # on those segments.
-  utils/data/combine_short_segments.sh \
-     data/${mic}/${train_set}_sp_hires $min_seg_len data/${mic}/${train_set}_sp_hires_comb
-
-  # just copy over the CMVN to avoid having to recompute it.
-  cp data/${mic}/${train_set}_sp_hires/cmvn.scp data/${mic}/${train_set}_sp_hires_comb/
-  utils/fix_data_dir.sh data/${mic}/${train_set}_sp_hires_comb/
-fi
 
 if [ $stage -le 3 ]; then
   echo "$0: creating reverberated MFCC features"
 
-  datadir=data/ihm/train_cleaned_sp
-
-  mfccdir=${datadir}_rvb${num_data_reps}_hires/data
+  mfccdir=${norvb_datadir}${rvb_affix}${num_data_reps}_hires/data
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  if [ ! -f ${datadir}_rvb${num_data_reps}_hires/feats.scp ]; then
-    if [ ! -d "RIRS_NOISES" ]; then
+  if [ ! -f ${norvb_datadir}${rvb_affix}${num_data_reps}_hires/feats.scp ]; then
+    if [ ! -d "RIRS_NOISES/" ]; then
       # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
       wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
       unzip rirs_noises.zip
@@ -123,60 +105,29 @@ if [ $stage -le 3 ]; then
       --isotropic-noise-addition-probability 1 \
       --num-replications ${num_data_reps} \
       --max-noises-per-minute 1 \
-      --source-sampling-rate 16000 \
-      ${datadir} ${datadir}_rvb${num_data_reps}
+      --source-sampling-rate $sample_rate \
+      ${norvb_datadir} ${norvb_datadir}${rvb_affix}${num_data_reps}
 
-    utils/copy_data_dir.sh ${datadir}_rvb${num_data_reps} ${datadir}_rvb${num_data_reps}_hires
-    utils/data/perturb_data_dir_volume.sh ${datadir}_rvb${num_data_reps}_hires
+    utils/copy_data_dir.sh ${norvb_datadir}${rvb_affix}${num_data_reps} ${norvb_datadir}${rvb_affix}${num_data_reps}_hires
+    utils/data/perturb_data_dir_volume.sh ${norvb_datadir}${rvb_affix}${num_data_reps}_hires
 
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" ${datadir}_rvb${num_data_reps}_hires
-    steps/compute_cmvn_stats.sh ${datadir}_rvb${num_data_reps}_hires
-    utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires  
-
-    utils/data/combine_short_segments.sh \
-      ${datadir}_rvb${num_data_reps}_hires $min_seg_len ${datadir}_rvb${num_data_reps}_hires_comb
-
-    # just copy over the CMVN to avoid having to recompute it.
-    cp ${datadir}_rvb${num_data_reps}_hires/cmvn.scp ${datadir}_rvb${num_data_reps}_hires_comb/
-    utils/fix_data_dir.sh ${datadir}_rvb${num_data_reps}_hires_comb/
+      --cmd "$train_cmd --max-jobs-run $max_jobs_run" ${norvb_datadir}${rvb_affix}${num_data_reps}_hires
+    steps/compute_cmvn_stats.sh ${norvb_datadir}${rvb_affix}${num_data_reps}_hires
+    utils/fix_data_dir.sh ${norvb_datadir}${rvb_affix}${num_data_reps}_hires
   fi
 
-  utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires data/${mic}/${train_set}_sp_hires ${datadir}_rvb${num_data_reps}_hires
-  utils/combine_data.sh data/${mic}/${train_set}_sp_rvb_hires_comb data/${mic}/${train_set}_sp_hires_comb ${datadir}_rvb${num_data_reps}_hires_comb
+  utils/combine_data.sh data/${mic}/${train_set}_sp${rvb_affix}_hires data/${mic}/${train_set}_sp_hires ${norvb_datadir}${rvb_affix}${num_data_reps}_hires
 fi
 
-
 if [ $stage -le 4 ]; then
-  echo "$0: selecting segments of hires training data that were also present in the"
-  echo " ... original training data."
-
-  # note, these data-dirs are temporary; we put them in a sub-directory
-  # of the place where we'll make the alignments.
-  temp_data_root=exp/$mic/nnet3${nnet3_affix}/tri5
-  mkdir -p $temp_data_root
-
-  utils/data/subset_data_dir.sh --utt-list data/${mic}/${train_set}/feats.scp \
-          data/${mic}/${train_set}_sp_hires $temp_data_root/${train_set}_hires
-
-  # note: essentially all the original segments should be in the hires data.
-  n1=$(wc -l <data/${mic}/${train_set}/feats.scp)
-  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
-  if [ $n1 != $n1 ]; then
-    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
-  fi
+  utils/combine_data.sh data/${mic}/${train_set}_sp${rvb_affix}_hires data/${mic}/${train_set}_sp_hires ${norvb_datadir}${rvb_affix}${num_data_reps}_hires
 
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmmdir exp/$mic/nnet3${nnet3_affix}/tri5
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    --max-utts 30000 --subsample 2 \
+    data/${mic}/${train_set}_sp${rvb_affix}_hires \
+    exp/$mic/nnet3${nnet3_affix}/pca_transform
 fi
 
 
@@ -186,22 +137,21 @@ if [ $stage -le 5 ]; then
   mkdir -p exp/$mic/nnet3${nnet3_affix}/diag_ubm
   temp_data_root=exp/$mic/nnet3${nnet3_affix}/diag_ubm
 
-  # train a diagonal UBM using a subset of about a quarter of the data
-  # we don't use the _comb data for this as there is no need for compatibility with
-  # the alignments, and using the non-combined data is more efficient for I/O
+  # train a diagonal UBM using a subset of about a quarter of the data,
+  # and using the non-combined data is more efficient for I/O
   # (no messing about with piped commands).
-  num_utts_total=$(wc -l <data/$mic/${train_set}_sp_rvb_hires/utt2spk)
+  num_utts_total=$(wc -l <data/$mic/${train_set}_sp${rvb_affix}_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
-  utils/data/subset_data_dir.sh data/$mic/${train_set}_sp_rvb_hires \
-      $num_utts ${temp_data_root}/${train_set}_sp_rvb_hires_subset
+  utils/data/subset_data_dir.sh data/$mic/${train_set}_sp${rvb_affix}_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp${rvb_affix}_hires_subset
 
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
-    ${temp_data_root}/${train_set}_sp_rvb_hires_subset 512 \
-    exp/$mic/nnet3${nnet3_affix}/tri5 exp/$mic/nnet3${nnet3_affix}/diag_ubm
+    ${temp_data_root}/${train_set}_sp${rvb_affix}_hires_subset 512 \
+    exp/$mic/nnet3${nnet3_affix}/pca_transform exp/$mic/nnet3${nnet3_affix}/diag_ubm
 fi
 
 if [ $stage -le 6 ]; then
@@ -210,14 +160,14 @@ if [ $stage -le 6 ]; then
   # 100.
   echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/$mic/${train_set}_sp_rvb_hires exp/$mic/nnet3${nnet3_affix}/diag_ubm exp/$mic/nnet3${nnet3_affix}/extractor || exit 1;
+    data/$mic/${train_set}_sp${rvb_affix}_hires exp/$mic/nnet3${nnet3_affix}/diag_ubm exp/$mic/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
   # valid for the non-'max2' data, the utterance list is the same.
-  ivectordir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_rvb_hires_comb
+  ivectordir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp${rvb_affix}_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
@@ -231,10 +181,10 @@ if [ $stage -le 7 ]; then
   # handle per-utterance decoding well (iVector starts at zero).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${mic}/${train_set}_sp_rvb_hires_comb ${temp_data_root}/${train_set}_sp_rvb_hires_comb_max2
+    data/${mic}/${train_set}_sp${rvb_affix}_hires ${temp_data_root}/${train_set}_sp${rvb_affix}_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    ${temp_data_root}/${train_set}_sp_rvb_hires_comb_max2 \
+    ${temp_data_root}/${train_set}_sp${rvb_affix}_hires_max2 \
     exp/$mic/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
diff --git a/egs/ami/s5b/local/nnet3/prepare_lores_feats.sh b/egs/ami/s5b/local/nnet3/prepare_lores_feats.sh
index f4e38371543..efa0046bd62 100755
--- a/egs/ami/s5b/local/nnet3/prepare_lores_feats.sh
+++ b/egs/ami/s5b/local/nnet3/prepare_lores_feats.sh
@@ -17,6 +17,7 @@ nj=30
 min_seg_len=1.55  # min length in seconds... we do this because chain training
                   # will discard segments shorter than 1.5 seconds.  Must remain in
                   # sync with the same option given to run_ivector_common.sh.
+                  # Set it to empty string to skip combining segments.
 use_ihm_ali=false # If true, we use alignments from the IHM data (which is better..
                   # don't set this to true if $mic is set to ihm.)
 train_set=train   # you might set this to e.g. train_cleaned.
@@ -69,15 +70,17 @@ if [ $stage -le 9 ]; then
   utils/fix_data_dir.sh data/${mic}/${train_set}${ihm_suffix}_sp
 fi
 
-if [ $stage -le 10 ]; then
-  echo "$0: combining short segments of 13-dimensional speed-perturbed ${maybe_ihm}MFCC data"
-  src=data/${mic}/${train_set}${ihm_suffix}_sp
-  dest=data/${mic}/${train_set}${ihm_suffix}_sp_comb
-  utils/data/combine_short_segments.sh $src $min_seg_len $dest
-  # re-use the CMVN stats from the source directory, since it seems to be slow to
-  # re-compute them after concatenating short segments.
-  cp $src/cmvn.scp $dest/
-  utils/fix_data_dir.sh $dest
+if [ ! -z "$min_seg_len" ]; then
+  if [ $stage -le 10 ]; then
+    echo "$0: combining short segments of 13-dimensional speed-perturbed ${maybe_ihm}MFCC data"
+    src=data/${mic}/${train_set}${ihm_suffix}_sp
+    dest=data/${mic}/${train_set}${ihm_suffix}_sp_comb
+    utils/data/combine_short_segments.sh $src $min_seg_len $dest
+    # re-use the CMVN stats from the source directory, since it seems to be slow to
+    # re-compute them after concatenating short segments.
+    cp $src/cmvn.scp $dest/
+    utils/fix_data_dir.sh $dest
+  fi
 fi
 
 
diff --git a/egs/ami/s5b/local/nnet3/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/run_ivector_common.sh
index 4317f375769..e67d1039c40 100755
--- a/egs/ami/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/ami/s5b/local/nnet3/run_ivector_common.sh
@@ -21,7 +21,7 @@ num_threads_ubm=32
 ivector_transform_type=lda
 nnet3_affix=_cleaned     # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/$mic/nnet3_cleaned or whatever.
-
+hires_suffix=
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@@ -72,7 +72,7 @@ if [ $stage -le 2 ]; then
   utils/data/perturb_data_dir_volume.sh data/$mic/${train_set}_sp_hires
 
   for datadir in ${train_set}_sp dev eval; do
-    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires$hires_suffix.conf \
       --cmd "$train_cmd" data/$mic/${datadir}_hires
     steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires
     utils/fix_data_dir.sh data/$mic/${datadir}_hires
diff --git a/egs/ami/s5b/local/prepare_parallel_train_data.sh b/egs/ami/s5b/local/prepare_parallel_train_data.sh
index 868342a9d51..ad22ad7cf22 100755
--- a/egs/ami/s5b/local/prepare_parallel_train_data.sh
+++ b/egs/ami/s5b/local/prepare_parallel_train_data.sh
@@ -6,6 +6,10 @@
 # utterance ids are different between the different mics
 
 
+train_set=train
+
+. utils/parse_options.sh
+
 if [ $# != 1 ]; then
   echo "Usage: $0 [sdm1|mdm8]"
   exit 1
@@ -18,12 +22,10 @@ if [ $mic == "ihm" ]; then
   exit 1;
 fi
 
-train_set=train
-
 . ./cmd.sh
 . ./path.sh
 
-for f in data/ihm/train/utt2spk data/$mic/train/utt2spk; do
+for f in data/ihm/${train_set}/utt2spk data/$mic/${train_set}/utt2spk; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
@@ -32,12 +34,12 @@ done
 
 set -e -o pipefail
 
-mkdir -p data/$mic/train_ihmdata
+mkdir -p data/$mic/${train_set}_ihmdata
 
 # the utterance-ids and speaker ids will be from the SDM or MDM data
-cp data/$mic/train/{spk2utt,text,utt2spk} data/$mic/train_ihmdata/
+cp data/$mic/${train_set}/{spk2utt,text,utt2spk} data/$mic/${train_set}_ihmdata/
 # the recording-ids will be from the IHM data.
-cp data/ihm/train/{wav.scp,reco2file_and_channel} data/$mic/train_ihmdata/
+cp data/ihm/${train_set}/{wav.scp,reco2file_and_channel} data/$mic/${train_set}_ihmdata/
 
 # map sdm/mdm segments to the ihm segments
 
@@ -47,18 +49,18 @@ mic_base_upcase=$(echo $mic | sed 's/[0-9]//g' | tr 'a-z' 'A-Z')
 # It has lines like:
 # AMI_EN2001a_H02_FEO065_0021133_0021442 AMI_EN2001a_SDM_FEO065_0021133_0021442
 
-tmpdir=data/$mic/train_ihmdata/
+tmpdir=data/$mic/${train_set}_ihmdata/
 
-awk '{print $1, $1}' <data/ihm/train/utt2spk | \
+awk '{print $1, $1}' <data/ihm/${train_set}/utt2spk | \
   sed -e "s/_H[0-9][0-9]_/_${mic_base_upcase}_/" | \
   awk '{print $2, $1}' >$tmpdir/ihmutt2utt
 
 # Map the 1st field of the segments file from the ihm data (the 1st field being
 # the utterance-id) to the corresponding SDM or MDM utterance-id.  The other
 # fields remain the same (e.g. we want the recording-ids from the IHM data).
-utils/apply_map.pl -f 1 $tmpdir/ihmutt2utt <data/ihm/train/segments >data/$mic/train_ihmdata/segments
+utils/apply_map.pl -f 1 $tmpdir/ihmutt2utt <data/ihm/${train_set}/segments >data/$mic/${train_set}_ihmdata/segments
 
-utils/fix_data_dir.sh data/$mic/train_ihmdata
+utils/fix_data_dir.sh data/$mic/${train_set}_ihmdata
 
 rm $tmpdir/ihmutt2utt
 
diff --git a/egs/an4/s5/local/data_prep.py b/egs/an4/s5/local/data_prep.py
index 24cb9bffb07..9d8083f3b60 100644
--- a/egs/an4/s5/local/data_prep.py
+++ b/egs/an4/s5/local/data_prep.py
@@ -15,6 +15,7 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import os
 import re
 import sys
diff --git a/egs/an4/s5/local/lexicon_prep.py b/egs/an4/s5/local/lexicon_prep.py
index 8d451daf869..3584fa86dfb 100644
--- a/egs/an4/s5/local/lexicon_prep.py
+++ b/egs/an4/s5/local/lexicon_prep.py
@@ -15,6 +15,7 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import os
 import re
 import sys
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
index 8ff59d83ed0..bd13010c791 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
@@ -138,7 +138,7 @@ if [ $stage -le 11 ]; then
 
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
index 0ca6062e9c8..b5979a3ce6b 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_asp_1.sh
@@ -208,7 +208,7 @@ if [ $stage -le 14 ]; then
   extra_right_context=$[$chunk_right_context+10]
   # %WER 26.8 | 2120 27220 | 80.2 11.7 8.1 7.0 26.8 76.5 | -0.804 | exp/chain/blstm_asp_1/decode_dev_aspire_whole_uniformsegmented_win10_over5_v7_iterfinal_pp_fg/score_9/penalty_0.0/
 
-  local/nnet3/prep_test_aspire.sh --stage 4 --decode-num-jobs 30  --affix "v7" \
+  local/multi_condition/prep_test_aspire.sh --stage 4 --decode-num-jobs 30  --affix "v7" \
    --extra-left-context $extra_left_context \
    --extra-right-context $extra_right_context \
    --frames-per-chunk $chunk_width \
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
index 201f61dc64b..af12e323e76 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
@@ -44,7 +44,7 @@ lang=data/lang_chain
 # The iVector-extraction and feature-dumping parts are the same as the standard
 # nnet3 setup, and you can skip them by setting "--stage 8" if you have already
 # run those things.
-local/nnet3/run_ivector_common.sh --stage $stage --num-data-reps 3|| exit 1;
+local/nnet3/run_ivector_common.sh --stage $stage --num-data-reps ${num_data_reps} || exit 1;
 
 if [ $stage -le 7 ]; then
   # Create a version of the lang/ directory that has one state per phone in the
@@ -92,8 +92,8 @@ if [ $stage -le 9 ]; then
 
  # combine the non-hires features for alignments/lattices
  rm -rf data/${latgen_train_set}_min${min_seg_len}
-  utt_prefix="THISISUNIQUESTRING_"
-  spk_prefix="THISISUNIQUESTRING_"
+  utt_prefix="THISISUNIQUESTRING-"
+  spk_prefix="THISISUNIQUESTRING-"
   utils/copy_data_dir.sh --spk-prefix "$spk_prefix" --utt-prefix "$utt_prefix" \
     data/train data/train_temp_for_lats
   utils/data/combine_short_segments.sh \
@@ -136,7 +136,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -182,6 +182,7 @@ if [ $stage -le 12 ]; then
      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
   fi
 
+  mkdir -p $dir/egs
   touch $dir/egs/.nodelete # keep egs around when that run dies.
 
   steps/nnet3/chain/train.py --stage $train_stage \
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 63d3a7ca988..f98dff5e6fa 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -26,7 +26,6 @@ cell_dim=1024
 projection_dim=256
 
 # training options
-num_epochs=2
 minibatch_size=64,32
 chunk_left_context=40
 chunk_right_context=0
@@ -95,7 +94,7 @@ if [ $stage -le 8 ]; then
 
   for n in `seq $nj`; do
     awk '{print $1}' data/${train_set}/split$nj/$n/utt2spk | \
-      perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj
+      perl -ane 's/rev[1-3]-//g' > $lat_dir/uttlist.$n.$nj
   done
 
   rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null
@@ -106,7 +105,7 @@ if [ $stage -le 8 ]; then
     ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1
 
   for n in `seq 3`; do
-    cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}'
+    cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"-"$1" "$2}'
   done > $lat_dir/lat_rvb.scp
 
   $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \
@@ -151,7 +150,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=40"
 
@@ -309,4 +308,3 @@ if [ $stage -le 17 ]; then
 fi
 
 exit 0;
-
diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh
index f4366fef679..2ceb4a4cf05 100755
--- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh
+++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh
@@ -63,7 +63,8 @@ if [ $stage -le 3 ]; then
   [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires
   if [ ! -f data/${data_set}_hires/segments ]; then
     utils/data/get_segments_for_data.sh data/${data_set}_hires > \
-      data/${data_set}_hires/segments
+      data/${data_set}_hires/segments.tmp
+    mv data/${data_set}_hires/segments.tmp data/${data_set}_hires/segments
   fi
 
   mkdir -p data/${segmented_data_set}_hires
diff --git a/egs/aspire/s5/local/multi_condition/create_uniform_segments.py b/egs/aspire/s5/local/multi_condition/create_uniform_segments.py
index e7baafc028c..010811490ef 100755
--- a/egs/aspire/s5/local/multi_condition/create_uniform_segments.py
+++ b/egs/aspire/s5/local/multi_condition/create_uniform_segments.py
@@ -4,13 +4,14 @@
 # creates a segments file in the provided data directory
 # into uniform segments with specified window and overlap
 
+from __future__ import division
 import imp, sys, argparse, os, math, subprocess
 
 min_segment_length = 10 # in seconds
 def segment(total_length, window_length, overlap = 0):
   increment = window_length - overlap
   num_windows = int(math.ceil(float(total_length)/increment))
-  segments = map(lambda x: (x * increment, min( total_length, (x * increment) + window_length)), range(0, num_windows))
+  segments = [(x * increment, min( total_length, (x * increment) + window_length)) for x in range(0, num_windows)]
   if segments[-1][1] - segments[-1][0] < min_segment_length:
     segments[-2] = (segments[-2][0], segments[-1][1])
     segments.pop()
@@ -53,7 +54,7 @@ def prepare_segments_file(kaldi_data_dir, window_length, overlap):
   parser = argparse.ArgumentParser()
   parser.add_argument('--window-length', type = float, default = 30.0, help = 'length of the window used to cut the segment')
   parser.add_argument('--overlap', type = float, default = 5.0, help = 'overlap of neighboring windows')
-  parser.add_argument('data_dir', type=str, help='directory such as data/train')
+  parser.add_argument('data_dir', help='directory such as data/train')
 
   params = parser.parse_args()
 
diff --git a/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py b/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py
index e249e54e5f6..2b4bcddda69 100755
--- a/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py
+++ b/egs/aspire/s5/local/multi_condition/fill_missing_recordings.py
@@ -38,14 +38,14 @@ def fill_ctm(input_ctm_file, output_ctm_file, recording_names):
 
   sys.stderr.write(str(" ".join(sys.argv)))
   parser = argparse.ArgumentParser(usage)
-  parser.add_argument('input_ctm_file', type=str, help='ctm file for the recordings')
-  parser.add_argument('output_ctm_file', type=str, help='ctm file for the recordings')
-  parser.add_argument('recording_name_file', type=str, help='file with names of the recordings')
+  parser.add_argument('input_ctm_file', help='ctm file for the recordings')
+  parser.add_argument('output_ctm_file', help='ctm file for the recordings')
+  parser.add_argument('recording_name_file', help='file with names of the recordings')
 
   params = parser.parse_args()
 
   try:
-    file_names = map(lambda x: x.strip(), open("{0}".format(params.recording_name_file)).readlines())
+    file_names = [x.strip() for x in open("{0}".format(params.recording_name_file)).readlines()]
   except IOError:
     raise Exception("Expected to find {0}".format(params.recording_name_file))
 
diff --git a/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py b/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py
index cc06f58616a..1f06d3e7c3b 100755
--- a/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py
+++ b/egs/aspire/s5/local/multi_condition/get_air_file_patterns.py
@@ -3,6 +3,7 @@
 
 # script to generate the file_patterns of the AIR database
 # see load_air.m file in AIR db to understand the naming convention
+from __future__ import print_function
 import sys, glob, re, os.path
 
 air_dir = sys.argv[1]
@@ -45,4 +46,4 @@
                 file_patterns.append(file_pattern+" "+output_file_name)
 file_patterns = list(set(file_patterns))
 file_patterns.sort()
-print "\n".join(file_patterns)
+print("\n".join(file_patterns))
diff --git a/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh b/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh
deleted file mode 100755
index 23f3bcb8378..00000000000
--- a/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/bin/bash
-# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
-
-# This script produces CTM files from a decoding directory that has lattices
-# present.  This version gives you confidence scores.
-
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-min_lmwt=5
-max_lmwt=20
-use_segments=true # if we have a segments file, use it to convert
-                  # the segments to be relative to the original files.
-iter=final
-#end configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --use-segments (true|false)     # use segments and reco2file_and_channel files "
-  echo "                                    # to produce a ctm relative to the original audio"
-  echo "                                    # files, with channel information (typically needed"
-  echo "                                    # for NIST scoring)."
-  echo "e.g.:"
-  echo "$0 data/train data/lang exp/tri4a/decode/"
-  echo "See also: steps/get_train_ctm.sh"
-  exit 1;
-fi
-
-data=$1
-lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
-dir=$3
-
-model=$dir/../$iter.mdl # assume model one level up from decoding dir.
-
-
-for f in $lang/words.txt $model $dir/lat.1.gz; do
-  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
-done
-
-name=`basename $data`; # e.g. eval2000
-
-mkdir -p $dir/scoring/log
-
-if [ -f $dir/../frame_shift ]; then
-  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
-  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
-elif [ -f $dir/../frame_subsampling_factor ]; then
-  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
-  frame_shift_opt="--frame-shift=0.0$factor"
-  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
-fi
-
-
-
-if [ $stage -le 0 ]; then
-  if [ -f $data/segments ] && $use_segments; then
-    f=$data/reco2file_and_channel
-    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
-    filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
-  else
-    filter_cmd=cat
-  fi
-
-  if [ -f $lang/phones/word_boundary.int ]; then
-    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/score_LMWT/ '&&' \
-      lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \| \
-      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
-  else
-    if [ ! -f $lang/phones/align_lexicon.int ]; then
-      echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
-      exit 1;
-    fi
-
-    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
-      mkdir -p $dir/score_LMWT/ '&&' \
-      lattice-prune --inv-acoustic-scale=LMWT --beam=5 "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \| \
-      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
-  fi
-fi
-
-
diff --git a/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh b/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh
new file mode 120000
index 00000000000..4c0ff429c31
--- /dev/null
+++ b/egs/aspire/s5/local/multi_condition/get_ctm_conf.sh
@@ -0,0 +1 @@
+../../../../wsj/s5/steps/conf/get_ctm_conf.sh
\ No newline at end of file
diff --git a/egs/aspire/s5/local/multi_condition/normalize_wavs.py b/egs/aspire/s5/local/multi_condition/normalize_wavs.py
index dabf420d9f8..6e67d2113c1 100755
--- a/egs/aspire/s5/local/multi_condition/normalize_wavs.py
+++ b/egs/aspire/s5/local/multi_condition/normalize_wavs.py
@@ -3,6 +3,8 @@
 
 # normalizes the wave files provided in input file list with a common scaling factor
 # the common scaling factor is computed to 1/\sqrt(1/(total_samples) * \sum_i{\sum_j x_i(j)^2}) where total_samples is sum of all samples of all wavefiles. If the data is multi-channel then each channel is treated as a seperate wave files
+from __future__ import division
+from __future__ import print_function
 import argparse, scipy.io.wavfile, warnings, numpy as np, math
 
 def get_normalization_coefficient(file_list, is_rir, additional_scaling):
@@ -29,7 +31,7 @@ def get_normalization_coefficient(file_list, is_rir, additional_scaling):
         assert(rate == sampling_rate)
       else:
         sampling_rate = rate
-      data = data / dtype_max_value
+      data = data/dtype_max_value
       if is_rir:
         # just count the energy of the direct impulse response
         # this is treated as energy of signal from 0.001 seconds before impulse
@@ -55,8 +57,8 @@ def get_normalization_coefficient(file_list, is_rir, additional_scaling):
     except IOError:
       warnings.warn("Did not find the file {0}.".format(file))
   assert(total_samples > 0)
-  scaling_coefficient = np.sqrt(total_samples / total_energy)
-  print "Scaling coefficient is {0}.".format(scaling_coefficient)
+  scaling_coefficient = np.sqrt(total_samples/total_energy)
+  print("Scaling coefficient is {0}.".format(scaling_coefficient))
   if math.isnan(scaling_coefficient):
     raise Exception(" Nan encountered while computing scaling coefficient. This is mostly due to numerical overflow")
   return scaling_coefficient
diff --git a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
index 804de611cae..8297cdee9ca 100755
--- a/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
+++ b/egs/aspire/s5/local/multi_condition/prepare_impulses_noises.sh
@@ -114,7 +114,7 @@ cp ${output_dir}_non_normalized/info/* $output_dir/info
 
 # rename file location in the noise-rir pairing files 
 for file in `ls $output_dir/info/noise_impulse*`; do
-  sed -i "s/_non_normalized//g" $file
+  perl -i -pe "s/_non_normalized//g" $file
 done
 
 # generating the rir-list with probabilities alloted for each rir
diff --git a/egs/aspire/s5/local/multi_condition/read_rir.py b/egs/aspire/s5/local/multi_condition/read_rir.py
index a2e1c2052e2..04898bda760 100755
--- a/egs/aspire/s5/local/multi_condition/read_rir.py
+++ b/egs/aspire/s5/local/multi_condition/read_rir.py
@@ -29,9 +29,9 @@ def usage():
   #sys.stderr.write(" ".join(sys.argv)+"\n")
   parser = argparse.ArgumentParser(usage())
   parser.add_argument('--output-sampling-rate', type = int, default = 8000,  help = 'sampling rate of the output')
-  parser.add_argument('type', type = str, default = None,  help = 'database type', choices = ['air'])
-  parser.add_argument('input', type = str, default = None,  help = 'directory containing the multi-channel data for a particular recording, or file name or file-regex-pattern')
-  parser.add_argument('output_filename', type = str, default = None,  help = 'output filename (if "-" then output is written to output pipe)')
+  parser.add_argument('type', default = None,  help = 'database type', choices = ['air'])
+  parser.add_argument('input', default = None,  help = 'directory containing the multi-channel data for a particular recording, or file name or file-regex-pattern')
+  parser.add_argument('output_filename', default = None,  help = 'output filename (if "-" then output is written to output pipe)')
   params = parser.parse_args()
 
   if params.output_filename == "-":
diff --git a/egs/aspire/s5/local/multi_condition/reverberate_wavs.py b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py
index 998a3ed5e74..f43e4a2f894 100755
--- a/egs/aspire/s5/local/multi_condition/reverberate_wavs.py
+++ b/egs/aspire/s5/local/multi_condition/reverberate_wavs.py
@@ -4,18 +4,20 @@
 # script to generate multicondition training data / dev data / test data
 import argparse, glob, math, os, random, scipy.io.wavfile, sys
 
-class list_cyclic_iterator:
+class list_cyclic_iterator(object):
   def __init__(self, list, random_seed = 0):
     self.list_index = 0
     self.list = list
     random.seed(random_seed)
     random.shuffle(self.list)
 
-  def next(self):
+  def __next__(self):
     item = self.list[self.list_index]
     self.list_index = (self.list_index + 1) % len(self.list)
     return item
 
+  next = __next__  # for Python 2
+
 def return_nonempty_lines(lines):
   new_lines = []
   for line in lines:
@@ -71,15 +73,15 @@ def return_nonempty_lines(lines):
   for i in range(len(wav_files)):
     wav_file = " ".join(wav_files[i].split()[1:])
     output_wav_file = wav_out_files[i]
-    impulse_file = impulses.next()
+    impulse_file = next(impulses)
     noise_file = ''
     snr = ''
     found_impulse = False
     if add_noise:
-      for i in xrange(len(impulse_noise_index)):
+      for i in range(len(impulse_noise_index)):
         if impulse_file in impulse_noise_index[i][0]:
-          noise_file = impulse_noise_index[i][1].next()
-          snr = snrs.next()
+          noise_file = next(impulse_noise_index[i][1])
+          snr = next(snrs)
           assert(len(wav_file.strip()) > 0)
           assert(len(impulse_file.strip()) > 0)
           assert(len(noise_file.strip()) > 0)
diff --git a/egs/aspire/s5/local/nnet3/segment_and_decode.sh b/egs/aspire/s5/local/nnet3/segment_and_decode.sh
index d66b72200c1..e8917d091e2 100755
--- a/egs/aspire/s5/local/nnet3/segment_and_decode.sh
+++ b/egs/aspire/s5/local/nnet3/segment_and_decode.sh
@@ -109,9 +109,9 @@ fi
 
 if [ $stage -le 4 ]; then
   utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \
-    data/${segmented_data_set}_hires
-  steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires
-  utils/fix_data_dir.sh data/${segmented_data_set}_hires
+    data/${segmented_data_set}_seg_hires
+  steps/compute_cmvn_stats.sh data/${segmented_data_set}_seg_hires
+  utils/fix_data_dir.sh data/${segmented_data_set}_seg_hires
 fi
 
 if [ $stage -le 5 ]; then
@@ -122,11 +122,11 @@ if [ $stage -le 5 ]; then
   # acoustic conditions drift over time within the speaker's data.
   steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \
     --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
-    data/${segmented_data_set}_hires $lang $ivector_root_dir/extractor \
-    $ivector_root_dir/ivectors_${segmented_data_set}
+    data/${segmented_data_set}_seg_hires $lang $ivector_root_dir/extractor \
+    $ivector_root_dir/ivectors_${segmented_data_set}_seg
 fi
 
-decode_dir=$dir/decode_${segmented_data_set}${affix}_pp
+decode_dir=$dir/decode_${segmented_data_set}_seg${affix}_pp
 if [ $stage -le 6 ]; then
   echo "Generating lattices"
   rm -f ${decode_dir}_tg/.error
@@ -138,8 +138,8 @@ if [ $stage -le 6 ]; then
       --extra-right-context-final $extra_right_context_final \
       --frames-per-chunk "$frames_per_chunk" \
       --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \
-      --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set} \
-     $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \
+      --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set}_seg \
+     $graph data/${segmented_data_set}_seg_hires ${decode_dir}_tg || \
      { echo "$0: Error decoding" && exit 1; }
 fi
 
@@ -147,7 +147,7 @@ if [ $stage -le 7 ]; then
   echo "Rescoring lattices"
   steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
     --skip-scoring true \
-    ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \
+    ${lang}_pp_test{,_fg} data/${segmented_data_set}_seg_hires \
     ${decode_dir}_{tg,fg};
 fi
 
@@ -161,5 +161,5 @@ if [ $stage -le 8 ]; then
     ${iter:+--iter $iter} \
     --decode-mbr true \
     --tune-hyper true \
-    $lang $decode_dir $act_data_set $segmented_data_set $out_file
+    $lang $decode_dir $act_data_set ${segmented_data_set}_seg $out_file
 fi
diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh
index de0a925a242..095e47e99de 100755
--- a/egs/aspire/s5/local/run_asr_segmentation.sh
+++ b/egs/aspire/s5/local/run_asr_segmentation.sh
@@ -213,7 +213,7 @@ if [ $stage -le 9 ]; then
   # Use left and right context options that were used when training
   # the chain nnet
   # Increase sil-scale to predict more silence
-  local/nnet3/prep_test_aspire_segmentation.sh --stage $test_stage \
+  local/nnet3/segment_and_decode.sh --stage $test_stage \
     --decode-num-jobs $test_nj --affix "${test_affix}" \
     --sad-opts "$sad_opts" \
     --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" --sad-priors-opts "--sil-scale=0.1" \
diff --git a/egs/aurora4/s5/RESULTS b/egs/aurora4/s5/RESULTS
index a7d4a444a02..dc9af7171f7 100644
--- a/egs/aurora4/s5/RESULTS
+++ b/egs/aurora4/s5/RESULTS
@@ -1,8 +1,19 @@
-for x in exp/{mono,tri,sgmm,nnet,dnn}*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+for x in exp/{mono,tri,sgmm,nnet,dnn,chain/tdnn*}*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
 
-%WER 19.61 [ 14698 / 74942, 1233 ins, 3759 del, 9706 sub ] exp/tri2b_multi/decode_tgpr_5k_eval92/wer_13
-%WER 13.93 [ 10437 / 74942, 732 ins, 2695 del, 7010 sub ] exp/tri3a_dnn/decode_tgpr_5k_eval92/wer_10
-%WER 13.61 [ 10202 / 74942, 660 ins, 2987 del, 6555 sub ] exp/tri4a_dnn/decode_tgpr_5k_eval92/wer_11
+# mono
+%WER 37.42 [ 14223 / 38010, 1030 ins, 2613 del, 10580 sub ] exp/mono0a_multi/decode_tgpr_0166/wer_10
+%WER 38.18 [ 28612 / 74942, 1919 ins, 5319 del, 21374 sub ] exp/mono0a_multi/decode_tgpr_eval92/wer_10
 
+# tri2b
+%WER 20.42 [ 7763 / 38010, 827 ins, 1905 del, 5031 sub ] exp/tri2b_multi/decode_tgpr_5k_0166/wer_12
+%WER 19.61 [ 14728 / 74942, 1411 ins, 3548 del, 9769 sub ] exp/tri2b_multi/decode_tgpr_5k_eval92/wer_12
+
+# tri3b
+%WER 15.71 [ 5970 / 38010, 641 ins, 1403 del, 3926 sub ] exp/tri3b_multi/decode_tgpr_0166/wer_13
+%WER 15.28 [ 11454 / 74942, 1082 ins, 2633 del, 7739 sub ] exp/tri3b_multi/decode_tgpr_eval92/wer_13
+
+# chain
+%WER 7.88 [ 2994 / 38010, 216 ins, 1045 del, 1733 sub ] exp/chain/tdnn1a_sp/decode_tgpr_5k_0166/wer_15
+%WER 7.67 [ 5745 / 74942, 392 ins, 1758 del, 3595 sub ] exp/chain/tdnn1a_sp/decode_tgpr_5k_eval92/wer_13
 
 for x in /mnt/matylda3/qmallidi/Karels_New-Parametric-ReLU/kaldi/egs/aurora4/s5_PReLU/exp/{mono,tri,sgmm,nnet,dnn}*/decode*; do [ -d $x ] && grep WER $x/wer_* | /mnt/matylda5/iveselyk/DEVEL/kaldi-official/egs/aurora4/s5/utils/best_wer.sh; done
diff --git a/egs/aurora4/s5/conf/mfcc_hires.conf b/egs/aurora4/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/aurora4/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/aurora4/s5/conf/online_cmvn.conf b/egs/aurora4/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/aurora4/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/aurora4/s5/local/chain/compare_wer.sh b/egs/aurora4/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..91701cad9e9
--- /dev/null
+++ b/egs/aurora4/s5/local/chain/compare_wer.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+"# WER eval92 (tgpr_5k)         "
+"# WER 0166 (tgpr_5k)           ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_5k_eval92 tgpr_5k_0166)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/aurora4/s5/local/chain/run_tdnn.sh b/egs/aurora4/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/aurora4/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/aurora4/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aurora4/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..8bc69f9c8cf
--- /dev/null
+++ b/egs/aurora4/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+# 1a is same as 1h setup in WSJ
+
+# local/chain/compare_wer.sh exp/chain/tdnn1a_sp
+# System                  tdnn1a_sp
+# WER eval92 (tgpr_5k)       7.67
+# WER 0166 (tgpr_5k)         7.88
+# Final train prob        -0.0338
+# Final valid prob        -0.0602
+# Final train prob (xent)   -0.7632
+# Final valid prob (xent)   -0.9377
+# Num-params                 8315264
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1a_sp: num-iters=24 nj=2..8 num-params=8.3M dim=40+100->2752 combine=-0.034->-0.034 (over 1) xent:train/valid[15,23,final]=(-1.13,-0.809,-0.763/-1.16,-0.961,-0.938) logprob:train/valid[15,23,final]=(-0.063,-0.038,-0.034/-0.068,-0.062,-0.060)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si84_multi
+test_sets="eval92 0166"
+gmm=tri3b_multi        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+
+num_threads_ubm=8
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_threads_extractor=4
+num_processes_extractor=2
+
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=false
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --test-sets $test_sets \
+  --num-threads-ubm $num_threads_ubm \
+  --nj-extractor $nj_extractor \
+  --num-processes-extractor $num_processes_extractor \
+  --num-threads-extractor $num_threads_extractor \
+  --nnet3-affix "$nnet3_affix"
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+  delta-layer name=delta input=idct
+  no-op-component name=input2 input=Append(delta, Scale(1.0, ReplaceIndex(ivector, t, 0)))
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=1024 input=input2
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=5000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=wait \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr_5k/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr_5k \
+    $tree_dir $tree_dir/graph_tgpr_5k || exit 1;
+
+fi
+
+if [ $stage -le 15 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in ${test_sets}; do
+    data_affix=$(echo $data | sed s/test_//)
+    nspk=$(wc -l <data/test_${data}_hires/spk2utt)
+    for lmtype in tgpr_5k; do
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context 0 --extra-right-context 0 \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_${data}_hires \
+        $tree_dir/graph_${lmtype} data/test_${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+    done
+  done
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/aurora4/s5/local/nnet3/run_ivector_common.sh b/egs/aurora4/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a489a273c6b
--- /dev/null
+++ b/egs/aurora4/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,193 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+stage=0
+nj=30
+train_set=train_si84   # you might set this to e.g. train.
+test_sets="test_0166 test_eval92"
+gmm=tri4b                # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+online_cmvn_iextractor=false
+
+num_threads_ubm=32
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_processes_extractor=4
+num_threads_extractor=4
+
+nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
+                         # in the tedlium recip it's _cleaned).
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+# low-resolution features and alignments,
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 2 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 3 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+# high-resolution features and i-vector extractor,
+if [ $stage -le 5 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 5."
+  exit 1
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+
+fi
+
+if [ $stage -le 7 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --online-cmvn-iextractor $online_cmvn_iextractor \
+    --nj $nj_extractor --num-threads $num_threads_extractor --num-processes $num_processes_extractor \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # We now extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0;
diff --git a/egs/aurora4/s5/local/run_mmi_tri2b.sh b/egs/aurora4/s5/local/run_mmi_tri2b.sh
index 6517e46a1a7..8a4d03c59c4 100755
--- a/egs/aurora4/s5/local/run_mmi_tri2b.sh
+++ b/egs/aurora4/s5/local/run_mmi_tri2b.sh
@@ -38,7 +38,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
    data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
    exp/tri2b_fmmi_b0.1
 
- for iter in `seq 3 8`; do 
+ for iter in `seq 3 8`; do
    steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
      exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it$iter &
  done
@@ -46,7 +46,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
  steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
    data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
    exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
- for iter in `seq 3 8`; do 
+ for iter in `seq 3 8`; do
    steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
      exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it$iter &
  done
@@ -54,7 +54,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
  steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
    data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
    exp/tri2b_fmmi_indirect_b0.1
- for iter in `seq 3 8`; do 
+ for iter in `seq 3 8`; do
    steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
       exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it$iter &
  done
diff --git a/egs/aurora4/s5/local/run_sgmm2.sh b/egs/aurora4/s5/local/run_sgmm2.sh
index b7f872930e0..2eb70785bcb 100755
--- a/egs/aurora4/s5/local/run_sgmm2.sh
+++ b/egs/aurora4/s5/local/run_sgmm2.sh
@@ -88,14 +88,14 @@
      exp/ubm5b/final.ubm exp/sgmm2_5c || exit 1;
    # Decode from lattices in exp/sgmm2_5b
     steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd"  --transform-dir exp/tri4b/decode_tgpr_dev93 \
-       data/test_dev93 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_dev93 exp/sgmm2_5c/decode_tgpr_dev93 
+       data/test_dev93 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_dev93 exp/sgmm2_5c/decode_tgpr_dev93
     steps/decode_sgmm2_fromlats.sh --cmd "$decode_cmd"  --transform-dir exp/tri4b/decode_tgpr_eval92 \
-       data/test_eval92 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_eval92 exp/sgmm2_5c/decode_tgpr_eval92 
+       data/test_eval92 data/lang_test_tgpr exp/sgmm2_5b/decode_tgpr_eval92 exp/sgmm2_5c/decode_tgpr_eval92
   ) &
 
 
   steps/align_sgmm2.sh --nj 30 --cmd "$train_cmd" --transform-dir exp/tri4b_ali_si284 \
-    --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm2_5b exp/sgmm2_5b_ali_si284 
+    --use-graphs true --use-gselect true data/train_si284 data/lang exp/sgmm2_5b exp/sgmm2_5b_ali_si284
 
   steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
     data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284
@@ -128,7 +128,7 @@ wait
 
 # Examples of combining some of the best decodings: SGMM+MMI with
 # MMI+fMMI on a conventional system.
- 
+
 local/score_combine.sh data/test_eval92 \
    data/lang_test_bd_tgpr \
    exp/tri4b_fmmi_a/decode_tgpr_eval92_it8 \
diff --git a/egs/aurora4/s5/run.sh b/egs/aurora4/s5/run.sh
index bb61bec4cb5..f7eb67580ae 100755
--- a/egs/aurora4/s5/run.sh
+++ b/egs/aurora4/s5/run.sh
@@ -2,9 +2,17 @@
 
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
+. ./path.sh
+
+stage=0
+train_set=multi # Set this to 'clean' or 'multi'
+test_sets="eval92 0166"
+train=true   # set to false to disable the training-related scripts
+             # note: you probably only want to set --train false if you
+             # are using at least --stage 1.
+decode=true  # set to false to disable the decoding-related scripts.
 
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
+. utils/parse_options.sh
 
 #clean LDC wsj0 corpus available in CLSP server: /export/corpora5/LDC/LDC93S6B
 #aurora4 directory in CLSP server: /export/corpora5/AURORA
@@ -15,129 +23,119 @@ aurora4=/export/corpora5/AURORA
 #wsj0=/mnt/spdb/wall_street_journal
 wsj0=/export/corpora5/LDC/LDC93S6B
 
-local/aurora4_data_prep.sh $aurora4 $wsj0
-
-local/wsj_prepare_dict.sh || exit 1;
+if [ $stage -le 0 ]; then
+  local/aurora4_data_prep.sh $aurora4 $wsj0
+fi
 
-utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+if [ $stage -le 1 ]; then
+  local/wsj_prepare_dict.sh
+  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
+fi
 
-local/aurora4_format_data.sh || exit 1;
+if [ $stage -le 2 ]; then
+  local/aurora4_format_data.sh
+fi
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
 mfccdir=mfcc
-for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do 
- steps/make_mfcc.sh  --nj 10 \
-   data/$x exp/make_mfcc/$x $mfccdir || exit 1;
- steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
-done
-
-# make fbank features
-fbankdir=fbank
-mkdir -p data-fbank
-for x in train_si84_clean train_si84_multi dev_0330 dev_1206 test_eval92 test_0166; do
-  cp -r data/$x data-fbank/$x
-  steps/make_fbank.sh --nj 10 \
-    data-fbank/$x exp/make_fbank/$x $fbankdir || exit 1;
-done
-
-# Note: the --boost-silence option should probably be omitted by default
-# for normal setups.  It doesn't always help. [it's to discourage non-silence
-# models from modeling silence.]
-#steps/train_mono.sh --boost-silence 1.25 --nj 10  \
-#  data/train_si84_clean data/lang exp/mono0a || exit 1;
-
-steps/train_mono.sh --boost-silence 1.25 --nj 10  \
-  data/train_si84_multi data/lang exp/mono0a_multi || exit 1;
-#(
-# utils/mkgraph.sh data/lang_test_tgpr exp/mono0a exp/mono0a/graph_tgpr && \
-# steps/decode.sh --nj 8  \
-#   exp/mono0a/graph_tgpr data/test_eval92 exp/mono0a/decode_tgpr_eval92 
-#) &
-
-#steps/align_si.sh --boost-silence 1.25 --nj 10  \
-#   data/train_si84_clean data/lang exp/mono0a exp/mono0a_ali || exit 1;
-steps/align_si.sh --boost-silence 1.25 --nj 10  \
-   data/train_si84_multi data/lang exp/mono0a_multi exp/mono0a_multi_ali || exit 1;
-
-#steps/train_deltas.sh --boost-silence 1.25 \
-#    2000 10000 data/train_si84_clean data/lang exp/mono0a_ali exp/tri1 || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 \
-    2000 10000 data/train_si84_multi data/lang exp/mono0a_multi_ali exp/tri1_multi || exit 1;
-
+if [ $stage -le 3 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  for x in train_si84_clean train_si84_multi test_eval92 test_0166 dev_0330 dev_1206; do 
+   steps/make_mfcc.sh  --nj 10 \
+     data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+  done
+fi
+
+model_affix=
+if [ $train_set == 'multi' ]; then
+  model_affix=_multi
+fi
+
+if [ $stage -le 4 ]; then
+  # Note: the --boost-silence option should probably be omitted by default
+  # for normal setups.  It doesn't always help. [it's to discourage non-silence
+  # models from modeling silence.]
+  if $train; then
+    steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+      data/train_si84_${train_set} data/lang exp/mono0a${model_affix} || exit 1;
+  fi
+
+  if $decode; then
+    for testdir in $test_sets; do
+      utils/mkgraph.sh data/lang_test_tgpr exp/mono0a${model_affix} exp/mono0a${model_affix}/graph_tgpr && \
+      steps/decode.sh --nj 8 --cmd "$decode_cmd" \
+        exp/mono0a${model_affix}/graph_tgpr data/test_${testdir} exp/mono0a${model_affix}/decode_tgpr_${testdir}
+    done 
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # tri1
+  if $train; then
+    steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+       data/train_si84_${train_set} data/lang exp/mono0a${model_affix} exp/mono0a${model_affix}_ali || exit 1;
+
+    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+        2000 10000 data/train_si84_${train_set} data/lang exp/mono0a${model_affix}_ali exp/tri1${model_affix} || exit 1;
+  fi
+fi
+
+if [ $stage -le 6 ]; then
+  # tri2
+  if $train; then 
+    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+      data/train_si84_${train_set} data/lang exp/tri1${model_affix} exp/tri1${model_affix}_ali_si84 || exit 1;
+
+    steps/train_deltas.sh --cmd "$train_cmd" 2500 15000 \
+      data/train_si84_${train_set} data/lang exp/tri1${model_affix}_ali_si84 exp/tri2a${model_affix} || exit 1;
+
+    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+      data/train_si84_${train_set} data/lang exp/tri2a${model_affix} exp/tri2a${model_affix}_ali_si84 || exit 1;
+    
+    steps/train_lda_mllt.sh --cmd "$train_cmd" \
+       --splice-opts "--left-context=3 --right-context=3" \
+       2500 15000 data/train_si84_${train_set} data/lang exp/tri2a${model_affix}_ali_si84 exp/tri2b${model_affix} || exit 1;
+  fi
+  
+  if $decode; then
+    for testdir in $test_sets; do
+      utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2b${model_affix} exp/tri2b${model_affix}/graph_tgpr_5k || exit 1;
+      steps/decode.sh --nj 8 --cmd "$decode_cmd" \
+        exp/tri2b${model_affix}/graph_tgpr_5k data/test_${testdir} exp/tri2b${model_affix}/decode_tgpr_5k_${testdir} || exit 1;
+    done
+  fi
+fi
+
+if [ $stage -le 7 ]; then
+  # From 2b system, train 3b which is LDA + MLLT + SAT.
+
+  # Align tri2b system with all the si84 data.
+  if $train; then
+    steps/align_si.sh  --nj 10 --cmd "$train_cmd" --use-graphs true \
+      data/train_si84_${train_set} data/lang exp/tri2b${model_affix} exp/tri2b${model_affix}_ali_si84  || exit 1;
+    
+    steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train_si84_${train_set} data/lang exp/tri2b${model_affix}_ali_si84 exp/tri3b${model_affix} || exit 1;
+  fi
+
+  if $decode; then
+    for testdir in $test_sets; do
+      nspk=$(wc -l <data/test_${testdir}/spk2utt)
+      utils/mkgraph.sh data/lang_test_tgpr \
+        exp/tri3b${model_affix} exp/tri3b${model_affix}/graph_tgpr || exit 1;
+      steps/decode_fmllr.sh --nj $nspk --cmd "$decode_cmd" \
+        exp/tri3b${model_affix}/graph_tgpr data/test_${testdir} exp/tri3b${model_affix}/decode_tgpr_${testdir} || exit 1;
+    done
+  fi
+fi
+
+# Chain training
+if [ $stage -le 8 ]; then
+  # Caution: this part needs a GPU.
+  local/chain/run_tdnn.sh 
+fi
+
+exit 0;
 
-steps/align_si.sh --nj 10 \
-  data/train_si84_multi data/lang exp/tri1_multi exp/tri1_multi_ali_si84 || exit 1;
-
-steps/train_deltas.sh  \
-  2500 15000 data/train_si84_multi data/lang exp/tri1_multi_ali_si84 exp/tri2a_multi || exit 1;
-
-steps/train_lda_mllt.sh \
-   --splice-opts "--left-context=3 --right-context=3" \
-   2500 15000 data/train_si84_multi data/lang exp/tri1_multi_ali_si84 exp/tri2b_multi || exit 1;
-
-
-utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri2b_multi exp/tri2b_multi/graph_tgpr_5k || exit 1;
-steps/decode.sh --nj 8 \
-  exp/tri2b_multi/graph_tgpr_5k data/test_eval92 exp/tri2b_multi/decode_tgpr_5k_eval92 || exit 1;
-
-# Align tri2b system with si84 multi-condition data.
-steps/align_si.sh  --nj 10 \
-  --use-graphs true data/train_si84_multi data/lang exp/tri2b_multi exp/tri2b_multi_ali_si84  || exit 1;
-
-steps/align_si.sh  --nj 10 \
-  data/dev_0330 data/lang exp/tri2b_multi exp/tri2b_multi_ali_dev_0330 || exit 1;
-
-steps/align_si.sh  --nj 10 \
-  data/dev_1206 data/lang exp/tri2b_multi exp/tri2b_multi_ali_dev_1206 || exit 1;
-
-#Now begin train DNN systems on multi data
-. ./path.sh
-#RBM pretrain
-dir=exp/tri3a_dnn_pretrain
-$cuda_cmd $dir/_pretrain_dbn.log \
-  steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 data-fbank/train_si84_multi $dir
-
-dir=exp/tri3a_dnn
-ali=exp/tri2b_multi_ali_si84
-ali_dev=exp/tri2b_multi_ali_dev_0330
-feature_transform=exp/tri3a_dnn_pretrain/final.feature_transform
-dbn=exp/tri3a_dnn_pretrain/7.dbn
-$cuda_cmd $dir/_train_nnet.log \
-  steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
-  data-fbank/train_si84_multi data-fbank/dev_0330 data/lang $ali $ali_dev $dir || exit 1;
-
-utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri3a_dnn exp/tri3a_dnn/graph_tgpr_5k || exit 1;
-dir=exp/tri3a_dnn
-steps/nnet/decode.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
-  exp/tri3a_dnn/graph_tgpr_5k data-fbank/test_eval92 $dir/decode_tgpr_5k_eval92 || exit 1;
-
-
-#realignments
-srcdir=exp/tri3a_dnn
-steps/nnet/align.sh --nj 10 \
-  data-fbank/train_si84_multi data/lang $srcdir ${srcdir}_ali_si84_multi || exit 1;
-steps/nnet/align.sh --nj 10 \
-  data-fbank/dev_0330 data/lang $srcdir ${srcdir}_ali_dev_0330 || exit 1;
-
-#train system again 
-
-dir=exp/tri4a_dnn
-ali=exp/tri3a_dnn_ali_si84_multi
-ali_dev=exp/tri3a_dnn_ali_dev_0330
-feature_transform=exp/tri3a_dnn_pretrain/final.feature_transform
-dbn=exp/tri3a_dnn_pretrain/7.dbn
-$cuda_cmd $dir/_train_nnet.log \
-  steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
-  data-fbank/train_si84_multi data-fbank/dev_0330 data/lang $ali $ali_dev $dir || exit 1;
-
-utils/mkgraph.sh data/lang_test_tgpr_5k exp/tri4a_dnn exp/tri4a_dnn/graph_tgpr_5k || exit 1;
-dir=exp/tri4a_dnn
-steps/nnet/decode.sh --nj 8 --acwt 0.10 --config conf/decode_dnn.config \
-  exp/tri4a_dnn/graph_tgpr_5k data-fbank/test_eval92 $dir/decode_tgpr_5k_eval92 || exit 1;
-
-
-# DNN Sequential DT training
-#......
diff --git a/egs/babel/s5b/local/lonestar.py b/egs/babel/s5b/local/lonestar.py
index e1594e55ada..809f99b22cf 100755
--- a/egs/babel/s5b/local/lonestar.py
+++ b/egs/babel/s5b/local/lonestar.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 from pylauncher import *
 import pylauncher
 import sys
@@ -39,7 +40,7 @@ def KaldiLauncher(lo, **kwargs):
 
 	logfiles = list()
 	commands = list()
-	for q in xrange(lo.jobstart, lo.jobend+1):
+	for q in range(lo.jobstart, lo.jobend+1):
 		s = "bash " + lo.queue_scriptfile + " " + str(q) 
 		commands.append(s)
 
@@ -74,7 +75,7 @@ def KaldiLauncher(lo, **kwargs):
 			time.sleep(delay);
 			
 			lines=tail(10, logfile)
-			with_status=filter(lambda x:re.search(r'with status (\d+)', x), lines)
+			with_status=[x for x in lines if re.search(r'with status (\d+)', x)]
 		
 			if len(with_status) == 0:
 				sys.stderr.write("The last line(s) of the log-file " + logfile + " does not seem"
@@ -98,7 +99,7 @@ def KaldiLauncher(lo, **kwargs):
 		sys.exit(-1);
 
 	#Remove service files. Be careful not to remove something that might be needed in problem diagnostics	
-	for i in xrange(len(commands)):
+	for i in range(len(commands)):
 		out_file=os.path.join(qdir, ce.outstring+str(i))
 
 		#First, let's wait on files missing (it might be that those are missing
@@ -149,7 +150,7 @@ def KaldiLauncher(lo, **kwargs):
 	
 	#print job.final_report()
 
-class LauncherOpts:
+class LauncherOpts(object):
 	def __init__(self):
 		self.sync=0
 		self.nof_threads = 1
@@ -199,7 +200,7 @@ def CmdLineParser(argv):
 		jobend=int(m.group(2))
 		argv.pop(0)
 	elif re.match("^.+=.*:.*$", argv[0]):
-		print >> sys.stderr, "warning: suspicious JOB argument " + argv[0];
+		print("warning: suspicious JOB argument " + argv[0], file=sys.stderr);
 
 	if jobstart > jobend:
 		sys.stderr.write("lonestar.py: JOBSTART("+ str(jobstart) + ") must be lower than JOBEND(" + str(jobend) + ")\n")
@@ -238,8 +239,8 @@ def setup_paths_and_vars(opts):
 	cwd = os.getcwd()
 
 	if opts.varname and (opts.varname not in opts.logfile ) and (opts.jobstart != opts.jobend):
-		print >>sys.stderr, "lonestar.py: you are trying to run a parallel job" \
-			"but you are putting the output into just one log file (" + opts.logfile + ")";
+		print("lonestar.py: you are trying to run a parallel job" \
+			"but you are putting the output into just one log file (" + opts.logfile + ")", file=sys.stderr);
 		sys.exit(1)
 
 	if not os.path.isabs(opts.logfile):
@@ -261,8 +262,8 @@ def setup_paths_and_vars(opts):
 	taskname=os.path.basename(queue_logfile)
 	taskname = taskname.replace(".log", "");
 	if taskname == "":
-		print >> sys.stderr, "lonestar.py: you specified the log file name in such form " \
-			"that leads to an empty task name ("+logfile + ")";
+		print("lonestar.py: you specified the log file name in such form " \
+			"that leads to an empty task name ("+logfile + ")", file=sys.stderr);
 		sys.exit(1)
 
 	if not os.path.isabs(queue_logfile):
diff --git a/egs/babel/s5b/local/resegment/segmentation.py b/egs/babel/s5b/local/resegment/segmentation.py
index 7c5c8665a16..aed65a4ca14 100755
--- a/egs/babel/s5b/local/resegment/segmentation.py
+++ b/egs/babel/s5b/local/resegment/segmentation.py
@@ -3,6 +3,7 @@
 # Copyright 2014  Vimal Manohar
 # Apache 2.0
 
+from __future__ import division
 import os, glob, argparse, sys, re, time
 from argparse import ArgumentParser
 
@@ -19,12 +20,12 @@
 
 def mean(l):
   if len(l) > 0:
-    return float(sum(l)) / len(l)
+    return (float(sum(l))/len(l))
   return 0
 
 # Analysis class
 # Stores statistics like the confusion matrix, length of the segments etc.
-class Analysis:
+class Analysis(object):
   def __init__(self, file_id, frame_shift, prefix):
     self.confusion_matrix = [0] * 9
     self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ]
@@ -274,8 +275,8 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift):
     i = len(this_file)
     category = splits[6]
     word = splits[5]
-    start_time = int(float(splits[3])/frame_shift + 0.5)
-    duration = int(float(splits[4])/frame_shift + 0.5)
+    start_time = int((float(splits[3])/frame_shift) + 0.5)
+    duration = int((float(splits[4])/frame_shift) + 0.5)
     if i < start_time:
       this_file.extend(["0"]*(start_time - i))
     if type1 == "NON-LEX":
@@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift):
 # Stats class to store some basic stats about the number of
 # times the post-processor goes through particular loops or blocks
 # of code in the algorithm. This is just for debugging.
-class Stats:
+class Stats(object):
   def __init__(self):
     self.inter_utt_nonspeech = 0
     self.merge_nonspeech_segment = 0
@@ -321,7 +322,7 @@ def reset(self):
     self.noise_only = 0
 
 # Timer class to time functions
-class Timer:
+class Timer(object):
   def __enter__(self):
     self.start = time.clock()
     return self
@@ -332,7 +333,7 @@ def __exit__(self, *args):
 # The main class for post-processing a file.
 # This does the segmentation either looking at the file isolated
 # or by looking at both classes simultaneously
-class JointResegmenter:
+class JointResegmenter(object):
   def __init__(self, P, A, f, options, phone_map, stats = None, reference = None):
 
     # Pointers to prediction arrays and Initialization
@@ -1290,22 +1291,22 @@ def main():
       dest='hard_max_segment_length', default=15.0, \
       help="Hard maximum on the segment length above which the segment " \
       + "will be broken even if in the middle of speech (default: %(default)s)")
-  parser.add_argument('--first-separator', type=str, \
+  parser.add_argument('--first-separator', \
       dest='first_separator', default="-", \
       help="Separator between recording-id and start-time (default: %(default)s)")
-  parser.add_argument('--second-separator', type=str, \
+  parser.add_argument('--second-separator', \
       dest='second_separator', default="-", \
       help="Separator between start-time and end-time (default: %(default)s)")
-  parser.add_argument('--remove-noise-only-segments', type=str, \
+  parser.add_argument('--remove-noise-only-segments', \
       dest='remove_noise_only_segments', default="true", choices=("true", "false"), \
       help="Remove segments that have only noise. (default: %(default)s)")
   parser.add_argument('--min-inter-utt-silence-length', type=float, \
       dest='min_inter_utt_silence_length', default=1.0, \
       help="Minimum silence that must exist between two separate utterances (default: %(default)s)");
-  parser.add_argument('--channel1-file', type=str, \
+  parser.add_argument('--channel1-file', \
       dest='channel1_file', default="inLine", \
       help="String that matches with the channel 1 file (default: %(default)s)")
-  parser.add_argument('--channel2-file', type=str, \
+  parser.add_argument('--channel2-file', \
       dest='channel2_file', default="outLine", \
       help="String that matches with the channel 2 file (default: %(default)s)")
   parser.add_argument('--isolated-resegmentation', \
@@ -1388,7 +1389,7 @@ def main():
 
   speech_cap = None
   if options.speech_cap_length != None:
-    speech_cap = int( options.speech_cap_length / options.frame_shift )
+    speech_cap = int(options.speech_cap_length/options.frame_shift)
   # End if
 
   for f in pred_files:
@@ -1454,7 +1455,7 @@ def main():
         f2 = f3
       # End if
 
-      if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift:
+      if (len(A1) - len(A2)) > int(options.max_length_diff/options.frame_shift):
         sys.stderr.write( \
             "%s: Warning: Lengths of %s and %s differ by more than %f. " \
             % (sys.argv[0], f1,f2, options.max_length_diff) \
diff --git a/egs/babel/s5c/local/lonestar.py b/egs/babel/s5c/local/lonestar.py
index e1594e55ada..809f99b22cf 100755
--- a/egs/babel/s5c/local/lonestar.py
+++ b/egs/babel/s5c/local/lonestar.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+from __future__ import print_function
 from pylauncher import *
 import pylauncher
 import sys
@@ -39,7 +40,7 @@ def KaldiLauncher(lo, **kwargs):
 
 	logfiles = list()
 	commands = list()
-	for q in xrange(lo.jobstart, lo.jobend+1):
+	for q in range(lo.jobstart, lo.jobend+1):
 		s = "bash " + lo.queue_scriptfile + " " + str(q) 
 		commands.append(s)
 
@@ -74,7 +75,7 @@ def KaldiLauncher(lo, **kwargs):
 			time.sleep(delay);
 			
 			lines=tail(10, logfile)
-			with_status=filter(lambda x:re.search(r'with status (\d+)', x), lines)
+			with_status=[x for x in lines if re.search(r'with status (\d+)', x)]
 		
 			if len(with_status) == 0:
 				sys.stderr.write("The last line(s) of the log-file " + logfile + " does not seem"
@@ -98,7 +99,7 @@ def KaldiLauncher(lo, **kwargs):
 		sys.exit(-1);
 
 	#Remove service files. Be careful not to remove something that might be needed in problem diagnostics	
-	for i in xrange(len(commands)):
+	for i in range(len(commands)):
 		out_file=os.path.join(qdir, ce.outstring+str(i))
 
 		#First, let's wait on files missing (it might be that those are missing
@@ -149,7 +150,7 @@ def KaldiLauncher(lo, **kwargs):
 	
 	#print job.final_report()
 
-class LauncherOpts:
+class LauncherOpts(object):
 	def __init__(self):
 		self.sync=0
 		self.nof_threads = 1
@@ -199,7 +200,7 @@ def CmdLineParser(argv):
 		jobend=int(m.group(2))
 		argv.pop(0)
 	elif re.match("^.+=.*:.*$", argv[0]):
-		print >> sys.stderr, "warning: suspicious JOB argument " + argv[0];
+		print("warning: suspicious JOB argument " + argv[0], file=sys.stderr);
 
 	if jobstart > jobend:
 		sys.stderr.write("lonestar.py: JOBSTART("+ str(jobstart) + ") must be lower than JOBEND(" + str(jobend) + ")\n")
@@ -238,8 +239,8 @@ def setup_paths_and_vars(opts):
 	cwd = os.getcwd()
 
 	if opts.varname and (opts.varname not in opts.logfile ) and (opts.jobstart != opts.jobend):
-		print >>sys.stderr, "lonestar.py: you are trying to run a parallel job" \
-			"but you are putting the output into just one log file (" + opts.logfile + ")";
+		print("lonestar.py: you are trying to run a parallel job" \
+			"but you are putting the output into just one log file (" + opts.logfile + ")", file=sys.stderr);
 		sys.exit(1)
 
 	if not os.path.isabs(opts.logfile):
@@ -261,8 +262,8 @@ def setup_paths_and_vars(opts):
 	taskname=os.path.basename(queue_logfile)
 	taskname = taskname.replace(".log", "");
 	if taskname == "":
-		print >> sys.stderr, "lonestar.py: you specified the log file name in such form " \
-			"that leads to an empty task name ("+logfile + ")";
+		print("lonestar.py: you specified the log file name in such form " \
+			"that leads to an empty task name ("+logfile + ")", file=sys.stderr);
 		sys.exit(1)
 
 	if not os.path.isabs(queue_logfile):
diff --git a/egs/babel/s5c/local/resegment/segmentation.py b/egs/babel/s5c/local/resegment/segmentation.py
index 7c5c8665a16..4bdb0fea75c 100755
--- a/egs/babel/s5c/local/resegment/segmentation.py
+++ b/egs/babel/s5c/local/resegment/segmentation.py
@@ -3,6 +3,7 @@
 # Copyright 2014  Vimal Manohar
 # Apache 2.0
 
+from __future__ import division
 import os, glob, argparse, sys, re, time
 from argparse import ArgumentParser
 
@@ -19,12 +20,12 @@
 
 def mean(l):
   if len(l) > 0:
-    return float(sum(l)) / len(l)
+    return (float(sum(l))/len(l))
   return 0
 
 # Analysis class
 # Stores statistics like the confusion matrix, length of the segments etc.
-class Analysis:
+class Analysis(object):
   def __init__(self, file_id, frame_shift, prefix):
     self.confusion_matrix = [0] * 9
     self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ]
@@ -274,7 +275,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift):
     i = len(this_file)
     category = splits[6]
     word = splits[5]
-    start_time = int(float(splits[3])/frame_shift + 0.5)
+    start_time = int((float(splits[3])/frame_shift) + 0.5)
     duration = int(float(splits[4])/frame_shift + 0.5)
     if i < start_time:
       this_file.extend(["0"]*(start_time - i))
@@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift):
 # Stats class to store some basic stats about the number of
 # times the post-processor goes through particular loops or blocks
 # of code in the algorithm. This is just for debugging.
-class Stats:
+class Stats(object):
   def __init__(self):
     self.inter_utt_nonspeech = 0
     self.merge_nonspeech_segment = 0
@@ -321,7 +322,7 @@ def reset(self):
     self.noise_only = 0
 
 # Timer class to time functions
-class Timer:
+class Timer(object):
   def __enter__(self):
     self.start = time.clock()
     return self
@@ -332,7 +333,7 @@ def __exit__(self, *args):
 # The main class for post-processing a file.
 # This does the segmentation either looking at the file isolated
 # or by looking at both classes simultaneously
-class JointResegmenter:
+class JointResegmenter(object):
   def __init__(self, P, A, f, options, phone_map, stats = None, reference = None):
 
     # Pointers to prediction arrays and Initialization
@@ -351,9 +352,9 @@ def __init__(self, P, A, f, options, phone_map, stats = None, reference = None):
 
     self.frame_shift = options.frame_shift
     # Convert length in seconds to frames
-    self.max_frames = int(options.max_segment_length / options.frame_shift)
-    self.hard_max_frames = int(options.hard_max_segment_length / options.frame_shift)
-    self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length / options.frame_shift)
+    self.max_frames = int(options.max_segment_length/options.frame_shift)
+    self.hard_max_frames = int(options.hard_max_segment_length/options.frame_shift)
+    self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length, options.frame_shift)
     if ( options.remove_noise_only_segments == "false" ):
       self.remove_noise_segments = False
     elif ( options.remove_noise_only_segments == "true" ):
@@ -540,7 +541,7 @@ def set_nonspeech_proportion(self):
     # Set the number of non-speech frames to be added depending on the
     # silence proportion. The target number of frames in the segments
     # is computed as below:
-    target_segment_frames = int(num_speech_frames / (1.0 - self.options.silence_proportion))
+    target_segment_frames = int(num_speech_frames/(1.0 - self.options.silence_proportion))
 
     # The number of frames currently in the segments
     num_segment_frames = num_speech_frames
@@ -599,7 +600,7 @@ def set_nonspeech_proportion(self):
       if not changed:   # avoid an infinite loop. if no changes, then break.
         break
     if num_segment_frames < target_segment_frames:
-      proportion = float(num_segment_frames - num_speech_frames) / num_segment_frames
+      proportion = float(num_segment_frames - num_speech_frames)/num_segment_frames
       sys.stderr.write("%s: Warning: for recording %s, only got a proportion %f of non-speech frames, versus target %f\n" % (sys.argv[0], self.file_id, proportion, self.options.silence_proportion))
 
     ###########################################################################
@@ -863,14 +864,14 @@ def split_long_segments(self):
           # Count the number of times long segments are split
           self.stats.split_segments += 1
 
-          num_pieces = int((float(segment_length) / self.hard_max_frames) + 0.99999)
+          num_pieces = int((float(segment_length)/self.hard_max_frames) + 0.99999)
           sys.stderr.write("%s: Warning: for recording %s, " \
               % (sys.argv[0], self.file_id) \
               + "splitting segment of length %f seconds into %d pieces " \
               % (segment_length * self.frame_shift, num_pieces) \
               + "(--hard-max-segment-length %f)\n" \
               % self.options.hard_max_segment_length)
-          frames_per_piece = int(segment_length / num_pieces)
+          frames_per_piece = int(segment_length/num_pieces)
           for i in range(1,num_pieces):
             q = n + i * frames_per_piece
             self.S[q] = True
@@ -1290,22 +1291,22 @@ def main():
       dest='hard_max_segment_length', default=15.0, \
       help="Hard maximum on the segment length above which the segment " \
       + "will be broken even if in the middle of speech (default: %(default)s)")
-  parser.add_argument('--first-separator', type=str, \
+  parser.add_argument('--first-separator', \
       dest='first_separator', default="-", \
       help="Separator between recording-id and start-time (default: %(default)s)")
-  parser.add_argument('--second-separator', type=str, \
+  parser.add_argument('--second-separator', \
       dest='second_separator', default="-", \
       help="Separator between start-time and end-time (default: %(default)s)")
-  parser.add_argument('--remove-noise-only-segments', type=str, \
+  parser.add_argument('--remove-noise-only-segments', \
       dest='remove_noise_only_segments', default="true", choices=("true", "false"), \
       help="Remove segments that have only noise. (default: %(default)s)")
   parser.add_argument('--min-inter-utt-silence-length', type=float, \
       dest='min_inter_utt_silence_length', default=1.0, \
       help="Minimum silence that must exist between two separate utterances (default: %(default)s)");
-  parser.add_argument('--channel1-file', type=str, \
+  parser.add_argument('--channel1-file', \
       dest='channel1_file', default="inLine", \
       help="String that matches with the channel 1 file (default: %(default)s)")
-  parser.add_argument('--channel2-file', type=str, \
+  parser.add_argument('--channel2-file', \
       dest='channel2_file', default="outLine", \
       help="String that matches with the channel 2 file (default: %(default)s)")
   parser.add_argument('--isolated-resegmentation', \
@@ -1388,7 +1389,7 @@ def main():
 
   speech_cap = None
   if options.speech_cap_length != None:
-    speech_cap = int( options.speech_cap_length / options.frame_shift )
+    speech_cap = int(options.speech_cap_length/options.frame_shift)
   # End if
 
   for f in pred_files:
@@ -1454,7 +1455,7 @@ def main():
         f2 = f3
       # End if
 
-      if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift:
+      if (len(A1) - len(A2)) > int(options.max_length_diff/options.frame_shift):
         sys.stderr.write( \
             "%s: Warning: Lengths of %s and %s differ by more than %f. " \
             % (sys.argv[0], f1,f2, options.max_length_diff) \
diff --git a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
index 2d1fcb2259e..4a0810b9415 100755
--- a/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
+++ b/egs/babel/s5c/local/syllab/generate_syllable_lang.sh
@@ -118,8 +118,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
index a6b22de419f..9cd043716ce 100644
--- a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
@@ -75,8 +75,8 @@ unsup_data_list=./conf/lists/404-georgian/untranscribed-training.list
 unsup_nj=32
 
 
-lexicon_file=
-lexiconFlags="--romanized --oov <unk>"
+lexicon_file=/export/corpora/LDC/LDC2016S12/IARPA_BABEL_OP3_404/conversational/reference_materials/lexicon.txt
+lexiconFlags=" --romanized --oov <unk>"
 
 
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
index 4f485edf7da..7b4535f8c5e 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
index 72f7a3c32dd..5fc14dda826 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh
@@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
index be0c2cc4b9b..8c7de5d18d4 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh
@@ -127,7 +127,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
index 8f21a239794..0b3e70b5a04 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh
@@ -127,7 +127,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
index 7898d172242..45f2907645e 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
index 49462573245..0d92aff5c28 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
index c888d985f5e..4129c00dcb4 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
index e9a045e113a..1cfa50c1aa1 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh
@@ -128,7 +128,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
index ce192a91665..ba8ac1e0373 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh
@@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 dropout-proportion=0.0"
   label_delay=5
 
diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
index 3fc0ef2206c..5de285e080e 100755
--- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
+++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh
@@ -129,7 +129,7 @@ if [ $stage -le 17 ]; then
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 dropout-proportion=0.0 "
   label_delay=5
 
diff --git a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
index 68280762597..91419f6e920 100755
--- a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
+++ b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
@@ -106,6 +106,7 @@
 # Import Statements
 
 from __future__ import print_function
+from __future__ import division
 import codecs
 import argparse
 import unicodedata
@@ -340,7 +341,7 @@ def encode(unicode_transcription, tag_percentage, log=False):
     int2graph = {v: k for k, v in graph2int.items()}
     graph_list_int = [graph2int[g] for g in graph_list]
     bin_edges = range(0, len(int2graph.keys()) + 1)
-    graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0] / float(len(graph_list_int))
+    graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0]/float(len(graph_list_int))
     # Set count threshold to frequency that tags the bottom 10% of graphemes
     bottom_idx = int(np.floor(tag_percentage * len(graph_counts)))
     count_thresh = sorted(graph_counts)[bottom_idx]
@@ -465,7 +466,7 @@ def encode(unicode_transcription, tag_percentage, log=False):
     for g_dict in table:
         g_map = ""
         map_number = 0
-        for g_field, g_val in sorted(g_dict.iteritems()):
+        for g_field, g_val in sorted(g_dict.items()):
             if(g_field == ("MAP" + str(map_number))):
                 g_map = g_map + g_val + " "
                 map_number = map_number + 1
@@ -561,7 +562,7 @@ def write_table(table, outfile):
     # Start writing to output
     with codecs.open(outfile, "w", "utf-8") as fo:
         # Get header names
-        header_names = sorted(set().union(*[d.keys() for d in table]))
+        header_names = sorted(set().union(*[list(d.keys()) for d in table]))
         # Write headers
         for h in header_names[:-1]:
             fo.write("%s\t" % h)
@@ -595,7 +596,7 @@ def write_map(grapheme_map, mapfile):
 
     '''
     with codecs.open(mapfile, 'w', encoding='utf-8') as f:
-        for g, g_map in grapheme_map.iteritems():
+        for g, g_map in grapheme_map.items():
             print(g, g_map, file=f)
 
 
@@ -613,14 +614,14 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None,
     with codecs.open(outfile, "w", "utf-8") as f:
         # First write the non-speech words
         try:
-            for w in sil_lex.iterkeys():
+            for w in sil_lex.keys():
                 f.write("%s\t%s\n" % (w, sil_lex[w]))
         except AttributeError:
             pass
         
         # Then write extra-speech words 
         try:
-            for w in extra_lex.iterkeys():
+            for w in extra_lex.keys():
                 f.write("%s\t%s\n" % (w, extra_lex[w]))
         except AttributeError:
             pass
@@ -629,9 +630,9 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None,
         for idx, w in enumerate(baseforms):
             # This is really just for BABEL in case <hes> is written as a word
             if(w[0].lower() == "<hes>"):
-                f.write("%s\t<hes>\n" % (unicode(w[0])))
+                f.write("%s\t<hes>\n" % (w[0]))
             else:
-                f.write("%s\t%s\n" % (unicode(w[0]),
+                f.write("%s\t%s\n" % (w[0],
                                       encoded_transcription[idx]))
 
 if __name__ == "__main__":
diff --git a/egs/babel/s5d/local/lexicon/make_word_list.py b/egs/babel/s5d/local/lexicon/make_word_list.py
index 9a9e17f6c60..c1473b8ced8 100755
--- a/egs/babel/s5d/local/lexicon/make_word_list.py
+++ b/egs/babel/s5d/local/lexicon/make_word_list.py
@@ -85,7 +85,7 @@ def main():
     # Print the word list
     with codecs.open(args.word_list, "w", encoding="utf-8") as f:
         for word, count in words:
-            f.write("%d %s\n" % (count, unicode(word)))
+            f.write("%d %s\n" % (count, word))
 
     if args.misprons is not None:
         with codecs.open(args.misprons, "w", encoding="utf-8") as f:
diff --git a/egs/babel/s5d/local/make_L_align.sh b/egs/babel/s5d/local/make_L_align.sh
index 50e46a00493..41e9ff32958 100755
--- a/egs/babel/s5d/local/make_L_align.sh
+++ b/egs/babel/s5d/local/make_L_align.sh
@@ -34,18 +34,24 @@ tmpdir=$1
 dir=$2
 outdir=$3
 
+for f in  $dir/phones/optional_silence.txt $dir/phones.txt $dir/words.txt ; do
+  [ ! -f $f ] &&  echo "$0: The file $f must exist!" exit 1
+fi
+
 silphone=`cat $dir/phones/optional_silence.txt` || exit 1;
 
+if [ ! -f $tmpdir/lexicon.txt ] && [ ! -f $tmpdir/lexiconp.txt ] ; then
+  echo "$0: At least one of the files $tmpdir/lexicon.txt or $tmpdir/lexiconp.txt must exist" >&2
+  exit 1
+fi
+
 # Create lexicon with alignment info
 if  [ -f $tmpdir/lexicon.txt ] ; then
   cat $tmpdir/lexicon.txt | \
     awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }'
-elif [ -f $tmpdir/lexiconp.txt ] ;  then
+else
   cat $tmpdir/lexiconp.txt | \
     awk '{printf("%s #1 ", $1); for (n=3; n <= NF; n++) { printf("%s ", $n); } print "#2"; }'
-else
-  echo "Neither $tmpdir/lexicon.txt nor $tmpdir/lexiconp.txt does not exist"
-  exit 1
 fi | utils/make_lexicon_fst.pl - 0.5 $silphone | \
 fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
   --keep_isymbols=false --keep_osymbols=false | \
diff --git a/egs/babel/s5d/local/prepare_unicode_lexicon.py b/egs/babel/s5d/local/prepare_unicode_lexicon.py
index 86fa4d60ba1..3b9dc1abd86 100755
--- a/egs/babel/s5d/local/prepare_unicode_lexicon.py
+++ b/egs/babel/s5d/local/prepare_unicode_lexicon.py
@@ -89,7 +89,7 @@ def extract_phonemes(lexicon):
     # Read all baseform units into dictionary with {a: [a, a_1, a_2],
     #                                               b: [b_1, b_3], ...}
     phonemes_dict = {}
-    for word, pron in lexicon.iteritems():
+    for word, pron in lexicon.items():
         for p in pron.split():
             try:
                 base = p.split("_",1)[0]
@@ -98,11 +98,11 @@ def extract_phonemes(lexicon):
                 phonemes_dict[base] = [p]
 
     # Makes sure there are no repeats in the list
-    phonemes_dict = {k: set(v) for k, v in phonemes_dict.iteritems()}
+    phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()}
 
     # Get all unique phonemes
     phonemes = []
-    for v in phonemes_dict.itervalues():
+    for v in phonemes_dict.values():
         for p in v:
             phonemes.append(p)
 
@@ -137,11 +137,11 @@ def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict,
 
         # Write all possible phone_tag combinations that occur in the lexicon
         for tag in tags:
-            for p in nonsil_phonemes_dict.iterkeys():
+            for p in nonsil_phonemes_dict.keys():
                 tagged_phoneme = "_".join([p, tag])
                 if(tagged_phoneme in nonsil_phonemes_dict[p]):
                     fp.write("%s " % tagged_phoneme)
-            for p in sil_phonemes_dict.iterkeys():
+            for p in sil_phonemes_dict.keys():
                 tagged_phoneme = "_".join([p, tag])
                 if(tagged_phoneme in sil_phonemes_dict[p]):
                     fp.write("%s " % tagged_phoneme)
diff --git a/egs/babel/s5d/local/resegment/segmentation.py b/egs/babel/s5d/local/resegment/segmentation.py
index 7c5c8665a16..02fd7646b96 100755
--- a/egs/babel/s5d/local/resegment/segmentation.py
+++ b/egs/babel/s5d/local/resegment/segmentation.py
@@ -3,6 +3,7 @@
 # Copyright 2014  Vimal Manohar
 # Apache 2.0
 
+from __future__ import division
 import os, glob, argparse, sys, re, time
 from argparse import ArgumentParser
 
@@ -19,12 +20,12 @@
 
 def mean(l):
   if len(l) > 0:
-    return float(sum(l)) / len(l)
+    return float(sum(l))/len(l)
   return 0
 
 # Analysis class
 # Stores statistics like the confusion matrix, length of the segments etc.
-class Analysis:
+class Analysis(object):
   def __init__(self, file_id, frame_shift, prefix):
     self.confusion_matrix = [0] * 9
     self.type_counts = [ [[] for j in range(0,9)] for i in range(0,3) ]
@@ -274,8 +275,8 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift):
     i = len(this_file)
     category = splits[6]
     word = splits[5]
-    start_time = int(float(splits[3])/frame_shift + 0.5)
-    duration = int(float(splits[4])/frame_shift + 0.5)
+    start_time = int((float(splits[3])/frame_shift) + 0.5)
+    duration = int((float(splits[4])/frame_shift) + 0.5)
     if i < start_time:
       this_file.extend(["0"]*(start_time - i))
     if type1 == "NON-LEX":
@@ -295,7 +296,7 @@ def read_rttm_file(rttm_file, temp_dir, frame_shift):
 # Stats class to store some basic stats about the number of
 # times the post-processor goes through particular loops or blocks
 # of code in the algorithm. This is just for debugging.
-class Stats:
+class Stats(object):
   def __init__(self):
     self.inter_utt_nonspeech = 0
     self.merge_nonspeech_segment = 0
@@ -321,7 +322,7 @@ def reset(self):
     self.noise_only = 0
 
 # Timer class to time functions
-class Timer:
+class Timer(object):
   def __enter__(self):
     self.start = time.clock()
     return self
@@ -332,7 +333,7 @@ def __exit__(self, *args):
 # The main class for post-processing a file.
 # This does the segmentation either looking at the file isolated
 # or by looking at both classes simultaneously
-class JointResegmenter:
+class JointResegmenter(object):
   def __init__(self, P, A, f, options, phone_map, stats = None, reference = None):
 
     # Pointers to prediction arrays and Initialization
@@ -351,8 +352,8 @@ def __init__(self, P, A, f, options, phone_map, stats = None, reference = None):
 
     self.frame_shift = options.frame_shift
     # Convert length in seconds to frames
-    self.max_frames = int(options.max_segment_length / options.frame_shift)
-    self.hard_max_frames = int(options.hard_max_segment_length / options.frame_shift)
+    self.max_frames = int(options.max_segment_length/options.frame_shift)
+    self.hard_max_frames = int(options.hard_max_segment_length/options.frame_shift)
     self.min_inter_utt_nonspeech_length = int(options.min_inter_utt_silence_length / options.frame_shift)
     if ( options.remove_noise_only_segments == "false" ):
       self.remove_noise_segments = False
@@ -540,7 +541,7 @@ def set_nonspeech_proportion(self):
     # Set the number of non-speech frames to be added depending on the
     # silence proportion. The target number of frames in the segments
     # is computed as below:
-    target_segment_frames = int(num_speech_frames / (1.0 - self.options.silence_proportion))
+    target_segment_frames = int(num_speech_frames/(1.0 - self.options.silence_proportion))
 
     # The number of frames currently in the segments
     num_segment_frames = num_speech_frames
@@ -599,7 +600,7 @@ def set_nonspeech_proportion(self):
       if not changed:   # avoid an infinite loop. if no changes, then break.
         break
     if num_segment_frames < target_segment_frames:
-      proportion = float(num_segment_frames - num_speech_frames) / num_segment_frames
+      proportion = float(num_segment_frames - num_speech_frames)/ num_segment_frames
       sys.stderr.write("%s: Warning: for recording %s, only got a proportion %f of non-speech frames, versus target %f\n" % (sys.argv[0], self.file_id, proportion, self.options.silence_proportion))
 
     ###########################################################################
@@ -863,14 +864,14 @@ def split_long_segments(self):
           # Count the number of times long segments are split
           self.stats.split_segments += 1
 
-          num_pieces = int((float(segment_length) / self.hard_max_frames) + 0.99999)
+          num_pieces = int((float(segment_length)/self.hard_max_frames) + 0.99999)
           sys.stderr.write("%s: Warning: for recording %s, " \
               % (sys.argv[0], self.file_id) \
               + "splitting segment of length %f seconds into %d pieces " \
               % (segment_length * self.frame_shift, num_pieces) \
               + "(--hard-max-segment-length %f)\n" \
               % self.options.hard_max_segment_length)
-          frames_per_piece = int(segment_length / num_pieces)
+          frames_per_piece = int(segment_length/num_pieces)
           for i in range(1,num_pieces):
             q = n + i * frames_per_piece
             self.S[q] = True
@@ -1388,7 +1389,7 @@ def main():
 
   speech_cap = None
   if options.speech_cap_length != None:
-    speech_cap = int( options.speech_cap_length / options.frame_shift )
+    speech_cap = int(options.speech_cap_length/options.frame_shift)
   # End if
 
   for f in pred_files:
@@ -1454,7 +1455,7 @@ def main():
         f2 = f3
       # End if
 
-      if (len(A1) - len(A2)) > options.max_length_diff / options.frame_shift:
+      if (len(A1) - len(A2)) > options.max_length_diff/options.frame_shift:
         sys.stderr.write( \
             "%s: Warning: Lengths of %s and %s differ by more than %f. " \
             % (sys.argv[0], f1,f2, options.max_length_diff) \
diff --git a/egs/babel/s5d/local/syllab/generate_phone_lang.sh b/egs/babel/s5d/local/syllab/generate_phone_lang.sh
index fc21a23231b..81d8a0acdc7 100755
--- a/egs/babel/s5d/local/syllab/generate_phone_lang.sh
+++ b/egs/babel/s5d/local/syllab/generate_phone_lang.sh
@@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
index db7b0902425..a7bd667027c 100755
--- a/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
+++ b/egs/babel/s5d/local/syllab/generate_syllable_lang.sh
@@ -122,8 +122,7 @@ ln -s lex.syllabs2phones.disambig.fst $out/L_disambig.fst
 echo "Validating the output lang dir"
 utils/validate_lang.pl $out || exit 1
 
-sed -i'' 's/#1$//g' $lout/lexicon.txt
-sed -i'' 's/#1$//g' $lout/lexiconp.txt
+perl -i -pe 's/#1$//g' $lout/lexicon.txt $lout/lexiconp.txt
 
 echo "Done OK."
 exit 0
diff --git a/egs/bentham/README.txt b/egs/bentham/README.txt
new file mode 100644
index 00000000000..02870c265f6
--- /dev/null
+++ b/egs/bentham/README.txt
@@ -0,0 +1,5 @@
+This directory contains example scripts for handwriting recognition on
+the Bentham dataset:
+http://www.transcriptorium.eu/~htrcontest/contestICFHR2014/public_html/
+In the ICFHR 2014 contest, the best performing system in the unrestricted 
+track obtained a WER of 8.6%.
diff --git a/egs/bentham/v1/cmd.sh b/egs/bentham/v1/cmd.sh
new file mode 100755
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/bentham/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/bentham/v1/image b/egs/bentham/v1/image
new file mode 120000
index 00000000000..6a4b3afeb09
--- /dev/null
+++ b/egs/bentham/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image
\ No newline at end of file
diff --git a/egs/bentham/v1/local/chain/compare_wer.sh b/egs/bentham/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..2ce14e13694
--- /dev/null
+++ b/egs/bentham/v1/local/chain/compare_wer.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+. ./path.sh
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer="--"
+  [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer="--"
+  [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# WER val                    "
+for x in $*; do
+  wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored) val         "
+for x in $*; do
+  wer="--"
+  [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val                    "
+for x in $*; do
+  cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored) val         "
+for x in $*; do
+  cer="--"
+  [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Parameters                 "
+for x in $*; do
+  params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+  printf "% 10s" $params
+done
+echo
diff --git a/egs/bentham/v1/local/chain/run_cnn_e2eali.sh b/egs/bentham/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..e2545b0186e
--- /dev/null
+++ b/egs/bentham/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
\ No newline at end of file
diff --git a/egs/bentham/v1/local/chain/run_e2e_cnn.sh b/egs/bentham/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/bentham/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..ec530ef1ce4
--- /dev/null
+++ b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ exp/chain/cnn_e2eali_1a
+# System                      e2e_cnn_1a cnn_e2eali_1a
+# WER                             13.72      8.14
+# WER (rescored)                  13.40      8.00
+# CER                              6.56      2.82
+# CER (rescored)                   6.33      2.73
+# WER val                         13.51      8.19
+# WER (rescored) val              13.38      7.97
+# CER val                          6.40      2.93
+# CER (rescored) val               6.29      2.90
+# Final train prob               0.1037   -0.0613
+# Final valid prob               0.0720   -0.0988
+# Final train prob (xent)                 -0.3706
+# Final valid prob (xent)                 -0.4669
+# Parameters                     11.54M     4.29M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
+# exp/chain/cnn_e2eali_1a: num-iters=20 nj=3..5 num-params=4.3M dim=40->336 combine=-0.066->-0.066 (over 1) xent:train/valid[12,19,final]=(-0.822,-0.437,-0.371/-0.859,-0.514,-0.467) logprob:train/valid[12,19,final]=(-0.188,-0.078,-0.061/-0.204,-0.114,-0.099)
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context $chunk_left_context \
+      --extra-right-context $chunk_right_context \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
+fi
+
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
new file mode 100755
index 00000000000..716bdce3729
--- /dev/null
+++ b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1b
+# WER                             13.72
+# WER (rescored)                  13.40
+# CER                              6.56
+# CER (rescored)                   6.33
+# WER val                         13.51
+# WER (rescored) val              13.38
+# CER val                          6.40
+# CER (rescored) val               6.29
+# Final train prob               0.1037
+# Final valid prob               0.0720
+# Final train prob (xent)
+# Final valid prob (xent)
+# Parameters                     11.54M
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=26 nj=2..4 num-params=11.5M dim=40->17112 combine=0.054->0.054 (over 1) logprob:train/valid[16,25,final]=(0.078,0.102,0.104/0.051,0.069,0.072)
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+nj=30
+
+# training options
+tdnn_dim=450
+minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
+common_egs_dir=
+train_set=train
+decode_val=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_bitree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type biphone \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 4 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/bentham/v1/local/check_tools.sh b/egs/bentham/v1/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/bentham/v1/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/bentham/v1/local/create_splits.sh b/egs/bentham/v1/local/create_splits.sh
new file mode 100755
index 00000000000..e8ea2279a49
--- /dev/null
+++ b/egs/bentham/v1/local/create_splits.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright   2018   Desh Raj (Johns Hopkins University) 
+
+# This script reads the extracted Bentham database files and creates
+#    the following files (for all the data subsets):
+#    text, utt2spk, images.scp.
+
+download_dir=$1
+save_dir=$2
+mkdir -p $save_dir/{train,val,test}
+touch $save_dir/{train,val,test}/{text,images.scp,utt2spk,spk2utt}
+
+partition_dir=$download_dir"/gt/Partitions/"
+lines_dir=$download_dir"/gt/Images/Lines/"
+text_dir=$download_dir"/gt/Transcriptions/" 
+
+function split {
+	echo "Creating $1 split"
+	split_dir=$save_dir/$1
+	line_file=$partition_dir/$2
+
+	while read -r line; do
+	    name="$line"
+        spkid=${name:0:11}
+        echo -n $name" " | cat - $text_dir/$name* >> $split_dir/text
+        echo >> $split_dir/text
+        echo $name $lines_dir"/"$name".png" >> $split_dir/images.scp
+        echo $name $spkid >> $split_dir/utt2spk 
+	done < "$line_file"
+   
+    perl -i -ne 'print if /\S/' $split_dir/images.scp $split_dir/text $split_dir/utt2spk
+    utils/utt2spk_to_spk2utt.pl $split_dir/utt2spk > $split_dir/spk2utt
+}
+
+split train TrainLines.lst
+split val ValidationLines.lst
+split test TestLines.lst
diff --git a/egs/bentham/v1/local/download_bentham_text.sh b/egs/bentham/v1/local/download_bentham_text.sh
new file mode 100755
index 00000000000..e09403718a1
--- /dev/null
+++ b/egs/bentham/v1/local/download_bentham_text.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright    2018   Desh Raj
+# Apache 2.0
+
+## Download all written works of Jeremy Bentham for the Bentham HWR task LM training
+
+baseurl='http://oll.libertyfund.org/titles/'
+savedir=$1
+
+mkdir -p $savedir
+
+declare -a texts=("bentham-the-works-of-jeremy-bentham-vol-1/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-2/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-3/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-5-scotch-reform-real-property-codification-petitions/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-6/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-7-rationale-of-judicial-evidence-part-2/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-8/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-9-constitutional-code"
+                "bentham-the-works-of-jeremy-bentham-vol-10-memoirs-part-i-and-correspondence/simple"
+                "bentham-the-works-of-jeremy-bentham-vol-11-memoirs-of-bentham-part-ii-and-analytical-index")
+
+counter=1
+for i in "${texts[@]}"
+do
+    echo "Downloading $baseurl$i"
+    curl -s -N {$baseurl}{$i} | sed -e 's/<[^>]*>//g' > $savedir"/bentham"$counter".txt"
+    ((counter++))
+done
+
+cat $savedir"/*.txt" > $savedir"/complete.txt"
+rm $savedir"/bentham*.txt"
diff --git a/egs/bentham/v1/local/extract_features.sh b/egs/bentham/v1/local/extract_features.sh
new file mode 100755
index 00000000000..460e467e99c
--- /dev/null
+++ b/egs/bentham/v1/local/extract_features.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --num-channels 4 \
+    --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/bentham/v1/local/gen_topo.py b/egs/bentham/v1/local/gen_topo.py
new file mode 100755
index 00000000000..af9e20317d8
--- /dev/null
+++ b/egs/bentham/v1/local/gen_topo.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+# Copyright 2017 (author: Chun-Chieh Chang)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs. This is a modified version of
+# 'utils/gen_topo.pl'. The difference is that this creates two topologies for
+# the non-silence HMMs. The number of states for punctuations is different than
+# the number of states for other characters.
+
+from __future__ import print_function
+from __future__ import division
+import argparse
+import string
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones");
+parser.add_argument("num_sil_states", type=int, help="number of states for silence phones");
+parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation");
+parser.add_argument("nonsilence_phones",
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones",
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+parser.add_argument("phone_list", help="file containing all phones and their corresponding number.");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+punctuation_phones = []
+exclude = set("!(),.?;:'-\"")
+with open(args.phone_list) as f:
+    for line in f:
+        line = line.strip()
+        phone = line.split(' ')[0]
+        if len(phone) == 1 and phone in exclude:
+            punctuation_phones.append(int(line.split(' ')[1]))
+# For nonsilence phones that are not punctuations
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_nonsil_states):
+    xp1 = x + 1
+    print("<State> {0} <PdfClass> {0} <Transition> {0} 0.75 <Transition> {1} 0.25 </State>".format(x, xp1))
+print("<State> {} </State>".format(args.num_nonsil_states))
+print("</TopologyEntry>")
+
+# For nonsilence phones that ar punctuations
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_punctuation_states):
+    xp1 = x + 1
+    print("<State> {0} <PdfClass> {0} <Transition> {0} 0.75 <Transition> {1} 0.25 </State>".format(x, xp1))
+print("<State> {} </State>".format(args.num_punctuation_states))
+print("</TopologyEntry>")
+
+# For silence phones
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in silence_phones]))
+print("</ForPhones>")
+if(args.num_sil_states > 1):
+    transp = 1.0/(args.num_sil_states - 1)
+    
+    state_str = "<State> 0 <PdfClass> 0 "
+    for x in range(0, (args.num_sil_states - 1)):
+        state_str = "{} <Transition> {} {} ".format(state_str, x, transp)
+    state_str = state_str + "</State>"
+    print(state_str)
+
+    for x in range(1, (args.num_sil_states - 1)):
+        state_str = "<State> {0} <PdfClass> {0} ".format(x)
+        for y in range(1, args.num_sil_states):
+        state_str = "{} <Transition> {} {} ".format(state_str, y, transp)
+        state_str = state_str + "</State>"
+        print(state_str)
+    second_last = args.num_sil_states - 1
+    print("<State> {0} <PdfClass> {0} <Transition> {0} 0.75 <Transition> {1} 0.25 </State>".format(second_last, args.num_sil_states))
+    print("<State> {} </State>".format(args.num_sil_states))
+else:
+    print("<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>")
+    print("<State> {} </State>".format(args.num_sil_states))
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/bentham/v1/local/prepare_data.sh b/egs/bentham/v1/local/prepare_data.sh
new file mode 100755
index 00000000000..bbcc9863611
--- /dev/null
+++ b/egs/bentham/v1/local/prepare_data.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright      2018  Desh Raj (Johns Hopkins University) 
+
+# Apache 2.0
+
+# This script downloads the Bentham handwriting database and prepares the training
+# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling create_splits.sh.
+
+# In addition, it downloads data for all texts of Bentham for LM training purpose.
+
+stage=0
+download_dir=data/local/download/
+database_dir=""
+text_corpus_dir=""
+
+mkdir -p $download_dir
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+BENTHAM_IMAGES_URL='http://transcriptorium.eu/~tsdata/BenthamR0/BenthamDatasetR0-Images.zip'
+BENTHAM_GT_URL='http://transcriptorium.eu/~tsdata/BenthamR0/BenthamDatasetR0-GT.zip'
+bentham_images=$database_dir"/images.zip"
+bentham_gt=$database_dir"/gt.zip"
+bentham_text=$download_dir"/text"
+
+# download and extract images and transcriptions
+if [ ! -f $bentham_images ]; then
+  echo "Downloading images and transcriptions to $database_dir"
+  mkdir -p $database_dir
+  wget $BENTHAM_IMAGES_URL -O $bentham_images
+  wget $BENTHAM_GT_URL -O $bentham_gt
+else
+  echo "Not downloading since corpus already exists"
+fi
+
+if [ ! -d $download_dir/"gt" ]; then
+  unzip $bentham_gt -d $download_dir
+  mv $download_dir"/BenthamDatasetR0-GT" $download_dir"/gt"
+else
+  echo "Local extracted corpus already exists"
+fi
+
+# Download extra Bentham text for LM training
+if [ -d $text_corpus_dir ]; then
+  echo "$0: Not downloading Bentham text corpus as it is already there."
+else
+  local/download_bentham_text.sh $text_corpus_dir
+fi
+
+# Copy extra Bentham text to local
+if [ -d $bentham_text ]; then
+  echo "$0: Not copying as local Bentham already present."
+else
+  mkdir -p $bentham_text
+  cp $text_corpus_dir/Bentham-Text/* $bentham_text
+  echo "$0: Done copying extra Bentham text to local."
+fi
+
+# Creating train, val, and test splits for all directories
+if [ -d data/train ]; then
+  echo "Data splits and files already exist. Not creating again."
+else
+  echo "Creating train, val, and test splits and corresponding files.."
+  local/create_splits.sh $download_dir "data/"
+fi
+
diff --git a/egs/bentham/v1/local/prepare_dict.sh b/egs/bentham/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..22db5ae834d
--- /dev/null
+++ b/egs/bentham/v1/local/prepare_dict.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Babak Rekabdar
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/bentham/v1/local/prepare_lexicon.py b/egs/bentham/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..3de96056c2a
--- /dev/null
+++ b/egs/bentham/v1/local/prepare_lexicon.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Babak Rekabdar
+#                2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text.
+# Since this lexicon is based on BPE, it replaces '|' with silence.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = list(characters)
+            characters = "".join([ '<HASH>' if char == '#' else char for char in characters])
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/bentham/v1/local/score.sh b/egs/bentham/v1/local/score.sh
new file mode 100755
index 00000000000..1d84815fc69
--- /dev/null
+++ b/egs/bentham/v1/local/score.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/bentham/v1/local/train_lm.sh b/egs/bentham/v1/local/train_lm.sh
new file mode 100755
index 00000000000..48632a90769
--- /dev/null
+++ b/egs/bentham/v1/local/train_lm.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+#           2018  Desh Raj
+# Apache 2.0
+#
+# This script trains an LM on the Bentham text corpus and training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+vocab_size=50000
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+bentham_text_dir=data/local/download/text/
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Using Bentham text with last 5000 lines for dev
+
+  cat $bentham_text_dir/complete.txt | \
+    sed '/^\s*$/d' | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+    | sed 's/@@//g' > ${dir}/bentham.txt
+  tail -n +5000 ${dir}/bentham.txt > ${dir}/data/text/bentham.txt
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  head -5000 ${dir}/bentham.txt > ${dir}/data/text/dev.txt
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/hwr.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/val/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from Bentham text
+  cat ${dir}/data/text/{bentham,hwr}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=6
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='bentham=1 hwr=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+  train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 1 million n-grams for a big LM for rescoring purposes.
+  size=1000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 500,000 n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=500000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/bentham/v1/local/wer_output_filter b/egs/bentham/v1/local/wer_output_filter
new file mode 100755
index 00000000000..24691a160a9
--- /dev/null
+++ b/egs/bentham/v1/local/wer_output_filter
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# Copyright      2017  Hossein Hadian
+
+# This is a filter used in scoring. It separates all
+# punctuations from words. For e.g. this sentence:
+
+# "They have come!" he said reverently, gripping his
+# hands. "Isn't it a glorious thing! Long awaited."
+
+# is converted to this:
+
+# " They have come ! " he said reverently , gripping his
+# hands . " Isn ' t it a glorious thing ! Long awaited . "
+
+# Sample BPE-based output:
+# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch
+
+import sys
+import re
+
+punctuations = "!(),.?;:'-\""
+escaped_punctuations = re.escape(punctuations)
+
+for line in sys.stdin:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations),
+                                       transcript)).strip()
+  print("{} {}".format(uttid, split_transcript))
diff --git a/egs/bentham/v1/path.sh b/egs/bentham/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/bentham/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/bentham/v1/run_end2end.sh b/egs/bentham/v1/run_end2end.sh
new file mode 100755
index 00000000000..63c034e41f6
--- /dev/null
+++ b/egs/bentham/v1/run_end2end.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# Copyright     2018    Ashish Arora (Johns Hopkins University)
+#               2018    Desh Raj (Johns Hopkins University)
+
+set -e
+stage=0
+nj=20
+# bentham_hwr_database points to the official database path on the JHU grid. If you have not
+# already downloaded the data, you will have to first download it and then name the Images
+# and Ground Truth zipped files as images.zip and gt.zip. Then, point the path below to the
+# location where your zipped files are present on the grid.
+bentham_hwr_database=/export/corpora5/handwriting_ocr/hwr1/ICDAR-HTR-Competition-2015
+# bentham_text_database points to the database path on the JHU grid.
+# It contains all of the written works of Bentham, and can be used to train
+# an LM for the HWR task. We have provided a script which downloads the data
+# and saves it to the location provided below.
+bentham_text_corpus=/export/corpora5/handwriting_ocr/hwr1/ICDAR-HTR-Competition-2015/Bentham-Text
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+
+./local/check_tools.sh
+
+if [ $stage -le 0 ]; then
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --database-dir $bentham_hwr_database \
+    --text-corpus-dir $bentham_text_corpus
+fi
+
+if [ $stage -le 1 ]; then
+  image/get_image2num_frames.py data/train  # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$(date) Extracting features, creating feats.scp file"
+  for dataset in train val test; do
+    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/$dataset  
+    steps/compute_cmvn_stats.sh data/$dataset
+  done
+  utils/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing BPE..."
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+for phone in phone_dict.keys():
+    output.write(phone+ '\n');
+END
+  ) > data/local/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/train_data.txt
+  cat data/local/phones.txt data/local/train_data.txt | \
+    utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+  for set in test train val; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | \
+      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+  # So we set --sil-prob to 0.0
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+    data/local/dict "<sil>" data/lang/temp data/lang
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \
+    data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+    data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
+                       --use-gpu false \
+                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh
+fi
diff --git a/egs/bentham/v1/steps b/egs/bentham/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/bentham/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/bentham/v1/utils b/egs/bentham/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/bentham/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/bn_music_speech/v1/local/make_annotations_bn.py b/egs/bn_music_speech/v1/local/make_annotations_bn.py
index 53cebf52ea4..86bec7b16ae 100755
--- a/egs/bn_music_speech/v1/local/make_annotations_bn.py
+++ b/egs/bn_music_speech/v1/local/make_annotations_bn.py
@@ -9,6 +9,7 @@
 #
 # This file is meant to be invoked by make_bn.sh.
 
+from __future__ import print_function
 import sys, re, os
 
 def is_speech(line):
@@ -37,7 +38,7 @@ def extract_speech(line):
   m = re.search('(?<=E_time=)\d+.\d+', line)
   end = float(m.group(0))
   if start > end:
-    print "Skipping annotation where end time is before start time:", line
+    print("Skipping annotation where end time is before start time: {}".format(line))
   return start, end
 
 def extract_other_type2(line):
@@ -46,7 +47,7 @@ def extract_other_type2(line):
   m = re.search('(?<=E_time=)\d+.\d+', line)
   end = float(m.group(0))
   if start > end:
-    print "Skipping annotation where end time is before start time:", line
+    print("Skipping annotation where end time is before start time: {}".format(line))
   return start, end
 
 def extract_music(line):
@@ -60,7 +61,7 @@ def extract_music(line):
   elif level == "O":
     is_on = False
   else:
-    print "Encountered bad token on line:", line
+    print("Encountered bad token on line: {}".format(line))
     sys.exit()
   return time, is_on
 
@@ -75,7 +76,7 @@ def extract_other_type1(line):
   elif level == "O":
     is_on = False
   else:
-    print "Encountered bad token on line:", line
+    print("Encountered bad token on line: {}".format(line))
     sys.exit()
   return time, is_on
 
@@ -92,11 +93,11 @@ def process_file(annos):
   for line in annos:
     if is_speech(line):
       speech_start, speech_end = extract_speech(line)
-      speech = speech + str(speech_start) + " " + str(speech_end) + "\n"
+      speech = "{}{} {}\n".format(speech, speech_start, speech_end)
       max_time = max(speech_end, max_time)
     elif is_other_type2(line):
       other_type2_start, other_type2_end = extract_other_type2(line)
-      other_type2 = other_type2 + str(other_type2_start) + " " + str(other_type2_end) + "\n"
+      other_type2 = "{}{} {}\n".format(other_type2, other_type2_start, other_type2_end)
       max_time = max(other_type2_end, max_time)
     elif is_music(line):
       time, is_on = extract_music(line)
@@ -105,7 +106,7 @@ def process_file(annos):
         prev_music_time = time
         start_new_music_segment = False
       elif not is_on and not start_new_music_segment:
-        music = music + str(prev_music_time) + " " + str(time) + "\n"
+        music = "{}{} {}\n".format(music, prev_music_time, time)
         start_new_music_segment = True
     elif is_other_type1(line):
       time, is_on = extract_other_type1(line)
@@ -114,13 +115,13 @@ def process_file(annos):
         prev_other_time = time
         start_new_other_segment = False
       elif not is_on and not start_new_other_segment:
-        other_type1 = other_type1 + str(prev_other_time) + " " + str(time) + "\n"
+        other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, time)
         start_new_other_segment = True
 
   if not start_new_music_segment:
-    music = music + str(prev_music_time) + " " + str(max_time) + "\n"
+    music = "{}{} {}\n".format(music, prev_music_time, max_time)
   if not start_new_other_segment:
-    other_type1 = other_type1 + str(prev_other_time) + " " + str(max_time) + "\n"
+    other_type1 = "{}{} {}\n".format(other_type1, prev_other_time, max_time)
 
   other = other_type1 + other_type2
   return speech, music, other
diff --git a/egs/bn_music_speech/v1/local/make_bn.py b/egs/bn_music_speech/v1/local/make_bn.py
index 98836d32534..7ec9aabcbdf 100755
--- a/egs/bn_music_speech/v1/local/make_bn.py
+++ b/egs/bn_music_speech/v1/local/make_bn.py
@@ -20,7 +20,7 @@
   for file in files:
     utt = str(file).replace(".sph", "")
     if file.endswith(".sph") and utt in utts:
-      wav = wav + utt + " sox " + subdir + "/" + utt + ".sph"  + " -c 1 -r 16000 -t wav - |\n"
+      wav = "{0}{1} sox {2}/{1}.sph -c 1 -r 16000 -t -wav - |\n".format(wav, utt, subdir)
 wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
 wav_fi.write(wav)
 
@@ -32,14 +32,14 @@
   count = 1
   for line in music_fi:
     left, right = line.rstrip().split(" ")
-    segments = segments + utt + "-music-" + str(count) + " " + utt + " " + left + " " + right + "\n"
-    utt2spk = utt2spk + utt + "-music-" + str(count) + " " + utt + "-music-" + str(count) + "\n"
+    segments = "{0}{1}-music-{2} {1} {3} {4}\n".format(segments, utt, count, left, right)
+    utt2spk = "{0}{1}-music-{2} {1}-music-{2}".format(utt2spk, utt,count)
     count += 1
   count = 1
   for line in speech_fi:
     left, right = line.rstrip().split(" ")
-    segments = segments + utt + "-speech-" + str(count) + " " + utt + " " + left + " " + right + "\n"
-    utt2spk = utt2spk + utt + "-speech-" + str(count) + " " + utt + "-speech-" + str(count) + "\n"
+    segments = "{0}{1}-speech-{2} {1} {3} {4}\n".format(segments, utt, count, left, right)
+    utt2spk = "{0}{1}-speech-{2} {1}-music-{2}".format(utt2spk, utt, count)
     count += 1
 utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
 utt2spk_fi.write(utt2spk)
diff --git a/egs/bn_music_speech/v1/local/make_musan.py b/egs/bn_music_speech/v1/local/make_musan.py
deleted file mode 100755
index b3795fe2b7d..00000000000
--- a/egs/bn_music_speech/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/bn_music_speech/v1/local/make_musan.sh b/egs/bn_music_speech/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/bn_music_speech/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/bn_music_speech/v1/local/print_scores.py b/egs/bn_music_speech/v1/local/print_scores.py
index c2b587cdcad..e563afb63d7 100755
--- a/egs/bn_music_speech/v1/local/print_scores.py
+++ b/egs/bn_music_speech/v1/local/print_scores.py
@@ -11,6 +11,7 @@
 # those strings to determine if it is a target or nontarget
 # utterance. We arbitrarily pick music to be the target class.
 
+from __future__ import print_function
 import sys
 utt2score = open(sys.argv[1], 'r').readlines()
 for i in range(0, len(utt2score)):
@@ -19,4 +20,4 @@
     type = "target"
   else:
     type = "nontarget"
-  print score, type
+  print(score, type)
diff --git a/egs/bn_music_speech/v1/local/refine_annotations_bn.py b/egs/bn_music_speech/v1/local/refine_annotations_bn.py
index 52ac87c8640..31cb1803f57 100755
--- a/egs/bn_music_speech/v1/local/refine_annotations_bn.py
+++ b/egs/bn_music_speech/v1/local/refine_annotations_bn.py
@@ -10,6 +10,7 @@
 # designated length are created.
 #
 # This file is meant to be invoked from make_bn.sh.
+from __future__ import division
 import sys, os
 
 def seg_to_string(seg):
@@ -23,7 +24,7 @@ def seg_to_string(seg):
 def process_segs(raw_segs):
   segs = []
   for seg in raw_segs:
-    lower, upper = map(float, seg.rstrip().split(" "))
+    lower, upper = [float(i) for i in seg.rstrip().split(" ")]
     segs.append((lower, upper))
   return segs
 
@@ -60,8 +61,8 @@ def resegment(music, speech, other, frame_length, min_seg):
   start_frame = 0
   for i in range(1, len(frame2classes)):
     if curr_class != frame2classes[i]:
-      start = float(start_frame) / frame_length
-      end = float(i) / frame_length
+      start = float(start_frame)/frame_length
+      end = float(i)/frame_length
       if end - start > min_seg:
         if curr_class == "music":
           new_music.append((start, end))
diff --git a/egs/bn_music_speech/v1/run.sh b/egs/bn_music_speech/v1/run.sh
index 6cc0531e9d7..08d5c022a9d 100755
--- a/egs/bn_music_speech/v1/run.sh
+++ b/egs/bn_music_speech/v1/run.sh
@@ -20,7 +20,7 @@ vaddir=`pwd`/mfcc
 local/make_bn.sh /export/corpora5/LDC/LDC97S44 \
                  /export/corpora/LDC/LDC97T22 data
 
-local/make_musan.sh /export/corpora/JHU/musan data
+steps/data/make_musan.sh --sampling-rate 16000 /export/corpora/JHU/musan data
 
 steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 30 --cmd "$train_cmd" \
     data/musan_speech exp/make_mfcc $mfccdir
diff --git a/egs/callhome_diarization/v1/diarization/VB_diarization.py b/egs/callhome_diarization/v1/diarization/VB_diarization.py
new file mode 100755
index 00000000000..62676d64510
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_diarization.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python3
+# Copyright 2013-2017 Lukas Burget (burget@fit.vutbr.cz)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Revision History
+#   L. Burget   16/07/13 01:00AM - original version
+#   L. Burget   20/06/17 12:07AM - np.asarray replaced by .toarray()
+#                                - minor bug fix in initializing q
+#                                - minor bug fix in ELBO calculation
+#                                - few more optimizations
+
+import numpy as np
+from scipy.sparse import coo_matrix
+import scipy.linalg as spl
+#import numexpr as ne # the dependency on this modul can be avoided by replacing
+#                       # logsumexp_ne and exp_ne with logsumexp and np.exp
+
+#[q sp Li] =
+def VB_diarization(X, m, iE, w, V, sp=None, q=None,
+                   maxSpeakers = 10, maxIters = 10,
+                   epsilon = 1e-4, loopProb = 0.99, statScale = 1.0,
+                   alphaQInit = 1.0, downsample = None, VtiEV = None, ref=None,
+                   plot=False, sparsityThr=0.001, llScale=1.0, minDur=1):
+
+  """
+  This a generalized version of speaker diarization described in:
+
+  Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
+  Montreal, CRIM, May 2008.
+
+  Kenny, P., Reynolds, D., and Castaldo, F. Diarization of Telephone
+  Conversations using Factor Analysis IEEE Journal of Selected Topics in Signal
+  Processing, December 2010.
+
+  The generalization introduced in this implementation lies in using an HMM
+  instead of the simple mixture model when modeling generation of segments
+  (or even frames) from speakers. HMM limits the probability of switching
+  between speakers when changing frames, which makes it possible to use
+  the model on frame-by-frame bases without any need to iterate between
+  1) clustering speech segments and 2) re-segmentation (i.e. as it was done in
+  the paper above).
+
+  Inputs:
+  X  - T x D array, where columns are D dimensional feature vectors for T frames
+  m  - C x D array of GMM component means
+  iE - C x D array of GMM component inverse covariance matrix diagonals
+  w  - C dimensional column vector of GMM component weights
+  V  - R x C x D array of eigenvoices
+  maxSpeakers - maximum number of speakers expected in the utterance
+  maxIters    - maximum number of algorithm iterations
+  epsilon     - stop iterating, if obj. fun. improvement is less than epsilon
+  loopProb    - probability of not switching speakers between frames
+  statScale   - scale sufficient statiscits collected using UBM
+  llScale     - scale UBM likelihood (i.e. llScale < 1.0 make atribution of
+                frames to UBM componets more uncertain)
+  sparsityThr - set occupations smaller that this threshold to 0.0 (saves memory
+                as the posteriors are represented by sparse matrix)
+  alphaQInit  - Dirichlet concentraion parameter for initializing q
+  downsample  - perform diarization on input downsampled by this factor
+  VtiEV       - C x (R**2+R)/2 matrix normally calculated by VB_diarization when
+                VtiEV is None. However, it can be pre-calculated using function
+                precalculate_VtiEV(V) and used across calls of VB_diarization.
+  minDur      - minimum number of frames between speaker turns imposed by linear
+                chains of HMM states corresponding to each speaker. All the states
+                in a chain share the same output distribution
+  ref         - T dim. integer vector with reference speaker ID (0:maxSpeakers)
+                per frame
+  plot        - if set to True, plot per-frame speaker posteriors.
+
+   Outputs:
+   q  - S x T matrix of posteriors attribution each frame to one of S possible
+        speakers, where S is given by opts.maxSpeakers
+   sp - S dimensional column vector of ML learned speaker priors. Ideally, these
+        should allow to estimate # of speaker in the utterance as the
+        probabilities of the redundant speaker should converge to zero.
+   Li - values of auxiliary function (and DER and frame cross-entropy between q
+        and reference if 'ref' is provided) over iterations.
+  """
+
+  # The references to equations corresponds to the technical report:
+  # Kenny, P. Bayesian Analysis of Speaker Diarization with Eigenvoice Priors,
+  # Montreal, CRIM, May 2008.
+
+  D=X.shape[1]  # feature dimensionality
+  C=len(w)      # number of mixture components
+  R=V.shape[0]  # subspace rank
+  nframes=X.shape[0]
+
+  if VtiEV is None:
+    VtiEV = precalculate_VtiEV(V, iE)
+
+  V = V.reshape(V.shape[0],-1)
+
+  if sp is None:
+    sp = np.ones(maxSpeakers)/maxSpeakers
+  else:
+    maxSpeakers = len(sp)
+
+  if q is None:
+    # initialize q from flat Dirichlet prior with concentrsaion parameter alphaQInit
+    q = np.random.gamma(alphaQInit, size=(nframes, maxSpeakers))
+    q = q / q.sum(1, keepdims=True)
+
+  # calculate UBM mixture frame posteriors (i.e. per-frame zero order statistics)
+  ll = (X**2).dot(-0.5*iE.T) + X.dot(iE.T*m.T)-0.5*((iE * m**2 - np.log(iE)).sum(1) - 2*np.log(w) + D*np.log(2*np.pi))
+  ll *= llScale
+  G = logsumexp(ll, axis=1)
+  NN =  np.exp(ll - G[:,np.newaxis]) * statScale
+  NN[NN<sparsityThr] = 0.0
+
+  #Kx = np.sum(NN * (np.log(w) - np.log(NN)), 1)
+  NN = coo_matrix(NN) # represent zero-order stats using sparse matrix
+  print('Sparsity: ', len(NN.row), float(len(NN.row))/np.prod(NN.shape))
+  LL = np.sum(G) # total log-likelihod as calculated using UBM
+
+  mixture_sum = coo_matrix((np.ones(C*D), (np.repeat(range(C),D), range(C*D))), shape=(C, C*D))
+
+  #G = np.sum((NN.multiply(ll - np.log(w))).toarray(), 1) + Kx  # eq. (15) # Aleready calculated above
+
+  # Calculate per-frame first order statistics projected into the R-dim. subspace
+  # V^T \Sigma^{-1} F_m
+  F_s = coo_matrix((((X[NN.row]-m[NN.col])*NN.data[:,np.newaxis]).flat,
+                   (NN.row.repeat(D), NN.col.repeat(D)*D+np.tile(range(D), len(NN.col)))), shape=(nframes, D*C))
+  VtiEF = F_s.tocsr().dot((iE.flat * V).T) ; del F_s
+  ## The code above is only efficient implementation of the following comented code
+  #VtiEF = 0;
+  #for ii in range(C):
+  #  VtiEF = VtiEF + V[ii*D:(ii+1)*D,:].T.dot(NN[ii,:] * np.sqrt(iE[:,[ii]]) *  (X - m[:,[ii]]))
+
+  if downsample is not None:
+    # Downsample NN, VtiEF, G and q by summing the statistic over 'downsample' frames
+    # This speeds-up diarization for the price of lowering its frame resolution
+    downsampler = coo_matrix((np.ones(nframes, dtype=np.int64), ((np.ceil(np.arange(nframes)/downsample)).astype(int), np.arange(nframes))), shape=(int(np.ceil((nframes - 1.0) / downsample)) + 1, nframes))
+    NN    = downsampler.dot(NN)
+    VtiEF = downsampler.dot(VtiEF)
+    G     = downsampler.dot(G)
+    q     = downsampler.dot(q) / downsample
+  else:
+    downsampler=np.array(1)
+
+  Li = [[LL]] # for the 0-th iteration,
+  if ref is not None:
+    Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+
+  lls = np.zeros_like(q)
+  tr = np.eye(minDur*maxSpeakers, k=1)
+  ip = np.zeros(minDur*maxSpeakers)
+  for ii in range(maxIters):
+    L = 0 # objective function (37) (i.e. VB lower-bound on the evidence)
+    Ns =   NN.T.dot(q).T                             # bracket in eq. (34) for all 's'
+    VtNsiEV_flat = Ns.astype(VtiEV.dtype).dot(VtiEV) # eq. (34) except for 'I' for all 's'
+    VtiEFs = q.T.dot(VtiEF)                          # eq. (35) except for \Lambda_s^{-1} for all 's'
+    for sid in range(maxSpeakers):
+        invL = np.linalg.inv(np.eye(R) + tril_to_sym(VtNsiEV_flat[sid])) # eq. (34) inverse
+        a = invL.dot(VtiEFs[sid])                                        # eq. (35)
+        # eq. (29) except for the prior term \ln \pi_s. Our prior is given by HMM
+        # trasition probability matrix. Instead of eq. (30), we need to use
+        # forward-backwar algorithm to calculate per-frame speaker posteriors,
+        # where 'lls' plays role of HMM output log-probabilities
+        lls[:,sid] = G + VtiEF.dot(a) - 0.5 * NN.dot(mixture_sum.dot(((invL+np.outer(a,a)).astype(V.dtype).dot(V) * (iE.flat * V)).sum(0)))
+        L += 0.5 * (logdet(invL) - np.sum(np.diag(invL) + a**2, 0) + R)
+
+    # Construct transition probability matrix with linear chain of 'minDur'
+    # states for each of 'maxSpeaker' speaker. The last state in each chain has
+    # self-loop probability 'loopProb' and the transition probabilities to the
+    # initial chain states given by vector '(1-loopProb) * sp'. From all other,
+    #states, one must move to the next state in the chain with probability one.
+    tr[minDur-1::minDur,0::minDur]=(1-loopProb)*sp
+    tr[(np.arange(1,maxSpeakers+1)*minDur-1,)*2] += loopProb
+    ip[::minDur]=sp
+    # per-frame HMM state posteriors. Note that we can have linear chain of minDur states
+    # for each speaker.
+    q, tll, lf, lb = forward_backward(lls.repeat(minDur,axis=1), tr, ip) #, np.arange(1,maxSpeakers+1)*minDur-1)
+
+    # Right after updating q(Z), tll is E{log p(X|,Y,Z)} - KL{q(Z)||p(Z)}.
+    # L now contains -KL{q(Y)||p(Y)}. Therefore, L+ttl is correct value for ELBO.
+    L += tll
+    Li.append([L])
+
+    # ML estimate of speaker prior probabilities (analogue to eq. (38))
+    sp = q[0,::minDur] + np.exp(logsumexp(lf[:-1,minDur-1::minDur],axis=1)[:,np.newaxis]
+                       + lb[1:,::minDur] + lls[1:] + np.log((1-loopProb)*sp)-tll).sum(0)
+    sp = sp / sp.sum()
+
+    # per-frame speaker posteriors (analogue to eq. (30)), obtained by summing
+    # HMM state posteriors corresponding to each speaker
+    q = q.reshape(len(q),maxSpeakers,minDur).sum(axis=2)
+
+
+    # if reference is provided, report DER, cross-entropy and plot the figures
+    if ref is not None:
+      Li[-1] += [DER(downsampler.T.dot(q), ref), DER(downsampler.T.dot(q), ref, xentropy=True)]
+
+      if plot:
+        import matplotlib.pyplot
+        if ii == 0: matplotlib.pyplot.clf()
+        matplotlib.pyplot.subplot(maxIters, 1, ii+1)
+        matplotlib.pyplot.plot(downsampler.T.dot(q), lw=2)
+        matplotlib.pyplot.imshow(np.atleast_2d(ref), interpolation='none', aspect='auto',
+                                 cmap=matplotlib.pyplot.cm.Pastel1, extent=(0, len(ref), -0.05, 1.05))
+        
+      print(ii, Li[-2])
+
+
+    if ii > 0 and L - Li[-2][0] < epsilon:
+      if L - Li[-1][0] < 0: print('WARNING: Value of auxiliary function has decreased!')
+      break
+
+  if downsample is not None:
+    #upsample resulting q to match number of frames in the input utterance
+    q = downsampler.T.dot(q)
+
+  return q, sp, Li
+
+
+def precalculate_VtiEV(V, iE):
+    tril_ind = np.tril_indices(V.shape[0])
+    VtiEV = np.empty((V.shape[1],len(tril_ind[0])), V.dtype)
+    for c in range(V.shape[1]):
+        VtiEV[c,:] = np.dot(V[:,c,:]*iE[np.newaxis,c,:], V[:,c,:].T)[tril_ind]
+    return VtiEV
+
+
+# Initialize q (per-frame speaker posteriors) from a reference
+# (vector of per-frame zero based integer speaker IDs)
+def frame_labels2posterior_mx(labels, maxSpeakers):
+    #initialize from reference
+    #pmx = np.zeros((len(labels), labels.max()+1))
+    pmx = np.zeros((len(labels), maxSpeakers))
+    pmx[np.arange(len(labels)), labels] = 1
+    return pmx
+
+# Calculates Diarization Error Rate (DER) or per-frame cross-entropy between
+# reference (vector of per-frame zero based integer speaker IDs) and q (per-frame
+# speaker posteriors). If expected=False, q is converted into hard labels before
+# calculating DER. If expected=TRUE, posteriors in q are used to calculated
+# "expected" DER.
+def DER(q, ref, expected=True, xentropy=False):
+    from itertools import permutations
+
+    if not expected:
+        # replce probabiities in q by zeros and ones
+        hard_labels = q.argmax(1)
+        q = np.zeros_like(q)
+        q[range(len(q)), hard_labels] = 1
+
+    err_mx = np.empty((ref.max()+1, q.shape[1]))
+    for s in range(err_mx.shape[0]):
+        tmpq = q[ref == s,:]
+        err_mx[s] = (-np.log(tmpq) if xentropy else tmpq).sum(0)
+
+    if err_mx.shape[0] < err_mx.shape[1]:
+        err_mx = err_mx.T
+
+    # try all alignments (permutations) of reference and detected speaker
+    #could be written in more efficient way using dynamic programing
+    acc = [err_mx[perm[:err_mx.shape[1]], range(err_mx.shape[1])].sum()
+              for perm in permutations(range(err_mx.shape[0]))]
+    if xentropy:
+       return min(acc)/float(len(ref))
+    else:
+       return (len(ref) - max(acc))/float(len(ref))
+
+
+###############################################################################
+# Module private functions
+###############################################################################
+def logsumexp(x, axis=0):
+    xmax = x.max(axis)
+    x = xmax + np.log(np.sum(np.exp(x - np.expand_dims(xmax, axis)), axis))
+    infs = np.isinf(xmax)
+    if np.ndim(x) > 0:
+      x[infs] = xmax[infs]
+    elif infs:
+      x = xmax
+    return x
+
+
+# The folowing two functions are only versions optimized for speed using numexpr
+# module and can be replaced by logsumexp and np.exp functions to avoid
+# the dependency on the module.
+def logsumexp_ne(x, axis=0):
+    xmax = np.array(x).max(axis=axis)
+    xmax_e = np.expand_dims(xmax, axis)
+    x = ne.evaluate("sum(exp(x - xmax_e), axis=%d)" % axis)
+    x = ne.evaluate("xmax + log(x)")
+    infs = np.isinf(xmax)
+    if np.ndim(x) > 0:
+      x[infs] = xmax[infs]
+    elif infs:
+      x = xmax
+    return x
+
+
+def exp_ne(x, out=None):
+    return ne.evaluate("exp(x)", out=None)
+
+
+# Convert vector with lower-triangular coefficients into symetric matrix
+def tril_to_sym(tril):
+    R = np.sqrt(len(tril)*2).astype(int)
+    tril_ind = np.tril_indices(R)
+    S = np.empty((R,R))
+    S[tril_ind]       = tril
+    S[tril_ind[::-1]] = tril
+    return S
+
+
+def logdet(A):
+    return 2*np.sum(np.log(np.diag(spl.cholesky(A))))
+
+
+def forward_backward(lls, tr, ip):
+    """
+    Inputs:
+        lls - matrix of per-frame log HMM state output probabilities
+        tr  - transition probability matrix
+        ip  - vector of initial state probabilities (i.e. statrting in the state)
+    Outputs:
+        sp  - matrix of per-frame state occupation posteriors
+        tll - total (forward) log-likelihood
+        lfw - log forward probabilities
+        lfw - log backward probabilities
+    """
+    ltr = np.log(tr)
+    lfw = np.empty_like(lls)
+    lbw = np.empty_like(lls)
+    lfw[:] = -np.inf
+    lbw[:] = -np.inf
+    lfw[0] = lls[0] + np.log(ip)
+    lbw[-1] = 0.0
+
+    for ii in range(1,len(lls)):
+        lfw[ii] =  lls[ii] + logsumexp(lfw[ii-1] + ltr.T, axis=1)
+
+    for ii in reversed(range(len(lls)-1)):
+        lbw[ii] = logsumexp(ltr + lls[ii+1] + lbw[ii+1], axis=1)
+
+    tll = logsumexp(lfw[-1])
+    sp = np.exp(lfw + lbw - tll)
+    return sp, tll, lfw, lbw
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.py b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
new file mode 100755
index 00000000000..e507c088563
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+
+# Copyright 2019  Zili Huang
+
+# This script is evoked by diarization/VB_resegmentation.sh. It prepares the necessary
+# inputs for the VB system and creates the output RTTM file. The inputs include data directory
+# (data_dir), the rttm file to initialize the VB system(init_rttm_filename), the directory to
+# output the rttm prediction(output_dir), path to diagonal UBM model(dubm_model) and path to 
+# i-vector extractor model(ie_model).
+
+import numpy as np
+import VB_diarization
+import kaldi_io
+import argparse
+from convert_VB_model import load_dubm, load_ivector_extractor 
+
+def get_utt_list(utt2spk_filename):
+    with open(utt2spk_filename, 'r') as fh:
+        content = fh.readlines()
+    utt_list = [line.split()[0] for line in content]
+    print("{} utterances in total".format(len(utt_list)))
+    return utt_list
+
+# prepare utt2num_frames dictionary
+def get_utt2num_frames(utt2num_frames_filename):
+    utt2num_frames = {}
+    with open(utt2num_frames_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        utt2num_frames[line_split[0]] = int(line_split[1])
+    return utt2num_frames
+
+# prepare utt2feats dictionary
+def get_utt2feats(utt2feats_filename):
+    utt2feats = {}
+    with open(utt2feats_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split(None, 1)
+        utt2feats[line_split[0]] = line_split[1]
+    return utt2feats
+
+def create_ref(uttname, utt2num_frames, full_rttm_filename):
+    num_frames = utt2num_frames[uttname]
+
+    # We use 0 to denote silence frames and 1 to denote overlapping frames.
+    ref = np.zeros(num_frames)
+    speaker_dict = {}
+    num_spk = 0
+
+    with open(full_rttm_filename, 'r') as fh:
+        content = fh.readlines()
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        uttname_line = line_split[1]
+        if uttname != uttname_line:
+            continue
+        start_time, duration = int(float(line_split[3]) * 100), int(float(line_split[4]) * 100)
+        end_time = start_time + duration
+        spkname = line_split[7]
+        if spkname not in speaker_dict.keys():
+            spk_idx = num_spk + 2
+            speaker_dict[spkname] = spk_idx
+            num_spk += 1
+        
+        for i in range(start_time, end_time):
+            if i < 0:
+                raise ValueError("Time index less than 0")
+            elif i >= num_frames:
+                print("Time index exceeds number of frames")
+                break
+            else:
+                if ref[i] == 0:
+                    ref[i] = speaker_dict[spkname] 
+                else:
+                    ref[i] = 1 # The overlapping speech is marked as 1.
+    return ref.astype(int)
+
+# create output rttm file
+def create_rttm_output(uttname, predicted_label, output_dir, channel):
+    num_frames = len(predicted_label)
+
+    start_idx = 0
+    seg_list = []
+
+    last_label = predicted_label[0]
+    for i in range(num_frames):
+        if predicted_label[i] == last_label: # The speaker label remains the same.
+            continue
+        else: # The speaker label is different.
+            if last_label != 0: # Ignore the silence.
+                seg_list.append([start_idx, i, last_label])
+            start_idx = i
+            last_label = predicted_label[i]
+    if last_label != 0:
+        seg_list.append([start_idx, num_frames, last_label])
+
+    with open("{}/{}_predict.rttm".format(output_dir, uttname), 'w') as fh:
+        for i in range(len(seg_list)):
+            start_frame = (seg_list[i])[0]
+            end_frame = (seg_list[i])[1]
+            label = (seg_list[i])[2]
+            duration = end_frame - start_frame
+            fh.write("SPEAKER {} {} {:.2f} {:.2f} <NA> <NA> {} <NA> <NA>\n".format(uttname, channel, start_frame / 100.0, duration / 100.0, label))
+    return 0
+
+def main():
+    parser = argparse.ArgumentParser(description='VB Resegmentation Wrapper')
+    parser.add_argument('data_dir', type=str, help='Subset data directory')
+    parser.add_argument('init_rttm_filename', type=str, 
+                        help='The rttm file to initialize the VB system, usually the AHC cluster result')
+    parser.add_argument('output_dir', type=str, help='Output directory')
+    parser.add_argument('dubm_model', type=str, help='Path of the diagonal UBM model')
+    parser.add_argument('ie_model', type=str, help='Path of the i-vector extractor model')
+
+    parser.add_argument('--max-speakers', type=int, default=10,
+                        help='Maximum number of speakers expected in the utterance (default: 10)')
+    parser.add_argument('--max-iters', type=int, default=10,
+                        help='Maximum number of algorithm iterations (default: 10)')
+    parser.add_argument('--downsample', type=int, default=25,
+                        help='Perform diarization on input downsampled by this factor (default: 25)')
+    parser.add_argument('--alphaQInit', type=float, default=100.0,
+                        help='Dirichlet concentraion parameter for initializing q')
+    parser.add_argument('--sparsityThr', type=float, default=0.001,
+                        help='Set occupations smaller that this threshold to 0.0 (saves memory as \
+                        the posteriors are represented by sparse matrix)')
+    parser.add_argument('--epsilon', type=float, default=1e-6,
+                        help='Stop iterating, if obj. fun. improvement is less than epsilon')
+    parser.add_argument('--minDur', type=int, default=1,
+                        help='Minimum number of frames between speaker turns imposed by linear \
+                        chains of HMM states corresponding to each speaker. All the states \
+                        in a chain share the same output distribution')
+    parser.add_argument('--loopProb', type=float, default=0.9,
+                        help='Probability of not switching speakers between frames')
+    parser.add_argument('--statScale', type=float, default=0.2,
+                        help='Scale sufficient statiscits collected using UBM')
+    parser.add_argument('--llScale', type=float, default=1.0,
+                        help='Scale UBM likelihood (i.e. llScale < 1.0 make atribution of \
+                        frames to UBM componets more uncertain)')
+    parser.add_argument('--channel', type=int, default=0,
+                        help='Channel information in the rttm file')
+    parser.add_argument('--initialize', type=int, default=1,
+                        help='Whether to initalize the speaker posterior')
+
+    args = parser.parse_args()
+    print(args)
+
+    utt_list = get_utt_list("{}/utt2spk".format(args.data_dir))
+    utt2num_frames = get_utt2num_frames("{}/utt2num_frames".format(args.data_dir))
+    
+    # Load the diagonal UBM and i-vector extractor
+    dubm_para = load_dubm(args.dubm_model)
+    ie_para = load_ivector_extractor(args.ie_model)
+
+    # Check the diagonal UBM and i-vector extractor model
+    assert '<WEIGHTS>' in dubm_para and '<MEANS_INVVARS>' in dubm_para and '<INV_VARS>' in dubm_para
+    DUBM_WEIGHTS, DUBM_MEANS_INVVARS, DUBM_INV_VARS = dubm_para['<WEIGHTS>'], dubm_para['<MEANS_INVVARS>'], dubm_para['<INV_VARS>']
+    assert 'M' in ie_para
+    IE_M = np.transpose(ie_para['M'], (2, 0, 1))
+    
+    m = DUBM_MEANS_INVVARS / DUBM_INV_VARS
+    iE = DUBM_INV_VARS
+    w = DUBM_WEIGHTS
+    V = IE_M
+
+    # Load the MFCC features
+    feats_dict = get_utt2feats("{}/feats.scp".format(args.data_dir))
+
+    for utt in utt_list:
+        # Get the alignments from the clustering result.
+        # In init_ref, 0 denotes the silence silence frames
+        # 1 denotes the overlapping speech frames, the speaker
+        # label starts from 2.
+        init_ref = create_ref(utt, utt2num_frames, args.init_rttm_filename)
+
+        # load MFCC features
+        X = kaldi_io.read_mat(feats_dict[utt]).astype(np.float64)
+        assert len(init_ref) == len(X)
+
+        # Keep only the voiced frames (0 denotes the silence 
+        # frames, 1 denotes the overlapping speech frames).
+        mask = (init_ref >= 2)
+        X_voiced = X[mask]
+        init_ref_voiced = init_ref[mask] - 2
+
+        if X_voiced.shape[0] == 0:
+            print("Warning: {} has no voiced frames in the initialization file".format(utt))
+            continue
+
+        # Initialize the posterior of each speaker based on the clustering result.
+        if args.initialize:
+            q = VB_diarization.frame_labels2posterior_mx(init_ref_voiced, args.max_speakers)
+        else:
+            q = None
+        
+        # VB resegmentation
+
+        # q  - S x T matrix of posteriors attribution each frame to one of S possible
+        #      speakers, where S is given by opts.maxSpeakers
+        # sp - S dimensional column vector of ML learned speaker priors. Ideally, these
+        #      should allow to estimate # of speaker in the utterance as the
+        #      probabilities of the redundant speaker should converge to zero.
+        # Li - values of auxiliary function (and DER and frame cross-entropy between q
+        #      and reference if 'ref' is provided) over iterations.
+        q_out, sp_out, L_out = VB_diarization.VB_diarization(X_voiced, m, iE, w, V, sp=None, q=q, maxSpeakers=args.max_speakers, maxIters=args.max_iters, VtiEV=None,
+                                  downsample=args.downsample, alphaQInit=args.alphaQInit, sparsityThr=args.sparsityThr, epsilon=args.epsilon, minDur=args.minDur,
+                                  loopProb=args.loopProb, statScale=args.statScale, llScale=args.llScale, ref=None, plot=False)
+        predicted_label_voiced = np.argmax(q_out, 1) + 2
+        predicted_label = (np.zeros(len(mask))).astype(int)
+        predicted_label[mask] = predicted_label_voiced
+
+        # Create the output rttm file
+        create_rttm_output(utt, predicted_label, args.output_dir, args.channel)
+    return 0
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
new file mode 100755
index 00000000000..765c4eee8b8
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/VB_resegmentation.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+# Copyright 2019  Zili Huang
+
+# This script is a wrapper for Variational Bayes resegmentation.
+# It shows how to use the code from Brno University of Technology 
+# to do resegmentation.
+
+# Begin configuration section.
+nj=20
+cmd=run.pl
+stage=0
+max_speakers=10
+max_iters=10
+downsample=25
+alphaQInit=100.0
+sparsityThr=0.001
+epsilon=1e-6
+minDur=1
+loopProb=0.9
+statScale=0.2
+llScale=1.0
+channel=0
+initialize=1
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+  echo "Usage: diarization/VB_resegmentation.sh <data_dir> <init_rttm_filename> <output_dir> <dubm_model> <ie_model>"
+  echo "Variational Bayes Re-segmenatation"
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # How to run jobs."
+  echo "  --nj <num-jobs|20>                               # Number of parallel jobs to run."
+  echo "  --max-speakers <n|10>                            # Maximum number of speakers" 
+  echo "                                                   # expected in the utterance" 
+  echo "					           # (default: 10)"
+  echo "  --max-iters <n|10>                               # Maximum number of algorithm"
+  echo "                                                   # iterations (default: 10)" 
+  echo "  --downsample <n|25>                              # Perform diarization on input"
+  echo "                                                   # downsampled by this factor"
+  echo "                                                   # (default: 25)"
+  echo "  --alphaQInit <float|100.0>                       # Dirichlet concentraion"
+  echo "                                                   # parameter for initializing q"
+  echo "  --sparsityThr <float|0.001>                      # Set occupations smaller that"
+  echo "                                                   # this threshold to 0.0 (saves"
+  echo "                                                   # memory as the posteriors are"
+  echo "                                                   # represented by sparse matrix)"
+  echo "  --epsilon <float|1e-6>                           # Stop iterating, if obj. fun." 
+  echo "                                                   # improvement is less than" 
+  echo "				                   # epsilon"
+  echo "  --minDur <n|1>                                   # Minimum number of frames"
+  echo "                                                   # between speaker turns imposed"
+  echo "                                                   # by linear chains of HMM" 
+  echo "                                                   # state corresponding to each" 
+  echo "                                                   # speaker. All the states in"
+  echo "                                                   # a chain share the same output"
+  echo "                                                   # distribution"
+  echo "  --loopProb <float|0.9>                           # Probability of not switching"
+  echo "                                                   # speakers between frames"
+  echo "  --statScale <float|0.2>                          # Scale sufficient statistics" 
+  echo "                                                   # collected using UBM"
+  echo "  --llScale <float|1.0>                            # Scale UBM likelihood (i.e."
+  echo "                                                   # llScale < 1.0 make" 
+  echo "                                                   # attribution of frames to UBM"
+  echo "                                                   # componets more uncertain)" 
+  echo "  --channel <n|0>                                  # Channel information in the rttm file"
+  echo "  --initialize <n|1>                               # Whether to initalize the"
+  echo "                                                   # speaker posterior (if not)"
+  echo "                                                   # the speaker posterior will be"
+  echo "                                                   # randomly initilized"
+
+  exit 1;
+fi
+
+data_dir=$1
+init_rttm_filename=$2
+output_dir=$3
+dubm_model=$4
+ie_model=$5
+
+mkdir -p $output_dir/tmp
+
+sdata=$data_dir/split$nj;
+utils/split_data.sh $data_dir $nj || exit 1;
+
+if [ $stage -le 0 ]; then
+  # Dump the diagonal UBM model into txt format.
+  "$train_cmd" $output_dir/log/convert_diag_ubm.log \
+    gmm-global-copy --binary=false \
+      $dubm_model \
+      $output_dir/tmp/dubm.tmp || exit 1;
+
+  # Dump the ivector extractor model into txt format.
+  "$train_cmd" $output_dir/log/convert_ie.log \
+    ivector-extractor-copy --binary=false \
+      $ie_model \
+      $output_dir/tmp/ie.tmp || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+    # VB resegmentation
+    $cmd JOB=1:$nj $output_dir/log/VB_resegmentation.JOB.log \
+      python3 diarization/VB_resegmentation.py --max-speakers $max_speakers \
+        --max-iters $max_iters --downsample $downsample --alphaQInit $alphaQInit \
+	--sparsityThr $sparsityThr --epsilon $epsilon --minDur $minDur \
+	--loopProb $loopProb --statScale $statScale --llScale $llScale \
+	--channel $channel --initialize $initialize \
+        $sdata/JOB $init_rttm_filename $output_dir/tmp $output_dir/tmp/dubm.tmp $output_dir/tmp/ie.tmp || exit 1;
+
+    cat $output_dir/tmp/*.rttm > $output_dir/rttm/VB_rttm
+fi
diff --git a/egs/callhome_diarization/v1/diarization/cluster.sh b/egs/callhome_diarization/v1/diarization/cluster.sh
index 4f46b3ba5ef..5e5c6e9dbe5 100755
--- a/egs/callhome_diarization/v1/diarization/cluster.sh
+++ b/egs/callhome_diarization/v1/diarization/cluster.sh
@@ -14,6 +14,9 @@ stage=0
 nj=10
 cleanup=true
 threshold=0.5
+max_spk_fraction=1.0
+first_pass_max_utterances=32767
+rttm_channel=0
 read_costs=false
 reco2num_spk=
 # End configuration section.
@@ -35,6 +38,17 @@ if [ $# != 2 ]; then
   echo "  --threshold <threshold|0>                        # Cluster stopping criterion. Clusters with scores greater"
   echo "                                                   # than this value will be merged until all clusters"
   echo "                                                   # exceed this value."
+  echo "  --max-spk-fraction <max-spk-fraction|1.0>        # Clusters with total fraction of utterances greater than"
+  echo "                                                   # this value will not be merged. This is active only when"
+  echo "                                                   # reco2num-spk is supplied and"
+  echo "                                                   # 1.0 / num-spk <= max-spk-fraction <= 1.0."
+  echo "  --first-pass-max-utterances <max-utts|32767>     # If the number of utterances is larger than first-pass-max-utterances,"
+  echo "                                                   # then clustering is done in two passes. In the first pass, input points"
+  echo "                                                   # are divided into contiguous subsets of size first-pass-max-utterances"
+  echo "                                                   # and each subset is clustered separately. In the second pass, the first"
+  echo "                                                   # pass clusters are merged into the final set of clusters."
+  echo "  --rttm-channel <rttm-channel|0>                  # The value passed into the RTTM channel field. Only affects"
+  echo "                                                   # the format of the RTTM file."
   echo "  --read-costs <read-costs|false>                  # If true, interpret input scores as costs, i.e. similarity"
   echo "                                                   # is indicated by smaller values. If enabled, clusters will"
   echo "                                                   # be merged until all cluster scores are less than the"
@@ -75,8 +89,10 @@ if [ $stage -le 0 ]; then
   echo "$0: clustering scores"
   $cmd JOB=1:$nj $dir/log/agglomerative_cluster.JOB.log \
     agglomerative-cluster --threshold=$threshold --read-costs=$read_costs \
-      --reco2num-spk-rspecifier=$reco2num_spk scp:"$feats" \
-      ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1;
+      --reco2num-spk-rspecifier=$reco2num_spk \
+      --max-spk-fraction=$max_spk_fraction \
+      --first-pass-max-utterances=$first_pass_max_utterances \
+      scp:"$feats" ark,t:$sdata/JOB/spk2utt ark,t:$dir/labels.JOB || exit 1;
 fi
 
 if [ $stage -le 1 ]; then
@@ -86,7 +102,7 @@ fi
 
 if [ $stage -le 2 ]; then
   echo "$0: computing RTTM"
-  diarization/make_rttm.py $srcdir/segments $dir/labels $dir/rttm || exit 1;
+  diarization/make_rttm.py --rttm-channel $rttm_channel $srcdir/segments $dir/labels $dir/rttm || exit 1;
 fi
 
 if $cleanup ; then
diff --git a/egs/callhome_diarization/v1/diarization/convert_VB_model.py b/egs/callhome_diarization/v1/diarization/convert_VB_model.py
new file mode 100755
index 00000000000..b1f25b0dbfd
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/convert_VB_model.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+
+# Copyright 2019  Zili Huang
+# Apache 2.0
+
+# This script loads diagonal UBM and ivector extractor from text file.
+
+import os
+import numpy as np
+
+def load_dubm(dubm_text):
+    assert os.path.exists(dubm_text)
+
+    para_dict = {}
+    state = 0
+    data_array = []
+
+    with open(dubm_text, 'r') as fh:
+        content = fh.readlines()
+
+    for line in content:
+        line = line.strip('\n')
+        line_split = line.split()
+        if state == 0:
+            if len(line_split) == 1:
+                continue
+            elif len(line_split) == 2 and line_split[1] == "[": # Start of a multi-line matrix like <MEANS_INVVARS> and <INV_VARS> 
+                para_name = line_split[0]
+                state = 1
+                data_array = []
+            elif len(line_split) >= 3 and line_split[1] == "[" and line_split[-1] == "]": # Single line vector like <WEIGHTS>
+                para_name = line_split[0]
+                data_list = []
+                for i in range(2, len(line_split) - 1):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                para_dict[para_name] = data_list
+            else:
+                raise ValueError("Condition not defined.")
+        elif state == 1:
+            if line_split[-1] == "]": # End of a multi-line matrix like <MEANS_INVVARS> and <INV_VARS>
+                data_list = []
+                for i in range(len(line_split) - 1):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                data_array.append(data_list)
+                data_array = np.array(data_array)
+                para_dict[para_name] = data_array
+                state = 0
+            else:
+                data_list = []
+                for i in range(len(line_split)):
+                    data_list.append(float(line_split[i]))
+                data_list = np.array(data_list)
+                data_array.append(data_list)
+        else:
+            raise ValueError("Condition not defined.")
+    return para_dict # the diagonal ubm parameter includes <GCONSTS>, <WEIGHTS>, <MEANS_INVVARS>, <INV_VARS> 
+
+def load_ivector_extractor(ie_text):
+    assert os.path.exists(ie_text)
+
+    para_dict = {}
+    state = 0
+    data_3dmatrix = []
+
+    with open(ie_text, 'r') as fh:
+        content = fh.readlines()
+
+    for line in content:
+        line = line.strip('\n')
+        if line == "<SigmaInv> [":
+            break
+        if state == 0:
+            if not line.startswith("<M>"):
+                continue
+            else:
+                state = 1
+                data_matrix = []
+        elif state == 1:
+            line_split = line.split()
+            if line_split[0] == "[":
+                data_matrix = []
+                continue
+            elif line_split[-1] == "]":
+                data_array = []
+                for i in range(len(line_split)-1):
+                    data_array.append(float(line_split[i]))
+                data_matrix.append(data_array)
+                data_3dmatrix.append(data_matrix)
+            else:
+                data_array = []
+                for i in range(len(line_split)):
+                    data_array.append(float(line_split[i]))
+                data_matrix.append(data_array)
+        else:
+            raise ValueError("Condition not defined.")
+    para_dict['M'] = np.array(data_3dmatrix)
+    return para_dict # the ivector extractor parameter is a 3d matrix of shape [num-gaussian, feat-dim, ivec-dim] 
diff --git a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh
index 370a37b873e..d7bb389bad5 100755
--- a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh
+++ b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh
@@ -29,6 +29,10 @@ min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
 posterior_scale=1.0 # This scale helps to control for successve features being highly
                     # correlated.  E.g. try 0.1 or 0.3.
 apply_cmn=true # If true, apply sliding window cepstral mean normalization
+apply_deltas=true # If true, copy the delta options from the i-vector extractor directory.
+                  # If false, we won't add deltas in this step. For speaker diarization,
+		  # we sometimes need to write features to disk that already have various
+		  # post-processing applied so adding deltas is no longer needed in this stage.
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -57,6 +61,12 @@ if [ $# != 3 ]; then
   echo "  --min-post <min-post|0.025>                      # Pruning threshold for posteriors"
   echo "  --apply-cmn <true,false|true>                    # if true, apply sliding window cepstral mean"
   echo "                                                   # normalization to features"
+  echo "  --apply-deltas <true,false|true>                 # If true, copy the delta options from the i-vector"
+  echo "                                                   # extractor directory. If false, we won't add deltas"
+  echo "                                                   # in this step. For speaker diarization, we sometimes"
+  echo "                                                   # need to write features to disk that already have"
+  echo "                                                   # various post-processing applied so adding deltas is"
+  echo "                                                   # no longer needed in this stage."
   exit 1;
 fi
 
@@ -82,7 +92,7 @@ if [ $stage -le 0 ]; then
   fi
   utils/data/get_uniform_subsegments.py \
       --max-segment-duration=$window \
-      --overlap-duration=$(echo "$window-$period" | bc) \
+      --overlap-duration=$(perl -e "print $window-$period") \
       --max-remaining-duration=$min_segment \
       --constant-duration=True \
       $segments > $dir/subsegments
@@ -95,7 +105,11 @@ mkdir -p $dir/log
 sub_sdata=$sub_data/split$nj;
 utils/split_data.sh $sub_data $nj || exit 1;
 
-delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+if $apply_deltas; then
+  delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+else
+  delta_opts="--delta-order=0"
+fi
 
 ## Set up features.
 if $apply_cmn; then
diff --git a/egs/callhome_diarization/v1/diarization/make_rttm.py b/egs/callhome_diarization/v1/diarization/make_rttm.py
index 1705411069f..fc32eafd530 100755
--- a/egs/callhome_diarization/v1/diarization/make_rttm.py
+++ b/egs/callhome_diarization/v1/diarization/make_rttm.py
@@ -34,9 +34,7 @@
 
 import argparse
 import sys
-
-sys.path.append('steps/libs')
-import common as common_lib
+import codecs
 
 
 def get_args():
@@ -51,6 +49,9 @@ def get_args():
                       help="Input labels file")
   parser.add_argument("rttm_file", type=str,
                       help="Output RTTM file")
+  parser.add_argument("--rttm-channel", type=int, default=0,
+                      help="The value passed into the RTTM channel field. \
+                      Only affects the format of the RTTM file.")
 
   args = parser.parse_args()
   return args
@@ -60,14 +61,14 @@ def main():
 
   # File containing speaker labels per segment
   seg2label = {}
-  with common_lib.smart_open(args.labels) as labels_file:
+  with codecs.open(args.labels, 'r', 'utf-8') as labels_file:
     for line in labels_file:
       seg, label = line.strip().split()
       seg2label[seg] = label
 
   # Segments file
   reco2segs = {}
-  with common_lib.smart_open(args.segments) as segments_file:
+  with codecs.open(args.segments, 'r', 'utf-8') as segments_file:
     for line in segments_file:
       seg, reco, start, end = line.strip().split()
       try:
@@ -80,7 +81,7 @@ def main():
 
   # Cut up overlapping segments so they are contiguous
   contiguous_segs = []
-  for reco in reco2segs:
+  for reco in sorted(reco2segs):
     segs = reco2segs[reco].strip().split()
     new_segs = ""
     for i in range(1, len(segs)-1):
@@ -114,14 +115,14 @@ def main():
     new_segs += " " + start + "," + end + "," + label
     merged_segs.append(reco + new_segs)
 
-  with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer:
+  with codecs.open(args.rttm_file, 'w', 'utf-8') as rttm_writer:
     for reco_line in merged_segs:
       segs = reco_line.strip().split()
       reco = segs[0]
       for i in range(1, len(segs)):
         start, end, label = segs[i].strip().split(',')
-        print("SPEAKER {0} 0 {1:7.3f} {2:7.3f} <NA> <NA> {3} <NA> <NA>".format(
-          reco, float(start), float(end)-float(start), label), file=rttm_writer)
+        print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} <NA> <NA> {4} <NA> <NA>".format(
+          reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer)
 
 if __name__ == '__main__':
   main()
diff --git a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
index d7591a6a3a8..8d579138c73 100755
--- a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
+++ b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh
@@ -102,7 +102,7 @@ if [ $stage -le 0 ]; then
   fi
   utils/data/get_uniform_subsegments.py \
       --max-segment-duration=$window \
-      --overlap-duration=$(echo "$window-$period" | bc) \
+      --overlap-duration=$(perl -e "print ($window-$period);") \
       --max-remaining-duration=$min_segment \
       --constant-duration=True \
       $segments > $dir/subsegments
diff --git a/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
new file mode 100755
index 00000000000..9254012f3b0
--- /dev/null
+++ b/egs/callhome_diarization/v1/diarization/train_ivector_extractor_diag.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+# Copyright   2013  Daniel Povey
+#             2014  David Snyder
+#             2019  Zili Huang
+# Apache 2.0.
+
+# This script trains the i-vector extractor for VB resegmentation. It is very similar to 
+# sid/train_ivector_extractor.sh except that the UBM is assumed to be diagonal in this script.
+
+# Begin configuration section.
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
+        # run is nj * num_processes * num_threads, and the number of
+        # separate pieces of data is nj * num_processes.
+num_threads=4
+num_processes=4 # each job runs this many processes, each with --num-threads threads
+cmd="run.pl"
+stage=-4
+num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select
+ivector_dim=400 # dimension of the extracted i-vector
+use_weights=false # set to true to turn on the regression of log-weights on the ivector.
+num_iters=10
+min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
+num_samples_for_weights=3 # smaller than the default for speed (relates to a sampling method)
+cleanup=true
+apply_cmn=true # If true, apply sliding window cepstral mean normalization
+posterior_scale=1.0 # This scale helps to control for successve features being highly
+                    # correlated.  E.g. try 0.1 or 0.3
+sum_accs_opt=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <gmm-model> <data> <extractor-dir>"
+  echo " e.g.: $0 exp/ubm_2048_male/final.dubm data/train_male exp/extractor_male"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --num-iters <#iters|10>                          # Number of iterations of E-M"
+  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
+  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
+  echo "                                                   # to summing accs in memory)"
+  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
+  echo "                                                   # increased much above 4)"
+  echo "  --stage <stage|-4>                               # To control partial reruns"
+  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
+  echo "                                                   # diagonal model."
+  echo "  --sum-accs-opt <option|''>                       # Option e.g. '-l hostname=a15' to localize"
+  echo "                                                   # sum-accs process to nfs server."
+  echo " --apply-cmn <true,false|true>                     # if true, apply sliding window cepstral mean"
+  echo "                                                   # normalization to features"
+  exit 1;
+fi
+
+gmm_model=$1
+data=$2
+dir=$3
+srcdir=$(dirname $gmm_model)
+
+for f in $gmm_model $data/feats.scp ; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+nj_full=$[$nj*$num_processes]
+sdata=$data/split$nj_full;
+utils/split_data.sh $data $nj_full || exit 1;
+
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+if [ -f $srcdir/delta_opts ]; then
+  cp $srcdir/delta_opts $dir/ 2>/dev/null
+fi
+
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
+## Set up features.
+if $apply_cmn; then
+  feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
+else
+  feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
+fi
+
+# Initialize the i-vector extractor using the FGMM input
+if [ $stage -le -2 ]; then
+  cp $gmm_model $dir/final.dubm || exit 1;
+  $cmd $dir/log/init.log \
+    ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
+     "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1
+fi
+
+# Do Gaussian selection and posterior extracion
+
+if [ $stage -le -1 ]; then
+  echo $nj_full > $dir/num_jobs
+  echo "$0: doing Gaussian selection and posterior computation"
+  $cmd JOB=1:$nj_full $dir/log/gselect.JOB.log \
+    gmm-global-get-post --n=$num_gselect --min-post=$min_post $dir/final.dubm "$feats" ark:- \| \
+    scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
+else
+  if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
+    echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
+    exit 1
+  fi
+fi
+
+x=0
+while [ $x -lt $num_iters ]; do
+  if [ $stage -le $x ]; then
+    rm $dir/.error 2>/dev/null
+
+    Args=() # bash array of training commands for 1:nj, that put accs to stdout.
+    for j in $(seq $nj_full); do
+      Args[$j]=`echo "ivector-extractor-acc-stats --num-threads=$num_threads --num-samples-for-weights=$num_samples_for_weights $dir/$x.ie '$feats' 'ark,s,cs:gunzip -c $dir/post.JOB.gz|' -|" | sed s/JOB/$j/g`
+    done
+
+    echo "Accumulating stats (pass $x)"
+    for g in $(seq $nj); do
+      start=$[$num_processes*($g-1)+1]
+      $cmd $parallel_opts $dir/log/acc.$x.$g.log \
+        ivector-extractor-sum-accs --parallel=true "${Args[@]:$start:$num_processes}" \
+          $dir/acc.$x.$g || touch $dir/.error &
+    done
+    wait
+    [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    echo "Updating model (pass $x)"
+    nt=$[$num_threads*$num_processes] # use the same number of threads that
+                                      # each accumulation process uses, since we
+                                      # can be sure the queue will support this many.
+    $cmd $parallel_opts $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
+  fi
+  x=$[$x+1]
+done
+$cleanup && rm -f $dir/post.*.gz
+rm -f $dir/final.ie
+ln -s $x.ie $dir/final.ie
diff --git a/egs/callhome_diarization/v1/local/make_callhome.sh b/egs/callhome_diarization/v1/local/make_callhome.sh
index caa8f679f22..21411fb6194 100755
--- a/egs/callhome_diarization/v1/local/make_callhome.sh
+++ b/egs/callhome_diarization/v1/local/make_callhome.sh
@@ -70,4 +70,9 @@ utils/filter_scp.pl $data_dir/callhome1/wav.scp $data_dir/callhome/reco2num_spk
 utils/filter_scp.pl $data_dir/callhome2/wav.scp $data_dir/callhome/reco2num_spk \
   > $data_dir/callhome2/reco2num_spk
 
+rm $data_dir/callhome/segments || exit 1;
+awk '{print $1, $1}' $data_dir/callhome/wav.scp > $data_dir/callhome/utt2spk
+utils/utt2spk_to_spk2utt.pl $data_dir/callhome/utt2spk > $data_dir/callhome/spk2utt
+utils/fix_data_dir.sh $data_dir/callhome
+
 rm -rf $tmp_dir 2> /dev/null
diff --git a/egs/callhome_diarization/v1/local/make_musan.py b/egs/callhome_diarization/v1/local/make_musan.py
deleted file mode 100755
index b3f6652ba40..00000000000
--- a/egs/callhome_diarization/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/callhome_diarization/v1/local/make_musan.sh b/egs/callhome_diarization/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/callhome_diarization/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl
new file mode 100755
index 00000000000..71b26b55de5
--- /dev/null
+++ b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl
@@ -0,0 +1,106 @@
+#!/usr/bin/perl
+use warnings; #sed replacement for -w perl parameter
+#
+# Copyright   2017   David Snyder
+# Apache 2.0
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-LDC98S75> <path-to-output>\n";
+  print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n";
+  exit(1);
+}
+($db_base, $out_dir) = @ARGV;
+
+if (system("mkdir -p $out_dir")) {
+  die "Error making directory $out_dir";
+}
+
+open(CS, "<$db_base/doc/callstat.tbl") || die  "Could not open $db_base/doc/callstat.tbl";
+open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender";
+open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp";
+
+@badAudio = ("3", "4");
+
+$tmp_dir = "$out_dir/tmp";
+if (system("mkdir -p $tmp_dir") != 0) {
+  die "Error making directory $tmp_dir";
+}
+
+if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) {
+  die "Error getting list of sph files";
+}
+
+open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list";
+
+%wavs = ();
+while(<WAVLIST>) {
+  chomp;
+  $sph = $_;
+  @t = split("/",$sph);
+  @t1 = split("[./]",$t[$#t]);
+  $uttId = $t1[0];
+  $wavs{$uttId} = $sph;
+}
+
+while (<CS>) {
+  $line = $_ ;
+  @A = split(",", $line);
+  @A1 = split("[./]",$A[0]);
+  $wav = $A1[0];
+  if (/$wav/i ~~ @badAudio) {
+    # do nothing
+    print "Bad Audio = $wav";
+  } else {
+    $spkr1= "sw_" . $A[2];
+    $spkr2= "sw_" . $A[3];
+    $gender1 = $A[5];
+    $gender2 = $A[6];
+    if ($gender1 eq "M") {
+      $gender1 = "m";
+    } elsif ($gender1 eq "F") {
+      $gender1 = "f";
+    } else {
+      die "Unknown Gender in $line";
+    }
+    if ($gender2 eq "M") {
+      $gender2 = "m";
+    } elsif ($gender2 eq "F") {
+      $gender2 = "f";
+    } else {
+      die "Unknown Gender in $line";
+    }
+    if (-e "$wavs{$wav}") {
+      $uttId = $spkr1 ."_" . $wav ."_1";
+      if (!$spk2gender{$spkr1}) {
+        $spk2gender{$spkr1} = $gender1;
+        print GNDR "$spkr1"," $gender1\n";
+      }
+      print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n";
+      print SPKR "$uttId"," $spkr1","\n";
+
+      $uttId = $spkr2 . "_" . $wav ."_2";
+      if (!$spk2gender{$spkr2}) {
+        $spk2gender{$spkr2} = $gender2;
+        print GNDR "$spkr2"," $gender2\n";
+      }
+      print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n";
+      print SPKR "$uttId"," $spkr2","\n";
+    } else {
+      print STDERR "Missing $wavs{$wav} for $wav\n";
+    }
+  }
+}
+
+close(WAV) || die;
+close(SPKR) || die;
+close(GNDR) || die;
+if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+if (system("utils/fix_data_dir.sh $out_dir") != 0) {
+  die "Error fixing data dir $out_dir";
+}
+if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/callhome_diarization/v1/run.sh b/egs/callhome_diarization/v1/run.sh
index acc48bd24f9..f4652c0c0ef 100755
--- a/egs/callhome_diarization/v1/run.sh
+++ b/egs/callhome_diarization/v1/run.sh
@@ -188,7 +188,7 @@ if [ $stage -le 6 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         exp/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh
index 4f730d4753c..85a2c7fdf2b 100755
--- a/egs/callhome_diarization/v2/run.sh
+++ b/egs/callhome_diarization/v2/run.sh
@@ -19,6 +19,8 @@ vaddir=`pwd`/mfcc
 data_root=/export/corpora5/LDC
 stage=0
 nnet_dir=exp/xvector_nnet_1a/
+num_components=1024 # the number of UBM components (used for VB resegmentation)
+ivector_dim=400 # the dimension of i-vector (used for VB resegmentation)
 
 # Prepare datasets
 if [ $stage -le 0 ]; then
@@ -53,7 +55,7 @@ if [ $stage -le 1 ]; then
   # callhome1 and callhome2.  Each partition is treated like a held-out
   # dataset, and used to estimate various quantities needed to perform
   # diarization on the other part (and vice versa).
-  for name in train callhome1 callhome2; do
+  for name in train callhome1 callhome2 callhome; do
     steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 \
       --cmd "$train_cmd" --write-utt2num-frames true \
       data/$name exp/make_mfcc $mfccdir
@@ -115,7 +117,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the SWBD+SRE list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -130,7 +132,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -140,11 +142,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
@@ -297,7 +299,7 @@ if [ $stage -le 10 ]; then
 
       der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
         $nnet_dir/tuning/${dataset}_t${threshold})
-      if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then
+      if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
         best_der=$der
         best_threshold=$threshold
       fi
@@ -356,3 +358,47 @@ if [ $stage -le 11 ]; then
   # Compare to 8.69% in ../v1/run.sh
   echo "Using the oracle number of speakers, DER: $der%"
 fi
+
+# Variational Bayes resegmentation using the code from Brno University of Technology
+# Please see https://speech.fit.vutbr.cz/software/vb-diarization-eigenvoice-and-hmm-priors 
+# for details
+if [ $stage -le 12 ]; then
+  utils/subset_data_dir.sh data/train 32000 data/train_32k
+  # Train the diagonal UBM.
+  sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \
+    --nj 40 --num-threads 8 --subsample 1 --delta-order 0 --apply-cmn false \
+    data/train_32k $num_components exp/diag_ubm_$num_components
+
+  # Train the i-vector extractor. The UBM is assumed to be diagonal.
+  diarization/train_ivector_extractor_diag.sh \
+    --cmd "$train_cmd --mem 35G" \
+    --ivector-dim $ivector_dim --num-iters 5 --apply-cmn false \
+    --num-threads 1 --num-processes 1 --nj 40 \
+    exp/diag_ubm_$num_components/final.dubm data/train \
+    exp/extractor_diag_c${num_components}_i${ivector_dim}
+fi
+
+if [ $stage -le 13 ]; then
+  output_rttm_dir=exp/VB/rttm
+  mkdir -p $output_rttm_dir || exit 1;
+  cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \
+    $nnet_dir/xvectors_callhome2/plda_scores/rttm > $output_rttm_dir/x_vector_rttm
+  init_rttm_file=$output_rttm_dir/x_vector_rttm
+
+  # VB resegmentation. In this script, I use the x-vector result to 
+  # initialize the VB system. You can also use i-vector result or random 
+  # initize the VB system. The following script uses kaldi_io. 
+  # You could use `sh ../../../tools/extras/install_kaldi_io.sh` to install it
+  diarization/VB_resegmentation.sh --nj 20 --cmd "$train_cmd --mem 10G" \
+    --initialize 1 data/callhome $init_rttm_file exp/VB \
+    exp/diag_ubm_$num_components/final.dubm exp/extractor_diag_c${num_components}_i${ivector_dim}/final.ie || exit 1; 
+
+  # Compute the DER after VB resegmentation
+  mkdir -p exp/VB/results || exit 1;
+  md-eval.pl -1 -c 0.25 -r data/callhome/fullref.rttm -s $output_rttm_dir/VB_rttm 2> exp/VB/log/VB_DER.log \
+    > exp/VB/results/VB_DER.txt
+  der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+    exp/VB/results/VB_DER.txt)
+  # After VB resegmentation, DER: 6.48%
+  echo "After VB resegmentation, DER: $der%"
+fi
diff --git a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
index 62bca974e53..d9faa97f266 100755
--- a/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
+++ b/egs/callhome_egyptian/s5/local/callhome_prepare_dict.sh
@@ -54,9 +54,8 @@ cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > \
 $dir/extra_questions.txt || exit 1;
 
 # Add prons for laughter, noise, oov
-for w in `grep -v sil $dir/silence_phones.txt`; do
-sed -i "/\[$w\]/d" $tmpdir/lexicon.3
-done
+w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
+perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.3
 
 for w in `grep -v sil $dir/silence_phones.txt`; do
 echo "[$w] $w"
diff --git a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py
index f5b69a1ff86..7192ff7a1cc 100644
--- a/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py
+++ b/egs/callhome_egyptian/s5/local/convert_symtable_to_utf.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 #!/usr/bin/env py
 
 # Converts a romanized ECA word list (symbol table) to
@@ -7,9 +8,9 @@
 import codecs
 
 if len(sys.argv) < 3:
-    print "USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]"
-    print "E.g., local/convert_symtable_to_utf.py data/lang/words.txt \
-                /export/corpora/LDC/LDC99L22"
+    print("USAGE: local/convert_symtable_to_utf.py [SYMTABLE] [ECA-LEXICON]")
+    print("E.g., local/convert_symtable_to_utf.py data/lang/words.txt \
+                /export/corpora/LDC/LDC99L22")
     sys.exit(1)
 
 # Note that the ECA lexicon's default encoding is ISO-8859-6, not UTF8
diff --git a/egs/callhome_egyptian/s5/local/ctm.sh b/egs/callhome_egyptian/s5/local/ctm.sh
index 14056b7a44b..64a7cf0d4f6 100755
--- a/egs/callhome_egyptian/s5/local/ctm.sh
+++ b/egs/callhome_egyptian/s5/local/ctm.sh
@@ -18,9 +18,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
+#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/callhome_egyptian/s5/local/splits/get_conversation.py b/egs/callhome_egyptian/s5/local/splits/get_conversation.py
index c999d3e597e..80f66174e2b 100755
--- a/egs/callhome_egyptian/s5/local/splits/get_conversation.py
+++ b/egs/callhome_egyptian/s5/local/splits/get_conversation.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+from __future__ import print_function
 import os
 import re
 
@@ -37,14 +38,14 @@
         evaltest[pathComponents[12]] = numberOfConversations
         testConv = testConv + numberOfConversations
 
-print "==============Train==============="
-print train
-print "Total Conversations in train = " + str(trainConv)
-print "==============Dev==============="
-print devtest
-print "Total Conversations in dev = " + str(devConv)
-print "==============Test==============="
-print evaltest
-print "Total Conversations in test = " + str(testConv)
-print "================================="
-print "Total Conversations in Corpus = " + str(trainConv + devConv + testConv)
+print("==============Train===============")
+print(train)
+print("Total Conversations in train = {}".format(trainConv))
+print("==============Dev===============")
+print(devtest)
+print("Total Conversations in dev = {}".format(devConv))
+print("==============Test===============")
+print(evaltest)
+print("Total Conversations in test = {}".format(testConv))
+print("=================================")
+print("Total Conversations in Corpus = {}".format(trainConv + devConv + testConv))
diff --git a/egs/casia_hwdb/v1/cmd.sh b/egs/casia_hwdb/v1/cmd.sh
new file mode 100755
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/casia_hwdb/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/casia_hwdb/v1/image b/egs/casia_hwdb/v1/image
new file mode 120000
index 00000000000..6a4b3afeb09
--- /dev/null
+++ b/egs/casia_hwdb/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image
\ No newline at end of file
diff --git a/egs/casia_hwdb/v1/local/augment_data.sh b/egs/casia_hwdb/v1/local/augment_data.sh
new file mode 100755
index 00000000000..1f13ed15ded
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/augment_data.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --vertical-shift $verticle_shift \
+    --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/casia_hwdb/v1/local/chain/compare_wer.sh b/egs/casia_hwdb/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..ab880c1adb5
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/chain/compare_wer.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/casia_hwdb/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/casia_hwdb/v1/local/chain/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..300c8ae8e31
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -0,0 +1,231 @@
+#!/bin/bash
+
+# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# CER                             15.44     13.57
+# Final train prob               0.0616   -0.0512
+# Final valid prob               0.0390   -0.0718
+# Final train prob (xent)                 -0.6199
+# Final valid prob (xent)                 -0.7448
+
+set -e -o pipefail
+
+data_dir=data
+exp_dir=exp
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=2000
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=true
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+e2echain_model_dir=$exp_dir/chain/e2e_cnn_1a
+ali_dir=$exp_dir/chain/e2e_ali_train
+lat_dir=$exp_dir/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=$exp_dir/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=$data_dir/${train_set}
+tree_dir=$exp_dir/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=$data_dir/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  cp -r $data_dir/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=180 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn3 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn6 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn8 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn9 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=4 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=16,8 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=wait \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/casia_hwdb/v1/local/chain/run_flatstart_cnn1a.sh b/egs/casia_hwdb/v1/local/chain/run_flatstart_cnn1a.sh
new file mode 100755
index 00000000000..023fbff1c14
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/chain/run_flatstart_cnn1a.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+
+# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# CER                             15.44     13.57
+# Final train prob               0.0616   -0.0512
+# Final valid prob               0.0390   -0.0718
+# Final train prob (xent)                 -0.6199
+# Final valid prob (xent)                 -0.7448
+
+set -e
+
+data_dir=data
+exp_dir=exp
+
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=4
+num_jobs_final=8
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_test=lang_test
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=$data_dir/lang_e2e
+treedir=$exp_dir/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=$exp_dir/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r $data_dir/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       $data_dir/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat $data_dir/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \
+    utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=1500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=180 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --use-gpu=wait \
+    --feat-dir $data_dir/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/casia_hwdb/v1/local/extract_database.sh b/egs/casia_hwdb/v1/local/extract_database.sh
new file mode 100755
index 00000000000..1af3713d586
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/extract_database.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright 2018 Chun-Chieh Chang
+
+# The original format of the dataset given is GEDI and page images.
+# This script is written to create line images from page images.
+# It also creates csv files from the GEDI files.
+
+database_train=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/
+database_competition=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/
+cangjie_url=https://raw.githubusercontent.com/wanleung/libcangjie/master/tables/cj5-cc.txt
+download_dir=download
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1; 
+
+mkdir -p ${download_dir}/{Train,Test}
+for task in 0 1 2; do
+    for datasplit in Train Test; do
+        unzip -q -d ${download_dir}/${datasplit} ${database_train}/CASIA-HWDB2.${task}/${datasplit}_Dgr.zip
+    done
+done
+
+unzip -q -d ${download_dir}/Competition ${database_competition}/competition-dgr.zip 
+
+echo "Downloading table for CangJie."
+wget -P $download_dir/ $cangjie_url || exit 1;
+sed -ie '1,8d' $download_dir/cj5-cc.txt
diff --git a/egs/casia_hwdb/v1/local/extract_features.sh b/egs/casia_hwdb/v1/local/extract_features.sh
new file mode 100755
index 00000000000..f75837ae5b3
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/extract_features.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+augment='no_aug'
+num_channels=3
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/casia_hwdb/v1/local/gen_topo.py b/egs/casia_hwdb/v1/local/gen_topo.py
new file mode 100755
index 00000000000..f64dcc5eec1
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/gen_topo.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+# Copyright 2017 (author: Chun-Chieh Chang)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs. This is a modified version of
+# 'utils/gen_topo.pl'. The difference is that this creates two topologies for
+# the non-silence HMMs. The number of states for punctuations is different than
+# the number of states for other characters.
+
+from __future__ import print_function
+import argparse
+import string
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones");
+parser.add_argument("num_sil_states", type=int, help="number of states for silence phones");
+parser.add_argument("num_cj5_states", type=int, help="number of states for punctuation");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number.");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+cj5_phones = []
+with open(args.phone_list) as f:
+    for line in f:
+        line = line.strip()
+        phone = line.split(' ')[0]
+        if "cj5" in phone:
+            cj5_phones.append(int(line.split(' ')[1]))
+# For nonsilence phones that are not punctuations
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x not in cj5_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_nonsil_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_nonsil_states) + " </State>")
+print("</TopologyEntry>")
+
+# For nonsilence phones that are cj5
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x in cj5_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_cj5_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_cj5_states) + " </State>")
+print("</TopologyEntry>")
+
+# For silence phones
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in silence_phones]))
+print("</ForPhones>")
+if(args.num_sil_states > 1):
+    transp = 1.0 / (args.num_sil_states - 1)
+    
+    state_str = "<State> 0 <PdfClass> 0 "
+    for x in range(0, (args.num_sil_states - 1)):
+        state_str = state_str + "<Transition> " + str(x) + " " + str(transp) + " "
+    state_str = state_str + "</State>"
+    print(state_str)
+
+    for x in range(1, (args.num_sil_states - 1)):
+        state_str = "<State> " + str(x) + " <PdfClass> " + str(x) + " "
+        for y in range(1, args.num_sil_states):
+            state_str = state_str + "<Transition> " + str(y) + " " + str(transp) + " "
+        state_str = state_str + "</State>"
+        print(state_str)
+    second_last = args.num_sil_states - 1
+    print("<State> " + str(second_last) + " <PdfClass> " + str(second_last) + " <Transition> " + str(second_last) + " 0.75 <Transition> " + str(args.num_sil_states) + " 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+else:
+    print("<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/casia_hwdb/v1/local/normalize_text.py b/egs/casia_hwdb/v1/local/normalize_text.py
new file mode 100755
index 00000000000..80c4e3ad3ab
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/normalize_text.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+# Copyright   2018 Chun-Chieh Chang
+
+# This script reads in text and outputs the normalized version
+
+import io
+import re
+import sys
+import unicodedata
+
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+for line in sys.stdin:
+    line = line.strip()
+    line = unicodedata.normalize('NFC', line)
+    line = re.sub(r'\s', ' ', line)
+    sys.stdout.write(line + '\n')
diff --git a/egs/casia_hwdb/v1/local/prepare_dict.sh b/egs/casia_hwdb/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..cf2ecb1ce9b
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/prepare_dict.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+data_dir=data
+
+. ./utils/parse_options.sh || exit 1;
+
+base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2)
+
+mkdir -p $dir
+
+cut -d' ' -f1 download/cj5-cc.txt | ./utils/lang/bpe/learn_bpe.py -s 300 > $dir/bpe.out
+cut -d' ' -f1 download/cj5-cc.txt | ./utils/lang/bpe/apply_bpe.py -c $dir/bpe.out | sed 's/@@//g' > $dir/bpe_text
+cut -d' ' -f2- download/cj5-cc.txt | sed 's/ //g' > $dir/ids
+paste -d' ' $dir/bpe_text $dir/ids > $dir/cj5-cc.txt
+local/prepare_lexicon.py --data-dir $data_dir $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/casia_hwdb/v1/local/prepare_lexicon.py b/egs/casia_hwdb/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..224c199ecef
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/prepare_lexicon.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Ashish Arora
+#                  Chun-Chieh Chang
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+parser.add_argument('--data-dir', type=str, default='data', help='Path to text file')
+args = parser.parse_args()
+
+### main ###
+radical = ['日', '月', '金', '木', '水', '火', '土', '竹', '戈', '十', '大', '中', '一', '弓', '人', '心', '手','口','尸','廿','山','女','田','卜']
+lex = {}
+text_path = os.path.join(args.data_dir, 'train', 'text')
+text_fh = open(text_path, 'r', encoding='utf-8')
+
+# Used specially for Chinese.
+# Uses the ChangJie keyboard input method to create subword units for Chinese.
+cj5_table = {}
+with open(os.path.join(args.dir, 'cj5-cc.txt'), 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split()
+        if not line_vect[0].startswith('yyy') and not line_vect[0].startswith('z'):
+            cj5_table[line_vect[-1]] = "cj5_" + " cj5_".join(line_vect[:-1])
+#            lex[line_vect[1]] = "cj5_" + " cj5_".join(list(line_vect[0]))
+
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split()
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+	    # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
+            characters = " ".join([ 'SIL' if char == '|' else char if char in radical else cj5_table[char] if char in cj5_table else char for char in characters])
+            characters = characters.replace('#','<HASH>')
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/casia_hwdb/v1/local/process_data.py b/egs/casia_hwdb/v1/local/process_data.py
new file mode 100755
index 00000000000..8548ac2c58e
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/process_data.py
@@ -0,0 +1,464 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Farsi OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import numpy as np
+import os
+import re
+import struct
+import sys
+import unicodedata
+from collections import namedtuple
+from math import atan2, cos, sin, pi, degrees, sqrt
+from PIL import Image
+from scipy import misc
+from scipy.spatial import ConvexHull
+
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area')
+args = parser.parse_args()
+
+"""
+bounding_box is a named tuple which contains:
+             area (float): area of the rectangle
+             length_parallel (float): length of the side that is parallel to unit_vector
+             length_orthogonal (float): length of the side that is orthogonal to unit_vector
+             rectangle_center(int, int): coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector (float, float): direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle (float): angle of the unit vector to be in radians.
+             corner_points [(float, float)]: set that contains the corners of the rectangle
+"""
+
+bounding_box_tuple = namedtuple('bounding_box_tuple', 'area '
+                                        'length_parallel '
+                                        'length_orthogonal '
+                                        'rectangle_center '
+                                        'unit_vector '
+                                        'unit_vector_angle '
+                                        'corner_points')
+
+
+def unit_vector(pt0, pt1):
+    """ Given two points pt0 and pt1, return a unit vector that
+        points in the direction of pt0 to pt1.
+    Returns
+    -------
+    (float, float): unit vector
+    """
+    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
+    return (pt1[0] - pt0[0]) / dis_0_to_1, \
+           (pt1[1] - pt0[1]) / dis_0_to_1
+
+
+def orthogonal_vector(vector):
+    """ Given a vector, returns a orthogonal/perpendicular vector of equal length.
+    Returns
+    ------
+    (float, float): A vector that points in the direction orthogonal to vector.
+    """
+    return -1 * vector[1], vector[0]
+
+def bounding_area(index, hull):
+    """ Given index location in an array and convex hull, it gets two points
+        hull[index] and hull[index+1]. From these two points, it returns a named
+        tuple that mainly contains area of the box that bounds the hull. This
+        bounding box orintation is same as the orientation of the lines formed
+        by the point hull[index] and hull[index+1].
+    Returns
+    -------
+    a named tuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side.
+    (it's orthogonal vector can be found with the orthogonal_vector function)
+    """
+    unit_vector_p = unit_vector(hull[index], hull[index+1])
+    unit_vector_o = orthogonal_vector(unit_vector_p)
+
+    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
+    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)
+
+    min_p = min(dis_p)
+    min_o = min(dis_o)
+    len_p = max(dis_p) - min_p
+    len_o = max(dis_o) - min_o
+
+    return {'area': len_p * len_o,
+            'length_parallel': len_p,
+            'length_orthogonal': len_o,
+            'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2),
+            'unit_vector': unit_vector_p}
+
+def to_xy_coordinates(unit_vector_angle, point):
+    """ Given angle from horizontal axis and a point from origin,
+        returns converted unit vector coordinates in x, y coordinates.
+        angle of unit vector should be in radians.
+    Returns
+    ------
+    (float, float): converted x,y coordinate of the unit vector.
+    """
+    angle_orthogonal = unit_vector_angle + pi / 2
+    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
+           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
+
+def rotate_points(center_of_rotation, angle, points):
+    """ Rotates a point cloud around the center_of_rotation point by angle
+    input
+    -----
+    center_of_rotation (float, float): angle of unit vector to be in radians.
+    angle (float): angle of rotation to be in radians.
+    points [(float, float)]: Points to be a list or tuple of points. Points to be rotated.
+    Returns
+    ------
+    [(float, float)]: Rotated points around center of rotation by angle
+    """
+    rot_points = []
+    ang = []
+    for pt in points:
+        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
+        diff_angle = atan2(diff[1], diff[0]) + angle
+        ang.append(diff_angle)
+        diff_length = sqrt(sum([d**2 for d in diff]))
+        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
+                           center_of_rotation[1] + diff_length * sin(diff_angle)))
+
+    return rot_points
+
+def rectangle_corners(rectangle):
+    """ Given rectangle center and its inclination, returns the corner
+        locations of the rectangle.
+    Returns
+    ------
+    [(float, float)]: 4 corner points of rectangle.
+    """
+    corner_points = []
+    for i1 in (.5, -.5):
+        for i2 in (i1, -1 * i1):
+            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
+                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))
+
+    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
+
+def get_orientation(origin, p1, p2):
+    """
+    Given origin and two points, return the orientation of the Point p1 with
+    regards to Point p2 using origin.
+    Returns
+    -------
+    integer: Negative if p1 is clockwise of p2.
+    """
+    difference = (
+        ((p2[0] - origin[0]) * (p1[1] - origin[1]))
+        - ((p1[0] - origin[0]) * (p2[1] - origin[1]))
+    )
+    return difference
+
+def compute_hull(points):
+    """
+    Given input list of points, return a list of points that
+    made up the convex hull.
+    Returns
+    -------
+    [(float, float)]: convexhull points
+    """
+    hull_points = []
+    start = points[0]
+    min_x = start[0]
+    for p in points[1:]:
+        if p[0] < min_x:
+            min_x = p[0]
+            start = p
+    point = start
+    hull_points.append(start)
+
+    far_point = None
+    while far_point is not start:
+        p1 = None
+        for p in points:
+            if p is point:
+                continue
+            else:
+                p1 = p
+                break
+
+        far_point = p1
+
+        for p2 in points:
+            if p2 is point or p2 is p1:
+                continue
+            else:
+                direction = get_orientation(point, far_point, p2)
+                if direction > 0:
+                    far_point = p2
+
+        hull_points.append(far_point)
+        point = far_point
+    return hull_points
+
+
+def minimum_bounding_box(points):
+    """ Given a list of 2D points, it returns the minimum area rectangle bounding all
+        the points in the point cloud.
+    Returns
+    ------
+    returns a namedtuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side. RADIANS
+    unit_vector_angle: angle of the unit vector
+    corner_points: set that contains the corners of the rectangle
+    """
+    if len(points) <= 2: raise ValueError('More than two points required.')
+
+    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
+    hull_ordered.append(hull_ordered[0])
+    #hull_ordered = compute_hull(points)
+    hull_ordered = tuple(hull_ordered)
+
+    min_rectangle = bounding_area(0, hull_ordered)
+    for i in range(1, len(hull_ordered)-1):
+        rectangle = bounding_area(i, hull_ordered)
+        if rectangle['area'] < min_rectangle['area']:
+            min_rectangle = rectangle
+
+    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
+    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])
+
+    return bounding_box_tuple(
+        area = min_rectangle['area'],
+        length_parallel = min_rectangle['length_parallel'],
+        length_orthogonal = min_rectangle['length_orthogonal'],
+        rectangle_center = min_rectangle['rectangle_center'],
+        unit_vector = min_rectangle['unit_vector'],
+        unit_vector_angle = min_rectangle['unit_vector_angle'],
+        corner_points = set(rectangle_corners(min_rectangle)))
+
+def get_center(im):
+    """ Given image, returns the location of center pixel
+    Returns
+    -------
+    (int, int): center of the image
+    """
+    center_x = float(im.size[0]) / 2
+    center_y = float(im.size[1]) / 2
+    return int(center_x), int(center_y)
+
+def get_horizontal_angle(unit_vector_angle):
+    """ Given an angle in radians, returns angle of the unit vector in
+        first or fourth quadrant.
+    Returns
+    ------
+    (float): updated angle of the unit vector to be in radians.
+             It is only in first or fourth quadrant.
+    """
+    if unit_vector_angle > pi / 2 and unit_vector_angle <= pi:
+        unit_vector_angle = unit_vector_angle - pi
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2:
+        unit_vector_angle = unit_vector_angle + pi
+
+    return unit_vector_angle
+
+def get_smaller_angle(bounding_box):
+    """ Given a rectangle, returns its smallest absolute angle from horizontal axis.
+    Returns
+    ------
+    (float): smallest angle of the rectangle to be in radians.
+    """
+    unit_vector = bounding_box.unit_vector
+    unit_vector_angle = bounding_box.unit_vector_angle
+    ortho_vector = orthogonal_vector(unit_vector)
+    ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0])
+
+    unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle)
+    ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle)
+
+    if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated):
+        return unit_vector_angle_updated
+    else:
+        return ortho_vector_angle_updated
+
+def rotated_points(bounding_box, center):
+    """ Given the rectangle, returns corner points of rotated rectangle.
+        It rotates the rectangle around the center by its smallest angle.
+    Returns
+    -------
+    [(int, int)]: 4 corner points of rectangle.
+    """
+    p1, p2, p3, p4 = bounding_box.corner_points
+    x1, y1 = p1
+    x2, y2 = p2
+    x3, y3 = p3
+    x4, y4 = p4
+    center_x, center_y = center
+    rotation_angle_in_rad = -get_smaller_angle(bounding_box)
+    x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x
+
+    y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y
+    return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4
+
+def pad_image(image):
+    """ Given an image, returns a padded image around the border.
+        This routine save the code from crashing if bounding boxes that are
+        slightly outside the page boundary.
+    Returns
+    -------
+    image: page image
+    """
+    offset = int(args.padding // 2)
+    padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white")
+    padded_image.paste(im = image, box = (offset, offset))
+    return padded_image
+
+def update_minimum_bounding_box_input(bounding_box_input):
+    """ Given list of 2D points, returns list of 2D points shifted by an offset.
+    Returns
+    ------
+    points [(float, float)]: points, a list or tuple of 2D coordinates
+    """
+    updated_minimum_bounding_box_input = []
+    offset = int(args.padding // 2)
+    for point in bounding_box_input:
+        x, y = point
+        new_x = x + offset
+        new_y = y + offset
+        word_coordinate = (new_x, new_y)
+        updated_minimum_bounding_box_input.append(word_coordinate)
+
+    return updated_minimum_bounding_box_input
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+for filename in sorted(os.listdir(args.database_path)):
+    if filename.endswith('.dgr'):
+        with open(os.path.join(args.database_path, filename), 'rb')  as f:
+            iHdSize = struct.unpack('i', f.read(4))[0]
+            szFormatCode = struct.unpack(''.join('c' for x in range(0,8)), f.read(8))
+            szFormatCode = "".join([x.decode('utf8') for x in szFormatCode])
+            szIllustr = f.read(iHdSize - 36)
+            szCodeType = struct.unpack(''.join(['c' for x in range(0,20)]), f.read(20))
+            szCodeType = "".join([x.decode('utf8') for x in szCodeType])
+            sCodeLen = struct.unpack('h', f.read(2))[0]
+            sBitApp = struct.unpack('h', f.read(2))[0]
+            iImgHei = struct.unpack('i', f.read(4))[0]
+            iImgWid = struct.unpack('i', f.read(4))[0]
+            pDocImg = Image.new('L', (iImgWid, iImgHei), (255))
+            iLineNum = struct.unpack('i', f.read(4))[0]
+            text_dict = {}
+            image_dict = {}
+            for i in range(0, iLineNum):
+                iWordNum = struct.unpack('i', f.read(4))[0]
+                for j in range(0, iWordNum):
+                    pWordLabel = f.read(sCodeLen).decode('gb18030', errors='ignore')
+                    sTop = struct.unpack('h', f.read(2))[0]
+                    sLeft = struct.unpack('h', f.read(2))[0]
+                    sHei = struct.unpack('h', f.read(2))[0]
+                    sWid = struct.unpack('h', f.read(2))[0]
+                    if i in text_dict:
+                        text_dict[i] += [pWordLabel]
+                    else:
+                        text_dict[i] = [pWordLabel]
+                    if i in image_dict:
+                        image_dict[i] += [[sTop, sLeft, sHei, sWid]]
+                    else:
+                        image_dict[i] = [[sTop, sLeft, sHei, sWid]]
+                    pTmpData = struct.unpack("{}B".format(sHei * sWid), f.read(sHei * sWid))
+                    character = misc.toimage(np.array(pTmpData).reshape(sHei, sWid))
+                    pDocImg.paste(character, (sLeft, sTop))
+            pDocImg.save(os.path.join(args.out_dir, 'data', 'images', os.path.splitext(filename)[0] + '.png'), 'png')
+            
+            im_page = pad_image(pDocImg)
+            for i in range(0, iLineNum):
+                text = ""
+                points = []
+                for j, char in enumerate(text_dict[i]):
+                    text += char
+                    points.append([image_dict[i][j][1], image_dict[i][j][0]])
+                    points.append([image_dict[i][j][1] + image_dict[i][j][3], image_dict[i][j][0]])
+                    points.append([image_dict[i][j][1], image_dict[i][j][0] + image_dict[i][j][2]])
+                    points.append([image_dict[i][j][1] + image_dict[i][j][3], image_dict[i][j][0] + image_dict[i][j][2]])
+                updated_mbb_input = update_minimum_bounding_box_input(points)
+                bounding_box = minimum_bounding_box(updated_mbb_input)
+                p1, p2, p3, p4 = bounding_box.corner_points 
+                x1, y1 = p1
+                x2, y2 = p2
+                x3, y3 = p3
+                x4, y4 = p4
+                min_x = int(min(x1, x2, x3, x4))
+                min_y = int(min(y1, y2, y3, y4))
+                max_x = int(max(x1, x2, x3, x4))
+                max_y = int(max(y1, y2, y3, y4))
+                box = (min_x, min_y, max_x, max_y)
+                region_initial = im_page.crop(box)
+                rot_points = []
+                p1_new = (x1 - min_x, y1 - min_y)
+                p2_new = (x2 - min_x, y2 - min_y)
+                p3_new = (x3 - min_x, y3 - min_y)
+                p4_new = (x4 - min_x, y4 - min_y)
+                rot_points.append(p1_new)
+                rot_points.append(p2_new)
+                rot_points.append(p3_new)
+                rot_points.append(p4_new)
+                cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                    bounding_box.length_parallel,
+                    bounding_box.length_orthogonal,
+                    bounding_box.length_orthogonal,
+                    bounding_box.unit_vector,
+                    bounding_box.unit_vector_angle,
+                    set(rot_points))
+
+                rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+                img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample=Image.BICUBIC)
+                x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                    cropped_bounding_box, get_center(region_initial))
+
+
+                min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                box = (min_x, min_y, max_x, max_y)
+                region_final = img2.crop(box)
+                text = text.replace('\x00', '')
+                text = unicodedata.normalize('NFC', text)
+                image_id = os.path.splitext(filename)[0] + '_' + str(i).zfill(3)
+                image_filepath = os.path.join(args.out_dir, 'data', 'images', os.path.splitext(filename)[0] + '_' + str(i).zfill(3) + '.png')
+                writer_id = os.path.splitext(filename)[0].split('-')[0]
+                region_final.save(image_filepath, 'png')
+                
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + writer_id + '\n')
+                image_fh.write(image_id + ' ' + image_filepath + '\n')
diff --git a/egs/casia_hwdb/v1/local/score.sh b/egs/casia_hwdb/v1/local/score.sh
new file mode 100755
index 00000000000..f2405205f02
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@"
+steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@"
diff --git a/egs/casia_hwdb/v1/local/train_lm.sh b/egs/casia_hwdb/v1/local/train_lm.sh
new file mode 100755
index 00000000000..bc738f217da
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/train_lm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/casia_hwdb/v1/local/train_lm_lr.sh b/egs/casia_hwdb/v1/local/train_lm_lr.sh
new file mode 100755
index 00000000000..a8b1bfb76a4
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/train_lm_lr.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE+Extra training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+extra_lm=download/extra_lm.txt
+order=3
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  cat ${extra_lm} | \
+    local/normalize_text.py | \
+    utils/lang/bpe/prepend_words.py | \
+    python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | \
+    sed 's/@@//g' > ${dir}/data/text/extra_lm.txt
+  
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='extra_lm=10 train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/casia_hwdb/v1/local/wer_output_filter b/egs/casia_hwdb/v1/local/wer_output_filter
new file mode 100755
index 00000000000..8702738144f
--- /dev/null
+++ b/egs/casia_hwdb/v1/local/wer_output_filter
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright      2017  Hossein Hadian
+
+# This is a filter used in scoring. It separates all
+# punctuations from words. For e.g. this sentence:
+
+# "They have come!" he said reverently, gripping his
+# hands. "Isn't it a glorious thing! Long awaited."
+
+# is converted to this:
+
+# " They have come ! " he said reverently , gripping his
+# hands . " Isn ' t it a glorious thing ! Long awaited . "
+
+# Sample BPE-based output:
+# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch
+
+import io
+import sys
+import unicodedata
+
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8");
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8");
+for line in sys.stdin:
+  line = unicodedata.normalize('NFKC', line)
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  print("{} {}".format(uttid, transcript))
diff --git a/egs/casia_hwdb/v1/path.sh b/egs/casia_hwdb/v1/path.sh
new file mode 100644
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/casia_hwdb/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/casia_hwdb/v1/run.sh b/egs/casia_hwdb/v1/run.sh
new file mode 100755
index 00000000000..44d1f26117c
--- /dev/null
+++ b/egs/casia_hwdb/v1/run.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+set -e
+stage=0
+nj=60
+
+database_train=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/
+database_competition=/export/corpora5/handwriting_ocr/CASIA_HWDB/Offline/
+data_dir=data
+exp_dir=exp
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $stage -le -1 ]; then
+    mkdir download/Train
+    mkdir download/Test
+    mkdir download/Competition
+    local/extract_database.sh --database-train $database_train \
+        --database-competition $database_competition
+fi
+
+if [ $stage -le 0 ]; then
+    mkdir -p data/train/data/images
+    mkdir -p data/test/data/images
+    mkdir -p data/competition/data/images
+    local/process_data.py download/Train data/train
+    local/process_data.py download/Test data/test
+    local/process_data.py download/Competition data/competition
+    image/fix_data_dir.sh ${data_dir}/test
+    image/fix_data_dir.sh ${data_dir}/train
+    image/fix_data_dir.sh ${data_dir}/competition
+fi
+
+mkdir -p $data_dir/{train,test}/data
+if [ $stage -le 1 ]; then
+    echo "$0: Obtaining image groups. calling get_image2num_frames"
+    echo "Date: $(date)."
+    image/get_image2num_frames.py --feat-dim 60 $data_dir/train
+    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train
+
+    for datasplit in train test competition; do
+        echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. "
+        echo "Date: $(date)."
+        local/extract_features.sh --nj $nj --cmd "$cmd" \
+            --feat-dim 60 --num-channels 3 \
+            $data_dir/${datasplit}
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1;
+    done
+
+    echo "$0: Fixing data directory for train dataset"
+    echo "Date: $(date)."
+    utils/fix_data_dir.sh $data_dir/train
+fi
+
+#if [ $stage -le 2 ]; then
+#    for datasplit in train; do
+#        echo "$(date) stage 2: Performing augmentation, it will double training data"
+#        local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 60 $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir
+#        steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1;
+#    done
+#fi
+
+if [ $stage -le 3 ]; then
+    echo "$0: Preparing dictionary and lang..."
+    if [ ! -f $data_dir/train/bpe.out ]; then
+        cut -d' ' -f2- $data_dir/train/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out
+        for datasplit in test train; do
+            cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids
+            cut -d' ' -f2- $data_dir/$datasplit/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text
+            mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old
+            paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text
+        done
+    fi
+
+    local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict
+    # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+    # So we set --sil-prob to 0.0
+    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+        $data_dir/local/dict "<sil>" $data_dir/lang/temp $data_dir/lang
+    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang
+fi
+
+if [ $stage -le 4 ]; then
+    echo "$0: Estimating a language model for decoding..."
+    local/train_lm.sh --data-dir $data_dir  --dir $data_dir/local/local_lm
+    utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+        $data_dir/local/dict/lexicon.txt $data_dir/lang_test
+fi
+
+if [ $stage -le 5 ]; then
+    echo "$0: Calling the flat-start chain recipe..."
+    echo "Date: $(date)." 
+    local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 6 ]; then
+    echo "$0: Aligning the training data using the e2e chain model..."
+    echo "Date: $(date)."
+    steps/nnet3/align.sh --nj $nj --cmd "$cmd" --use-gpu false \
+        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+        $data_dir/train $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+    echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+    echo "Date: $(date)."
+    local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train --data-dir $data_dir --exp-dir $exp_dir
+fi
diff --git a/egs/casia_hwdb/v1/steps b/egs/casia_hwdb/v1/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/casia_hwdb/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/casia_hwdb/v1/utils b/egs/casia_hwdb/v1/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/casia_hwdb/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/chime1/s5/local/chime1_prepare_data.sh b/egs/chime1/s5/local/chime1_prepare_data.sh
index e60c46ff8da..c5963b5d4ab 100755
--- a/egs/chime1/s5/local/chime1_prepare_data.sh
+++ b/egs/chime1/s5/local/chime1_prepare_data.sh
@@ -53,7 +53,7 @@ for x in "devel" "test"; do
     for sid in `seq 34`; do
       sid2=`printf "s%02d" $sid`
       ls -1 $wav_dir/*/s${sid}_*.wav \
-        | perl -ape "s/(.*)\/(.*)\/s.*_(.*).wav/${sid2}_\3_\2$\t\1\/\2\/s${sid}_\3.wav/;" \
+        | perl -ape "s/(.*)\/(.*)\/s.*_(.*).wav/${sid2}_\3_\2\t\1\/\2\/s${sid}_\3.wav/;" \
         | sort >> $scp 
     done
   fi
@@ -68,7 +68,7 @@ for x in $set_list; do
 
     # Create utt2spk files 
     # No speaker ID
-    perl -ape "s/(.*)\t.*/\1$\t\1/;" < "$scp" > "$data/$x/utt2spk"
+    perl -ape "s/(.*)\t.*/\1\t\1/;" < "$scp" > "$data/$x/utt2spk"
     # Use speaker ID
     # perl -ape "s/(s..)(.*)\\t.*/\1\2\t\1/;" < "$scp" > "$data/$x/utt2spk"
 
diff --git a/egs/chime4/s5_1ch/RESULTS b/egs/chime4/s5_1ch/RESULTS
index c0146b772b7..3e5f752a803 100644
--- a/egs/chime4/s5_1ch/RESULTS
+++ b/egs/chime4/s5_1ch/RESULTS
@@ -17,7 +17,22 @@ et05_simu WER: 33.30% (Average), 26.65% (BUS), 38.40% (CAFE), 34.68% (PEDESTRIAN
 et05_real WER: 37.54% (Average), 51.92% (BUS), 39.67% (CAFE), 34.04% (PEDESTRIAN), 24.54% (STREET)
 -------------------
 
-Advanced baseline:
+GMM noisy multi-condition without enhancement using 6 channel data
+exp/tri3b_tr05_multi_noisy/best_wer_isolated_1ch_track.result
+-------------------
+best overall dt05 WER 22.32% (language model weight = 10)
+-------------------
+dt05_simu WER: 23.24% (Average), 19.28% (BUS), 28.41% (CAFE), 19.16% (PEDESTRIAN), 26.12% (STREET)
+-------------------
+dt05_real WER: 21.40% (Average), 25.86% (BUS), 21.81% (CAFE), 16.80% (PEDESTRIAN), 21.12% (STREET)
+-------------------
+et05_simu WER: 32.03% (Average), 25.42% (BUS), 36.25% (CAFE), 33.34% (PEDESTRIAN), 33.10% (STREET)
+-------------------
+et05_real WER: 36.14% (Average), 49.28% (BUS), 38.79% (CAFE), 32.44% (PEDESTRIAN), 24.06% (STREET)
+-------------------
+
+GMM noisy multi-condition without enhancement using 6 channel data plus enhanced data
+exp/tri3b_tr05_multi_noisy/best_wer_isolated_1ch_track.result
 -------------------
 best overall dt05 WER 22.28% (language model weight = 10)
 -------------------
@@ -30,6 +45,34 @@ et05_simu WER: 32.18% (Average), 25.33% (BUS), 37.37% (CAFE), 33.36% (PEDESTRIAN
 et05_real WER: 35.54% (Average), 49.07% (BUS), 38.94% (CAFE), 31.60% (PEDESTRIAN), 22.56% (STREET)
 -------------------
 
+GMM noisy multi-condition with BLSTM masking using 6 channel data
+exp/tri3b_tr05_multi_noisy/best_wer_single_BLSTMmask.result
+-------------------
+best overall dt05 WER 28.82% (language model weight = 14)
+-------------------
+dt05_simu WER: 28.54% (Average), 25.46% (BUS), 33.47% (CAFE), 25.19% (PEDESTRIAN), 30.06% (STREET)
+-------------------
+dt05_real WER: 29.10% (Average), 33.46% (BUS), 31.80% (CAFE), 25.71% (PEDESTRIAN), 25.42% (STREET)
+-------------------
+et05_simu WER: 36.10% (Average), 30.97% (BUS), 40.42% (CAFE), 35.82% (PEDESTRIAN), 37.19% (STREET)
+-------------------
+et05_real WER: 41.84% (Average), 52.57% (BUS), 46.41% (CAFE), 39.87% (PEDESTRIAN), 28.52% (STREET)
+-------------------
+
+GMM noisy multi-condition with BLSTM masking using 6 channel data plus enhanced data
+exp/tri3b_tr05_multi_noisy/best_wer_single_BLSTMmask.result
+-------------------
+best overall dt05 WER 22.72% (language model weight = 13)
+-------------------
+dt05_simu WER: 23.37% (Average), 20.71% (BUS), 28.26% (CAFE), 19.85% (PEDESTRIAN), 24.66% (STREET)
+-------------------
+dt05_real WER: 22.07% (Average), 25.92% (BUS), 24.32% (CAFE), 18.47% (PEDESTRIAN), 19.58% (STREET)
+-------------------
+et05_simu WER: 30.41% (Average), 24.08% (BUS), 35.86% (CAFE), 30.80% (PEDESTRIAN), 30.89% (STREET)
+-------------------
+et05_real WER: 34.02% (Average), 44.68% (BUS), 37.19% (CAFE), 31.73% (PEDESTRIAN), 22.49% (STREET)
+-------------------
+
 DNN sMBR
 exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_isolated_1ch_track.result
 -------------------
@@ -45,7 +88,7 @@ et05_simu WER: 24.13% (Average), 19.65% (BUS), 27.57% (CAFE), 23.14% (PEDESTRIAN
 et05_real WER: 27.68% (Average), 40.40% (BUS), 28.95% (CAFE), 24.25% (PEDESTRIAN), 17.13% (STREET)
 -------------------
 
-Advanced baseline:
+DNN sMBR using all 6 channel data
 -------------------
 best overall dt05 WER 12.84% (language model weight = 12)
  (Number of iterations = 3)
@@ -73,7 +116,7 @@ et05_simu WER: 22.32% (Average), 17.82% (BUS), 25.48% (CAFE), 21.70% (PEDESTRIAN
 et05_real WER: 24.92% (Average), 37.52% (BUS), 26.45% (CAFE), 21.28% (PEDESTRIAN), 14.44% (STREET)
 -------------------
 
-Advanced baseline:
+5-gram rescoring using all 6 channel data
 -------------------
 best overall dt05 WER 11.07% (language model weight = 12)
 -------------------
@@ -100,7 +143,7 @@ et05_simu WER: 20.84% (Average), 16.49% (BUS), 23.91% (CAFE), 20.25% (PEDESTRIAN
 et05_real WER: 23.70% (Average), 35.93% (BUS), 24.60% (CAFE), 19.94% (PEDESTRIAN), 14.36% (STREET)
 -------------------
 
-Advanced baseline:
+RNNLM using all 6 channel data
 -------------------
 best overall dt05 WER 9.99% (language model weight = 14)
 -------------------
@@ -113,30 +156,86 @@ et05_simu WER: 17.31% (Average), 12.81% (BUS), 20.32% (CAFE), 17.03% (PEDESTRIAN
 et05_real WER: 18.10% (Average), 26.58% (BUS), 19.97% (CAFE), 14.44% (PEDESTRIAN), 11.43% (STREET)
 -------------------
 
-TDNN
-exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result
+TDNN using all 6 channel data
+exp/chain/tdnniso_sp/best_wer_beamformit_5mics.result
+-------------------
+best overall dt05 WER 9.56% (language model weight = 10)
+-------------------
+dt05_simu WER: 10.23% (Average), 8.86% (BUS), 13.13% (CAFE), 7.94% (PEDESTRIAN), 11.00% (STREET)
+-------------------
+dt05_real WER: 8.89% (Average), 11.90% (BUS), 8.54% (CAFE), 6.09% (PEDESTRIAN), 9.03% (STREET)
+-------------------
+et05_simu WER: 16.48% (Average), 12.87% (BUS), 18.60% (CAFE), 15.52% (PEDESTRIAN), 18.94% (STREET)
+-------------------
+et05_real WER: 16.34% (Average), 24.32% (BUS), 16.51% (CAFE), 13.43% (PEDESTRIAN), 11.11% (STREET)
+-------------------
+
+TDNN+RNNLM using all 6 channel data
+exp/chain/tdnniso_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+-------------------
+best overall dt05 WER 7.21% (language model weight = 11)
+-------------------
+dt05_simu WER: 7.78% (Average), 6.52% (BUS), 10.27% (CAFE), 5.69% (PEDESTRIAN), 8.66% (STREET)
+-------------------
+dt05_real WER: 6.64% (Average), 9.06% (BUS), 6.62% (CAFE), 4.26% (PEDESTRIAN), 6.61% (STREET)
+-------------------
+et05_simu WER: 13.54% (Average), 10.22% (BUS), 15.07% (CAFE), 12.94% (PEDESTRIAN), 15.93% (STREET)
+-------------------
+et05_real WER: 12.92% (Average), 20.79% (BUS), 12.35% (CAFE), 9.62% (PEDESTRIAN), 8.91% (STREET)
+-------------------
+
+TDNN with BLSTM masking using all 6 channel data
+exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result
+-------------------
+best overall dt05 WER 18.00% (language model weight = 13)
+-------------------
+dt05_simu WER: 18.81% (Average), 15.34% (BUS), 23.58% (CAFE), 15.27% (PEDESTRIAN), 21.06% (STREET)
+-------------------
+dt05_real WER: 17.18% (Average), 21.12% (BUS), 19.45% (CAFE), 11.61% (PEDESTRIAN), 16.53% (STREET)
+-------------------
+et05_simu WER: 25.85% (Average), 20.06% (BUS), 30.13% (CAFE), 26.88% (PEDESTRIAN), 26.32% (STREET)
+-------------------
+et05_real WER: 27.68% (Average), 37.88% (BUS), 29.51% (CAFE), 24.74% (PEDESTRIAN), 18.60% (STREET)
+-------------------
+
+TDNN+RNNLM with BLSTM masking using all 6 channel data
+exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result
+-------------------
+best overall dt05 WER 14.38% (language model weight = 14)
+-------------------
+dt05_simu WER: 15.62% (Average), 12.36% (BUS), 20.46% (CAFE), 12.11% (PEDESTRIAN), 17.55% (STREET)
+-------------------
+dt05_real WER: 13.15% (Average), 16.43% (BUS), 15.21% (CAFE), 8.59% (PEDESTRIAN), 12.37% (STREET)
+-------------------
+et05_simu WER: 21.61% (Average), 16.01% (BUS), 25.87% (CAFE), 22.15% (PEDESTRIAN), 22.39% (STREET)
+-------------------
+et05_real WER: 22.47% (Average), 32.34% (BUS), 24.08% (CAFE), 18.91% (PEDESTRIAN), 14.57% (STREET)
+-------------------
+
+TDNN with BLSTM masking using all 6 channel data plus enhanced data
+exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result
 -------------------
-best overall dt05 WER 10.37% (language model weight = 9)
+best overall dt05 WER 11.73% (language model weight = 12)
 -------------------
-dt05_simu WER: 10.79% (Average), 9.62% (BUS), 13.70% (CAFE), 8.23% (PEDESTRIAN), 11.61% (STREET)
+dt05_simu WER: 13.06% (Average), 10.78% (BUS), 17.20% (CAFE), 10.15% (PEDESTRIAN), 14.10% (STREET)
 -------------------
-dt05_real WER: 9.95% (Average), 14.38% (BUS), 8.81% (CAFE), 6.43% (PEDESTRIAN), 10.19% (STREET)
+dt05_real WER: 10.40% (Average), 13.44% (BUS), 10.72% (CAFE), 7.29% (PEDESTRIAN), 10.16% (STREET)
 -------------------
-et05_simu WER: 17.18% (Average), 13.75% (BUS), 19.48% (CAFE), 15.82% (PEDESTRIAN), 19.67% (STREET)
+et05_simu WER: 19.48% (Average), 14.48% (BUS), 23.10% (CAFE), 19.84% (PEDESTRIAN), 20.49% (STREET)
 -------------------
-et05_real WER: 18.36% (Average), 30.77% (BUS), 16.17% (CAFE), 14.29% (PEDESTRIAN), 12.20% (STREET)
+et05_real WER: 19.08% (Average), 27.43% (BUS), 19.76% (CAFE), 16.93% (PEDESTRIAN), 12.22% (STREET)
 -------------------
 
-TDNN+RNNLM
-exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+TDNN+RNNLM with BLSTM masking using all 6 channel data plus enhanced data
+exp/chain/tdnn1a_sp/best_wer_single_BLSTMmask.result
 -------------------
-best overall dt05 WER 7.98% (language model weight = 10)
+best overall dt05 WER 8.95% (language model weight = 13)
 -------------------
-dt05_simu WER: 8.40% (Average), 7.37% (BUS), 10.91% (CAFE), 6.36% (PEDESTRIAN), 8.97% (STREET)
+dt05_simu WER: 10.28% (Average), 8.51% (BUS), 13.88% (CAFE), 7.58% (PEDESTRIAN), 11.17% (STREET)
 -------------------
-dt05_real WER: 7.56% (Average), 11.58% (BUS), 6.58% (CAFE), 4.41% (PEDESTRIAN), 7.65% (STREET)
+dt05_real WER: 7.62% (Average), 10.25% (BUS), 7.86% (CAFE), 5.31% (PEDESTRIAN), 7.05% (STREET)
 -------------------
-et05_simu WER: 13.91% (Average), 10.87% (BUS), 15.09% (CAFE), 12.78% (PEDESTRIAN), 16.88% (STREET)
+et05_simu WER: 16.18% (Average), 12.03% (BUS), 18.71% (CAFE), 16.62% (PEDESTRIAN), 17.35% (STREET)
 -------------------
-et05_real WER: 14.99% (Average), 26.88% (BUS), 13.32% (CAFE), 10.07% (PEDESTRIAN), 9.71% (STREET)
+et05_real WER: 15.08% (Average), 22.96% (BUS), 15.45% (CAFE), 12.74% (PEDESTRIAN), 9.17% (STREET)
 -------------------
diff --git a/egs/chime4/s5_1ch/local/CHiME3_simulate_data_patched_parallel.m b/egs/chime4/s5_1ch/local/CHiME3_simulate_data_patched_parallel.m
new file mode 100755
index 00000000000..49c1ed48018
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/CHiME3_simulate_data_patched_parallel.m
@@ -0,0 +1,362 @@
+function CHiME3_simulate_data_patched_parallel(official,nj,chime4_dir,chime3_dir)
+
+% CHIME3_SIMULATE_DATA Creates simulated data for the 3rd CHiME Challenge
+%
+% CHiME3_simulate_data
+% CHiME3_simulate_data(official)
+%
+% Input:
+% official: boolean flag indicating whether to recreate the official
+% Challenge data (default) or to create new (non-official) data
+%
+% If you use this software in a publication, please cite:
+%
+% Jon Barker, Ricard Marxer, Emmanuel Vincent, and Shinji Watanabe, The
+% third 'CHiME' Speech Separation and Recognition Challenge: Dataset,
+% task and baselines, submitted to IEEE 2015 Automatic Speech Recognition
+% and Understanding Workshop (ASRU), 2015.
+%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
+%                Inria (Emmanuel Vincent)
+%                Mitsubishi Electric Research Labs (Shinji Watanabe)
+% This software is distributed under the terms of the GNU Public License
+% version 3 (http://www.gnu.org/licenses/gpl.txt)
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+utils_folder = sprintf('%s/tools/utils', chime4_dir);
+enhancement_folder = sprintf('%s/tools/enhancement/', chime3_dir);
+addpath(utils_folder,'-end');
+addpath(enhancement_folder);
+sim_folder = sprintf('%s/tools/simulation', chime4_dir);
+addpath(sim_folder);
+upath = sprintf('%s/data/audio/16kHz/isolated/', chime4_dir);
+cpath = sprintf('%s/data/audio/16kHz/embedded/', chime4_dir);
+bpath = sprintf('%s/data/audio/16kHz/backgrounds/', chime4_dir);
+apath = sprintf('%s/data/annotations/', chime4_dir);
+upath_ext = 'local/nn-gev/data/audio/16kHz/isolated_ext/';
+upath_simu = 'local/nn-gev/data/audio/16kHz/isolated/';
+nchan=6;
+
+% Define hyper-parameters
+pow_thresh=-20; % threshold in dB below which a microphone is considered to fail
+wlen_sub=256; % STFT window length in samples
+blen_sub=4000; % average block length in samples for speech subtraction (250 ms)
+ntap_sub=12; % filter length in frames for speech subtraction (88 ms)
+wlen_add=1024; % STFT window length in samples for speaker localization
+del=-3; % minimum delay (0 for a causal filter)
+
+%% Create simulated training dataset from original WSJ0 data %%
+if exist('equal_filter.mat','file'),
+    load('equal_filter.mat');
+else
+    % Compute average power spectrum of booth data
+    nfram=0;
+    bth_spec=zeros(wlen_sub/2+1,1);
+    sets={'tr05' 'dt05'};
+    for set_ind=1:length(sets),
+        set=sets{set_ind};
+        mat=json2mat([apath set '_bth.json']);
+        for utt_ind=1:length(mat),
+            oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_BTH'];
+	    fprintf('%s\n',[upath set '_bth/' oname '.CH0.wav']);
+            o=audioread([upath set '_bth/' oname '.CH0.wav']);
+            O=stft_multi(o.',wlen_sub);
+            nfram=nfram+size(O,2);
+            bth_spec=bth_spec+sum(abs(O).^2,2);
+        end
+    end
+    bth_spec=bth_spec/nfram;
+    
+    % Compute average power spectrum of original WSJ0 data
+    nfram=0;
+    org_spec=zeros(wlen_sub/2+1,1);
+    olist=dir([upath 'tr05_org/*.wav']);
+    for f=1:length(olist),
+        oname=olist(f).name;
+        o=audioread([upath 'tr05_org/' oname]);
+        O=stft_multi(o.',wlen_sub);
+        nfram=nfram+size(O,2);
+        org_spec=org_spec+sum(abs(O).^2,2);
+    end
+    org_spec=org_spec/nfram;
+    
+    % Derive equalization filter
+    equal_filter=sqrt(bth_spec./org_spec);
+    save('equal_filter.mat','equal_filter');
+end
+% Read official annotations
+if official,
+    mat=json2mat([apath 'tr05_simu.json']);
+% Create new (non-official) annotations
+else
+    mat=json2mat([apath 'tr05_org.json']);
+    ir_mat=json2mat([apath 'tr05_real.json']);
+    for utt_ind=1:length(mat),
+        oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_ORG'];
+        osize=audioread([upath 'tr05_org/' oname '.wav'],'size');
+        dur=osize(1)/16000;
+        envirs={'BUS' 'CAF' 'PED' 'STR'};
+        envir=envirs{randperm(4,1)}; % draw a random environment
+        mat{utt_ind}.environment=envir;
+        blist=dir([bpath '*' envir '.CH1.wav']);
+        dur_diff=inf(1,length(ir_mat));
+        for ir_ind=1:length(ir_mat),
+            if strcmp(ir_mat{ir_ind}.environment,envir),
+                ir_dur=ir_mat{ir_ind}.end-ir_mat{ir_ind}.start;
+                dur_diff(ir_ind)=abs(ir_dur-dur);
+            end
+        end
+        ir_ind=find(isinf(dur_diff));
+        ir_ind=ir_ind(1);
+        nfail=true;
+        while nfail,
+            bname=blist(randperm(length(blist),1)).name(1:end-8); % draw a random background recording
+            mat{utt_ind}.noise_wavfile=bname;
+            bsize=audioread([bpath bname '.CH1.wav'],'size');
+            bdur=bsize(1)/16000;
+            mat{utt_ind}.noise_start=ceil(rand(1)*(bdur-dur)*16000)/16000; % draw a random time
+            mat{utt_ind}.noise_end=mat{utt_ind}.noise_start+dur;
+            nname=mat{utt_ind}.noise_wavfile;
+            nbeg=round(mat{utt_ind}.noise_start*16000)+1;
+            nend=round(mat{utt_ind}.noise_end*16000);
+            n=zeros(nend-nbeg+1,nchan);
+            for c=1:nchan,
+                n(:,c)=audioread([bpath nname '.CH' int2str(c) '.wav'],[nbeg nend]);
+            end
+            npow=sum(n.^2,1);
+            npow=10*log10(npow/max(npow));
+            nfail=any(npow<=pow_thresh); % check for microphone failure
+        end
+        xfail=true;
+        while xfail,
+            dur_diff(ir_ind)=inf;
+            [~,ir_ind]=min(dur_diff); % pick impulse response from the same environment with the closest duration
+            if dur_diff(ir_ind)==inf,
+                keyboard;
+            end
+            mat{utt_ind}.ir_wavfile=ir_mat{ir_ind}.wavfile;
+            mat{utt_ind}.ir_start=ir_mat{ir_ind}.start;
+            mat{utt_ind}.ir_end=ir_mat{ir_ind}.end;
+            iname=mat{utt_ind}.ir_wavfile;
+            ibeg=round(mat{utt_ind}.ir_start*16000)+1;
+            iend=round(mat{utt_ind}.ir_end*16000);
+            x=zeros(iend-ibeg+1,nchan);
+            for c=1:nchan,
+                x(:,c)=audioread([cpath iname '.CH' int2str(c) '.wav'],[ibeg iend]);
+            end
+            xpow=sum(x.^2,1);
+            xpow=10*log10(xpow/max(xpow));
+            xfail=any(xpow<=pow_thresh); % check for microphone failure
+        end
+        mat{utt_ind}=orderfields(mat{utt_ind});
+    end
+    mat2json(mat,[apath 'tr05_simu_new.json']);
+end
+
+p = parpool('local', nj);
+% Loop over utterances
+parfor utt_ind=1:length(mat),
+    if official,
+        udir=[upath_simu 'tr05_' lower(mat{utt_ind}.environment) '_simu/'];
+        udir_ext=[upath_ext 'tr05_' lower(mat{utt_ind}.environment) '_simu/'];
+    else
+        udir=[upath 'tr05_' lower(mat{utt_ind}.environment) '_simu_new/'];
+    end
+    if ~exist(udir,'dir'),
+        system(['mkdir -p ' udir]);
+    end
+    if ~exist(udir_ext,'dir'),
+        system(['mkdir -p ' udir_ext]);
+    end
+    oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_ORG'];
+    iname=mat{utt_ind}.ir_wavfile;
+    nname=mat{utt_ind}.noise_wavfile;
+    uname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_' mat{utt_ind}.environment];
+    ibeg=round(mat{utt_ind}.ir_start*16000)+1;
+    iend=round(mat{utt_ind}.ir_end*16000);
+    nbeg=round(mat{utt_ind}.noise_start*16000)+1;
+    nend=round(mat{utt_ind}.noise_end*16000);
+
+    % Load WAV files
+    fprintf('%s\n',[upath 'tr05_org/' oname '.wav']);
+    o=audioread([upath 'tr05_org/' oname '.wav']);
+    [r,fs]=audioread([cpath iname '.CH0.wav'],[ibeg iend]);
+    fprintf('%s\n',[cpath iname '.CH0.wav'],[ibeg iend]);
+    x=zeros(iend-ibeg+1,nchan);
+    n=zeros(nend-nbeg+1,nchan);
+    for c=1:nchan,
+    	fprintf('%s Place1\n',[cpath iname '.CH' int2str(c) '.wav']);
+        x(:,c)=audioread([cpath iname '.CH' int2str(c) '.wav'],[ibeg iend]);
+        n(:,c)=audioread([bpath nname '.CH' int2str(c) '.wav'],[nbeg nend]);
+	fprintf('%s Place2\n',[bpath nname '.CH' int2str(c) '.wav']);
+    end
+   
+    % Compute the STFT (short window)
+    O=stft_multi(o.',wlen_sub);
+    R=stft_multi(r.',wlen_sub);
+    X=stft_multi(x.',wlen_sub);
+
+    % Estimate 88 ms impulse responses on 250 ms time blocks
+    A=estimate_ir(R,X,blen_sub,ntap_sub,del);
+
+    % Derive SNR
+    Y=apply_ir(A,R,del);
+    y=istft_multi(Y,iend-ibeg+1).';
+    SNR=sum(sum(y.^2))/sum(sum((x-y).^2));
+    
+    % Equalize microphone
+    [~,nfram]=size(O);
+    O=O.*repmat(equal_filter,[1 nfram]);
+    o=istft_multi(O,nend-nbeg+1).';
+    
+    % Compute the STFT (long window)
+    O=stft_multi(o.',wlen_add);
+    X=stft_multi(x.',wlen_add);
+    [nbin,nfram] = size(O);
+
+    % Localize and track the speaker
+    [~,TDOAx]=localize(X);
+    
+    % Interpolate the spatial position over the duration of clean speech
+    TDOA=zeros(nchan,nfram);
+    for c=1:nchan,
+        TDOA(c,:)=interp1(0:size(X,2)-1,TDOAx(c,:),(0:nfram-1)/(nfram-1)*(size(X,2)-1));
+    end
+    
+    % Filter clean speech
+    Ysimu=zeros(nbin,nfram,nchan);
+    for f=1:nbin,
+        for t=1:nfram,
+            Df=sqrt(1/nchan)*exp(-2*1i*pi*(f-1)/wlen_add*fs*TDOA(:,t));
+            Ysimu(f,t,:)=permute(Df*O(f,t),[2 3 1]);
+        end
+    end
+    ysimu=istft_multi(Ysimu,nend-nbeg+1).';
+
+    % Normalize level and add
+    ysimu=sqrt(SNR/sum(sum(ysimu.^2))*sum(sum(n.^2)))*ysimu;
+    xsimu=ysimu+n;
+    
+    % Write WAV file
+    for c=1:nchan,
+        audiowrite([udir uname '.CH' int2str(c) '.wav'],xsimu(:,c),fs);
+        audiowrite([udir_ext uname '.CH' int2str(c) '.Noise.wav'],n(:, c),fs);
+        audiowrite([udir_ext uname '.CH' int2str(c) '.Clean.wav'],ysimu(:, c), fs);
+    end
+end
+
+%% Create simulated development and test datasets from booth recordings %%
+sets={'dt05' 'et05'};
+for set_ind=1:length(sets),
+    set=sets{set_ind};
+
+    % Read official annotations
+    if official,
+        mat=json2mat([apath set '_simu.json']);
+        
+    % Create new (non-official) annotations
+    else
+        mat=json2mat([apath set '_real.json']);
+        clean_mat=json2mat([apath set '_bth.json']);
+        for utt_ind=1:length(mat),
+            for clean_ind=1:length(clean_mat), % match noisy utterance with same clean utterance (may be from a different speaker)
+                if strcmp(clean_mat{clean_ind}.wsj_name,mat{utt_ind}.wsj_name),
+                    break;
+                end
+            end
+            noise_mat=mat{utt_ind};
+            mat{utt_ind}=clean_mat{clean_ind};
+            mat{utt_ind}.environment=noise_mat.environment;
+            mat{utt_ind}.noise_wavfile=noise_mat.wavfile;
+            dur=mat{utt_ind}.end-mat{utt_ind}.start;
+            noise_dur=noise_mat.end-noise_mat.start;
+            pbeg=round((dur-noise_dur)/2*16000)/16000;
+            pend=round((dur-noise_dur)*16000)/16000-pbeg;
+            mat{utt_ind}.noise_start=noise_mat.start-pbeg;
+            mat{utt_ind}.noise_end=noise_mat.end+pend;
+            mat{utt_ind}=orderfields(mat{utt_ind}); 
+        end
+        mat2json(mat,[apath set '_simu_new.json']);
+    end
+    
+    % Loop over utterances
+    parfor utt_ind=1:length(mat),
+        if official,
+            udir=[upath_simu set '_' lower(mat{utt_ind}.environment) '_simu/'];
+            udir_ext=[upath_ext set '_' lower(mat{utt_ind}.environment) '_simu/'];
+        else
+            udir=[upath set '_' lower(mat{utt_ind}.environment) '_simu_new/'];
+        end
+        if ~exist(udir,'dir'),
+            system(['mkdir -p ' udir]);
+        end
+        if ~exist(udir_ext,'dir'),
+            system(['mkdir -p ' udir_ext]);
+        end
+        oname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_BTH'];
+        nname=mat{utt_ind}.noise_wavfile;
+        uname=[mat{utt_ind}.speaker '_' mat{utt_ind}.wsj_name '_' mat{utt_ind}.environment];
+        tbeg=round(mat{utt_ind}.noise_start*16000)+1;
+        tend=round(mat{utt_ind}.noise_end*16000);
+        
+        % Load WAV files
+        o=audioread([upath set '_bth/' oname '.CH0.wav']);
+        [r,fs]=audioread([cpath nname '.CH0.wav'],[tbeg tend]);
+        nsampl=length(r);
+        x=zeros(nsampl,nchan);
+        for c=1:nchan,
+            x(:,c)=audioread([cpath nname '.CH' int2str(c) '.wav'],[tbeg tend]);
+        end
+        
+        % Compute the STFT (short window)
+        R=stft_multi(r.',wlen_sub);
+        X=stft_multi(x.',wlen_sub);
+        
+        % Estimate 88 ms impulse responses on 250 ms time blocks
+        A=estimate_ir(R,X,blen_sub,ntap_sub,del);
+        
+        % Filter and subtract close-mic speech
+        Y=apply_ir(A,R,del);
+        y=istft_multi(Y,nsampl).';
+        level=sum(sum(y.^2));
+        n=x-y;
+        
+        % Compute the STFT (long window)
+        O=stft_multi(o.',wlen_add);
+        X=stft_multi(x.',wlen_add);
+        [nbin,nfram] = size(O);
+        
+        % Localize and track the speaker
+        [~,TDOAx]=localize(X);
+        
+        % Interpolate the spatial position over the duration of clean speech
+        TDOA=zeros(nchan,nfram);
+        for c=1:nchan,
+            TDOA(c,:)=interp1(0:size(X,2)-1,TDOAx(c,:),(0:nfram-1)/(nfram-1)*(size(X,2)-1));
+        end
+
+        % Filter clean speech
+        Ysimu=zeros(nbin,nfram,nchan);
+        for f=1:nbin,
+            for t=1:nfram,
+                Df=sqrt(1/nchan)*exp(-2*1i*pi*(f-1)/wlen_add*fs*TDOA(:,t));
+                Ysimu(f,t,:)=permute(Df*O(f,t),[2 3 1]);
+            end
+        end
+        ysimu=istft_multi(Ysimu,nsampl).';
+        
+        % Normalize level and add
+        ysimu=sqrt(level/sum(sum(ysimu.^2)))*ysimu;
+        xsimu=ysimu+n;
+        
+        % Write WAV file
+        for c=1:nchan,
+            audiowrite([udir uname '.CH' int2str(c) '.wav'],xsimu(:,c),fs);
+            audiowrite([udir_ext uname '.CH' int2str(c) '.Noise.wav'],n(:, c),fs);
+            audiowrite([udir_ext uname '.CH' int2str(c) '.Clean.wav'],ysimu(:, c), fs);
+        end
+    end
+end
+delete(p);
+end
diff --git a/egs/chime4/s5_1ch/local/chain/run_tdnn_lstm_recog.sh b/egs/chime4/s5_1ch/local/chain/run_tdnn_lstm_recog.sh
deleted file mode 100755
index 9348cd6fa5a..00000000000
--- a/egs/chime4/s5_1ch/local/chain/run_tdnn_lstm_recog.sh
+++ /dev/null
@@ -1,223 +0,0 @@
-#!/bin/bash
-
-set -e -o pipefail
-
-stage=0
-nj=30
-train=noisy
-enhan=$1
-mdir=$2
-train_set=tr05_multi_${train}
-test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
-gmm=tri3b_tr05_multi_${train} # this is the source gmm-dir that we'll use for alignments; it
-                              # should have alignments for the specified training data.
-nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
-
-# Options which are not passed through to run_ivector_common.sh
-affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
-common_egs_dir=
-reporting_email=
-
-# training chunk-options
-chunk_width=140,100,160
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
-
-#decode options
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  echo "Second argument specifies acoustic and language model directory"
-  exit 1;
-fi
-
-# check whether run_init is executed
-if [ ! -d data/lang ]; then
-  echo "error, execute local/run_init.sh, first"
-  exit 1;
-fi
-
-# check whether run_init is executed
-if [ ! -d exp/tri3b_tr05_multi_${train} ]; then
-  echo "error, execute local/run_init.sh, first"
-  exit 1;
-fi
-
-# check ivector extractor
-if [ ! -d $mdir/exp/nnet3${nnet3_affix}/extractor ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/nnet3${nnet3_affix}/extractor ]; then
-  echo "copy $mdir/exp/nnet3${nnet3_affix}/extractor"
-  mkdir -p exp/nnet3${nnet3_affix}
-  cp -r $mdir/exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/
-fi
-
-# check tdnn-lstm graph
-if [ ! -d $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then
-  echo "copy $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k"
-  mkdir -p exp/chain${nnet3_affix}/tree_a_sp
-  cp -r $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k exp/chain${nnet3_affix}/tree_a_sp/
-fi
-
-# check dir
-if [ ! -d $mdir/exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp ]; then
-  echo "copy $mdir/exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp"
-  cp -r $mdir/exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp exp/chain${nnet3_affix}/
-  rm -rf exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp/decode_*
-  rm -rf exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp/best_*
-fi
-
-dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
-
-# note: you don't necessarily have to change the treedir name
-# each time you do a new experiment-- only if you change the
-# configuration in a way that affects the tree.
-tree_dir=$mdir/exp/chain${nnet3_affix}/tree_a_sp
-
-# make ivector for dev and eval
-if [ $stage -le 2 ]; then
-  for datadir in ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-  done
-  
-  # extracting hires features
-  for datadir in ${test_sets}; do
-    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires
-    steps/compute_cmvn_stats.sh data/${datadir}_hires
-    utils/fix_data_dir.sh data/${datadir}_hires
-  done
-  
-  # extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp).
-  for data in ${test_sets}; do
-    nspk=$(wc -l <data/${data}_hires/spk2utt)
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
-    data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
-    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
-  done
-fi
-
-if [ $stage -le 18 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      data_affix=$(echo $data | sed s/test_//)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      for lmtype in tgpr_5k; do
-        steps/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $chunk_left_context \
-          --extra-right-context $chunk_right_context \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
-          --frames-per-chunk $frames_per_chunk \
-          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
-      done
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-if [ $stage -le 19 ]; then
-  # 'looped' decoding.
-  # note: you should NOT do this decoding step for setups that have bidirectional
-  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
-  # we didn't write a -parallel version of this program yet,
-  # so it will take a bit longer as the --num-threads option is not supported.
-  # we just hardcode the --frames-per-chunk option as it doesn't have to
-  # match any value used in training, and it won't affect the results (unlike
-  # regular decoding).
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      data_affix=$(echo $data | sed s/test_//)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      for lmtype in tgpr_5k; do
-        steps/nnet3/decode_looped.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --frames-per-chunk 30 \
-          --nj $nspk --cmd "$decode_cmd" \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
-      done
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-if $test_online_decoding && [ $stage -le 20 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-    --mfcc-config conf/mfcc_hires.conf \
-    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      data_affix=$(echo $data | sed s/test_//)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      # note: we just give it "data/${data}" as it only uses the wav.scp, the
-      # feature type does not matter.
-      for lmtype in tgpr; do
-        steps/online/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $nspk --cmd "$decode_cmd" \
-          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
-      done
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-# scoring
-if [ $stage -le 20 ]; then
-  # decoded results of enhanced speech using TDNN AMs trained with enhanced data
-  local/chime4_calc_wers.sh exp/chain/tdnn_lstm${affix}_sp $enhan exp/chain/tree_a_sp/graph_tgpr_5k \
-    > exp/chain/tdnn_lstm${affix}_sp/best_wer_$enhan.result
-  head -n 15 exp/chain/tdnn_lstm${affix}_sp/best_wer_$enhan.result
-  
-  echo "score looped decoding results"
-  local/chime4_calc_wers_looped.sh exp/chain/tdnn_lstm${affix}_sp $enhan exp/chain/tree_a_sp/graph_tgpr_5k \
-    > exp/chain/tdnn_lstm${affix}_sp/best_wer_looped_$enhan.result
-  head -n 15 exp/chain/tdnn_lstm${affix}_sp/best_wer_looped_$enhan.result
-fi
-
-exit 0;
diff --git a/egs/chime4/s5_1ch/local/chain/run_tdnn_recog.sh b/egs/chime4/s5_1ch/local/chain/run_tdnn_recog.sh
deleted file mode 100755
index 38a9cc391e7..00000000000
--- a/egs/chime4/s5_1ch/local/chain/run_tdnn_recog.sh
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/bin/bash
-
-set -e -o pipefail
-
-stage=0
-nj=30
-train=noisy
-enhan=$1
-mdir=$2
-train_set=tr05_multi_${train}
-test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
-gmm=tri3b_tr05_multi_${train} # this is the source gmm-dir that we'll use for alignments; it
-                              # should have alignments for the specified training data.
-nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
-
-# Options which are not passed through to run_ivector_common.sh
-affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
-common_egs_dir=
-reporting_email=
-
-# training chunk-options
-chunk_width=140,100,160
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
-
-#decode options
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  echo "Second argument specifies acoustic and language model directory"
-  exit 1;
-fi
-
-# check whether run_init is executed
-if [ ! -d data/lang ]; then
-  echo "error, execute local/run_init.sh, first"
-  exit 1;
-fi
-
-# check whether run_init is executed
-if [ ! -d exp/tri3b_tr05_multi_${train} ]; then
-  echo "error, execute local/run_init.sh, first"
-  exit 1;
-fi
-
-# check ivector extractor
-if [ ! -d $mdir/exp/nnet3${nnet3_affix}/extractor ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/nnet3${nnet3_affix}/extractor ]; then
-  echo "copy $mdir/exp/nnet3${nnet3_affix}/extractor"
-  mkdir -p exp/nnet3${nnet3_affix}
-  cp -r $mdir/exp/nnet3${nnet3_affix}/extractor exp/nnet3${nnet3_affix}/
-fi
-
-# check tdnn graph
-if [ ! -d $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then
-  echo "copy $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k"
-  mkdir -p exp/chain${nnet3_affix}/tree_a_sp
-  cp -r $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k exp/chain${nnet3_affix}/tree_a_sp/
-fi
-
-# check dir
-if [ ! -d $mdir/exp/chain${nnet3_affix}/tdnn${affix}_sp ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/chain${nnet3_affix}/tdnn${affix}_sp ]; then
-  echo "copy $mdir/exp/chain${nnet3_affix}/tdnn${affix}_sp"
-  cp -r $mdir/exp/chain${nnet3_affix}/tdnn${affix}_sp exp/chain${nnet3_affix}/
-  rm -rf exp/chain${nnet3_affix}/tdnn${affix}_sp/decode_*
-  rm -rf exp/chain${nnet3_affix}/tdnn${affix}_sp/best_*
-fi
-
-dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
-
-# note: you don't necessarily have to change the treedir name
-# each time you do a new experiment-- only if you change the
-# configuration in a way that affects the tree.
-tree_dir=$mdir/exp/chain${nnet3_affix}/tree_a_sp
-
-# make ivector for dev and eval
-if [ $stage -le 2 ]; then
-  for datadir in ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-  done
-  
-  # extracting hires features
-  for datadir in ${test_sets}; do
-    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires
-    steps/compute_cmvn_stats.sh data/${datadir}_hires
-    utils/fix_data_dir.sh data/${datadir}_hires
-  done
-  
-  # extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp).
-  for data in ${test_sets}; do
-    nspk=$(wc -l <data/${data}_hires/spk2utt)
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
-    data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
-    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
-  done
-fi
-
-if [ $stage -le 18 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      data_affix=$(echo $data | sed s/test_//)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      for lmtype in tgpr_5k; do
-        steps/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context 0 --extra-right-context 0 \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
-          --frames-per-chunk $frames_per_chunk \
-          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
-      done
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-# Not testing the 'looped' decoding separately, because for
-# TDNN systems it would give exactly the same results as the
-# normal decoding.
-
-if $test_online_decoding && [ $stage -le 19 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-    --mfcc-config conf/mfcc_hires.conf \
-    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-
-  for data in $test_sets; do
-    (
-      data_affix=$(echo $data | sed s/test_//)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
-      # note: we just give it "data/${data}" as it only uses the wav.scp, the
-      # feature type does not matter.
-      for lmtype in tgpr bd_tgpr; do
-        steps/online/nnet3/decode.sh \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $nspk --cmd "$decode_cmd" \
-          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
-      done
-      steps/lmrescore.sh \
-        --self-loop-scale 1.0 \
-        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
-        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
-      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-        data/lang_test_bd_{tgpr,fgconst} \
-       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-# scoring
-if [ $stage -le 20 ]; then
-  # decoded results of enhanced speech using TDNN AMs trained with enhanced data
-  local/chime4_calc_wers.sh exp/chain/tdnn${affix}_sp $enhan exp/chain/tree_a_sp/graph_tgpr_5k \
-    > exp/chain/tdnn${affix}_sp/best_wer_$enhan.result
-  head -n 15 exp/chain/tdnn${affix}_sp/best_wer_$enhan.result
-fi
-
-
-exit 0;
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
index aa7d07b636a..3f8b7c60090 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_1a.sh
@@ -1,20 +1,20 @@
 #!/bin/bash
 
-# This was modified from wsj/local/chain/tunning/run_tdnn_1d.sh to be
+# This was modified from wsj/local/chain/tunning/run_tdnn_1e.sh to be
 # used in Chime4.
 
 #This is the result using all 6 channels:
-# exp/chain/tdnn1a_sp/best_wer_beamformit_5mics.result
+# exp/chain/tdnn1a_sp/best_wer_blstm_gev.result
 # -------------------
-# best overall dt05 WER 6.04% (language model weight = 9)
+# best overall dt05 WER 4.34% (language model weight = 7)
 # -------------------
-# dt05_simu WER: 6.25% (Average), 5.71% (BUS), 6.92% (CAFE), 5.37% (PEDESTRIAN), 7.02% (STREET)
+# dt05_simu WER: 4.46% (Average), 4.12% (BUS), 5.29% (CAFE), 4.00% (PEDESTRIAN), 4.42% (STREET)
 # -------------------
-# dt05_real WER: 5.83% (Average), 7.48% (BUS), 5.28% (CAFE), 4.43% (PEDESTRIAN), 6.13% (STREET)
+# dt05_real WER: 4.21% (Average), 4.94% (BUS), 4.07% (CAFE), 3.81% (PEDESTRIAN), 4.04% (STREET)
 # -------------------
-# et05_simu WER: 10.30% (Average), 7.34% (BUS), 10.37% (CAFE), 10.05% (PEDESTRIAN), 13.43% (STREET)
+# et05_simu WER: 5.50% (Average), 4.78% (BUS), 5.86% (CAFE), 5.51% (PEDESTRIAN), 5.83% (STREET)
 # -------------------
-# et05_real WER: 9.67% (Average), 12.71% (BUS), 8.33% (CAFE), 8.20% (PEDESTRIAN), 9.45% (STREET)
+# et05_real WER: 5.78% (Average), 6.82% (BUS), 5.10% (CAFE), 5.70% (PEDESTRIAN), 5.51% (STREET)
 # -------------------
 # Final train prob        -0.080
 # Final valid prob        -0.075
@@ -32,9 +32,7 @@ set -e -o pipefail
 stage=1
 nj=30
 train=noisy
-enhan=$1
 train_set=tr05_multi_${train}
-test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
 gmm=tri3b_tr05_multi_${train} # this is the source gmm-dir that we'll use for alignments; it
                               # should have alignments for the specified training data.
 num_threads_ubm=32
@@ -57,11 +55,11 @@ chunk_right_context=0
 
 # training options
 srand=0
-remove_egs=false
+remove_egs=true
 
 #decode options
 test_online_decoding=false  # if true, it will run the last decoding stage.
-
+decode_only=false # if true, it wouldn't train a model again and will only do decoding
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -70,6 +68,8 @@ echo "$0 $@"  # Print the command line for logging
 . ./path.sh
 . ./utils/parse_options.sh
 
+enhan=$1
+test_sets="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
 
 if ! cuda-compiled; then
   cat <<EOF && exit 1
@@ -91,12 +91,55 @@ if [ ! -d exp/tri3b_tr05_multi_${train} ]; then
   exit 1;
 fi
 
-local/nnet3/run_ivector_common.sh \
-  --stage $stage --nj $nj \
-  --train-set "$train_set" --gmm $gmm \
-  --test-sets "$test_sets" \
-  --num-threads-ubm $num_threads_ubm \
-  --nnet3-affix "$nnet3_affix"
+if $decode_only; then
+  mdir=`pwd`
+  # check ivector extractor
+  if [ ! -d $mdir/exp/nnet3${nnet3_affix}/extractor ]; then
+    echo "error, set $mdir correctly"
+    exit 1;
+  fi
+  # check tdnn graph
+  if [ ! -d $mdir/exp/chain${nnet3_affix}/tree_a_sp/graph_tgpr_5k ]; then
+    echo "error, set $mdir correctly"
+    exit 1;
+  fi
+  # check dir
+  if [ ! -d $mdir/exp/chain${nnet3_affix}/tdnn${affix}_sp ]; then
+    echo "error, set $mdir correctly"
+    exit 1;
+  fi
+
+  # make ivector for dev and eval
+  for datadir in ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # extracting hires features
+  for datadir in ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+
+  # extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+    data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+  # directly do decoding
+  stage=18
+else
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set "$train_set" --gmm $gmm \
+    --test-sets "$test_sets" \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
 
 gmm_dir=exp/${gmm}
 ali_dir=exp/${gmm}_ali_${train_set}_sp
@@ -174,7 +217,9 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -187,18 +232,18 @@ if [ $stage -le 15 ]; then
   fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
 
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 dim=750
-  relu-batchnorm-layer name=tdnn2 dim=750 input=Append(-1,0,1)
-  relu-batchnorm-layer name=tdnn3 dim=750
-  relu-batchnorm-layer name=tdnn4 dim=750 input=Append(-1,0,1)
-  relu-batchnorm-layer name=tdnn5 dim=750
-  relu-batchnorm-layer name=tdnn6 dim=750 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn7 dim=750 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn8 dim=750 input=Append(-6,-3,0)
+  relu-batchnorm-layer name=tdnn1 $opts dim=850
+  relu-batchnorm-layer name=tdnn2 $opts dim=850 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=850
+  relu-batchnorm-layer name=tdnn4 $opts dim=850 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=850
+  relu-batchnorm-layer name=tdnn6 $opts dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=850 input=Append(-6,-3,0)
 
   ## adding the layers for chain branch
-  relu-batchnorm-layer name=prefinal-chain dim=750 target-rms=0.5
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+  relu-batchnorm-layer name=prefinal-chain $opts dim=850 target-rms=0.5
+  output-layer name=output $output_opts include-log-softmax=false dim=$num_targets max-change=1.5
 
   # adding the layers for xent branch
   # This block prints the configs for a separate output that will be
@@ -209,8 +254,8 @@ if [ $stage -le 15 ]; then
   # final-layer learns at a rate independent of the regularization
   # constant; and the 0.5 was tuned so as to make the relative progress
   # similar in the xent and regular final layers.
-  relu-batchnorm-layer name=prefinal-xent input=tdnn8 dim=750 target-rms=0.5
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+  relu-batchnorm-layer name=prefinal-xent $opts input=tdnn8 dim=850 target-rms=0.5
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
@@ -221,7 +266,12 @@ if [ $stage -le 16 ]; then
     utils/create_split_dir.pl \
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime4-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
-
+  
+  cat $train_data_dir/utt2uniq | awk -F' ' '{print $1}' > $train_data_dir/utt2uniq.tmp1
+  cat $train_data_dir/utt2uniq | awk -F' ' '{print $2}' | sed -e 's/\....//g' | sed -e 's/\_CH.//g' | sed -e 's/\_enhan//g' > $train_data_dir/utt2uniq.tmp2
+  paste -d" " $train_data_dir/utt2uniq.tmp1 $train_data_dir/utt2uniq.tmp2 > $train_data_dir/utt2uniq
+  rm -rf $train_data_dir/utt2uniq.tmp{1,2}
+  
   steps/nnet3/chain/train.py --stage=$train_stage \
     --cmd="$decode_cmd" \
     --feat.online-ivector-dir=$train_ivector_dir \
@@ -233,16 +283,17 @@ if [ $stage -le 16 ]; then
     --chain.lm-opts="--num-extra-lm-states=2000" \
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
-    --trainer.num-epochs=6 \
+    --trainer.num-epochs=12 \
     --trainer.frames-per-iter=3000000 \
     --trainer.optimization.num-jobs-initial=2 \
-    --trainer.optimization.num-jobs-final=5 \
-    --trainer.optimization.initial-effective-lrate=0.003 \
-    --trainer.optimization.final-effective-lrate=0.0003 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.005 \
+    --trainer.optimization.final-effective-lrate=0.0005 \
     --trainer.optimization.shrink-value=1.0 \
-    --trainer.optimization.proportional-shrink=60.0 \
     --trainer.num-chunk-per-minibatch=128,64 \
     --trainer.optimization.momentum=0.0 \
+    --trainer.optimization.backstitch-training-scale=0.3 \
+    --trainer.optimization.backstitch-training-interval=1 \
     --egs.chunk-width=$chunk_width \
     --egs.chunk-left-context=0 \
     --egs.chunk-right-context=0 \
@@ -280,8 +331,11 @@ if [ $stage -le 18 ]; then
 
   for data in $test_sets; do
     (
+      utils/data/modify_speaker_info.sh --seconds-per-spk-max 200 \
+        data/${data}_hires data/${data}_chunked
+      
       data_affix=$(echo $data | sed s/test_//)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      nspk=$(wc -l <data/${data}_chunked/spk2utt)
       for lmtype in tgpr_5k; do
         steps/nnet3/decode.sh \
           --acwt 1.0 --post-decode-acwt 10.0 \
@@ -291,7 +345,7 @@ if [ $stage -le 18 ]; then
           --frames-per-chunk $frames_per_chunk \
           --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+          $tree_dir/graph_${lmtype} data/${data}_chunked ${dir}/decode_${lmtype}_${data_affix} || exit 1
       done
     ) || touch $dir/.error &
   done
diff --git a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
index f5c8973ab67..8b4e93cd05b 100755
--- a/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/chime4/s5_1ch/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -180,7 +180,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh b/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh
index 9fe4a20f43a..84bb2cb8dbd 100755
--- a/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh
+++ b/egs/chime4/s5_1ch/local/chime4_calc_wers_looped.sh
@@ -82,4 +82,4 @@ for e_d in $tasks; do
       | utils/int2sym.pl -f 2- $graph_dir/words.txt \
       | sed s:\<UNK\>::g
   done
-done
\ No newline at end of file
+done
diff --git a/egs/chime4/s5_1ch/local/compute_pesq.sh b/egs/chime4/s5_1ch/local/compute_pesq.sh
new file mode 100755
index 00000000000..1d290a4893f
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/compute_pesq.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+# This script creates the average PESQ score of files in an enhanced directory with corresponding 
+# files in a reference directory.
+# Expects the PESQ third party executable in "local/PESQ"
+# PESQ source was dowloaded and compiled using "local/download_se_eval_tool.sh" 
+# Eg. local/compute_pesq.sh blstm_gev enhan/blstm_gev local/nn-gev/data/audio/16kHz/isolated_ext $PWD
+
+set -e
+set -u
+set -o pipefail
+
+if [ $# != 4 ]; then
+   echo "Wrong #arguments ($#, expected 4)"
+   echo "Usage: local/compute_pesq.sh <enhancement-method> <enhancement-directory> <chime-rir-directory> <modeldir>"
+   exit 1;
+fi
+
+enhancement_method=$1
+enhancement_directory=$2
+chime_rir_directory=$3
+modeldir=$4
+
+expdir=$modeldir/exp/compute_pesq_${enhancement_method}
+mkdir -p $expdir
+pushd $expdir
+ls $enhancement_directory/et05_*_simu/*.wav > $expdir/et05_files
+ls $enhancement_directory/dt05_*_simu/*.wav > $expdir/dt05_files
+
+for set in "dt05" "et05"
+do
+declare -i n_files=0
+t_mos=0
+avg_mos=0
+  while read filename; do
+    n_files=$n_files+1
+    target_filename=`echo $filename | rev | cut -d"/" -f1 | rev`
+    speaker=`echo $target_filename | cut -d"_" -f1`
+    utt_id=`echo $target_filename | cut -d"_" -f2`
+    noise_cap=`echo $target_filename | cut -d"_" -f3 | cut -d"." -f1`
+    noise=`echo "$noise_cap" | awk '{ print tolower($1) }'`
+    temp=`$modeldir/local/PESQ +16000 ../../$chime_rir_directory/"$set"_"$noise"_simu/"$speaker"_"$utt_id"_"$noise_cap".CH5.Clean.wav $filename`
+    pesq_score=`echo $temp | rev | cut -d " " -f1 | rev`
+    t_mos=$(awk "BEGIN {print $t_mos+$pesq_score; exit}")
+  done <$expdir/"$set"_files
+avg_mos=$(awk "BEGIN {print $t_mos/$n_files; exit}")
+echo $avg_mos>"$expdir"/pesq_"$set"
+done
+popd
diff --git a/egs/chime4/s5_1ch/local/compute_stoi_estoi_sdr.sh b/egs/chime4/s5_1ch/local/compute_stoi_estoi_sdr.sh
new file mode 100755
index 00000000000..b7627560b67
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/compute_stoi_estoi_sdr.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+# This script creates the list of enhanced files and reference files and calls the 
+# matlab script "stoi_estoi_sdr.m" to get STOI, eSTOI and SDR scores
+# Eg. local/compute_stoi_estoi_sdr.sh --njobs 10 blstm_gev enhan/blstm_gev local/nn-gev/data/audio/16kHz/isolated_ext  
+
+. ./cmd.sh
+. ./path.sh
+set -e
+set -u
+set -o pipefail
+
+njobs=10
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/compute_stoi_estoi_sdr.sh [options] <enhancement-method> <enhancement-directory> <chime-rir-directory>"
+   echo "options"
+   echo "  --njobs <njobs>                          # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   exit 1;
+fi
+
+enhancement_method=$1
+enhancement_directory=$2
+chime_rir_directory=$3
+
+expdir=exp/compute_stoi_estoi_sdr_${enhancement_method}
+mkdir -p $expdir
+ls $chime_rir_directory/dt05_*/*CH5.Clean.wav > $expdir/original_list
+ls $enhancement_directory/dt05_*simu/*.wav > $expdir/enhanced_list
+$cmd $expdir/compute_stoi_estoi_sdr_dt05.log matlab -nodisplay -nosplash -r "addpath('local'); stoi_estoi_sdr($njobs,'$enhancement_method','$expdir','dt05');exit"
+ls $chime_rir_directory/et05_*/*CH5.Clean.wav > $expdir/original_list
+ls $enhancement_directory/et05_*simu/*.wav > $expdir/enhanced_list
+$cmd $expdir/compute_stoi_estoi_sdr_et05.log matlab -nodisplay -nosplash -r "addpath('local'); stoi_estoi_sdr($njobs,'$enhancement_method','$expdir','et05');exit"
diff --git a/egs/chime4/s5_1ch/local/download_se_eval_tool.sh b/egs/chime4/s5_1ch/local/download_se_eval_tool.sh
new file mode 100755
index 00000000000..ddd86a03d8a
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/download_se_eval_tool.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+wget http://bass-db.gforge.inria.fr/bss_eval/bss_eval_sources.m -O local/bss_eval_sources.m
+wget https://github.com/JacobD10/SoundZone_Tools/raw/master/stoi.m -O local/stoi.m
+wget https://github.com/JacobD10/SoundZone_Tools/raw/master/estoi.m -O local/estoi.m
+wget 'https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200102-I!!SOFT-ZST-E&type=items' -O PESQ.zip
+unzip PESQ.zip -d local/PESQ_sources
+cd local/PESQ_sources/P862/Software/source
+gcc  *.c -lm -o PESQ
+cd ../../../../../
+mv local/PESQ_sources/P862/Software/source/PESQ local/
diff --git a/egs/chime4/s5_1ch/local/fix_read_sim_from_different_directory.patch b/egs/chime4/s5_1ch/local/fix_read_sim_from_different_directory.patch
new file mode 100644
index 00000000000..46121357c5e
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/fix_read_sim_from_different_directory.patch
@@ -0,0 +1,244 @@
+diff --git a/beamform.py b/beamform.py
+index 02eeed8..070c76d 100644
+--- a/beamform.py
++++ b/beamform.py
+@@ -6,9 +6,10 @@ from chainer import Variable
+ from chainer import cuda
+ from chainer import serializers
+ from tqdm import tqdm
++import sys
+ 
+-from chime_data import gen_flist_simu, \
+-    gen_flist_real, get_audio_data, get_audio_data_with_context
++from chime_data import gen_flist_simu, gen_flist_2ch,\
++    gen_flist_real, get_audio_data, get_audio_data_1ch, get_audio_data_with_context
+ from fgnt.beamforming import gev_wrapper_on_masks
+ from fgnt.signal_processing import audiowrite, stft, istft
+ from fgnt.utils import Timer
+@@ -20,6 +21,8 @@ parser.add_argument('flist',
+                     help='Name of the flist to process (e.g. tr05_simu)')
+ parser.add_argument('chime_dir',
+                     help='Base directory of the CHiME challenge.')
++parser.add_argument('sim_dir',
++                    help='Base directory of the CHiME challenge simulated data.')
+ parser.add_argument('output_dir',
+                     help='The directory where the enhanced wav files will '
+                          'be stored.')
+@@ -29,6 +32,10 @@ parser.add_argument('model_type',
+                     help='Type of model (BLSTM or FW)')
+ parser.add_argument('--gpu', '-g', default=-1, type=int,
+                     help='GPU ID (negative value indicates CPU)')
++parser.add_argument('--single', '-s', default=0, type=int,
++                    help='0 for multi-channel and channel number (1-6) for single channel')
++parser.add_argument('--track', '-t', default=6, type=int,
++                    help='1, 2 or 6 depending on the data used')
+ args = parser.parse_args()
+ 
+ # Prepare model
+@@ -48,11 +55,35 @@ xp = np if args.gpu < 0 else cuda.cupy
+ stage = args.flist[:2]
+ scenario = args.flist.split('_')[-1]
+ 
++if stage == 'tr' and (args.track == 1 or args.track == 2):
++    print("No train data for 1ch track and 2ch track");
++    sys.exit(0);
++
+ # CHiME data handling
+ if scenario == 'simu':
+-    flist = gen_flist_simu(args.chime_dir, stage)
++    if args.track == 6:
++        flist = gen_flist_simu(args.chime_dir, args.sim_dir, stage)
++    elif args.track == 2:
++        flist = gen_flist_2ch(args.chime_dir, stage, scenario)
++    elif args.track == 1:
++        flist = list()
++        for env in ['caf', 'bus', 'str', 'ped']:
++            flist_temp = os.listdir(os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario)))
++            flist_ext = [i for i in flist_temp if i.endswith('.wav')]
++            flist_with_dir = [os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario), i) for i in flist_ext]
++            flist = flist + flist_with_dir
+ elif scenario == 'real':
+-    flist = gen_flist_real(args.chime_dir, stage)
++    if args.track == 6:
++        flist = gen_flist_real(args.chime_dir, stage)
++    elif args.track == 2:
++        flist = gen_flist_2ch(args.chime_dir, stage, scenario)
++    elif args.track == 1:
++        flist = list()
++        for env in ['caf', 'bus', 'str', 'ped']:
++            flist_temp = os.listdir(os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario)))
++            flist_ext = [i for i in flist_temp if i.endswith('.wav')]
++            flist_with_dir = [os.path.join(args.chime_dir, 'audio', '16kHz', 'isolated_1ch_track', '{}05_{}_{}'.format(stage, env, scenario), i) for i in flist_ext]
++            flist = flist + flist_with_dir
+ else:
+     raise ValueError('Unknown flist {}'.format(args.flist))
+ 
+@@ -67,12 +98,19 @@ t_beamform = 0
+ # Beamform loop
+ for cur_line in tqdm(flist):
+     with Timer() as t:
+-        if scenario == 'simu':
++        if args.track == 6:
++            if scenario == 'simu':
++                audio_data = get_audio_data(cur_line)
++                context_samples = 0
++            elif scenario == 'real':
++                audio_data, context_samples = get_audio_data_with_context(
++                        cur_line[0], cur_line[1], cur_line[2])
++        elif args.track == 2:
+             audio_data = get_audio_data(cur_line)
+             context_samples = 0
+-        elif scenario == 'real':
+-            audio_data, context_samples = get_audio_data_with_context(
+-                    cur_line[0], cur_line[1], cur_line[2])
++        elif args.track == 1:
++            audio_data = get_audio_data_1ch(cur_line)
++            context_samples = 0
+     t_io += t.msecs
+     Y = stft(audio_data, time_dim=1).transpose((1, 0, 2))
+     Y_var = Variable(np.abs(Y).astype(np.float32), True)
+@@ -85,28 +123,45 @@ for cur_line in tqdm(flist):
+     t_net += t.msecs
+ 
+     with Timer() as t:
+-        N_mask = np.median(N_masks.data, axis=1)
+-        X_mask = np.median(X_masks.data, axis=1)
+-        Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
++        if args.single >= 1 or args.track == 1:
++            Y_hat = X_masks.data * Y
++        elif args.single == 0:
++            N_mask = np.median(N_masks.data, axis=1)
++            X_mask = np.median(X_masks.data, axis=1)
++            Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask)
+     t_beamform += t.msecs
+ 
+-    if scenario == 'simu':
+-        wsj_name = cur_line.split('/')[-1].split('_')[1]
+-        spk = cur_line.split('/')[-1].split('_')[0]
+-        env = cur_line.split('/')[-1].split('_')[-1]
+-    elif scenario == 'real':
+-        wsj_name = cur_line[3]
+-        spk = cur_line[0].split('/')[-1].split('_')[0]
+-        env = cur_line[0].split('/')[-1].split('_')[-1]
++    if args.track == 1:
++        env = cur_line.split('/')[-1].split('_')[2].split('.')[0]
++        filename = os.path.join(args.output_dir, '{}05_{}_{}'.format(stage, env.lower(), scenario), os.path.basename(cur_line))
++    else:
++        if scenario == 'simu' or args.track == 2:
++            wsj_name = cur_line.split('/')[-1].split('_')[1]
++            spk = cur_line.split('/')[-1].split('_')[0]
++            env = cur_line.split('/')[-1].split('_')[-1]
++        elif scenario == 'real':
++            wsj_name = cur_line[3]
++            spk = cur_line[0].split('/')[-1].split('_')[0]
++            env = cur_line[0].split('/')[-1].split('_')[-1]
+ 
+-    filename = os.path.join(
+-            args.output_dir,
+-            '{}05_{}_{}'.format(stage, env.lower(), scenario),
+-            '{}_{}_{}.wav'.format(spk, wsj_name, env.upper())
+-    )
+-    with Timer() as t:
+-        audiowrite(istft(Y_hat)[context_samples:], filename, 16000, True, True)
+-    t_io += t.msecs
++        filename = os.path.join(
++                args.output_dir,
++                '{}05_{}_{}'.format(stage, env.lower(), scenario),
++                '{}_{}_{}.wav'.format(spk, wsj_name, env.upper())
++        )
++    if args.track == 1:
++        with Timer() as t:
++            audiowrite(istft(Y_hat[:,0,:])[int(context_samples):], filename, 16000, True, True)
++        t_io += t.msecs
++    elif args.single == 0:
++        with Timer() as t:
++            audiowrite(istft(Y_hat)[int(context_samples):], filename, 16000, True, True)
++        t_io += t.msecs
++    elif args.single >= 1:
++        ch = args.single
++        with Timer() as t:
++            audiowrite(istft(Y_hat[:,ch-1,:])[int(context_samples):], filename, 16000, True, True)
++        t_io += t.msecs
+ 
+ print('Finished')
+ print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format(
+diff --git a/beamform.sh b/beamform.sh
+index 3c7de5a..aaae10d 100755
+--- a/beamform.sh
++++ b/beamform.sh
+@@ -1,5 +1,5 @@
+ #!/usr/bin/env bash
+ 
+ for flist in tr05_simu tr05_real dt05_simu dt05_real et05_simu et05_real; do
+-    python beamform.py $flist "$@"
+-done
+\ No newline at end of file
++     $HOME/miniconda3/bin/python local/nn-gev/beamform.py $flist "$@"
++done
+diff --git a/chime_data.py b/chime_data.py
+index 0072e1b..641d9d3 100644
+--- a/chime_data.py
++++ b/chime_data.py
+@@ -11,7 +11,7 @@ from fgnt.signal_processing import stft
+ from fgnt.utils import mkdir_p
+ 
+ 
+-def gen_flist_simu(chime_data_dir, stage, ext=False):
++def gen_flist_simu(chime_data_dir, dest_dir, stage, ext=False):
+     with open(os.path.join(
+             chime_data_dir, 'annotations',
+             '{}05_{}.json'.format(stage, 'simu'))) as fid:
+@@ -21,7 +21,7 @@ def gen_flist_simu(chime_data_dir, stage, ext=False):
+     else:
+         isolated_dir = 'isolated'
+     flist = [os.path.join(
+-            chime_data_dir, 'audio', '16kHz', isolated_dir,
++            dest_dir, 'audio', '16kHz', isolated_dir,
+             '{}05_{}_{}'.format(stage, a['environment'].lower(), 'simu'),
+             '{}_{}_{}'.format(a['speaker'], a['wsj_name'], a['environment']))
+              for a in annotations]
+@@ -39,11 +39,33 @@ def gen_flist_real(chime_data_dir, stage):
+     return flist_tuples
+ 
+ 
++def gen_flist_2ch(chime_data_dir, stage, scenario):
++    with open(os.path.join(
++            chime_data_dir, 'annotations',
++            '{}05_{}.json'.format(stage, scenario))) as fid:
++        annotations = json.load(fid)
++    flist = [os.path.join(
++            chime_data_dir, 'audio', '16kHz', 'isolated_2ch_track',
++            '{}05_{}_{}'.format(stage, a['environment'].lower(), scenario),
++            '{}_{}_{}'.format(a['speaker'], a['wsj_name'], a['environment']))
++             for a in annotations]
++    return flist
++
++
++def get_audio_data_1ch(filename):
++    audio_data = list()
++    audio_data.append(audioread(filename)[None, :])
++    audio_data = np.concatenate(audio_data, axis=0)
++    audio_data = audio_data.astype(np.float32)
++    return audio_data
++
++
+ def get_audio_data(file_template, postfix='', ch_range=range(1, 7)):
+     audio_data = list()
+     for ch in ch_range:
+-        audio_data.append(audioread(
+-                file_template + '.CH{}{}.wav'.format(ch, postfix))[None, :])
++        if os.path.isfile(file_template + '.CH{}{}.wav'.format(ch, postfix)):
++            audio_data.append(audioread(
++                    file_template + '.CH{}{}.wav'.format(ch, postfix))[None, :])
+     audio_data = np.concatenate(audio_data, axis=0)
+     audio_data = audio_data.astype(np.float32)
+     return audio_data
+@@ -65,7 +87,7 @@ def get_audio_data_with_context(embedded_template, t_start, t_end,
+ 
+ def prepare_training_data(chime_data_dir, dest_dir):
+     for stage in ['tr', 'dt']:
+-        flist = gen_flist_simu(chime_data_dir, stage, ext=True)
++        flist = gen_flist_simu(chime_data_dir, dest_dir, stage, ext=True)
+         export_flist = list()
+         mkdir_p(os.path.join(dest_dir, stage))
+         for f in tqdm.tqdm(flist, desc='Generating data for {}'.format(stage)):
diff --git a/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh
index edbbfd41e69..0173b022176 100755
--- a/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh
+++ b/egs/chime4/s5_1ch/local/real_noisy_chime4_data_prep.sh
@@ -68,10 +68,14 @@ if $eval_flag; then
 cp $trans_dir/et05_real.dot_all et05_real.dot
 fi
 
-# make a scp file from file list
+# make a scp temporary file from file list
 for x in $list_set; do
-    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.ids
-    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_REAL/' > ${x}_wav.id.temp
+    cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch
+    cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1
+    cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2
+    paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp
 done
 
 #make a transcription from dot
@@ -98,13 +102,17 @@ fi
 # data-preparation stage independent of the specific lexicon used.
 noiseword="<NOISE>";
 for x in $list_set;do
+  cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1
+  cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2
+  paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1
   cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
     | sort > $x.txt || exit 1;
 done
 
 # Make the utt2spk and spk2utt files.
 for x in $list_set; do
-  cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk
+  sort ${x}_wav.scp.temp > ${x}_wav.scp
+  cat ${x}_wav.scp | awk -F'_' '{print $1"_"$2}' > $x.spk
   cat ${x}_wav.scp | awk '{print $1}' > $x.utt
   paste -d" " $x.utt $x.spk > $x.utt2spk
   cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
@@ -119,4 +127,8 @@ for x in $list_set; do
   cp ${x}.utt2spk ../../$x/utt2spk || exit 1;
 done
 
+# clean up temp files
+rm *.temp
+rm *.part{1,2}
+
 echo "Data preparation succeeded"
diff --git a/egs/chime4/s5_1ch/local/rnnlm/run_lstm.sh b/egs/chime4/s5_1ch/local/rnnlm/run_lstm.sh
new file mode 120000
index 00000000000..c53740399ce
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/rnnlm/run_lstm.sh
@@ -0,0 +1 @@
+tuning/run_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/chime4/s5_1ch/local/rnnlm/run_lstm_back.sh b/egs/chime4/s5_1ch/local/rnnlm/run_lstm_back.sh
new file mode 100755
index 00000000000..76e2b563e6b
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/rnnlm/run_lstm_back.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+#           2017  Szu-Jui Chen
+
+# This script trains LMs on the reversed Chime4 data, which we
+# call it backward model.
+
+# Begin configuration section.
+affix=1a
+dir=exp/rnnlm_lstm_${affix}_back
+embedding_dim=2048
+lstm_rpd=512
+lstm_nrpd=512
+stage=-10
+train_stage=-10
+
+# variables for lattice rescoring
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+
+. cmd.sh
+. utils/parse_options.sh
+
+srcdir=data/local/local_lm
+lexicon=data/local/dict/lexiconp.txt
+text_dir=data/rnnlm/text_nosp_${affix}_back
+mkdir -p $dir/config
+set -e
+
+for f in $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+#prepare training and dev data
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $srcdir/train.rnn | awk '{for(i=NF;i>0;i--) printf("%s ",$i); print""}'> $text_dir/chime4.txt.tmp
+  sed -e "s/<RNN_UNK>/<UNK>/g" $text_dir/chime4.txt.tmp > $text_dir/chime4.txt
+  rm $text_dir/chime4.txt.tmp
+  cat $srcdir/valid.rnn | awk '{for(i=NF;i>0;i--) printf("%s ",$i); print""}'> $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp data/lang_chain/words.txt $dir/config/words.txt
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<UNK>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+chime4   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<UNK>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<UNK>,<brk>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
+                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+fi
+
+exit 0
diff --git a/egs/chime4/s5_1ch/local/rnnlm/tuning/run_lstm_1a.sh b/egs/chime4/s5_1ch/local/rnnlm/tuning/run_lstm_1a.sh
new file mode 100755
index 00000000000..8825364e6fa
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/rnnlm/tuning/run_lstm_1a.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2015  Guoguo Chen
+#           2017  Hainan Xu
+#           2017  Szu-Jui Chen
+
+# This script trains LMs on the Chime4 data.
+# rnnlm/train_rnnlm.sh: best iteration (out of 120) was 91, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 23.2 / 25.6.
+# Train objf: -5.63 -4.52 -4.20 -4.05 -3.96 -3.89 -3.83 -3.79 -3.76 -3.73 -3.70 -3.67 -3.65
+# -3.63 -3.61 -3.59 -3.58 -3.56 -3.54 -3.53 -3.52 -3.50 -3.49 -3.48 -3.47 -3.46 -3.45 -3.44
+# -3.43 -3.42 -3.43 -3.41 -3.39 -3.38 -3.38 -3.37 -3.35 -3.34 -3.34 -3.33 -3.32 -3.31 -3.31
+# -3.30 -3.29 -3.28 -3.28 -3.27 -3.26 -3.25 -3.25 -3.25 -3.23 -3.22 -3.23 -3.22 -3.21 -3.20
+# -3.20 -3.19 -3.19 -3.18 -3.18 -3.17 -3.16 -3.15 -3.16 -3.15 -3.14 -3.13 -3.13 -3.13 -3.12
+# -3.11 -3.12 -3.11 -3.10 -3.09 -3.09 -3.09 -3.08 -3.07 -3.07 -3.07 -3.06 -3.05 -3.05 -3.05
+# -3.04 -3.04 -3.04 -3.03 -3.00 -3.02 -3.00 -2.99 -3.00 -2.99 -2.99 -2.98 -2.96 -2.97 -2.96
+# -2.95 -2.96 -2.95 -2.95 -2.94 -2.93 -2.93 -2.92 -2.91 -2.92 -2.91 -2.91 -2.91 -2.89 -2.90 -2.89 -2.88 
+#Dev objf:   -11.73 -5.17 -4.46 -4.21 -4.06 -3.96 -3.88 -3.82 -3.79 -3.73 -3.69 -3.68 -3.63 
+# -3.61 -3.59 -3.58 -3.54 -3.54 -3.53 -3.51 -3.50 -3.47 -3.47 -3.46 -3.44 -3.44 -3.42 -3.42 
+# -3.42 -3.42 -3.40 -3.36 -3.35 -3.35 -3.34 -3.34 -3.34 -3.33 -3.32 -3.32 -3.31 -3.31 -3.31 
+# -3.30 -3.29 -3.29 -3.29 -3.28 -3.28 -3.28 -3.27 -3.27 -3.26 -3.27 -3.27 -3.26 -3.25 -3.26 
+# -3.26 -3.25 -3.25 -3.25 -3.25 -3.25 -3.25 -3.25 -3.26 -3.25 -3.24 -3.25 -3.25 -3.24 -3.24 
+# -3.25 -3.25 -3.24 -3.24 -3.25 -3.26 -3.25 -3.25 -3.24 -3.25 -3.25 -3.24 -3.25 -3.25 -3.25 
+# -3.24 -3.26 -3.25 -3.25 -3.25 -3.25 -3.25 -3.25 -3.25 -3.25 -3.26 -3.26 -3.26 -3.26 -3.26 
+# -3.27 -3.27 -3.27 -3.27 -3.27 -3.27 -3.27 -3.27 -3.27 -3.27 -3.28 -3.28 -3.28 -3.28 -3.29 -3.29 -3.29 
+
+# Begin configuration section.
+affix=1a
+dir=exp/rnnlm_lstm_${affix}
+enhan=$1
+embedding_dim=2048
+lstm_rpd=512
+lstm_nrpd=512
+stage=-10
+train_stage=-10
+
+# variables for lattice rescoring
+run_lat_rescore=true
+run_nbest_rescore=true
+use_backward_model=true
+ac_model_dir=exp/chain/tdnn1a_sp
+decode_dir_suffix=rnnlm_lstm_${affix}
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+
+
+. cmd.sh
+. utils/parse_options.sh
+
+srcdir=data/local/local_lm
+lexicon=data/local/dict/lexiconp.txt
+text_dir=data/rnnlm/text_nosp_${affix}
+mkdir -p $dir/config
+set -e
+
+for f in $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+#prepare training and dev data
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cp $srcdir/train.rnn $text_dir/chime4.txt.tmp
+  sed -e "s/<RNN_UNK>/<UNK>/g" $text_dir/chime4.txt.tmp > $text_dir/chime4.txt
+  cp $srcdir/valid.rnn $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp data/lang_chain/words.txt $dir/config/words.txt
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<UNK>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+chime4   3   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<UNK>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<UNK>,<brk>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+# Train model with forward data(forward model)
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 3 \
+                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+fi
+
+# Train another model with reversed data(backward model)
+if [ $stage -le 4 ] && $use_backward_model; then
+  local/rnnlm/run_lstm_back.sh --embedding-dim $embedding_dim \
+    --lstm-rpd $lstm_rpd --lstm-nrpd $lstm_nrpd \
+    --affix $affix
+fi
+
+# Since lattice-rescoring performs worse but faster than nbest-rescoring,
+# we only use it to evaluate how good our forward model is.
+LM=5gkn_5k # using the 5-gram lm from run_lmrescore_tdnn.sh
+tgtdir=${ac_model_dir}_smbr_lmrescore
+if [ $stage -le 5 ] && $run_lat_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  for decode_set in dt05_real dt05_simu et05_real et05_simu; do
+    decode_dir=$tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${LM}
+
+    # Lattice rescoring
+    rnnlm/lmrescore_pruned.sh \
+      --cmd "$train_cmd --mem 2G" \
+      --weight 0.8 --max-ngram-order $ngram_order \
+      data/lang_test_$LM $dir \
+      data/${decode_set}_${enhan}_chunked ${decode_dir} \
+      $tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${decode_dir_suffix} &
+  done
+  
+  wait
+  
+  # calc wers for lattice-rescoring results
+  local/chime4_calc_wers.sh $tgtdir ${enhan}_${decode_dir_suffix} \
+      $tgtdir/graph_tgpr_5k \
+      > $tgtdir/best_wer_${enhan}_${decode_dir_suffix}.result
+  head -n 15 $tgtdir/best_wer_${enhan}_${decode_dir_suffix}.result
+fi
+
+nbest=100
+rnnweight=0.8
+if [ $stage -le 6 ] && $run_nbest_rescore; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+  for decode_set in dt05_real dt05_simu et05_real et05_simu; do
+    decode_dir=$tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${LM}
+    (
+    # Lattice rescoring
+    rnnlm/lmrescore_nbest.sh \
+      --cmd "$train_cmd --mem 2G" --N $nbest \
+      $rnnweight data/lang_test_$LM $dir \
+      data/${decode_set}_${enhan}_chunked ${decode_dir} \
+      $tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}
+
+    if $use_backward_model; then
+      rnnlm/lmrescore_nbest_back.sh \
+        --cmd "$train_cmd --mem 2G" --N $nbest \
+        $rnnweight data/lang_test_$LM ${dir}_back \
+        data/${decode_set}_${enhan}_chunked \
+        $tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest} \
+        $tgtdir/decode_tgpr_5k_${decode_set}_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi
+    fi
+    ) &
+  done
+  wait
+  # calc wers for nbest-rescoring results
+  if $use_backward_model; then
+    local/chime4_calc_wers.sh $tgtdir ${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi \
+      $tgtdir/graph_tgpr_5k \
+      > $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi.result
+    head -n 15 $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}_bi.result
+  else
+    local/chime4_calc_wers.sh $tgtdir ${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest} \
+        $tgtdir/graph_tgpr_5k \
+        > $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}.result
+    head -n 15 $tgtdir/best_wer_${enhan}_${decode_dir_suffix}_w${rnnweight}_n${nbest}.result
+  fi
+fi
+
+exit 0
diff --git a/egs/chime4/s5_1ch/local/run_blstm_gev.sh b/egs/chime4/s5_1ch/local/run_blstm_gev.sh
new file mode 100755
index 00000000000..2ee92b70fbd
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_blstm_gev.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=10
+cmd=run.pl
+track=6
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Wrong #arguments ($#, expected 4)"
+   echo "Usage: local/run_blstm_gev.sh [options] <chime4-dir> <chime3-dir> <wav-out-dir> <enhancement-type>"
+   echo "main options (for others, see top of script file)"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --track <track>            # Chime data to use (1, 2 or 6)"
+   exit 1;
+fi
+
+sdir=$1
+chime3_dir=$2
+odir=$3
+enhancement_type=$4
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_chainer.sh';"
+fi
+
+# check if chainer is installed
+result=`$HOME/miniconda3/bin/python -c "\
+try:
+    import chainer
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "Chainer is installed"
+else
+    echo "Chainer is not installed. Please run ../../../tools/extras/install_chainer.sh"
+fi
+
+if [ ! -d local/nn-gev ]; then
+    cd local/
+    git clone https://github.com/fgnt/nn-gev.git
+    cd nn-gev/
+    git checkout 3a039a4b707419fab05deb9679b41360ea92d779 .
+    git apply ../fix_read_sim_from_different_directory.patch
+    cd ../../
+else
+    cd local/nn-gev/
+    git checkout 3a039a4b707419fab05deb9679b41360ea92d779 .
+    git apply ../fix_read_sim_from_different_directory.patch
+    cd ../../
+fi
+
+mkdir -p $odir
+set +e
+n_isolated_dirs=`ls local/nn-gev/data/audio/16kHz/isolated/ 2>/dev/null | wc -l`
+n_isolated_ext_dirs=`ls local/nn-gev/data/audio/16kHz/isolated_ext/ 2>/dev/null | wc -l`
+set -e
+if [[ "$n_isolated_dirs" -ne 12 || "$n_isolated_ext_dirs" -ne 12 ]];then
+   echo "generating simulation data and storing in local/nn-gev/data"
+   $cmd $odir/simulation.log matlab -nodisplay -nosplash -r "addpath('local'); CHiME3_simulate_data_patched_parallel(1,$nj,'$sdir','$chime3_dir');exit"
+else
+   echo "Didn't run Matlab simulation. Using existing data in local/nn-gev/data/audio/"
+fi
+
+echo "Training a BLSTM-based mask network and enhancing signals with mask-based GEV beamformer"
+$cuda_cmd $odir/beamform.log local/run_nn-gev.sh $sdir $odir $enhancement_type $track
diff --git a/egs/chime4/s5_1ch/local/run_dnn.sh b/egs/chime4/s5_1ch/local/run_dnn.sh
deleted file mode 100755
index 2207574e71c..00000000000
--- a/egs/chime4/s5_1ch/local/run_dnn.sh
+++ /dev/null
@@ -1,237 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
-#                Inria (Emmanuel Vincent)
-#                Mitsubishi Electric Research Labs (Shinji Watanabe)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
-# made by Chao Weng
-
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
-
-# Config:
-nj=30
-stage=0 # resume training with --stage N
-train=noisy
-eval_flag=true # make it true when the evaluation data are released
-
-. utils/parse_options.sh || exit 1;
-
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-
-if [ $# -ne 1 ]; then
-  printf "\nUSAGE: %s <enhancement method>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  exit 1;
-fi
-
-# set enhanced data
-enhan=$1
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-
-# check whether run_init is executed
-if [ ! -d data/lang ]; then
-  echo "error, execute local/run_init.sh, first"
-  exit 1;
-fi
-
-# check whether run_init is executed
-if [ ! -d exp/tri3b_tr05_multi_${train} ]; then
-  echo "error, execute local/run_init.sh, first"
-  exit 1;
-fi
-
-# get alignments
-if [ $stage -le 0 ]; then
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/tr05_multi_${train} data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali
-  steps/align_fmllr.sh --nj 4 --cmd "$train_cmd" \
-    data/dt05_multi_$enhan data/lang exp/tri3b_tr05_multi_${train} exp/tri3b_tr05_multi_${train}_ali_dt05
-fi
-
-# make fmllr feature for training multi = simu + real
-gmmdir=exp/tri3b_tr05_multi_${train}_ali
-data_fmllr=data-fmllr-tri3b
-mkdir -p $data_fmllr
-fmllrdir=fmllr-tri3b/${train}
-if [ $stage -le 1 ]; then
-  for x in tr05_real_${train} tr05_simu_${train}; do
-    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
-      --transform-dir $gmmdir \
-      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
-  done
-fi
-
-# make fmllr feature for dev and eval
-gmmdir=exp/tri3b_tr05_multi_${train}
-fmllrdir=fmllr-tri3b/$enhan
-if [ $stage -le 2 ]; then
-  if $eval_flag; then
-    tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
-  else
-    tasks="dt05_real_$enhan dt05_simu_$enhan"
-  fi
-  for x in $tasks; do
-    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
-      --transform-dir $gmmdir/decode_tgpr_5k_$x \
-      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
-  done
-fi
-
-# make mixed training set from real and simulation enhanced data
-# multi = simu + real
-if [ $stage -le 3 ]; then
-  for data_dir in $data_fmllr/tr05_real_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/dt05_real_$enhan $data_fmllr/dt05_simu_$enhan; do
-    utils/data/get_utt2dur.sh $data_dir
-  done
-
-  utils/combine_data.sh $data_fmllr/tr05_multi_${train} $data_fmllr/tr05_simu_${train} $data_fmllr/tr05_real_${train}
-  utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan
-  if $eval_flag; then
-    for data_dir in $data_fmllr/et05_real_$enhan $data_fmllr/et05_simu_$enhan; do
-      utils/data/get_utt2dur.sh $data_dir
-    done
-    utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan
-  fi
-fi
-
-# pre-train dnn
-dir=exp/tri4a_dnn_pretrain_tr05_multi_${train}
-if [ $stage -le 4 ]; then
-  $cuda_cmd $dir/_pretrain_dbn.log \
-    steps/nnet/pretrain_dbn.sh --nn-depth 7 --rbm-iter 3 $data_fmllr/tr05_multi_${train} $dir
-fi
-
-# train dnn
-dir=exp/tri4a_dnn_tr05_multi_${train}
-ali=exp/tri3b_tr05_multi_${train}_ali
-ali_dev=exp/tri3b_tr05_multi_${train}_ali_dt05
-feature_transform=exp/tri4a_dnn_pretrain_tr05_multi_${train}/final.feature_transform
-dbn=exp/tri4a_dnn_pretrain_tr05_multi_${train}/7.dbn
-if [ $stage -le 5 ]; then
-  $cuda_cmd $dir/_train_nnet.log \
-    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
-    $data_fmllr/tr05_multi_${train} $data_fmllr/dt05_multi_$enhan data/lang $ali $ali_dev $dir
-fi
-
-# decode enhanced speech
-if [ $stage -le 6 ]; then
-  utils/mkgraph.sh data/lang_test_tgpr_5k $dir $dir/graph_tgpr_5k
-  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
-    $dir/graph_tgpr_5k $data_fmllr/dt05_real_$enhan $dir/decode_tgpr_5k_dt05_real_$enhan &
-  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
-    $dir/graph_tgpr_5k $data_fmllr/dt05_simu_$enhan $dir/decode_tgpr_5k_dt05_simu_$enhan &
-  if $eval_flag; then
-  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
-    $dir/graph_tgpr_5k $data_fmllr/et05_real_$enhan $dir/decode_tgpr_5k_et05_real_$enhan &
-  steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --acwt 0.10 --config conf/decode_dnn.config \
-    $dir/graph_tgpr_5k $data_fmllr/et05_simu_$enhan $dir/decode_tgpr_5k_et05_simu_$enhan &
-  fi
-  wait;
-fi
-
-# Sequence training using sMBR criterion, we do Stochastic-GD
-# with per-utterance updates. We use usually good acwt 0.1
-# Lattices are re-generated after 1st epoch, to get faster convergence.
-dir=exp/tri4a_dnn_tr05_multi_${train}_smbr
-srcdir=exp/tri4a_dnn_tr05_multi_${train}
-acwt=0.1
-
-# First we generate lattices and alignments:
-# awk -v FS="/" '{ NF_nosuffix=$NF; sub(".gz","",NF_nosuffix); print NF_nosuffix gunzip -c "$0" |"; }' in
-# steps/nnet/make_denlats.sh
-if [ $stage -le 7 ]; then
-  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
-    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali
-  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
-    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats
-fi
-
-# Re-train the DNN by 1 iteration of sMBR
-if [ $stage -le 8 ]; then
-  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
-    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
-fi
-
-# Decode (reuse HCLG graph)
-if [ $stage -le 9 ]; then
-  for ITER in 1; do
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
-    if $eval_flag; then
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
-    fi
-  done
-fi
-
-# Re-generate lattices, run 4 more sMBR iterations
-dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats
-srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr
-acwt=0.1
-
-# Generate lattices and alignments:
-if [ $stage -le 10 ]; then
-  steps/nnet/align.sh --nj $nj --cmd "$train_cmd" \
-    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali
-  steps/nnet/make_denlats.sh --nj $nj --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
-    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_denlats
-fi
-
-# Re-train the DNN by 4 iterations of sMBR
-if [ $stage -le 11 ]; then
-  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
-    $data_fmllr/tr05_multi_${train} data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
-fi
-
-# Decode (reuse HCLG graph)
-if [ $stage -le 12 ]; then
-  for ITER in 1 2 3 4; do
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
-    if $eval_flag; then
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
-    fi
-  done
-  wait
-fi
-
-# scoring
-if [ $stage -le 13 ]; then
-  # decoded results of enhanced speech using DNN AMs trained with enhanced data
-  local/chime4_calc_wers.sh exp/tri4a_dnn_tr05_multi_${train} $enhan exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \
-    > exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result
-  head -n 15 exp/tri4a_dnn_tr05_multi_${train}/best_wer_$enhan.result
-  # decoded results of enhanced speech using sequence-training DNN
-  ./local/chime4_calc_wers_smbr.sh exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k \
-    > exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result
-  head -n 15 exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats/best_wer_${enhan}.result
-fi
-
-echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_dnn_recog.sh b/egs/chime4/s5_1ch/local/run_dnn_recog.sh
deleted file mode 100755
index 5e6ade02387..00000000000
--- a/egs/chime4/s5_1ch/local/run_dnn_recog.sh
+++ /dev/null
@@ -1,143 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
-#                Inria (Emmanuel Vincent)
-#                Mitsubishi Electric Research Labs (Shinji Watanabe)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
-# made by Chao Weng
-
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
-
-# Config:
-nj=30
-stage=0 # resume training with --stage=N
-train=noisy
-eval_flag=true # make it true when the evaluation data are released
-
-. utils/parse_options.sh || exit 1;
-
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-
-if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  echo "Second argument specifies acoustic and language model directory"
-  exit 1;
-fi
-
-# set enhanced data
-enhan=$1
-# set model directory
-mdir=$2
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-
-# check data/loca/data
-if [ ! -d $mdir/data/local/data ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d data/local/data ]; then
-  echo "copy $mdir/data/local/data"
-  mkdir -p data/local
-  cp -r $mdir/data/local/data data/local/
-fi
-
-# check gmm model
-if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then
-  echo "copy $mdir/exp/tri3b_tr05_multi_${train}"
-  mkdir -p exp
-  cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/
-fi
-
-# check dnn graph
-if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then
-  echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k"
-  mkdir -p exp/tri4a_dnn_tr05_multi_${train}
-  cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k exp/tri4a_dnn_tr05_multi_${train}/
-fi
-
-# check dnn smbr model
-if [ ! -d $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats ]; then
-  echo "copy $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats"
-  mkdir -p exp
-  cp -r $mdir/exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats exp/
-fi
-
-# make fmllr feature for dev and eval
-gmmdir=exp/tri3b_tr05_multi_${train}
-data_fmllr=data-fmllr-tri3b
-mkdir -p $data_fmllr
-fmllrdir=fmllr-tri3b/$enhan
-if [ $stage -le 4 ]; then
-  if $eval_flag; then
-    tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
-  else
-    tasks="dt05_real_$enhan dt05_simu_$enhan"
-  fi
-  for x in $tasks; do
-    steps/nnet/make_fmllr_feats.sh --nj 4 --cmd "$train_cmd" \
-      --transform-dir $gmmdir/decode_tgpr_5k_$x \
-      $data_fmllr/$x data/$x $gmmdir exp/make_fmllr_tri3b/$x $fmllrdir
-  done
-fi
-
-# make mixed training set from real and simulation enhanced data
-# multi = simu + real
-if [ $stage -le 5 ]; then
-  utils/combine_data.sh $data_fmllr/dt05_multi_$enhan $data_fmllr/dt05_simu_$enhan $data_fmllr/dt05_real_$enhan
-  if $eval_flag; then
-  utils/combine_data.sh $data_fmllr/et05_multi_$enhan $data_fmllr/et05_simu_$enhan $data_fmllr/et05_real_$enhan
-  fi
-fi
-
-# Re-generate lattices, run 4 more sMBR iterations
-dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats
-acwt=0.1
-
-# Decode (reuse HCLG graph)
-if [ $stage -le 6 ]; then
-  for ITER in 1 2 3 4; do
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_real_${enhan} $dir/decode_tgpr_5k_dt05_real_${enhan}_it${ITER} &
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/dt05_simu_${enhan} $dir/decode_tgpr_5k_dt05_simu_${enhan}_it${ITER} &
-    if $eval_flag; then
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_real_${enhan} $dir/decode_tgpr_5k_et05_real_${enhan}_it${ITER} &
-    steps/nnet/decode.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" --config conf/decode_dnn.config \
-      --nnet $dir/${ITER}.nnet --acwt $acwt \
-      exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k $data_fmllr/et05_simu_${enhan} $dir/decode_tgpr_5k_et05_simu_${enhan}_it${ITER} &
-    fi
-    wait
-  done
-fi
-
-# scoring
-if [ $stage -le 7 ]; then
-  # decoded results of enhanced speech using sequence-training DNN
-  ./local/chime4_calc_wers_smbr.sh $dir ${enhan} exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k > $dir/best_wer_${enhan}.result
-  head -n 15 $dir/best_wer_${enhan}.result
-fi
-
-echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_gmm.sh b/egs/chime4/s5_1ch/local/run_gmm.sh
index 2a3c8680f23..5178433dfc2 100755
--- a/egs/chime4/s5_1ch/local/run_gmm.sh
+++ b/egs/chime4/s5_1ch/local/run_gmm.sh
@@ -17,6 +17,8 @@ nj=30
 stage=0 # resume training with --stage=N
 train=noisy # noisy data multi-condition training
 eval_flag=true # make it true when the evaluation data are released
+add_enhanced_data=true # make it true when you want to add enhanced data into training set
+decode_only=false # if true, it wouldn't train a model again and will only do decoding
 
 . utils/parse_options.sh || exit 1;
 
@@ -49,6 +51,33 @@ if [ ! -d data/lang ]; then
   exit 1;
 fi
 
+if $decode_only; then
+  # check data/loca/data
+  mdir=`pwd`
+  if [ ! -d $mdir/data/local/data ]; then
+    echo "error, set $mdir correctly"
+    exit 1;
+  elif [ ! -d data/local/data ]; then
+    echo "copy $mdir/data/local/data"
+    mkdir -p data/local
+    cp -r $mdir/data/local/data data/local/
+  fi
+  # check gmm model
+  if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then
+    echo "error, set $mdir correctly"
+    exit 1;
+  elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then
+    echo "copy $mdir/exp/tri3b_tr05_multi_${train}"
+    mkdir -p exp
+    cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/
+  fi
+  # process for enhanced data
+  if [ ! -d data/dt05_real_$enhan ] || [ ! -d data/et05_real_$enhan ]; then
+    local/real_enhan_chime4_data_prep.sh $enhan $enhan_data
+    local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data
+  fi
+  stage=6
+fi
 #######################
 #### training #########
 if [ $stage -le 1 ]; then
@@ -63,27 +92,51 @@ if [ $stage -le 1 ]; then
     local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data
   fi
 fi
+# Copy enhanced data for 1ch and 2ch experiments
+if [ $stage -le 2 ] && [[ "$PWD" != *s5_6ch* ]]; then
+  beamformed=0
+  # First remove empty files generated from previous stage
+  for d in tr05_{real,simu}_$enhan; do
+    [ -d data/$d ] && rm -rf data/$d && \
+    echo "remove empty directory $d"
+  done
+  if [[ "$enhan" == *beamformit_2mics* ]] && [ -d ../s5_6ch/data/tr05_real_beamformit_5mics ]; then
+    echo "copy tr05_{real,simu}_beamformit_5mics from ../s5_6ch/data/"
+    cp -r ../s5_6ch/data/tr05_real_beamformit_5mics data/tr05_real_beamformit_2mics
+    cp -r ../s5_6ch/data/tr05_simu_beamformit_5mics data/tr05_simu_beamformit_2mics
+    beamformed=1
+  elif [ -d ../s5_6ch/data/tr05_real_$enhan ]; then
+    echo "copy enhanced training data ${d} from ../s5_6ch/data/"
+    cp -r ../s5_6ch/data/tr05_real_$enhan data/
+    cp -r ../s5_6ch/data/tr05_simu_$enhan data/
+    beamformed=1
+  elif [[ "$enhan" == *isolated_1ch_track* ]]; then
+    beamformed=1
+  fi
+  if [ $beamformed == 0 ]; then
+    echo "no such directory tr05_{real,simu}_{beamformit_5mics,blstm_gev,single_BLSTMmask}"
+    echo "They are generated by run_beamform_6ch_track.sh in ../s5_6ch/run.sh, please execute it first" && \
+    exit 1;
+  fi
+fi
 
 # Now make MFCC features for clean, close, and noisy data
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
 mfccdir=mfcc
-if [ $stage -le 2 ]; then
-  if $eval_flag; then
-    tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train} tr05_real_$enhan tr05_simu_$enhan"
+if [ $stage -le 3 ]; then
+  if $add_enhanced_data; then
+    if $eval_flag; then
+      tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train} tr05_real_$enhan tr05_simu_$enhan"
+    else
+      tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} tr05_real_$enhan tr05_simu_$enhan"
+    fi
   else
-    tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} tr05_real_$enhan tr05_simu_$enhan"
-  fi
-  if [ "$enhan" == "beamformit_2mics" ]; then
-    for d in ../s5_6ch/data/tr05_{real,simu}_beamformit_5mics; do
-      [ ! -d $d ] && echo "no such directory $d" && \
-      echo "It is generated by run_beamform_6ch_track.sh within ../s5_6ch/run.sh, execute it first" && \
-      exit 1;
-    done
-    echo "copy enhanced training data from ../s5_6ch/data/"
-    rm -rf data/tr05_{real,simu}_beamformit_2mics
-    cp -r ../s5_6ch/data/tr05_real_beamformit_5mics data/tr05_real_beamformit_2mics
-    cp -r ../s5_6ch/data/tr05_simu_beamformit_5mics data/tr05_simu_beamformit_2mics
+    if $eval_flag; then
+      tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train} et05_real_${train} et05_simu_${train}"
+    else
+      tasks="tr05_real_${train} dt05_real_${train} tr05_simu_${train} dt05_simu_${train}"
+    fi
   fi
   for x in $tasks; do
     steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \
@@ -95,17 +148,20 @@ fi
 # make mixed training set from real and simulation training data
 # multi = simu + real
 # Note that we are combining enhanced training data with noisy training data
-if [ $stage -le 3 ]; then
-  utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train} data/tr05_simu_$enhan data/tr05_real_$enhan
-  #utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train}
+if [ $stage -le 4 ]; then
+  if $add_enhanced_data; then
+    utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train} data/tr05_simu_$enhan data/tr05_real_$enhan
+  else
+    utils/combine_data.sh data/tr05_multi_${train} data/tr05_simu_${train} data/tr05_real_${train}
+  fi
   utils/combine_data.sh data/dt05_multi_${train} data/dt05_simu_${train} data/dt05_real_${train}
   if $eval_flag; then
-  utils/combine_data.sh data/et05_multi_${train} data/et05_simu_${train} data/et05_real_${train}
+    utils/combine_data.sh data/et05_multi_${train} data/et05_simu_${train} data/et05_real_${train}
   fi
 fi
 
 # training models for noisy data
-if [ $stage -le 4 ]; then
+if [ $stage -le 5 ]; then
   nspk=`wc -l data/tr05_multi_${train}/spk2utt | awk '{print $1}'`
   if [ $nj -gt $nspk ]; then
     nj2=$nspk
diff --git a/egs/chime4/s5_1ch/local/run_gmm_recog.sh b/egs/chime4/s5_1ch/local/run_gmm_recog.sh
deleted file mode 100755
index 5f7f47b39d7..00000000000
--- a/egs/chime4/s5_1ch/local/run_gmm_recog.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
-#                Inria (Emmanuel Vincent)
-#                Mitsubishi Electric Research Labs (Shinji Watanabe)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-# This script is made from the kaldi recipe of the 2nd CHiME Challenge Track 2
-# made by Chao Weng
-
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
-
-# Config:
-nj=30
-stage=0 # resume training with --stage=N
-train=noisy
-eval_flag=true # make it true when the evaluation data are released
-
-. utils/parse_options.sh || exit 1;
-
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-
-if [ $# -ne 3 ]; then
-  printf "\nUSAGE: %s <enhancement method> <enhanced speech directory> <model dir>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  echo "Second argument specifies the directory of enhanced wav files"
-  echo "Third argument specifies acoustic and language model directory"
-  exit 1;
-fi
-
-# set enhanced data
-enhan=$1
-enhan_data=$2
-# set model directory
-mdir=$3
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-
-# check data/loca/data
-if [ ! -d $mdir/data/local/data ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d data/local/data ]; then
-  echo "copy $mdir/data/local/data"
-  mkdir -p data/local
-  cp -r $mdir/data/local/data data/local/
-fi
-
-# check gmm model
-if [ ! -d $mdir/exp/tri3b_tr05_multi_${train} ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-elif [ ! -d exp/tri3b_tr05_multi_${train} ]; then
-  echo "copy $mdir/exp/tri3b_tr05_multi_${train}"
-  mkdir -p exp
-  cp -r $mdir/exp/tri3b_tr05_multi_${train} exp/
-fi
-
-# process for enhanced data
-if [ $stage -le 0 ]; then
-  if [ ! -d data/dt05_real_$enhan ] || [ ! -d data/et05_real_$enhan ]; then
-    local/real_enhan_chime4_data_prep.sh $enhan $enhan_data
-    local/simu_enhan_chime4_data_prep.sh $enhan $enhan_data
-  fi
-fi
-
-# Now make MFCC features for enhanced data
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
-mfccdir=mfcc/$enhan
-if [ $stage -le 1 ]; then
-  if $eval_flag; then
-    tasks="dt05_real_$enhan dt05_simu_$enhan et05_real_$enhan et05_simu_$enhan"
-  else
-    tasks="dt05_real_$enhan dt05_simu_$enhan"
-  fi
-  for x in $tasks; do
-    if [ ! -e data/$x/feats.scp ]; then
-      steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" \
-        data/$x exp/make_mfcc/$x $mfccdir
-      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
-    fi
-  done
-fi
-
-# make mixed training set from real and simulation enhanced data
-# multi = simu + real
-if [ $stage -le 2 ]; then
-  if [ ! -d data/dt05_multi_$enhan ] || [ ! -d data/et05_multi_$enhan ]; then
-    utils/combine_data.sh data/dt05_multi_$enhan data/dt05_simu_$enhan data/dt05_real_$enhan
-    if $eval_flag; then
-      utils/combine_data.sh data/et05_multi_$enhan data/et05_simu_$enhan data/et05_real_$enhan
-    fi
-  fi
-fi
-
-# decode enhanced speech using AMs trained with enhanced data
-if [ $stage -le 3 ]; then
-  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
-    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_real_$enhan &
-  steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
-    exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/dt05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_dt05_simu_$enhan &
-  if $eval_flag; then
-    steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
-      exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_real_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_real_$enhan &
-    steps/decode_fmllr.sh --nj 4 --num-threads 3 --cmd "$decode_cmd" \
-      exp/tri3b_tr05_multi_${train}/graph_tgpr_5k data/et05_simu_$enhan exp/tri3b_tr05_multi_${train}/decode_tgpr_5k_et05_simu_$enhan &
-  fi
-  wait;
-fi
-
-# scoring
-if [ $stage -le 4 ]; then
-  # decoded results of enhanced speech using AMs trained with enhanced data
-  local/chime4_calc_wers.sh exp/tri3b_tr05_multi_${train} $enhan exp/tri3b_tr05_multi_${train}/graph_tgpr_5k \
-    > exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result
-  head -n 15 exp/tri3b_tr05_multi_${train}/best_wer_$enhan.result
-fi
-
-echo "`basename $0` Done."
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh
deleted file mode 100755
index 8b57585fda0..00000000000
--- a/egs/chime4/s5_1ch/local/run_lmrescore_recog.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-
-# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
-#                Inria (Emmanuel Vincent)
-#                Mitsubishi Electric Research Labs (Shinji Watanabe)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
-
-nj=12
-stage=1
-order=5
-hidden=300
-rnnweight=0.5
-nbest=100
-train=noisy
-eval_flag=true # make it true when the evaluation data are released
-
-. utils/parse_options.sh || exit 1;
-
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
-
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-
-if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  echo "Second argument specifies acoustic and language model directory"
-  exit 1;
-fi
-
-# set language models
-lm_suffix=${order}gkn_5k
-rnnlm_suffix=rnnlm_5k_h${hidden}
-
-# enhan data
-enhan=$1
-# set model directory
-mdir=$2
-srcdir=exp/tri4a_dnn_tr05_multi_${train}_smbr_i1lats
-
-# check language models
-if [ ! -d $mdir/data/lang ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-fi
-
-# preparation
-dir=exp/tri4a_dnn_tr05_multi_${train}_smbr_lmrescore
-mkdir -p $dir
-# make a symbolic link to graph info
-if [ ! -e $dir/graph_tgpr_5k ]; then
-  if [ ! -e exp/tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k ]; then
-    echo "graph is missing, execute local/run_dnn.sh, correctly"
-    exit 1;
-  fi
-  pushd . ; cd $dir
-  ln -s ../tri4a_dnn_tr05_multi_${train}/graph_tgpr_5k .
-  popd
-fi
-
-# rescore lattices by a high-order N-gram
-if [ $stage -le 3 ]; then
-  # check the best iteration
-  if [ ! -f $srcdir/log/best_wer_$enhan ]; then
-    echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_dnn.sh, first"
-    exit 1;
-  fi
-  it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'`
-  # rescore lattices
-  if $eval_flag; then
-    tasks="dt05_simu dt05_real et05_simu et05_real"
-  else
-    tasks="dt05_simu dt05_real"
-  fi
-  for t in $tasks; do
-    steps/lmrescore.sh --mode 3 \
-      $mdir/data/lang_test_tgpr_5k \
-      $mdir/data/lang_test_${lm_suffix} \
-      data-fmllr-tri3b/${t}_$enhan \
-      $srcdir/decode_tgpr_5k_${t}_${enhan}_it$it \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix}
-  done
-  # rescored results by high-order n-gram LM
-  mkdir -p $dir/log
-  local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \
-      > $dir/best_wer_${enhan}_${lm_suffix}.result
-  head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result
-fi
-
-# N-best rescoring using a RNNLM
-if [ $stage -le 4 ]; then
-  # check the best lmw
-  if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then
-    echo "error, rescoring with a high-order n-gram seems to be failed"
-    exit 1;
-  fi
-  lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'`
-  # rescore n-best list for all sets
-  if $eval_flag; then
-    tasks="dt05_simu dt05_real et05_simu et05_real"
-  else
-    tasks="dt05_simu dt05_real"
-  fi
-  for t in $tasks; do
-    steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \
-      $rnnweight \
-      $mdir/data/lang_test_${lm_suffix} \
-      $mdir/data/lang_test_${rnnlm_suffix} \
-      data-fmllr-tri3b/${t}_$enhan \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
-  done
-  # calc wers for RNNLM results
-  local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
-      > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-  head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-fi
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh
index 67572f0dd4c..58af793615e 100755
--- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh
+++ b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn.sh
@@ -98,7 +98,7 @@ if [ $stage -le 3 ]; then
     steps/lmrescore.sh --mode 3 \
       data/lang_test_tgpr_5k \
       data/lang_test_${lm_suffix} \
-      data/${t}_${enhan}_hires \
+      data/${t}_${enhan}_chunked \
       $srcdir/decode_tgpr_5k_${t}_${enhan} \
       $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix}
   done
@@ -128,7 +128,7 @@ if [ $stage -le 4 ]; then
       $rnnweight \
       data/lang_test_${lm_suffix} \
       data/lang_test_${rnnlm_suffix} \
-      data/${t}_${enhan}_hires \
+      data/${t}_${enhan}_chunked \
       $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \
       $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
   done
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh
index 7173dcea78b..0bea4dd7102 100755
--- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh
+++ b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm.sh
@@ -165,4 +165,4 @@ if [ $stage -le 4 ]; then
   local/chime4_calc_wers_looped.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
       > $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
   head -n 15 $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-fi
\ No newline at end of file
+fi
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm_recog.sh
deleted file mode 100755
index c4b4e238011..00000000000
--- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_lstm_recog.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/bin/bash
-
-# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
-#                Inria (Emmanuel Vincent)
-#                Mitsubishi Electric Research Labs (Shinji Watanabe)
-#           2017 JHU CLSP (Szu-Jui Chen)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
-
-nj=12
-stage=1
-order=5
-hidden=300
-rnnweight=0.5
-nbest=100
-train=noisy
-eval_flag=true # make it true when the evaluation data are released
-
-. utils/parse_options.sh || exit 1;
-
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
-
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-
-if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  echo "Second argument specifies acoustic and language model directory"
-  exit 1;
-fi
-
-# set language models
-# You might need to change affix to the affix of your best tdnn model.
-affix=1a
-lm_suffix=${order}gkn_5k
-rnnlm_suffix=rnnlm_5k_h${hidden}
-
-# enhan data
-enhan=$1
-# set model directory
-mdir=$2
-srcdir=exp/chain/tdnn_lstm${affix}_sp
-
-# check language models
-if [ ! -d $mdir/data/lang ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-fi
-
-# preparation
-dir=exp/chain/tdnn_lstm${affix}_sp_smbr_lmrescore
-mkdir -p $dir
-# make a symbolic link to graph info
-if [ ! -e $dir/graph_tgpr_5k ]; then
-  if [ ! -e exp/chain/tree_a_sp/graph_tgpr_5k ]; then
-    echo "graph is missing, execute local/run_tdnn.sh, correctly"
-    exit 1;
-  fi
-  pushd . ; cd $dir
-  ln -s ../tree_a_sp/graph_tgpr_5k .
-  popd
-fi
-
-# rescore lattices by a high-order N-gram
-if [ $stage -le 3 ]; then
-  # check the best iteration
-  if [ ! -f $srcdir/log/best_wer_$enhan ]; then
-    echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_tdnn_lstm.sh, first"
-    exit 1;
-  fi
-  it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'`
-  # rescore lattices
-  if $eval_flag; then
-    tasks="dt05_simu dt05_real et05_simu et05_real"
-  else
-    tasks="dt05_simu dt05_real"
-  fi
-  for t in $tasks; do
-    steps/lmrescore.sh --mode 3 \
-      $mdir/data/lang_test_tgpr_5k \
-      $mdir/data/lang_test_${lm_suffix} \
-      data/${t}_${enhan}_hires \
-      $srcdir/decode_tgpr_5k_${t}_${enhan} \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix}
-  done
-  # rescored results by high-order n-gram LM
-  mkdir -p $dir/log
-  local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \
-      > $dir/best_wer_${enhan}_${lm_suffix}.result
-  head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result
-  
-  # now rescore lattices after looped decoding
-  for t in $tasks; do
-    steps/lmrescore.sh --mode 3 \
-      data/lang_test_tgpr_5k \
-      data/lang_test_${lm_suffix} \
-      data/${t}_${enhan}_hires \
-      $srcdir/decode_looped_tgpr_5k_${t}_${enhan} \
-      $dir/decode_looped_tgpr_5k_${t}_${enhan}_${lm_suffix}
-  done
-  # rescored results by high-order n-gram LM
-  local/chime4_calc_wers_looped.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \
-      > $dir/best_wer_looped_${enhan}_${lm_suffix}.result
-  head -n 15 $dir/best_wer_looped_${enhan}_${lm_suffix}.result
-fi
-
-# N-best rescoring using a RNNLM
-if [ $stage -le 4 ]; then
-  # check the best lmw
-  if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then
-    echo "error, rescoring with a high-order n-gram seems to be failed"
-    exit 1;
-  fi
-  lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'`
-  # rescore n-best list for all sets
-  if $eval_flag; then
-    tasks="dt05_simu dt05_real et05_simu et05_real"
-  else
-    tasks="dt05_simu dt05_real"
-  fi
-  for t in $tasks; do
-    steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \
-      $rnnweight \
-      $mdir/data/lang_test_${lm_suffix} \
-      $mdir/data/lang_test_${rnnlm_suffix} \
-      data/${t}_${enhan}_hires \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
-  done
-  # calc wers for RNNLM results
-  local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
-      > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-  head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-  
-  # now rescore lattices after looped decoding
-  for t in $tasks; do
-    steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \
-      $rnnweight \
-      data/lang_test_${lm_suffix} \
-      data/lang_test_${rnnlm_suffix} \
-      data/${t}_${enhan}_hires \
-      $dir/decode_looped_tgpr_5k_${t}_${enhan}_${lm_suffix} \
-      $dir/decode_looped_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
-  done
-  # calc wers for RNNLM results
-  local/chime4_calc_wers_looped.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
-      > $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-  head -n 15 $dir/best_wer_looped_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-fi
diff --git a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_recog.sh b/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_recog.sh
deleted file mode 100755
index 4508ddeb9f4..00000000000
--- a/egs/chime4/s5_1ch/local/run_lmrescore_tdnn_recog.sh
+++ /dev/null
@@ -1,124 +0,0 @@
-#!/bin/bash
-
-# Copyright 2015 University of Sheffield (Jon Barker, Ricard Marxer)
-#                Inria (Emmanuel Vincent)
-#                Mitsubishi Electric Research Labs (Shinji Watanabe)
-#           2017 JHU CLSP (Szu-Jui Chen)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori)
-
-nj=12
-stage=1
-order=5
-hidden=300
-rnnweight=0.5
-nbest=100
-train=noisy
-eval_flag=true # make it true when the evaluation data are released
-
-. utils/parse_options.sh || exit 1;
-
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
-
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-
-if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <enhancement method> <model dir>\n\n" `basename $0`
-  echo "First argument specifies a unique name for different enhancement method"
-  echo "Second argument specifies acoustic and language model directory"
-  exit 1;
-fi
-
-# set language models
-# You might need to change affix to the affix of your best tdnn model.
-affix=1a
-lm_suffix=${order}gkn_5k
-rnnlm_suffix=rnnlm_5k_h${hidden}
-
-# enhan data
-enhan=$1
-# set model directory
-mdir=$2
-srcdir=exp/chain/tdnn${affix}_sp
-
-# check language models
-if [ ! -d $mdir/data/lang ]; then
-  echo "error, set $mdir correctly"
-  exit 1;
-fi
-
-# preparation
-dir=exp/chain/tdnn${affix}_sp_smbr_lmrescore
-mkdir -p $dir
-# make a symbolic link to graph info
-if [ ! -e $dir/graph_tgpr_5k ]; then
-  if [ ! -e exp/chain/tree_a_sp/graph_tgpr_5k ]; then
-    echo "graph is missing, execute local/run_tdnn.sh, correctly"
-    exit 1;
-  fi
-  pushd . ; cd $dir
-  ln -s ../tree_a_sp/graph_tgpr_5k .
-  popd
-fi
-
-# rescore lattices by a high-order N-gram
-if [ $stage -le 3 ]; then
-  # check the best iteration
-  if [ ! -f $srcdir/log/best_wer_$enhan ]; then
-    echo "$0: error $srcdir/log/best_wer_$enhan not found. execute local/run_tdnn.sh, first"
-    exit 1;
-  fi
-  it=`cut -f 1 -d" " $srcdir/log/best_wer_$enhan | awk -F'[_]' '{print $1}'`
-  # rescore lattices
-  if $eval_flag; then
-    tasks="dt05_simu dt05_real et05_simu et05_real"
-  else
-    tasks="dt05_simu dt05_real"
-  fi
-  for t in $tasks; do
-    steps/lmrescore.sh --mode 3 \
-      $mdir/data/lang_test_tgpr_5k \
-      $mdir/data/lang_test_${lm_suffix} \
-      data/${t}_${enhan}_hires \
-      $srcdir/decode_tgpr_5k_${t}_${enhan} \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix}
-  done
-  # rescored results by high-order n-gram LM
-  mkdir -p $dir/log
-  local/chime4_calc_wers.sh $dir ${enhan}_${lm_suffix} $dir/graph_tgpr_5k \
-      > $dir/best_wer_${enhan}_${lm_suffix}.result
-  head -n 15 $dir/best_wer_${enhan}_${lm_suffix}.result
-fi
-
-# N-best rescoring using a RNNLM
-if [ $stage -le 4 ]; then
-  # check the best lmw
-  if [ ! -f $dir/log/best_wer_${enhan}_${lm_suffix} ]; then
-    echo "error, rescoring with a high-order n-gram seems to be failed"
-    exit 1;
-  fi
-  lmw=`cut -f 1 -d" " $dir/log/best_wer_${enhan}_${lm_suffix} | awk -F'[_]' '{print $NF}'`
-  # rescore n-best list for all sets
-  if $eval_flag; then
-    tasks="dt05_simu dt05_real et05_simu et05_real"
-  else
-    tasks="dt05_simu dt05_real"
-  fi
-  for t in $tasks; do
-    steps/rnnlmrescore.sh --inv-acwt $lmw --N $nbest --use-phi true \
-      $rnnweight \
-      $mdir/data/lang_test_${lm_suffix} \
-      $mdir/data/lang_test_${rnnlm_suffix} \
-      data/${t}_${enhan}_hires \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${lm_suffix} \
-      $dir/decode_tgpr_5k_${t}_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}
-  done
-  # calc wers for RNNLM results
-  local/chime4_calc_wers.sh $dir ${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest} $dir/graph_tgpr_5k \
-      > $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-  head -n 15 $dir/best_wer_${enhan}_${rnnlm_suffix}_w${rnnweight}_n${nbest}.result
-fi
diff --git a/egs/chime4/s5_1ch/local/run_nn-gev.sh b/egs/chime4/s5_1ch/local/run_nn-gev.sh
new file mode 100755
index 00000000000..a17dd3d3f15
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/run_nn-gev.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+if [ $# != 4 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_nn-gev.sh <wav-in-dir> <wav-out-dir> <enhancement-type> <track>"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+enhancement_type=$3
+track=$4
+
+gpu_id=1
+case $(hostname -f) in
+  *.clsp.jhu.edu) gpu_id=`free-gpu` ;; # JHU,
+esac 
+
+if [ ! -f local/nn-gev/data/BLSTM_model/mlp.tr ]; then
+    echo "training a BLSTM mask network"
+    $HOME/miniconda3/bin/python local/nn-gev/train.py --chime_dir=$sdir/data --gpu $gpu_id local/nn-gev/data BLSTM
+else
+    echo "Not training a BLSTM mask network. Using existing model in local/nn-gev/data/BLSTM_model/"
+fi
+echo "enhancing signals with mask-based GEV beamformer"
+local/nn-gev/beamform.sh $sdir/data local/nn-gev/data $odir local/nn-gev/data/BLSTM_model/best.nnet BLSTM --gpu $gpu_id --single $enhancement_type --track $track
diff --git a/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh
index 03e355a82ec..124cde82b8a 100755
--- a/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh
+++ b/egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh
@@ -69,8 +69,12 @@ fi
 
 # make a scp file from file list
 for x in $list_set; do
-    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.ids
-    paste -d" " ${x}_wav.ids $x.flist | sort -k 1 > ${x}_wav.scp
+    cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.id.temp
+    cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch
+    cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1
+    cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2
+    paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids
+    paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp
 done
 
 # make a transcription from dot
@@ -80,10 +84,10 @@ if [ ! -e dot_files.flist ]; then
   echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
   exit 1;
 fi
-cat tr05_simu_noisy_wav.scp | awk -F'[_]' '{print $2}' | tr '[A-Z]' '[a-z]' \
+cat tr05_simu_noisy_wav.scp.temp | awk -F'[_]' '{print $3}' | tr '[A-Z]' '[a-z]' \
     | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt
-cat tr05_simu_noisy_wav.scp | cut -f 1 -d" " > tr05_simu_noisy.ids
-paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -k 1 > tr05_simu_noisy.trans1
+cat tr05_simu_noisy_wav.scp.temp | cut -f 1 -d" " > tr05_simu_noisy.ids
+paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -t_ -k1,1 -k3 > tr05_simu_noisy.trans1
 # dt05 and et05 simulation data are generated from the CHiME4 booth recording
 # and we use CHiME4 dot files
 cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> dt05_simu_noisy.ids
@@ -104,13 +108,17 @@ fi
 # data-preparation stage independent of the specific lexicon used.
 noiseword="<NOISE>";
 for x in $list_set;do
+  cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1
+  cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2
+  paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1
   cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
     | sort > $x.txt || exit 1;
 done
 
 # Make the utt2spk and spk2utt files.
 for x in $list_set; do
-  cat ${x}_wav.scp | awk -F'_' '{print $1}' > $x.spk
+  sort ${x}_wav.scp.temp > ${x}_wav.scp
+  cat ${x}_wav.scp | awk -F'_' '{print $1"_"$2}' > $x.spk
   cat ${x}_wav.scp | awk '{print $1}' > $x.utt
   paste -d" " $x.utt $x.spk > $x.utt2spk
   cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
@@ -125,4 +133,8 @@ for x in $list_set; do
   cp ${x}.utt2spk ../../$x/utt2spk || exit 1;
 done
 
+# clean up temp files
+rm *.temp
+rm *.part{1,2}
+
 echo "Data preparation succeeded"
diff --git a/egs/chime4/s5_1ch/local/stoi_estoi_sdr.m b/egs/chime4/s5_1ch/local/stoi_estoi_sdr.m
new file mode 100644
index 00000000000..45047fe1884
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/stoi_estoi_sdr.m
@@ -0,0 +1,62 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
+% Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
+
+function stoi_estoi_sdr(nj,enhancement_method,destination_directory,set)
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
+% "stoi_estoi_sdr" : this function computes the average STOI, eSTOI and SDR
+%                    scores by calling downloaded third party matlab functions
+%
+% Input:
+% nj: number of jobs
+% enhancement_method: the name of the enhacement method
+% destination_directory: the directory where the results have to be stored,
+%                        the list of the enhaced and reference files are
+%                        stored here before calling this function
+% set: name of the set to be evaluated ('et05' or 'dt05')
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 
+
+original_file_list=strcat(destination_directory,'/original_list');
+enhanced_file_list=strcat(destination_directory,'/enhanced_list');
+files1=textread(original_file_list,'%s');
+files2=textread(enhanced_file_list,'%s');
+d_stoi=zeros(1,length(files2));
+d_estoi=zeros(1,length(files2));
+SDR=zeros(1,length(files2));
+p = parpool('local', nj);
+parfor i=1:length(files2)
+    [x, fs] = audioread(files1{i});
+    [y, fs] = audioread(files2{i});
+    m=length(x);
+    n=length(y);
+    d=abs(m-n);
+    if m>n
+         y=[y; zeros(d,1)];
+    end
+    if n>m
+         x=[x; zeros(d,1)];
+    end
+
+    d_stoi(i)=stoi(x,y,fs);
+    d_estoi(i)=estoi(x,y,fs);
+    [SDR(i),SIR,SAR,perm]=bss_eval_sources(y',x');
+end
+SDR_avg=mean(SDR);
+STOI_avg=mean(d_stoi);
+ESTOI_avg=mean(d_estoi);
+SDRFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_SDR');
+stoiFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_STOI');
+estoiFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_eSTOI');
+fileID = fopen(SDRFile,'w');
+fprintf(fileID,'%f\n',SDR_avg);
+fclose(fileID);
+fileID = fopen(stoiFile,'w');
+fprintf(fileID,'%f\n',STOI_avg);
+fclose(fileID);
+fileID = fopen(estoiFile,'w');
+fprintf(fileID,'%f\n',ESTOI_avg);
+fclose(fileID);
+ResultMATFile=strcat(destination_directory,'/',enhancement_method,'_',set,'_stoi_estoi_sdr.mat');
+save(ResultMATFile,'SDR','d_stoi','d_estoi');
+end
diff --git a/egs/chime4/s5_1ch/local/write_se_results.sh b/egs/chime4/s5_1ch/local/write_se_results.sh
new file mode 100755
index 00000000000..7ada63f8ccc
--- /dev/null
+++ b/egs/chime4/s5_1ch/local/write_se_results.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright 2017 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+
+if [ $# != 1 ]; then
+   echo "Wrong #arguments ($#, expected 1)"
+   echo "Usage: local/write_se_results.sh <enhancement-method>"
+   exit 1;
+fi
+
+enhancement=$1
+
+echo -e "PESQ ($enhancement) \t dt05_simu=$(cat exp/compute_pesq_$enhancement/pesq_dt05) \t et05_simu=$(cat exp/compute_pesq_$enhancement/pesq_et05)"
+echo -e "STOI ($enhancement) \t dt05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_dt05_STOI) \t et05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_et05_STOI)"
+echo -e "eSTOI ($enhancement) \t dt05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_dt05_eSTOI) \t et05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_et05_eSTOI)"
+echo -e "SDR ($enhancement) \t dt05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_dt05_SDR) \t et05_simu=$(cat exp/compute_stoi_estoi_sdr_$enhancement/${enhancement}_et05_SDR)"
+echo ""
diff --git a/egs/chime4/s5_1ch/rnnlm b/egs/chime4/s5_1ch/rnnlm
new file mode 120000
index 00000000000..e136939ba72
--- /dev/null
+++ b/egs/chime4/s5_1ch/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
\ No newline at end of file
diff --git a/egs/chime4/s5_1ch/run.sh b/egs/chime4/s5_1ch/run.sh
index beb8c80207f..5b980dec827 100755
--- a/egs/chime4/s5_1ch/run.sh
+++ b/egs/chime4/s5_1ch/run.sh
@@ -6,26 +6,29 @@
 #                Inria (Emmanuel Vincent)
 #                Mitsubishi Electric Research Labs (Shinji Watanabe)
 #           2017 JHU CLSP (Szu-Jui Chen)
+#           2017 JHU CLSP (Aswin Shanmugam Subramanian)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 . ./path.sh
 . ./cmd.sh
+
 #####Baseline settings#####
 # Usage: 
-# 1. For using original baseline, execute './run.sh --baseline chime4_official'. 
-# We don't provide the function to train original baseline models anymore. Instead, we provided the
-# trained original baseline models in tools/ASR_models for directly using.
+# Execute './run.sh' to get the models.
+# We provide BLSTM masking based enhancement --enhancement single_blstmmask
 #
-# 2. For using advanced baseline, first execute './run.sh --baseline advanced --flatstart true' to
-# get the models. If you want to use DNN instead of TDNN, add option "--tdnn false".
-# Then execute './run.sh --baseline advanced' for your experiments. 
+# We stopped to support the old CHiME-3/4 baseline. If you want to reproduce the old results
+# Please use the old version of Kaldi, e.g., git checkout 9e8ff73648917836d0870c8f6fdd2ff4bdde384f
 
 # Config:
 stage=0 # resume training with --stage N
-
-baseline=advanced
-flatstart=false
-tdnn=true
+enhancement=single_blstmmask #### or your method
+# if the following options are true, they wouldn't train a model again and will only do decoding
+gmm_decode_only=false
+tdnn_decode_only=false
+# make it true when you want to add enhanced data into training set. But please note that when changing enhancement method,
+# you may need to retrain from run_gmm.sh and avoid using decode-only options above
+add_enhanced_data=true
 
 . utils/parse_options.sh || exit 1;
 
@@ -40,107 +43,82 @@ set -o pipefail
 # If you use scripts distributed in the CHiME4 package,
 chime4_data=`pwd`/../..
 # Otherwise, please specify it, e.g.,
-chime4_data=/db/laputa1/data/processed/public/CHiME4
-
+# chime4_data=/db/laputa1/data/processed/public/CHiME4
+# chime3_data=/data2/archive/speech-db/original/public/CHiME3
 
 case $(hostname -f) in
-  *.clsp.jhu.edu) chime4_data=/export/corpora4/CHiME4/CHiME3 ;; # JHU,
+  *.clsp.jhu.edu) 
+      chime4_data=/export/corpora4/CHiME4/CHiME3 # JHU,
+      chime3_data=/export/corpora5/CHiME3 
+      ;;
 esac 
 
 if [ ! -d $chime4_data ]; then
-  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1;
 fi
-# Set a model directory for the CHiME4 data.
-case $baseline in
-  chime4_official)
-      if $flatstart; then
-        echo "We don't support this anymore for 'chime4_official' baseline"
-        echo " ... Automatically set it to false"
-      fi
-      modeldir=$chime4_data/tools/ASR_models
-      flatstart=false
-      ;;
-  advanced)
-      modeldir=`pwd`
-      ;;
-  *)
-      echo "Usage: './run.sh --baseline chime4_official' or './run.sh --baseline advanced'"
-      echo " ... If you haven't run flatstart for advanced baseline, please execute"
-      echo " ... './run.sh --baseline advanced --flatstart true' first";
-      exit 1;
-esac
-
-if [ "$flatstart" = false ]; then
-  for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \
-    $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do
-    [ ! -d $d ] && echo "$0: no such directory $d. specify models correctly" && \
-    echo " or execute './run.sh --baseline advanced --flatstart true' first" && exit 1;
-  done
+if [ ! -d $chime3_data ]; then
+  echo "$chime3_data does not exist. Please specify chime4 data root correctly" && exit 1;
 fi
-#####check data and model paths finished#######
-
 
 #####main program start################
 # You can execute run_init.sh only "once"
 # This creates 3-gram LM, FSTs, and basic task files
-if [ $stage -le 0 ] && $flatstart; then
+if [ $stage -le 0 ]; then
   local/run_init.sh $chime4_data
 fi
 
-# In this script, we use non-enhanced 6th microphone signals.
-enhancement_method=isolated_1ch_track
-enhancement_data=$chime4_data/data/audio/16kHz/$enhancement_method
-#if [ $stage -le 1 ]; then
-#  put your single channel enhancement
-#fi
+if [[ "$enhancement" == *isolated_1ch_track* ]]; then
+  enhancement_data=$chime4_data/data/audio/16kHz/isolated_1ch_track
+else
+  enhancement_data=`pwd`/enhan/$enhancement
+fi
 
-# GMM based ASR experiment without "retraining"
-# Please set a directory of your speech enhancement method.
-# run_gmm_recog.sh can be done every time when you change a speech enhancement technique.
-# The directory structure and audio files must follow the attached baseline enhancement directory
+if [ $stage -le 1 ]; then
+  local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 --track 1 $chime4_data $chime3_data $enhancement_data 0 
+fi
+
+# Compute PESQ, STOI, eSTOI, and SDR scores
 if [ $stage -le 2 ]; then
-  if $flatstart; then
-    local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data
-  else
-    local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir
+  if [ ! -f local/bss_eval_sources.m ] || [ ! -f local/stoi.m ] || [ ! -f local/estoi.m ] || [ ! -f local/PESQ ]; then
+    # download and install speech enhancement evaluation tools
+    local/download_se_eval_tool.sh
+  fi
+  chime4_rir_data=local/nn-gev/data/audio/16kHz/isolated_ext
+  if [ ! -d $chime4_rir_data ]; then
+    echo "$chime4_rir_dir does not exist. Please run 'blstm_gev' enhancement method first;" && exit 1;
   fi
+  local/compute_pesq.sh $enhancement $enhancement_data $chime4_rir_data $PWD
+  local/compute_stoi_estoi_sdr.sh $enhancement $enhancement_data $chime4_rir_data
+  local/compute_pesq.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data $PWD
+  local/compute_stoi_estoi_sdr.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data
+  local/write_se_results.sh $enhancement
+  local/write_se_results.sh NOISY_1ch
 fi
 
-# DNN based ASR experiment
-# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately.
-# You may execute it after you would have promising results using GMM-based ASR experiments
+# GMM based ASR experiment
+# Please set a directory of your speech enhancement method.
+# The directory structure and audio files must follow the attached baseline enhancement directory
 if [ $stage -le 3 ]; then
-  if $tdnn; then
-    if $flatstart; then
-      local/chain/run_tdnn.sh $enhancement_method
-    else
-      local/chain/run_tdnn_recog.sh $enhancement_method $modeldir
-    fi
-  else
-    if $flatstart; then
-      local/run_dnn.sh $enhancement_method
-    else
-      local/run_dnn_recog.sh $enhancement_method $modeldir
-    fi
-  fi
+  local/run_gmm.sh --add-enhanced-data $add_enhanced_data \
+    --decode-only $gmm_decode_only $enhancement $enhancement_data $chime4_data
+fi
+
+# TDNN based ASR experiment
+# Since it takes time to evaluate TDNN, we make the GMM and TDNN scripts separately.
+# You may execute it after you would have promising results using GMM-based ASR experiments
+if [ $stage -le 4 ]; then
+  local/chain/run_tdnn.sh --decode-only $tdnn_decode_only $enhancement
 fi
 
 # LM-rescoring experiment with 5-gram and RNN LMs
 # It takes a few days to train a RNNLM.
-if [ $stage -le 4 ]; then
-  if $flatstart; then
-    if $tdnn; then
-      local/run_lmrescore_tdnn.sh $chime4_data $enhancement_method
-    else
-      local/run_lmrescore.sh $chime4_data $enhancement_method
-    fi
-  else
-    if $tdnn; then
-      local/run_lmrescore_tdnn_recog.sh $enhancement_method $modeldir
-    else
-      local/run_lmrescore_recog.sh $enhancement_method $modeldir
-    fi
-  fi
+if [ $stage -le 5 ]; then
+  local/run_lmrescore_tdnn.sh $chime4_data $enhancement
+fi
+
+# LM-rescoring experiment with LSTM LMs
+if [ $stage -le 6 ]; then
+  local/rnnlm/run_lstm.sh $enhancement
 fi
 
 echo "Done."
diff --git a/egs/chime4/s5_2ch/RESULTS b/egs/chime4/s5_2ch/RESULTS
index f506b54c5db..156b94ebfa9 100644
--- a/egs/chime4/s5_2ch/RESULTS
+++ b/egs/chime4/s5_2ch/RESULTS
@@ -19,7 +19,8 @@ et05_simu WER: 27.57% (Average), 20.17% (BUS), 31.81% (CAFE), 29.96% (PEDESTRIAN
 et05_real WER: 29.03% (Average), 39.37% (BUS), 28.43% (CAFE), 27.56% (PEDESTRIAN), 20.77% (STREET)
 -------------------
 
-Advanced baseline:
+GMM noisy multi-condition with beamformit using 6 channel data
+exp/tri3b_tr05_multi_noisy/best_wer_beamformit_2mics.result
 -------------------
 best overall dt05 WER 17.26% (language model weight = 10)
 -------------------
@@ -32,6 +33,19 @@ et05_simu WER: 26.85% (Average), 20.08% (BUS), 30.84% (CAFE), 29.03% (PEDESTRIAN
 et05_real WER: 27.91% (Average), 37.05% (BUS), 29.25% (CAFE), 25.37% (PEDESTRIAN), 19.97% (STREET)
 -------------------
 
+GMM noisy multi-condition with BLSTM masking using 6 channel data plus enhanced data
+exp/tri3b_tr05_multi_noisy/best_wer_blstm_gev.result
+-------------------
+best overall dt05 WER 14.57% (language model weight = 10)
+-------------------
+dt05_simu WER: 15.62% (Average), 12.89% (BUS), 20.49% (CAFE), 14.22% (PEDESTRIAN), 14.90% (STREET)
+-------------------
+dt05_real WER: 13.52% (Average), 15.52% (BUS), 14.34% (CAFE), 11.57% (PEDESTRIAN), 12.67% (STREET)
+-------------------
+et05_simu WER: 19.05% (Average), 14.51% (BUS), 21.87% (CAFE), 20.41% (PEDESTRIAN), 19.39% (STREET)
+-------------------
+et05_real WER: 20.94% (Average), 26.66% (BUS), 21.52% (CAFE), 19.15% (PEDESTRIAN), 16.45% (STREET)
+-------------------
 
 DNN sMBR
 exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_2mics.result
@@ -48,7 +62,7 @@ et05_simu WER: 19.04% (Average), 14.76% (BUS), 21.72% (CAFE), 19.22% (PEDESTRIAN
 et05_real WER: 20.44% (Average), 30.02% (BUS), 19.95% (CAFE), 17.79% (PEDESTRIAN), 14.01% (STREET)
 -------------------
 
-Advanced baseline:
+DNN sMBR using all 6 channel data
 -------------------
 best overall dt05 WER 10.13% (language model weight = 12)
  (Number of iterations = 3)
@@ -77,7 +91,7 @@ et05_simu WER: 16.88% (Average), 12.08% (BUS), 19.70% (CAFE), 16.77% (PEDESTRIAN
 et05_real WER: 18.07% (Average), 26.77% (BUS), 17.93% (CAFE), 14.76% (PEDESTRIAN), 12.83% (STREET)
 -------------------
 
-Advanced baseline:
+5-gram rescoring using all 6 channel data
 -------------------
 best overall dt05 WER 8.53% (language model weight = 13)
 -------------------
@@ -105,7 +119,7 @@ et05_simu WER: 15.33% (Average), 10.66% (BUS), 18.21% (CAFE), 15.61% (PEDESTRIAN
 et05_real WER: 16.58% (Average), 25.37% (BUS), 15.97% (CAFE), 13.53% (PEDESTRIAN), 11.45% (STREET)
 -------------------
 
-Advanced baseline:
+RNNLM using all 6 channel data
 -------------------
 best overall dt05 WER 7.46% (language model weight = 14)
 -------------------
@@ -118,7 +132,7 @@ et05_simu WER: 12.57% (Average), 8.85% (BUS), 14.85% (CAFE), 12.44% (PEDESTRIAN)
 et05_real WER: 13.33% (Average), 18.94% (BUS), 13.04% (CAFE), 11.85% (PEDESTRIAN), 9.49% (STREET)
 -------------------
 
-TDNN
+TDNN using all 6 channel data
 exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result
 -------------------
 best overall dt05 WER 7.89% (language model weight = 10)
@@ -132,8 +146,8 @@ et05_simu WER: 13.15% (Average), 9.77% (BUS), 14.16% (CAFE), 13.43% (PEDESTRIAN)
 et05_real WER: 13.39% (Average), 19.63% (BUS), 11.64% (CAFE), 11.49% (PEDESTRIAN), 10.80% (STREET)
 -------------------
 
-TDNN+RNNLM
-exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+TDNN+RNNLM using all 6 channel data
+exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_2mics_rnnlm_5k_h300_w0.5_n100.result
 -------------------
 best overall dt05 WER 5.82% (language model weight = 11)
 -------------------
@@ -145,3 +159,73 @@ et05_simu WER: 9.90% (Average), 7.00% (BUS), 11.15% (CAFE), 10.05% (PEDESTRIAN),
 -------------------
 et05_real WER: 10.53% (Average), 16.90% (BUS), 8.65% (CAFE), 8.52% (PEDESTRIAN), 8.05% (STREET)
 -------------------
+
+TDNN using 6 channel data plus enhanced data
+exp/chain/tdnn1a_sp/best_wer_beamformit_5mics.result
+-------------------
+best overall dt05 WER 7.57% (language model weight = 10)
+-------------------
+dt05_simu WER: 8.18% (Average), 7.12% (BUS), 10.16% (CAFE), 6.33% (PEDESTRIAN), 9.12% (STREET)
+-------------------
+dt05_real WER: 6.96% (Average), 9.38% (BUS), 6.46% (CAFE), 4.91% (PEDESTRIAN), 7.09% (STREET)
+-------------------
+et05_simu WER: 13.14% (Average), 9.92% (BUS), 14.55% (CAFE), 13.26% (PEDESTRIAN), 14.83% (STREET)
+-------------------
+et05_real WER: 12.81% (Average), 19.27% (BUS), 10.66% (CAFE), 11.29% (PEDESTRIAN), 10.03% (STREET)
+-------------------
+
+TDNN+RNNLM using 6 channel data plus enhanced data
+exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_beamformit_2mics_rnnlm_5k_h300_w0.5_n100.result
+-------------------
+best overall dt05 WER 5.52% (language model weight = 10)
+-------------------
+dt05_simu WER: 6.02% (Average), 5.28% (BUS), 7.37% (CAFE), 4.60% (PEDESTRIAN), 6.81% (STREET)
+-------------------
+dt05_real WER: 5.03% (Average), 7.23% (BUS), 4.26% (CAFE), 3.26% (PEDESTRIAN), 5.35% (STREET)
+-------------------
+et05_simu WER: 10.35% (Average), 7.84% (BUS), 11.04% (CAFE), 10.55% (PEDESTRIAN), 11.95% (STREET)
+-------------------
+et05_real WER: 10.20% (Average), 16.21% (BUS), 8.18% (CAFE), 8.43% (PEDESTRIAN), 7.98% (STREET)
+-------------------
+
+TDNN with BLSTM masking using 6 channel data plus enhanced data
+exp/chain/tdnn1a_sp/best_wer_blstm_gev.result
+-------------------
+best overall dt05 WER 6.35% (language model weight = 9)
+-------------------
+dt05_simu WER: 7.03% (Average), 5.72% (BUS), 9.32% (CAFE), 6.28% (PEDESTRIAN), 6.78% (STREET)
+-------------------
+dt05_real WER: 5.66% (Average), 6.89% (BUS), 5.99% (CAFE), 4.44% (PEDESTRIAN), 5.34% (STREET)
+-------------------
+et05_simu WER: 8.80% (Average), 6.80% (BUS), 10.20% (CAFE), 8.37% (PEDESTRIAN), 9.84% (STREET)
+-------------------
+et05_real WER: 9.46% (Average), 13.42% (BUS), 8.31% (CAFE), 8.76% (PEDESTRIAN), 7.34% (STREET)
+-------------------
+
+TDNN+RNNLM with BLSTM masking using 6 channel data plus enhanced data
+exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_5k_h300_w0.5_n100.result
+-------------------
+best overall dt05 WER 4.41% (language model weight = 11)
+-------------------
+dt05_simu WER: 5.03% (Average), 4.13% (BUS), 6.83% (CAFE), 4.45% (PEDESTRIAN), 4.72% (STREET)
+-------------------
+dt05_real WER: 3.79% (Average), 4.68% (BUS), 3.94% (CAFE), 2.95% (PEDESTRIAN), 3.61% (STREET)
+-------------------
+et05_simu WER: 6.07% (Average), 4.52% (BUS), 6.93% (CAFE), 6.05% (PEDESTRIAN), 6.78% (STREET)
+-------------------
+et05_real WER: 6.93% (Average), 10.23% (BUS), 6.13% (CAFE), 6.41% (PEDESTRIAN), 4.97% (STREET)
+-------------------
+
+TDNN+RNNLM with BLSTM masking using 6 channel data plus enhanced data
+exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_lstm_1a_w0.5_n100.result
+-------------------
+best overall dt05 WER 3.39% (language model weight = 10)
+-------------------
+dt05_simu WER: 3.94% (Average), 2.99% (BUS), 5.65% (CAFE), 3.44% (PEDESTRIAN), 3.67% (STREET)
+-------------------
+dt05_real WER: 2.85% (Average), 3.58% (BUS), 2.89% (CAFE), 2.07% (PEDESTRIAN), 2.85% (STREET)
+-------------------
+et05_simu WER: 5.03% (Average), 3.66% (BUS), 5.57% (CAFE), 4.87% (PEDESTRIAN), 6.03% (STREET)
+-------------------
+et05_real WER: 5.40% (Average), 7.81% (BUS), 4.71% (CAFE), 4.73% (PEDESTRIAN), 4.37% (STREET)
+-------------------
diff --git a/egs/chime4/s5_2ch/rnnlm b/egs/chime4/s5_2ch/rnnlm
new file mode 120000
index 00000000000..e136939ba72
--- /dev/null
+++ b/egs/chime4/s5_2ch/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
\ No newline at end of file
diff --git a/egs/chime4/s5_2ch/run.sh b/egs/chime4/s5_2ch/run.sh
index e1a3fecbce5..7ae5048c6fa 100755
--- a/egs/chime4/s5_2ch/run.sh
+++ b/egs/chime4/s5_2ch/run.sh
@@ -6,26 +6,30 @@
 #                Inria (Emmanuel Vincent)
 #                Mitsubishi Electric Research Labs (Shinji Watanabe)
 #           2017 JHU CLSP (Szu-Jui Chen)
+#           2017 JHU CLSP (Aswin Shanmugam Subramanian)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 . ./path.sh
 . ./cmd.sh
+
 #####Baseline settings#####
 # Usage: 
-# 1. For using original baseline, execute './run.sh --baseline chime4_official'. 
-# We don't provide the function to train original baseline models anymore. Instead, we provided the
-# trained original baseline models in tools/ASR_models for directly using.
+# Execute './run.sh' to get the models.
+# We provide three kinds of beamform methods. Add option --enhancement blstm_gev, or --enhancement beamformit_2mics
+# to use them. i.g. './run.sh --enhancement blstm_gev'
 #
-# 2. For using advanced baseline, first execute './run.sh --baseline advanced --flatstart true' to
-# get the models. If you want to use DNN instead of TDNN, add option "--tdnn false".
-# Then execute './run.sh --baseline advanced' for your experiments. 
+# We stopped to support the old CHiME-3/4 baseline. If you want to reproduce the old results
+# Please use the old version of Kaldi, e.g., git checkout 9e8ff73648917836d0870c8f6fdd2ff4bdde384f
 
 # Config:
 stage=0 # resume training with --stage N
-
-baseline=advanced
-flatstart=false
-tdnn=true
+enhancement=blstm_gev #### or your method 
+# if the following options are true, they wouldn't train a model again and will only do decoding
+gmm_decode_only=false
+tdnn_decode_only=false
+# make it true when you want to add enhanced data into training set. But please note that when changing enhancement method,
+# you may need to retrain from run_gmm.sh and avoid using decode-only options above
+add_enhanced_data=true
 
 . utils/parse_options.sh || exit 1;
 
@@ -40,109 +44,89 @@ set -o pipefail
 # If you use scripts distributed in the CHiME4 package,
 chime4_data=`pwd`/../..
 # Otherwise, please specify it, e.g.,
-chime4_data=/db/laputa1/data/processed/public/CHiME4
+# chime4_data=/db/laputa1/data/processed/public/CHiME4
+# chime3_data=/data2/archive/speech-db/original/public/CHiME3
 
 case $(hostname -f) in
-  *.clsp.jhu.edu) chime4_data=/export/corpora4/CHiME4/CHiME3 ;; # JHU,
+  *.clsp.jhu.edu) 
+      chime4_data=/export/corpora4/CHiME4/CHiME3 # JHU,
+      chime3_data=/export/corpora5/CHiME3 
+      ;;
 esac 
 
 if [ ! -d $chime4_data ]; then
-  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1
+  echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1;
 fi
-# Set a model directory for the CHiME4 data.
-case $baseline in
-  chime4_official)
-      if $flatstart; then
-        echo "We don't support this anymore for 'chime4_official' baseline"
-        echo " ... Automatically set it to false"
-      fi
-      modeldir=$chime4_data/tools/ASR_models
-      flatstart=false
-      ;;
-  advanced)
-      modeldir=`pwd`
-      ;;
-  *)
-      echo "Usage: './run.sh --baseline chime4_official' or './run.sh --baseline advanced'"
-      echo " ... If you haven't run flatstart to train the model of advanced baseline,"
-      echo " ... please execute './run.sh --baseline advanced --flatstart true' first";
-      exit 1;
-esac
-
-if [ "$flatstart" = false ]; then
-  for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \
-    $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do
-    [ ! -d $d ] && echo "$0: no such directory $d. specify models correctly" && \
-    echo " or execute './run.sh --baseline advanced --flatstart true' first" && exit 1;
-  done
+if [ ! -d $chime3_data ]; then
+  echo "$chime3_data does not exist. Please specify chime4 data root correctly" && exit 1;
 fi
-#####check data and model paths finished#######
-
 
 #####main program start################
 # You can execute run_init.sh only "once"
 # This creates 3-gram LM, FSTs, and basic task files
-if [ $stage -le 0 ] && $flatstart; then
+if [ $stage -le 0 ]; then
   local/run_init.sh $chime4_data
 fi
 
-# Using Beamformit
-# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
-# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15
-# note that beamformed wav files are generated in the following directory
-enhancement_method=beamformit_2mics
-enhancement_data=`pwd`/enhan/$enhancement_method
+# Using Beamformit or mask-based beamformer
+# note that beamformed WAV files are generated in the following directory
+enhancement_data=`pwd`/enhan/$enhancement
 if [ $stage -le 1 ]; then
-  local/run_beamform_2ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_2ch_track $enhancement_data
+   case $enhancement in
+    beamformit_2mics)
+        local/run_beamform_2ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_2ch_track $enhancement_data
+        ;;
+    blstm_gev)
+        local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 --track 2 $chime4_data $chime3_data $enhancement_data 0
+        ;;
+    *)
+        echo "Usage: --enhancement blstm_gev, or --enhancement beamformit_2mics" 
+        exit 1;
+   esac
 fi
 
-# GMM based ASR experiment without "retraining"
-# Please set a directory of your speech enhancement method.
-# run_gmm_recog.sh can be done every time when you change a speech enhancement technique.
-# The directory structure and audio files must follow the attached baseline enhancement directory
+# Compute PESQ, STOI, eSTOI, and SDR scores
 if [ $stage -le 2 ]; then
-  if $flatstart; then
-    local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data
-  else
-    local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir
+  if [ ! -f local/bss_eval_sources.m ] || [ ! -f local/stoi.m ] || [ ! -f local/estoi.m ] || [ ! -f local/PESQ ]; then
+    # download and install speech enhancement evaluation tools
+    local/download_se_eval_tool.sh
+  fi
+  chime4_rir_data=local/nn-gev/data/audio/16kHz/isolated_ext
+  if [ ! -d $chime4_rir_data ]; then
+    echo "$chime4_rir_dir does not exist. Please run 'blstm_gev' enhancement method first;" && exit 1;
   fi
+  local/compute_pesq.sh $enhancement $enhancement_data $chime4_rir_data $PWD
+  local/compute_stoi_estoi_sdr.sh $enhancement $enhancement_data $chime4_rir_data
+  local/compute_pesq.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data $PWD
+  local/compute_stoi_estoi_sdr.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data
+  local/write_se_results.sh $enhancement
+  local/write_se_results.sh NOISY_1ch
 fi
 
-# DNN based ASR experiment
-# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately.
-# You may execute it after you would have promising results using GMM-based ASR experiments
+# GMM based ASR experiment
+# Please set a directory of your speech enhancement method.
+# The directory structure and audio files must follow the attached baseline enhancement directory
 if [ $stage -le 3 ]; then
-  if $tdnn; then
-    if $flatstart; then
-      local/chain/run_tdnn.sh $enhancement_method
-    else
-      local/chain/run_tdnn_recog.sh $enhancement_method $modeldir
-    fi
-  else
-    if $flatstart; then
-      local/run_dnn.sh $enhancement_method
-    else
-      local/run_dnn_recog.sh $enhancement_method $modeldir
-    fi
-  fi
+  local/run_gmm.sh --add-enhanced-data $add_enhanced_data \
+    --decode-only $gmm_decode_only $enhancement $enhancement_data $chime4_data
+fi
+
+# TDNN based ASR experiment
+# Since it takes time to evaluate TDNN, we make the GMM and TDNN scripts separately.
+# You may execute it after you would have promising results using GMM-based ASR experiments
+if [ $stage -le 4 ]; then
+  local/chain/run_tdnn.sh --decode-only $tdnn_decode_only $enhancement
 fi
 
 # LM-rescoring experiment with 5-gram and RNN LMs
 # It takes a few days to train a RNNLM.
-if [ $stage -le 4 ]; then
-  if $flatstart; then
-    if $tdnn; then
-      local/run_lmrescore_tdnn.sh $chime4_data $enhancement_method
-    else
-      local/run_lmrescore.sh $chime4_data $enhancement_method
-    fi
-  else
-    if $tdnn; then
-      local/run_lmrescore_tdnn_recog.sh $enhancement_method $modeldir
-    else
-      local/run_lmrescore_recog.sh $enhancement_method $modeldir
-    fi
-  fi
+if [ $stage -le 5 ]; then
+  local/run_lmrescore_tdnn.sh $chime4_data $enhancement
+fi
+
+# LM-rescoring experiment with LSTM LMs
+if [ $stage -le 6 ]; then
+  local/rnnlm/run_lstm.sh $enhancement
 fi
 
 echo "Done."
diff --git a/egs/chime4/s5_6ch/RESULTS b/egs/chime4/s5_6ch/RESULTS
index 7d602d49247..266216adc16 100644
--- a/egs/chime4/s5_6ch/RESULTS
+++ b/egs/chime4/s5_6ch/RESULTS
@@ -19,20 +19,21 @@ et05_simu WER: 21.30% (Average), 15.73% (BUS), 22.94% (CAFE), 22.51% (PEDESTRIAN
 et05_real WER: 21.83% (Average), 30.17% (BUS), 20.66% (CAFE), 19.82% (PEDESTRIAN), 16.68% (STREET)
 -------------------
 
-Advanced baseline:
+GMM noisy multi-condition with blstm_gev
+exp/tri3b_tr05_multi_noisy/best_wer_blstm_gev.result
 -------------------
-best overall dt05 WER 13.60% (language model weight = 12)
+best overall dt05 WER 11.17% (language model weight = 12)
 -------------------
-dt05_simu WER: 14.23% (Average), 12.24% (BUS), 17.20% (CAFE), 12.05% (PEDESTRIAN), 15.44% (STREET)
+dt05_simu WER: 11.44% (Average), 9.78% (BUS), 14.37% (CAFE), 10.10% (PEDESTRIAN), 11.50% (STREET)
 -------------------
-dt05_real WER: 12.96% (Average), 15.42% (BUS), 12.94% (CAFE), 10.18% (PEDESTRIAN), 13.30% (STREET)
+dt05_real WER: 10.91% (Average), 11.21% (BUS), 11.24% (CAFE), 10.34% (PEDESTRIAN), 10.84% (STREET)
 -------------------
-et05_simu WER: 20.46% (Average), 14.77% (BUS), 21.78% (CAFE), 22.49% (PEDESTRIAN), 22.81% (STREET)
+et05_simu WER: 13.54% (Average), 11.65% (BUS), 14.90% (CAFE), 13.73% (PEDESTRIAN), 13.86% (STREET)
 -------------------
-et05_real WER: 21.14% (Average), 28.40% (BUS), 21.29% (CAFE), 18.68% (PEDESTRIAN), 16.19% (STREET)
+et05_real WER: 14.62% (Average), 16.43% (BUS), 15.43% (CAFE), 12.99% (PEDESTRIAN), 13.63% (STREET)
 -------------------
 
-DNN sMBR
+DNN sMBR with beamformit
 exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_beamformit_5mics.result
 -------------------
 best overall dt05 WER 8.60% (language model weight = 11)
@@ -47,98 +48,120 @@ et05_simu WER: 14.23% (Average), 10.72% (BUS), 15.52% (CAFE), 13.90% (PEDESTRIAN
 et05_real WER: 15.00% (Average), 21.74% (BUS), 13.58% (CAFE), 12.84% (PEDESTRIAN), 11.86% (STREET)
 -------------------
 
-Advanced baseline:
+DNN sMBR with blstm_gev
+exp/tri4a_dnn_tr05_multi_noisy_smbr_i1lats/best_wer_blstm_gev.result
 -------------------
-best overall dt05 WER 7.72% (language model weight = 12)
- (Number of iterations = 3)
+best overall dt05 WER 7.38% (language model weight = 11)
+ (Number of iterations = 4)
 -------------------
-dt05_simu WER: 7.98% (Average), 6.96% (BUS), 9.75% (CAFE), 6.56% (PEDESTRIAN), 8.66% (STREET)
+dt05_simu WER: 7.49% (Average), 5.93% (BUS), 9.69% (CAFE), 6.73% (PEDESTRIAN), 7.61% (STREET)
 -------------------
-dt05_real WER: 7.45% (Average), 9.15% (BUS), 8.10% (CAFE), 5.40% (PEDESTRIAN), 7.17% (STREET)
+dt05_real WER: 7.28% (Average), 7.83% (BUS), 7.80% (CAFE), 6.37% (PEDESTRIAN), 7.11% (STREET)
 -------------------
-et05_simu WER: 12.30% (Average), 9.45% (BUS), 13.26% (CAFE), 11.77% (PEDESTRIAN), 14.74% (STREET)
+et05_simu WER: 9.54% (Average), 8.18% (BUS), 10.87% (CAFE), 9.81% (PEDESTRIAN), 9.32% (STREET)
 -------------------
-et05_real WER: 12.64% (Average), 16.34% (BUS), 12.36% (CAFE), 10.93% (PEDESTRIAN), 10.93% (STREET)
+et05_real WER: 9.77% (Average), 11.42% (BUS), 10.22% (CAFE), 9.23% (PEDESTRIAN), 8.22% (STREET)
 -------------------
 
-5-gram rescoring
-exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_5gkn_5k.result
+RNNLM with beamformit
+exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
 -------------------
-best overall dt05 WER 7.30% (language model weight = 11)
+best overall dt05 WER 6.27% (language model weight = 12)
 -------------------
-dt05_simu WER: 7.75% (Average), 7.14% (BUS), 9.13% (CAFE), 6.33% (PEDESTRIAN), 8.41% (STREET)
+dt05_simu WER: 6.77% (Average), 6.02% (BUS), 8.10% (CAFE), 5.49% (PEDESTRIAN), 7.48% (STREET)
 -------------------
-dt05_real WER: 6.85% (Average), 8.53% (BUS), 6.90% (CAFE), 4.72% (PEDESTRIAN), 7.24% (STREET)
+dt05_real WER: 5.76% (Average), 7.39% (BUS), 5.77% (CAFE), 3.72% (PEDESTRIAN), 6.18% (STREET)
 -------------------
-et05_simu WER: 12.31% (Average), 8.82% (BUS), 13.04% (CAFE), 11.84% (PEDESTRIAN), 15.54% (STREET)
+et05_simu WER: 10.90% (Average), 7.68% (BUS), 11.54% (CAFE), 10.31% (PEDESTRIAN), 14.06% (STREET)
 -------------------
-et05_real WER: 13.23% (Average), 19.07% (BUS), 11.80% (CAFE), 11.51% (PEDESTRIAN), 10.53% (STREET)
+et05_real WER: 11.51% (Average), 16.86% (BUS), 10.18% (CAFE), 9.83% (PEDESTRIAN), 9.19% (STREET)
 -------------------
 
-Advanced baseline:
+######## Advanced baseline
+######## All 6 channel training, enhanced data training, Lattice-free MMI TDNN, BLSTM-mask-based GEV beamformer
+
+TDNN with beamformit
+exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result
 -------------------
-best overall dt05 WER 6.25% (language model weight = 13)
+best overall dt05 WER 6.04% (language model weight = 9)
 -------------------
-dt05_simu WER: 6.58% (Average), 5.86% (BUS), 7.89% (CAFE), 5.19% (PEDESTRIAN), 7.39% (STREET)
+dt05_simu WER: 6.25% (Average), 5.71% (BUS), 6.92% (CAFE), 5.37% (PEDESTRIAN), 7.02% (STREET)
 -------------------
-dt05_real WER: 5.92% (Average), 7.46% (BUS), 6.19% (CAFE), 4.25% (PEDESTRIAN), 5.77% (STREET)
+dt05_real WER: 5.83% (Average), 7.48% (BUS), 5.28% (CAFE), 4.43% (PEDESTRIAN), 6.13% (STREET)
 -------------------
-et05_simu WER: 10.50% (Average), 7.81% (BUS), 11.06% (CAFE), 10.44% (PEDESTRIAN), 12.70% (STREET)
+et05_simu WER: 10.30% (Average), 7.34% (BUS), 10.37% (CAFE), 10.05% (PEDESTRIAN), 13.43% (STREET)
 -------------------
-et05_real WER: 10.68% (Average), 13.97% (BUS), 10.48% (CAFE), 9.08% (PEDESTRIAN), 9.19% (STREET)
+et05_real WER: 9.67% (Average), 12.71% (BUS), 8.33% (CAFE), 8.20% (PEDESTRIAN), 9.45% (STREET)
 -------------------
 
-RNNLM
-exp/tri4a_dnn_tr05_multi_noisy_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+TDNN+RNNLM with beamformit
+exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
 -------------------
-best overall dt05 WER 6.27% (language model weight = 12)
+best overall dt05 WER 4.15% (language model weight = 9)
 -------------------
-dt05_simu WER: 6.77% (Average), 6.02% (BUS), 8.10% (CAFE), 5.49% (PEDESTRIAN), 7.48% (STREET)
+dt05_simu WER: 4.33% (Average), 3.95% (BUS), 4.87% (CAFE), 3.53% (PEDESTRIAN), 4.97% (STREET)
 -------------------
-dt05_real WER: 5.76% (Average), 7.39% (BUS), 5.77% (CAFE), 3.72% (PEDESTRIAN), 6.18% (STREET)
+dt05_real WER: 3.97% (Average), 5.38% (BUS), 3.19% (CAFE), 2.94% (PEDESTRIAN), 4.37% (STREET)
 -------------------
-et05_simu WER: 10.90% (Average), 7.68% (BUS), 11.54% (CAFE), 10.31% (PEDESTRIAN), 14.06% (STREET)
+et05_simu WER: 7.39% (Average), 4.87% (BUS), 7.58% (CAFE), 7.15% (PEDESTRIAN), 9.96% (STREET)
 -------------------
-et05_real WER: 11.51% (Average), 16.86% (BUS), 10.18% (CAFE), 9.83% (PEDESTRIAN), 9.19% (STREET)
+et05_real WER: 7.04% (Average), 9.89% (BUS), 5.49% (CAFE), 5.70% (PEDESTRIAN), 7.10% (STREET)
 -------------------
 
-Advanced baseline:
+TDNN using 6 channel data plus enhanced data with beamformit
+exp/chain/tdnn7a_sp/best_wer_beamformit_5mics.result
 -------------------
-best overall dt05 WER 5.44% (language model weight = 13)
+best overall dt05 WER 5.80% (language model weight = 10)
 -------------------
-dt05_simu WER: 5.82% (Average), 4.90% (BUS), 6.96% (CAFE), 4.62% (PEDESTRIAN), 6.81% (STREET)
+dt05_simu WER: 6.19% (Average), 5.96% (BUS), 6.78% (CAFE), 5.10% (PEDESTRIAN), 6.92% (STREET)
 -------------------
-dt05_real WER: 5.05% (Average), 6.43% (BUS), 5.03% (CAFE), 3.42% (PEDESTRIAN), 5.31% (STREET)
+dt05_real WER: 5.41% (Average), 6.86% (BUS), 4.87% (CAFE), 4.00% (PEDESTRIAN), 5.91% (STREET)
 -------------------
-et05_simu WER: 9.24% (Average), 6.65% (BUS), 9.81% (CAFE), 9.23% (PEDESTRIAN), 11.28% (STREET)
+et05_simu WER: 10.26% (Average), 7.68% (BUS), 10.40% (CAFE), 10.16% (PEDESTRIAN), 12.79% (STREET)
 -------------------
-et05_real WER: 9.50% (Average), 12.64% (BUS), 8.76% (CAFE), 7.96% (PEDESTRIAN), 8.63% (STREET)
+et05_real WER: 9.63% (Average), 13.46% (BUS), 7.98% (CAFE), 8.13% (PEDESTRIAN), 8.97% (STREET)
 -------------------
 
-TDNN
-exp/chain/tdnn1d_sp/best_wer_beamformit_5mics.result
+TDNN+RNNLM using 6 channel data plus enhanced data with beamformit
+exp/chain/tdnn7a_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+compute dt05 WER for each location
 -------------------
-best overall dt05 WER 6.04% (language model weight = 9)
+best overall dt05 WER 4.02% (language model weight = 11)
 -------------------
-dt05_simu WER: 6.25% (Average), 5.71% (BUS), 6.92% (CAFE), 5.37% (PEDESTRIAN), 7.02% (STREET)
+dt05_simu WER: 4.31% (Average), 4.04% (BUS), 4.88% (CAFE), 3.38% (PEDESTRIAN), 4.94% (STREET)
 -------------------
-dt05_real WER: 5.83% (Average), 7.48% (BUS), 5.28% (CAFE), 4.43% (PEDESTRIAN), 6.13% (STREET)
+dt05_real WER: 3.74% (Average), 4.62% (BUS), 3.17% (CAFE), 3.02% (PEDESTRIAN), 4.14% (STREET)
 -------------------
-et05_simu WER: 10.30% (Average), 7.34% (BUS), 10.37% (CAFE), 10.05% (PEDESTRIAN), 13.43% (STREET)
+et05_simu WER: 7.49% (Average), 5.16% (BUS), 7.21% (CAFE), 7.45% (PEDESTRIAN), 10.14% (STREET)
 -------------------
-et05_real WER: 9.67% (Average), 12.71% (BUS), 8.33% (CAFE), 8.20% (PEDESTRIAN), 9.45% (STREET)
+et05_real WER: 6.84% (Average), 9.74% (BUS), 5.38% (CAFE), 5.25% (PEDESTRIAN), 7.00% (STREET)
 -------------------
 
-TDNN+RNNLM
-exp/chain/tdnn1d_sp_smbr_lmrescore/best_wer_beamformit_5mics_rnnlm_5k_h300_w0.5_n100.result
+TDNN+RNNLM using 6 channel data plus enhanced data with blstm_gev
+exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_5k_h300_w0.5_n100.result
 -------------------
-best overall dt05 WER 4.15% (language model weight = 9)
+best overall dt05 WER 3.01% (language model weight = 10)
 -------------------
-dt05_simu WER: 4.33% (Average), 3.95% (BUS), 4.87% (CAFE), 3.53% (PEDESTRIAN), 4.97% (STREET)
+dt05_simu WER: 3.10% (Average), 2.60% (BUS), 4.07% (CAFE), 2.80% (PEDESTRIAN), 2.92% (STREET)
 -------------------
-dt05_real WER: 3.97% (Average), 5.38% (BUS), 3.19% (CAFE), 2.94% (PEDESTRIAN), 4.37% (STREET)
+dt05_real WER: 2.93% (Average), 3.32% (BUS), 2.83% (CAFE), 2.63% (PEDESTRIAN), 2.93% (STREET)
 -------------------
-et05_simu WER: 7.39% (Average), 4.87% (BUS), 7.58% (CAFE), 7.15% (PEDESTRIAN), 9.96% (STREET)
+et05_simu WER: 3.95% (Average), 3.29% (BUS), 4.71% (CAFE), 4.30% (PEDESTRIAN), 3.53% (STREET)
 -------------------
-et05_real WER: 7.04% (Average), 9.89% (BUS), 5.49% (CAFE), 5.70% (PEDESTRIAN), 7.10% (STREET)
--------------------
\ No newline at end of file
+et05_real WER: 4.04% (Average), 4.94% (BUS), 3.66% (CAFE), 3.66% (PEDESTRIAN), 3.90% (STREET)
+-------------------
+
+TDNN+LSTMLM using 6 channel data plus enhanced data with blstm_gev
+exp/chain/tdnn1a_sp_smbr_lmrescore/best_wer_blstm_gev_rnnlm_lstm_1a_w0.5_n100.result
+-------------------
+best overall dt05 WER 2.00% (language model weight = 11)
+-------------------
+dt05_simu WER: 2.10% (Average), 2.06% (BUS), 2.58% (CAFE), 1.73% (PEDESTRIAN), 2.02% (STREET)
+-------------------
+dt05_real WER: 1.90% (Average), 2.05% (BUS), 1.78% (CAFE), 1.68% (PEDESTRIAN), 2.09% (STREET)
+-------------------
+et05_simu WER: 2.66% (Average), 2.33% (BUS), 2.73% (CAFE), 2.93% (PEDESTRIAN), 2.63% (STREET)
+-------------------
+et05_real WER: 2.74% (Average), 3.05% (BUS), 2.45% (CAFE), 2.65% (PEDESTRIAN), 2.82% (STREET)
+-------------------
+
diff --git a/egs/chime4/s5_6ch/rnnlm b/egs/chime4/s5_6ch/rnnlm
new file mode 120000
index 00000000000..e136939ba72
--- /dev/null
+++ b/egs/chime4/s5_6ch/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
\ No newline at end of file
diff --git a/egs/chime4/s5_6ch/run.sh b/egs/chime4/s5_6ch/run.sh
index 090808c026b..1979a040bd8 100755
--- a/egs/chime4/s5_6ch/run.sh
+++ b/egs/chime4/s5_6ch/run.sh
@@ -1,33 +1,33 @@
-#!/bin/bash
-
 # Kaldi ASR baseline for the CHiME-4 Challenge (6ch track: 6 channel track)
 #
 # Copyright 2016 University of Sheffield (Jon Barker, Ricard Marxer)
 #                Inria (Emmanuel Vincent)
 #                Mitsubishi Electric Research Labs (Shinji Watanabe)
 #           2017 JHU CLSP (Szu-Jui Chen)
+#           2017 JHU CLSP (Aswin Shanmugam Subramanian)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 . ./path.sh
 . ./cmd.sh
+
 #####Baseline settings#####
 # Usage: 
-# 1. For using original baseline, execute './run.sh --baseline chime4_official'. 
-# We don't provide the function to train original baseline models anymore. Instead, we provided the
-# trained original baseline models in tools/ASR_models for directly using.
+# Execute './run.sh' to get the models.
+# We provide three kinds of beamform methods. Add option --enhancement blstm_gev, or --enhancement beamformit_5mics
+# or --enhancement single_blstmmask to use them. i.g. './run.sh --enhancement blstm_gev'
 #
-# 2. For using advanced baseline, first execute './run.sh --baseline advanced --flatstart true' to
-# get the models. If you want to use TDNN instead of DNN, add option "--tdnn true". If you want to
-# use TDNN-LSTM instead of DNN, add option "--tdnn-lstm true".
-# Then execute './run.sh --baseline advanced' for your experiments. 
+# We stopped to support the old CHiME-3/4 baseline. If you want to reproduce the old results
+# Please use the old version of Kaldi, e.g., git checkout 9e8ff73648917836d0870c8f6fdd2ff4bdde384f
 
 # Config:
 stage=0 # resume training with --stage N
-
-baseline=advanced
-flatstart=false
-tdnn=true
-tdnn_lstm=false
+enhancement=blstm_gev #### or your method
+# if the following options are true, they wouldn't train a model again and will only do decoding
+gmm_decode_only=false
+tdnn_decode_only=false
+# make it true when you want to add enhanced data into training set. But please note that when changing enhancement method,
+# you may need to retrain from run_gmm.sh and avoid using decode-only options above
+add_enhanced_data=true
 
 . utils/parse_options.sh || exit 1;
 
@@ -42,119 +42,92 @@ set -o pipefail
 # If you use scripts distributed in the CHiME4 package,
 chime4_data=`pwd`/../..
 # Otherwise, please specify it, e.g.,
-chime4_data=/db/laputa1/data/processed/public/CHiME4
+# chime4_data=/db/laputa1/data/processed/public/CHiME4
+# chime3_data=/data2/archive/speech-db/original/public/CHiME3
 
 case $(hostname -f) in
-  *.clsp.jhu.edu) chime4_data=/export/corpora4/CHiME4/CHiME3 ;; # JHU,
+  *.clsp.jhu.edu) 
+      chime4_data=/export/corpora4/CHiME4/CHiME3 # JHU,
+      chime3_data=/export/corpora5/CHiME3 
+      ;;
 esac 
 
 if [ ! -d $chime4_data ]; then
   echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1;
 fi
-# Set a model directory for the CHiME4 data.
-case $baseline in
-  chime4_official)
-      if $flatstart; then
-        echo "We don't support this anymore for 'chime4_official' baseline"
-        echo " ... Automatically set it to false"
-      fi
-      modeldir=$chime4_data/tools/ASR_models
-      flatstart=false
-      ;;
-  advanced)
-      modeldir=`pwd`
-      ;;
-  *)
-      echo "Usage: './run.sh --baseline chime4_official' or './run.sh --baseline advanced'"
-      echo " ... If you haven't run flatstart for advanced baseline, please execute"
-      echo " ... './run.sh --baseline advanced --flatstart true' first";
-      exit 1;
-esac
-
-if [ "$flatstart" = false ]; then
-  for d in $modeldir $modeldir/data/{lang,lang_test_tgpr_5k,lang_test_5gkn_5k,lang_test_rnnlm_5k_h300,local} \
-    $modeldir/exp/{tri3b_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy,tri4a_dnn_tr05_multi_noisy_smbr_i1lats}; do
-    [ ! -d $d ] && echo "$0: no such directory $d. specify models correctly" && \
-    echo " or execute './run.sh --baseline advanced --flatstart true' first" && exit 1;
-  done
+if [ ! -d $chime3_data ]; then
+  echo "$chime3_data does not exist. Please specify chime4 data root correctly" && exit 1;
 fi
-#####check data and model paths finished#######
-
 
 #####main program start################
 # You can execute run_init.sh only "once"
 # This creates 3-gram LM, FSTs, and basic task files
-if [ $stage -le 0 ] && $flatstart; then
+if [ $stage -le 0 ]; then
   local/run_init.sh $chime4_data
 fi
 
-# Using Beamformit
-# See Hori et al, "The MERL/SRI system for the 3rd CHiME challenge using beamforming,
-# robust feature extraction, and advanced speech recognition," in Proc. ASRU'15
-# note that beamformed wav files are generated in the following directory
-enhancement_method=beamformit_5mics
-enhancement_data=`pwd`/enhan/$enhancement_method
+# Using Beamformit or mask-based beamformer
+# note that beamformed WAV files are generated in the following directory
+enhancement_data=`pwd`/enhan/$enhancement
 if [ $stage -le 1 ]; then
-  local/run_beamform_6ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_6ch_track $enhancement_data
+   case $enhancement in
+    beamformit_5mics)
+        local/run_beamform_6ch_track.sh --cmd "$train_cmd" --nj 20 $chime4_data/data/audio/16kHz/isolated_6ch_track $enhancement_data
+        ;;
+    blstm_gev)
+        local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 $chime4_data $chime3_data $enhancement_data 0
+        ;;
+    single_blstmmask)
+        local/run_blstm_gev.sh --cmd "$train_cmd" --nj 20 $chime4_data $chime3_data $enhancement_data 5 
+        ;;
+    *)
+        echo "Usage: --enhancement blstm_gev, or --enhancement beamformit_5mics , or --enhancement single_blstmmask" 
+        exit 1;
+   esac
 fi
 
-# GMM based ASR experiment without "retraining"
-# Please set a directory of your speech enhancement method.
-# run_gmm_recog.sh can be done every time when you change a speech enhancement technique.
-# The directory structure and audio files must follow the attached baseline enhancement directory
+# Compute PESQ, STOI, eSTOI, and SDR scores
 if [ $stage -le 2 ]; then
-  if $flatstart; then
-    local/run_gmm.sh $enhancement_method $enhancement_data $chime4_data
-  else
-    local/run_gmm_recog.sh $enhancement_method $enhancement_data $modeldir
+  if [ ! -f local/bss_eval_sources.m ] || [ ! -f local/stoi.m ] || [ ! -f local/estoi.m ] || [ ! -f local/PESQ ]; then
+    # download and install speech enhancement evaluation tools
+    local/download_se_eval_tool.sh
   fi
+  chime4_rir_data=local/nn-gev/data/audio/16kHz/isolated_ext
+  if [ ! -d $chime4_rir_data ]; then
+    echo "$chime4_rir_data does not exist. Please run 'blstm_gev' enhancement method first;" && exit 1;
+  fi
+  local/compute_pesq.sh $enhancement $enhancement_data $chime4_rir_data $PWD
+  local/compute_stoi_estoi_sdr.sh $enhancement $enhancement_data $chime4_rir_data
+  local/compute_pesq.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data $PWD
+  local/compute_stoi_estoi_sdr.sh NOISY_1ch $chime4_data/data/audio/16kHz/isolated_1ch_track/ $chime4_rir_data
+  local/write_se_results.sh $enhancement
+  local/write_se_results.sh NOISY_1ch
 fi
 
-# DNN based ASR experiment
-# Since it takes time to evaluate DNN, we make the GMM and DNN scripts separately.
-# You may execute it after you would have promising results using GMM-based ASR experiments
+# GMM based ASR experiment
+# Please set a directory of your speech enhancement method.
+# The directory structure and audio files must follow the attached baseline enhancement directory
 if [ $stage -le 3 ]; then
-  if $tdnn; then
-    if $flatstart; then
-      local/chain/run_tdnn.sh $enhancement_method
-    else
-      local/chain/run_tdnn_recog.sh $enhancement_method $modeldir
-    fi
-  elif $tdnn_lstm; then
-    if $flatstart; then
-      local/chain/run_tdnn_lstm.sh $enhancement_method
-    else
-      local/chain/run_tdnn_lstm_recog.sh $enhancement_method $modeldir
-    fi
-  else
-    if $flatstart; then
-      local/run_dnn.sh $enhancement_method
-    else
-      local/run_dnn_recog.sh $enhancement_method $modeldir
-    fi
-  fi
+  local/run_gmm.sh --add-enhanced-data $add_enhanced_data \
+    --decode-only $gmm_decode_only $enhancement $enhancement_data $chime4_data
+fi
+
+# TDNN based ASR experiment
+# Since it takes time to evaluate TDNN, we make the GMM and TDNN scripts separately.
+# You may execute it after you would have promising results using GMM-based ASR experiments
+if [ $stage -le 4 ]; then
+  local/chain/run_tdnn.sh --decode-only $tdnn_decode_only $enhancement
 fi
-flatstart=false
+
 # LM-rescoring experiment with 5-gram and RNN LMs
 # It takes a few days to train a RNNLM.
-if [ $stage -le 4 ]; then
-  if $flatstart; then
-    if $tdnn; then
-      local/run_lmrescore_tdnn.sh $chime4_data $enhancement_method
-    elif $tdnn_lstm; then
-      local/run_lmrescore_tdnn_lstm.sh $chime4_data $enhancement_method
-    else
-      local/run_lmrescore.sh $chime4_data $enhancement_method
-    fi
-  else
-    if $tdnn; then
-      local/run_lmrescore_tdnn_recog.sh $enhancement_method $modeldir
-    elif $tdnn_lstm; then
-      local/run_lmrescore_tdnn_lstm_recog.sh $enhancement_method $modeldir
-    else
-      local/run_lmrescore_recog.sh $enhancement_method $modeldir
-    fi
-  fi
+if [ $stage -le 5 ]; then
+  local/run_lmrescore_tdnn.sh $chime4_data $enhancement
+fi
+
+# LM-rescoring experiment with LSTM LMs
+if [ $stage -le 6 ]; then
+  local/rnnlm/run_lstm.sh $enhancement
 fi
 
 echo "Done."
diff --git a/egs/chime5/s5/cmd.sh b/egs/chime5/s5/cmd.sh
index a697a22cda3..9702501f1a7 100644
--- a/egs/chime5/s5/cmd.sh
+++ b/egs/chime5/s5/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
+export train_cmd="retry.pl queue.pl --mem 2G"
 export decode_cmd="queue.pl --mem 4G"
 
diff --git a/egs/chime5/s5/local/chain/run_tdnn.sh b/egs/chime5/s5/local/chain/run_tdnn.sh
index 34499362831..61f8f499182 120000
--- a/egs/chime5/s5/local/chain/run_tdnn.sh
+++ b/egs/chime5/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1a.sh
\ No newline at end of file
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
index 45a7fd84bd6..f0f469e46c8 100755
--- a/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -24,21 +24,16 @@ decode_iter=
 # training options
 # training chunk-options
 chunk_width=140,100,160
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 common_egs_dir=
 xent_regularize=0.1
 
 # training options
 srand=0
 remove_egs=true
-reporting_email=
 
 #decode options
 test_online_decoding=false  # if true, it will run the last decoding stage.
 
-
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -59,7 +54,7 @@ fi
 # run those things.
 local/nnet3/run_ivector_common.sh --stage $stage \
                                   --train-set $train_set \
-				  --test-sets "$test_sets" \
+                                  --test-sets "$test_sets" \
                                   --gmm $gmm \
                                   --nnet3-affix "$nnet3_affix" || exit 1;
 
@@ -133,7 +128,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05"
   output_opts="l2-regularize=0.01 bottleneck-dim=320"
 
@@ -176,7 +171,6 @@ EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
 fi
 
-
 if [ $stage -le 14 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
@@ -204,15 +198,10 @@ if [ $stage -le 14 ]; then
     --trainer.num-chunk-per-minibatch=256,128,64 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
     --cleanup.remove-egs=$remove_egs \
     --use-gpu=true \
-    --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
     --tree-dir=$tree_dir \
     --lat-dir=$lat_dir \
@@ -235,10 +224,6 @@ if [ $stage -le 16 ]; then
     (
       steps/nnet3/decode.sh \
           --acwt 1.0 --post-decode-acwt 10.0 \
-          --extra-left-context $chunk_left_context \
-          --extra-right-context $chunk_right_context \
-          --extra-left-context-initial 0 \
-          --extra-right-context-final 0 \
           --frames-per-chunk $frames_per_chunk \
           --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
diff --git a/egs/chime5/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/chime5/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..920f2543132
--- /dev/null
+++ b/egs/chime5/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,277 @@
+#!/bin/bash
+
+# This factorized TDNN (TDNN-F) script is ported from s5b recipe
+# It uses resnet-style skip connections.
+# For details, refer to the paper:
+# "Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks", Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohamadi, Sanjeev Khudanpur, Interspeech 2018
+
+# %WER 73.03 [ 43001 / 58881, 4433 ins, 22250 del, 16318 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1b_sp/decode_dev_beamformit_ref/wer_10_0.0
+# %WER 38.88 [ 22895 / 58881, 1882 ins, 8235 del, 12778 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1b_sp/decode_dev_worn/wer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain_train_worn_u100k_cleaned/tdnn1b_sp
+# exp/chain_train_worn_u100k_cleaned/tdnn1b_sp: num-iters=96 nj=3..16 num-params=17.1M dim=40+100->2928 combine=-0.125->-0.125 (over 2) xent:train/valid[63,95,final]=(-2.12,-1.81,-1.82/-2.20,-1.96,-1.96) logprob:train/valid[63,95,final]=(-0.190,-0.126,-0.125/-0.218,-0.183,-0.183)
+
+set -e
+
+# configs for 'chain'
+stage=0
+nj=96
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u100k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+num_epochs=4
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \
+    ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $lat_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.00025 \
+    --trainer.optimization.final-effective-lrate=0.000025 \
+    --trainer.num-chunk-per-minibatch=64 \
+    --egs.stage $get_egs_stage \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5/local/json2text.py b/egs/chime5/s5/local/json2text.py
index 4df0160efb6..a0142ad916e 100755
--- a/egs/chime5/s5/local/json2text.py
+++ b/egs/chime5/s5/local/json2text.py
@@ -25,8 +25,8 @@ def hms_to_seconds(hms):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('json', type=str, help='JSON transcription file')
-    parser.add_argument('--mictype', type=str,
+    parser.add_argument('json', help='JSON transcription file')
+    parser.add_argument('--mictype',
                         choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'],
                         help='Type of microphones')
     args = parser.parse_args()
diff --git a/egs/chime5/s5/local/nnet3/run_ivector_common.sh b/egs/chime5/s5/local/nnet3/run_ivector_common.sh
index e28e5ce996d..2b672063be7 100755
--- a/egs/chime5/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/chime5/s5/local/nnet3/run_ivector_common.sh
@@ -23,7 +23,7 @@ nnet3_affix=_train_worn_u100k
 gmm_dir=exp/${gmm}
 ali_dir=exp/${gmm}_ali_${train_set}_sp
 
-for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+for f in data/${train_set}/utt2spk ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
diff --git a/egs/chime5/s5/local/run_wpe.py b/egs/chime5/s5/local/run_wpe.py
new file mode 100644
index 00000000000..cc9cd41927a
--- /dev/null
+++ b/egs/chime5/s5/local/run_wpe.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+from nara_wpe.wpe import wpe
+from nara_wpe.utils import stft, istft 
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try: 
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/chime5/s5/local/run_wpe.sh b/egs/chime5/s5/local/run_wpe.sh
new file mode 100755
index 00000000000..8ecbbd6182a
--- /dev/null
+++ b/egs/chime5/s5/local/run_wpe.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_wpe.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nj 50                        # number of jobs for parallel processing"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+task=`basename $sdir`
+expdir=exp/wpe/${task}_${array}
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';"
+fi
+
+# check if WPE is installed
+result=`$HOME/miniconda3/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} > $expdir/channels_input
+cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output
+paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Dereverberation - $task - $array\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat <<-EOF > $expdir/log/wpe.$n.sh
+while read line; do
+  $HOME/miniconda3/bin/python local/run_wpe.py \
+    --file \$line
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/wpe.*.sh
+$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \
+  $expdir/log/wpe.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5/local/score_for_submit.sh b/egs/chime5/s5/local/score_for_submit.sh
index 5502c5994e5..23121d68b93 100755
--- a/egs/chime5/s5/local/score_for_submit.sh
+++ b/egs/chime5/s5/local/score_for_submit.sh
@@ -43,7 +43,7 @@ for session in S02 S09; do
 	# get nerror
 	nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
 	# get nwords from references (NF-2 means to exclude utterance id and " ref ")
-	nwrd=`grep " ref "  $score_result | grep $room | grep $session | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'`
+	nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
 	# compute wer with scale=2
 	wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
 	
@@ -59,7 +59,7 @@ echo -n "overall: "
 # get nerror
 nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
 # get nwords from references (NF-2 means to exclude utterance id and " ref ")
-nwrd=`grep " ref "  $score_result | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'`
+nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
 # compute wer with scale=2
 wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
 echo -n "#words $nwrd, "
@@ -81,7 +81,7 @@ for session in S01 S21; do
 	    # get nerror
 	    nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
 	    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
-	    nwrd=`grep " ref "  $score_result | grep $room | grep $session | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'`
+	    nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
 	    # compute wer with scale=2
 	    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
 	
@@ -98,7 +98,7 @@ if $do_eval; then
     # get nerror
     nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
     # get nwords from references (NF-2 means to exclude utterance id and " ref ")
-    nwrd=`grep " ref "  $score_result | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'`
+    nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
     # compute wer with scale=2
     wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
     echo -n "overall: "
diff --git a/egs/chime5/s5b/RESULTS b/egs/chime5/s5b/RESULTS
new file mode 100644
index 00000000000..0dcea1f0031
--- /dev/null
+++ b/egs/chime5/s5b/RESULTS
@@ -0,0 +1,33 @@
+
+# tri2
+%WER 76.40 [ 44985 / 58881, 3496 ins, 17652 del, 23837 sub ] exp/tri2/decode_dev_worn/wer_13_1.0
+%WER 93.56 [ 55091 / 58881, 2132 ins, 35555 del, 17404 sub ] exp/tri2/decode_dev_beamformit_ref/wer_17_1.0
+
+# tri3
+%WER 72.81 [ 42869 / 58881, 3629 ins, 15998 del, 23242 sub ] exp/tri3/decode_dev_worn/wer_15_1.0
+%WER 91.73 [ 54013 / 58881, 3519 ins, 27098 del, 23396 sub ] exp/tri3/decode_dev_beamformit_ref/wer_17_1.0
+
+# nnet3 tdnn+chain
+%WER 47.91 [ 28212 / 58881, 2843 ins, 8957 del, 16412 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_worn/wer_9_0.0
+%WER 81.28 [ 47859 / 58881, 4210 ins, 27511 del, 16138 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref/wer_9_0.5
+
+# result with the challenge submission format (July 9, 2018)
+# before the fix of speaker ID across arrays
+session S02 room DINING: #words 8288, #errors 6593, wer 79.54 %
+session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 %
+session S02 room LIVING: #words 15460, #errors 12219, wer 79.03 %
+session S09 room DINING: #words 5766, #errors 4651, wer 80.66 %
+session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 %
+session S09 room LIVING: #words 7760, #errors 6023, wer 77.61 %
+overall: #words 58881, #errors 47859, wer 81.28 %
+
+# result with the challenge submission format (July 9, 2018)
+# after the fix of speaker ID across arrays
+==== development set ====
+session S02 room DINING: #words 8288, #errors 6556, wer 79.10 %
+session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 %
+session S02 room LIVING: #words 15460, #errors 12182, wer 78.79 %
+session S09 room DINING: #words 5766, #errors 4648, wer 80.61 %
+session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 %
+session S09 room LIVING: #words 7760, #errors 6022, wer 77.60 %
+overall: #words 58881, #errors 47781, wer 81.14 %
diff --git a/egs/chime5/s5b/cmd.sh b/egs/chime5/s5b/cmd.sh
new file mode 100644
index 00000000000..9702501f1a7
--- /dev/null
+++ b/egs/chime5/s5b/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+
diff --git a/egs/chime5/s5b/conf/beamformit.cfg b/egs/chime5/s5b/conf/beamformit.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime5/s5b/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime5/s5b/conf/mfcc.conf b/egs/chime5/s5b/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/chime5/s5b/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime5/s5b/conf/mfcc_hires.conf b/egs/chime5/s5b/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/chime5/s5b/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime5/s5b/conf/online_cmvn.conf b/egs/chime5/s5b/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/chime5/s5b/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime5/s5b/local/chain/run_tdnn.sh b/egs/chime5/s5b/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..95e9d934bd3
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_cnn_tdnn_lstm_1a.sh
@@ -0,0 +1,304 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u400k_cleaned
+test_sets="dev_beamformit_ref"
+gmm=tri3_cleaned
+nnet3_affix=_train_worn_u400k_cleaned
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=_1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+common_egs_dir=
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2  # 2 works better than 4
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+      
+  conv-relu-batchnorm-layer name=cnn1 input=idct height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 input=cnn1 height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  relu-batchnorm-layer name=affine1 input=lda dim=512
+ 
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 input=cnn2 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,affine1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.srand=$srand \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..daad37e2cd7
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u100k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.01 bottleneck-dim=320"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..e033715d884
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+# This factorized TDNN (TDNN-F) script is adapted from SWBD recipe 7q.
+# It uses resnet-style skip connections.
+# For details, refer to the paper:
+# "Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks", Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohamadi, Sanjeev Khudanpur, Interspeech 2018
+
+# %WER 70.27 [ 41375 / 58881, 3487 ins, 22831 del, 15057 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ref_2stage/wer_12_0.0
+# %WER 70.28 [ 41383 / 58881, 4486 ins, 19616 del, 17281 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_ref_2stage/wer_11_0.0
+# %WER 72.62 [ 42761 / 58881, 4545 ins, 21618 del, 16598 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_ref/wer_11_0.0
+# %WER 72.64 [ 42772 / 58881, 4556 ins, 21618 del, 16598 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_ref/wer_11_0.0
+
+# steps/info/chain_dir_info.pl exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn_1b_sp
+# exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/: num-iters=317 nj=3..16 num-params=17.0M dim=40+100->2792 combine=-0.149->-0.149 (over 2) xent:train/valid[210,316,final]=(-2.50,-1.99,-2.00/-2.36,-1.95,-1.95) logprob:train/valid[210,316,final]=(-0.228,-0.136,-0.136/-0.223,-0.156,-0.155)
+
+set -e
+
+# configs for 'chain'
+stage=0
+nj=96
+train_set=train_worn_u400k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u400k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+num_epochs=4
+common_egs_dir=
+# training options
+# training chunk-options
+chunk_width=140,100,160
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \
+    ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $lat_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule "$dropout_schedule" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..e3d8e6ac4dc
--- /dev/null
+++ b/egs/chime5/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u400k_cleaned
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3_cleaned
+nnet3_affix=_train_worn_u400k_cleaned
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=_1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+common_egs_dir=
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2  # 2 works better than 4
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim
+
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.srand=$srand \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime5/s5b/local/check_tools.sh b/egs/chime5/s5b/local/check_tools.sh
new file mode 100755
index 00000000000..9c0f9290a75
--- /dev/null
+++ b/egs/chime5/s5b/local/check_tools.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+
+command -v uconv &>/dev/null \
+  || { echo  >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; }
+
+command -v ngram &>/dev/null \
+  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; }
+
+if [  -z ${LIBLBFGS} ]; then
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+sox=`command -v sox 2>/dev/null` \
+  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }
+
+# If sox is found on path, check if the version is correct
+if [ ! -z "$sox" ]; then
+  sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
+  if [[ ! $sox_version =~ v14.4.* ]]; then
+    echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
+    exit 1
+  fi
+fi
+
+command -v phonetisaurus-align &>/dev/null \
+  || { echo  >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; }
+
+command -v BeamformIt &>/dev/null \
+  || { echo  >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; }
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'"
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+exit  0
diff --git a/egs/chime5/s5b/local/copy_lat_dir_parallel.sh b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh
new file mode 100755
index 00000000000..82839604c9e
--- /dev/null
+++ b/egs/chime5/s5b/local/copy_lat_dir_parallel.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+cmd=queue.pl
+nj=40
+stage=0
+speed_perturb=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <utt-map> <data-dir> <src-lat-dir> <out-lat-dir>"
+  exit 1
+fi
+
+utt_map=$1
+data=$2
+srcdir=$3
+dir=$4
+
+mkdir -p $dir
+
+cp $srcdir/{phones.txt,tree,final.mdl} $dir || exit 1
+cp $srcdir/{final.alimdl,final.occs,splice_opts,cmvn_opts,delta_opts,final.mat,full.mat} 2>/dev/null || true
+
+nj_src=$(cat $srcdir/num_jobs) || exit 1
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj_src $dir/log/copy_lats_orig.JOB.log \
+    lattice-copy "ark:gunzip -c $srcdir/lat.JOB.gz |" \
+    ark,scp:$dir/lat_orig.JOB.ark,$dir/lat_orig.JOB.scp || exit 1
+fi
+
+for n in $(seq $nj_src); do
+  cat $dir/lat_orig.$n.scp
+done > $dir/lat_orig.scp || exit 1
+
+if $speed_perturb; then
+  for s in 0.9 1.1; do
+    awk -v s=$s '{print "sp"s"-"$1" sp"s"-"$2}' $utt_map
+  done | cat - $utt_map | sort -k1,1 > $dir/utt_map
+  utt_map=$dir/utt_map
+fi
+
+if [ $stage -le 2 ]; then
+  utils/filter_scp.pl -f 2 $dir/lat_orig.scp < $utt_map | \
+    utils/apply_map.pl -f 2 $dir/lat_orig.scp > \
+    $dir/lat.scp || exit 1
+
+  if [ ! -s $dir/lat.scp ]; then
+    echo "$0: $dir/lat.scp is empty. Something went wrong!"
+    exit 1
+  fi
+fi
+
+utils/split_data.sh $data $nj
+
+if [ $stage -le 3 ]; then
+  $cmd JOB=1:$nj $dir/log/copy_lats.JOB.log \
+    lattice-copy "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/lat.scp |" \
+    "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1
+fi
+
+echo $nj > $dir/num_jobs
+
+if [ -f $srcdir/ali.1.gz ]; then
+  if [ $stage -le 4 ]; then
+    $cmd JOB=1:$nj_src $dir/log/copy_ali_orig.JOB.log \
+      copy-int-vector "ark:gunzip -c $srcdir/ali.JOB.gz |" \
+      ark,scp:$dir/ali_orig.JOB.ark,$dir/ali_orig.JOB.scp || exit 1
+  fi
+
+  for n in $(seq $nj_src); do
+    cat $dir/ali_orig.$n.scp
+  done > $dir/ali_orig.scp || exit 1
+
+  if [ $stage -le 5 ]; then
+    utils/filter_scp.pl -f 2 $dir/ali_orig.scp < $utt_map | \
+      utils/apply_map.pl -f 2 $dir/ali_orig.scp > \
+      $dir/ali.scp || exit 1
+  
+    if [ ! -s $dir/ali.scp ]; then
+      echo "$0: $dir/ali.scp is empty. Something went wrong!"
+      exit 1
+    fi
+  fi
+
+  utils/split_data.sh $data $nj
+
+  if [ $stage -le 6 ]; then
+    $cmd JOB=1:$nj $dir/log/copy_ali.JOB.log \
+      copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \
+      "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1
+  fi
+fi
+
+rm $dir/lat_orig.*.{ark,scp} $dir/ali_orig.*.{ark,scp} 2>/dev/null || true
diff --git a/egs/chime5/s5b/local/distant_audio_list b/egs/chime5/s5b/local/distant_audio_list
new file mode 100644
index 00000000000..fc7aff15cd0
--- /dev/null
+++ b/egs/chime5/s5b/local/distant_audio_list
@@ -0,0 +1,376 @@
+S03_U01.CH1
+S03_U01.CH2
+S03_U01.CH3
+S03_U01.CH4
+S03_U02.CH1
+S03_U02.CH2
+S03_U02.CH3
+S03_U02.CH4
+S03_U03.CH1
+S03_U03.CH2
+S03_U03.CH3
+S03_U03.CH4
+S03_U04.CH1
+S03_U04.CH2
+S03_U04.CH3
+S03_U04.CH4
+S03_U05.CH1
+S03_U05.CH2
+S03_U05.CH3
+S03_U05.CH4
+S03_U06.CH1
+S03_U06.CH2
+S03_U06.CH3
+S03_U06.CH4
+S04_U01.CH1
+S04_U01.CH2
+S04_U01.CH3
+S04_U01.CH4
+S04_U02.CH1
+S04_U02.CH2
+S04_U02.CH3
+S04_U02.CH4
+S04_U03.CH1
+S04_U03.CH2
+S04_U03.CH3
+S04_U03.CH4
+S04_U04.CH1
+S04_U04.CH2
+S04_U04.CH3
+S04_U04.CH4
+S04_U05.CH1
+S04_U05.CH2
+S04_U05.CH3
+S04_U05.CH4
+S04_U06.CH1
+S04_U06.CH2
+S04_U06.CH3
+S04_U06.CH4
+S05_U01.CH1
+S05_U01.CH2
+S05_U01.CH3
+S05_U01.CH4
+S05_U02.CH1
+S05_U02.CH2
+S05_U02.CH3
+S05_U02.CH4
+S05_U04.CH1
+S05_U04.CH2
+S05_U04.CH3
+S05_U04.CH4
+S05_U05.CH1
+S05_U05.CH2
+S05_U05.CH3
+S05_U05.CH4
+S05_U06.CH1
+S05_U06.CH2
+S05_U06.CH3
+S05_U06.CH4
+S06_U01.CH1
+S06_U01.CH2
+S06_U01.CH3
+S06_U01.CH4
+S06_U02.CH1
+S06_U02.CH2
+S06_U02.CH3
+S06_U02.CH4
+S06_U03.CH1
+S06_U03.CH2
+S06_U03.CH3
+S06_U03.CH4
+S06_U04.CH1
+S06_U04.CH2
+S06_U04.CH3
+S06_U04.CH4
+S06_U05.CH1
+S06_U05.CH2
+S06_U05.CH3
+S06_U05.CH4
+S06_U06.CH1
+S06_U06.CH2
+S06_U06.CH3
+S06_U06.CH4
+S07_U01.CH1
+S07_U01.CH2
+S07_U01.CH3
+S07_U01.CH4
+S07_U02.CH1
+S07_U02.CH2
+S07_U02.CH3
+S07_U02.CH4
+S07_U03.CH1
+S07_U03.CH2
+S07_U03.CH3
+S07_U03.CH4
+S07_U04.CH1
+S07_U04.CH2
+S07_U04.CH3
+S07_U04.CH4
+S07_U05.CH1
+S07_U05.CH2
+S07_U05.CH3
+S07_U05.CH4
+S07_U06.CH1
+S07_U06.CH2
+S07_U06.CH3
+S07_U06.CH4
+S08_U01.CH1
+S08_U01.CH2
+S08_U01.CH3
+S08_U01.CH4
+S08_U02.CH1
+S08_U02.CH2
+S08_U02.CH3
+S08_U02.CH4
+S08_U03.CH1
+S08_U03.CH2
+S08_U03.CH3
+S08_U03.CH4
+S08_U04.CH1
+S08_U04.CH2
+S08_U04.CH3
+S08_U04.CH4
+S08_U05.CH1
+S08_U05.CH2
+S08_U05.CH3
+S08_U05.CH4
+S08_U06.CH1
+S08_U06.CH2
+S08_U06.CH3
+S08_U06.CH4
+S12_U01.CH1
+S12_U01.CH2
+S12_U01.CH3
+S12_U01.CH4
+S12_U02.CH1
+S12_U02.CH2
+S12_U02.CH3
+S12_U02.CH4
+S12_U03.CH1
+S12_U03.CH2
+S12_U03.CH3
+S12_U03.CH4
+S12_U04.CH1
+S12_U04.CH2
+S12_U04.CH3
+S12_U04.CH4
+S12_U05.CH1
+S12_U05.CH2
+S12_U05.CH3
+S12_U05.CH4
+S12_U06.CH1
+S12_U06.CH2
+S12_U06.CH3
+S12_U06.CH4
+S13_U01.CH1
+S13_U01.CH2
+S13_U01.CH3
+S13_U01.CH4
+S13_U02.CH1
+S13_U02.CH2
+S13_U02.CH3
+S13_U02.CH4
+S13_U03.CH1
+S13_U03.CH2
+S13_U03.CH3
+S13_U03.CH4
+S13_U04.CH1
+S13_U04.CH2
+S13_U04.CH3
+S13_U04.CH4
+S13_U05.CH1
+S13_U05.CH2
+S13_U05.CH3
+S13_U05.CH4
+S13_U06.CH1
+S13_U06.CH2
+S13_U06.CH3
+S13_U06.CH4
+S16_U01.CH1
+S16_U01.CH2
+S16_U01.CH3
+S16_U01.CH4
+S16_U02.CH1
+S16_U02.CH2
+S16_U02.CH3
+S16_U02.CH4
+S16_U03.CH1
+S16_U03.CH2
+S16_U03.CH3
+S16_U03.CH4
+S16_U04.CH1
+S16_U04.CH2
+S16_U04.CH3
+S16_U04.CH4
+S16_U05.CH1
+S16_U05.CH2
+S16_U05.CH3
+S16_U05.CH4
+S16_U06.CH1
+S16_U06.CH2
+S16_U06.CH3
+S16_U06.CH4
+S17_U01.CH1
+S17_U01.CH2
+S17_U01.CH3
+S17_U01.CH4
+S17_U02.CH1
+S17_U02.CH2
+S17_U02.CH3
+S17_U02.CH4
+S17_U03.CH1
+S17_U03.CH2
+S17_U03.CH3
+S17_U03.CH4
+S17_U04.CH1
+S17_U04.CH2
+S17_U04.CH3
+S17_U04.CH4
+S17_U05.CH1
+S17_U05.CH2
+S17_U05.CH3
+S17_U05.CH4
+S17_U06.CH1
+S17_U06.CH2
+S17_U06.CH3
+S17_U06.CH4
+S18_U01.CH1
+S18_U01.CH2
+S18_U01.CH3
+S18_U01.CH4
+S18_U02.CH1
+S18_U02.CH2
+S18_U02.CH3
+S18_U02.CH4
+S18_U03.CH1
+S18_U03.CH2
+S18_U03.CH3
+S18_U03.CH4
+S18_U04.CH1
+S18_U04.CH2
+S18_U04.CH3
+S18_U04.CH4
+S18_U05.CH1
+S18_U05.CH2
+S18_U05.CH3
+S18_U05.CH4
+S18_U06.CH1
+S18_U06.CH2
+S18_U06.CH3
+S18_U06.CH4
+S19_U01.CH1
+S19_U01.CH2
+S19_U01.CH3
+S19_U01.CH4
+S19_U02.CH1
+S19_U02.CH2
+S19_U02.CH3
+S19_U02.CH4
+S19_U03.CH1
+S19_U03.CH2
+S19_U03.CH3
+S19_U03.CH4
+S19_U04.CH1
+S19_U04.CH2
+S19_U04.CH3
+S19_U04.CH4
+S19_U05.CH1
+S19_U05.CH2
+S19_U05.CH3
+S19_U05.CH4
+S19_U06.CH1
+S19_U06.CH2
+S19_U06.CH3
+S19_U06.CH4
+S20_U01.CH1
+S20_U01.CH2
+S20_U01.CH3
+S20_U01.CH4
+S20_U02.CH1
+S20_U02.CH2
+S20_U02.CH3
+S20_U02.CH4
+S20_U03.CH1
+S20_U03.CH2
+S20_U03.CH3
+S20_U03.CH4
+S20_U04.CH1
+S20_U04.CH2
+S20_U04.CH3
+S20_U04.CH4
+S20_U05.CH1
+S20_U05.CH2
+S20_U05.CH3
+S20_U05.CH4
+S20_U06.CH1
+S20_U06.CH2
+S20_U06.CH3
+S20_U06.CH4
+S22_U01.CH1
+S22_U01.CH2
+S22_U01.CH3
+S22_U01.CH4
+S22_U02.CH1
+S22_U02.CH2
+S22_U02.CH3
+S22_U02.CH4
+S22_U04.CH1
+S22_U04.CH2
+S22_U04.CH3
+S22_U04.CH4
+S22_U05.CH1
+S22_U05.CH2
+S22_U05.CH3
+S22_U05.CH4
+S22_U06.CH1
+S22_U06.CH2
+S22_U06.CH3
+S22_U06.CH4
+S23_U01.CH1
+S23_U01.CH2
+S23_U01.CH3
+S23_U01.CH4
+S23_U02.CH1
+S23_U02.CH2
+S23_U02.CH3
+S23_U02.CH4
+S23_U03.CH1
+S23_U03.CH2
+S23_U03.CH3
+S23_U03.CH4
+S23_U04.CH1
+S23_U04.CH2
+S23_U04.CH3
+S23_U04.CH4
+S23_U05.CH1
+S23_U05.CH2
+S23_U05.CH3
+S23_U05.CH4
+S23_U06.CH1
+S23_U06.CH2
+S23_U06.CH3
+S23_U06.CH4
+S24_U01.CH1
+S24_U01.CH2
+S24_U01.CH3
+S24_U01.CH4
+S24_U02.CH1
+S24_U02.CH2
+S24_U02.CH3
+S24_U02.CH4
+S24_U03.CH1
+S24_U03.CH2
+S24_U03.CH3
+S24_U03.CH4
+S24_U04.CH1
+S24_U04.CH2
+S24_U04.CH3
+S24_U04.CH4
+S24_U05.CH1
+S24_U05.CH2
+S24_U05.CH3
+S24_U05.CH4
+S24_U06.CH1
+S24_U06.CH2
+S24_U06.CH3
+S24_U06.CH4
diff --git a/egs/chime5/s5b/local/extract_noises.py b/egs/chime5/s5b/local/extract_noises.py
new file mode 100755
index 00000000000..f7b7f752d9e
--- /dev/null
+++ b/egs/chime5/s5b/local/extract_noises.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import logging
+import os
+import sys
+import scipy.io.wavfile as siw
+import math
+import numpy as np
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        """Extract noises from the corpus based on the non-speech regions.
+        e.g. {} /export/corpora4/CHiME5/audio/train/ \\
+                /export/corpora4/CHiME5/transcriptions/train/ \\
+                /export/b05/zhiqiw/noise/""".format(sys.argv[0]))
+
+    parser.add_argument("--segment-length", default=20)
+    parser.add_argument("audio_dir", help="""Location of the CHiME5 Audio files. e.g. /export/corpora4/CHiME5/audio/train/""")
+    parser.add_argument("trans_dir", help="""Location of the CHiME5 Transcriptions. e.g. /export/corpora4/CHiME5/transcriptions/train/""")
+    parser.add_argument("audio_list", help="""List of ids of the CHiME5 recordings from which noise is extracted. e.g. local/distant_audio_list""")
+    parser.add_argument("out_dir", help="Output directory to write noise files. e.g. /export/b05/zhiqiw/noise/")
+
+    args = parser.parse_args()
+    return args
+
+
+def Trans_time(time, fs):
+    units = time.split(':')
+    time_second = float(units[0]) * 3600 + float(units[1]) * 60 + float(units[2])
+    return int(time_second*fs)
+
+
+def Get_time(conf, tag, mic, fs):
+    for i in conf:
+        st = Trans_time(i['start_time'][mic], fs)
+        ed = Trans_time(i['end_time'][mic], fs)
+        tag[st:ed] = 0
+    return tag
+
+
+def write_noise(out_dir, seg, audio, sig, tag, fs, cnt):
+    sig_noise = sig[np.nonzero(tag)]
+    for i in range(math.floor(len(sig_noise)/(seg*fs))):
+        siw.write(out_dir +'/noise'+str(cnt)+'.wav', fs, sig_noise[i*seg*fs:(i+1)*seg*fs])
+        cnt += 1
+    return cnt
+
+
+def main():
+    args = get_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir)
+
+    wav_list = open(args.audio_list).readlines()
+
+    cnt = 1
+    for i, audio in enumerate(wav_list):
+        parts = audio.strip().split('.')
+        if len(parts) == 2:
+            # Assuming distant mic with name like S03_U01.CH1
+            session, mic = parts[0].split('_')
+            channel = parts[1]
+            base_name = session + "_" + mic + "." + channel
+        else:
+            # Assuming close talk mic with name like S03_P09
+            session, mic = audio.strip().split('_')
+            base_name = session + "_" + mic
+        fs, sig = siw.read(args.audio_dir + "/" + base_name + '.wav')
+        tag = np.ones(len(sig))
+        if i == 0 or session != session_p:
+            with open(args.trans_dir + "/" + session + '.json') as f:
+                conf = json.load(f)
+        tag = Get_time(conf, tag, mic, fs)
+        cnt = write_noise(args.out_dir, args.segment_length, audio, sig, tag, fs, cnt)
+        session_p = session
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime5/s5b/local/extract_vad_weights.sh b/egs/chime5/s5b/local/extract_vad_weights.sh
new file mode 100755
index 00000000000..250b021bd8f
--- /dev/null
+++ b/egs/chime5/s5b/local/extract_vad_weights.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar
+# Apache 2.0.
+
+# This script converts lattices available from a first pass decode into a per-frame weights file
+# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001)
+# and voiced frames have a weight of 1.
+
+set -e
+
+stage=1
+cmd=run.pl
+silence_weight=0.00001
+#end configuration section.
+
+. ./cmd.sh
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <input-decode-dir> <output-wts-file-gzipped>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  exit 1;
+fi
+
+data_dir=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+output_wts_file_gz=$4
+
+if [ $stage -le 1 ]; then
+  echo "$0: generating CTM from input lattices"
+  steps/get_ctm_conf.sh --cmd "$cmd" \
+    --use-segments false \
+    $data_dir \
+    $lang \
+    $decode_dir
+fi
+
+if [ $stage -le 2 ]; then
+  name=`basename $data_dir`
+  # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot
+  ctm=$decode_dir/score_10/$name.ctm
+  echo "$0: generating weights file from ctm $ctm"
+
+  pad_frames=0  # this did not seem to be helpful but leaving it as an option.
+  feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths
+  if [ ! -f $ctm ]; then  echo "$0: expected ctm to exist: $ctm"; exit 1; fi
+
+  cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \
+  grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \
+  grep -v -F '[laughter]' | grep -v -F '<unk>' | \
+  perl -e ' $lengths=shift @ARGV;  $pad_frames=shift @ARGV; $silence_weight=shift @ARGV;
+   $pad_frames >= 0 || die "bad pad-frames value $pad_frames";
+   open(L, "<$lengths") || die "opening lengths file";
+   @all_utts = ();
+   $utt2ref = { };
+   while (<L>) {
+     ($utt, $len) = split(" ", $_);
+     push @all_utts, $utt;
+     $array_ref = [ ];
+     for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; }
+     $utt2ref{$utt} = $array_ref;
+   }
+   while (<STDIN>) {
+     @A = split(" ", $_);
+     @A == 6 || die "bad ctm line $_";
+     $utt = $A[0]; $beg = $A[2]; $len = $A[3];
+     $beg_int = int($beg * 100) - $pad_frames;
+     $len_int = int($len * 100) + 2*$pad_frames;
+     $array_ref = $utt2ref{$utt};
+     !defined $array_ref  && die "No length info for utterance $utt";
+     for ($t = $beg_int; $t < $beg_int + $len_int; $t++) {
+       if ($t >= 0 && $t < @$array_ref) {
+         ${$array_ref}[$t] = 1;
+        }
+      }
+    }
+    foreach $utt (@all_utts) {  $array_ref = $utt2ref{$utt};
+      print $utt, " [ ", join(" ", @$array_ref), " ]\n";
+      } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \
+        gzip -c > $output_wts_file_gz
+fi
diff --git a/egs/chime5/s5b/local/json2text.py b/egs/chime5/s5b/local/json2text.py
new file mode 100755
index 00000000000..4df0160efb6
--- /dev/null
+++ b/egs/chime5/s5b/local/json2text.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+import argparse
+import logging
+import sys
+
+
+def hms_to_seconds(hms):
+    hour = hms.split(':')[0]
+    minute = hms.split(':')[1]
+    second = hms.split(':')[2].split('.')[0]
+
+    # .xx (10 ms order)
+    ms10 = hms.split(':')[2].split('.')[1]
+
+    # total seconds
+    seconds = int(hour) * 3600 + int(minute) * 60 + int(second)
+
+    return '{:07d}'.format(int(str(seconds) + ms10))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('json', type=str, help='JSON transcription file')
+    parser.add_argument('--mictype', type=str,
+                        choices=['ref', 'worn', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'],
+                        help='Type of microphones')
+    args = parser.parse_args()
+
+    # logging info
+    log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
+
+    logging.debug("reading %s", args.json)
+    with open(args.json, 'rt', encoding="utf-8") as f:
+        j = json.load(f)
+
+    for x in j:
+        if '[redacted]' not in x['words']:
+            session_id = x['session_id']
+            speaker_id = x['speaker']
+            if args.mictype == 'ref':
+                mictype = x['ref']
+            elif args.mictype == 'worn':
+                mictype = 'original'
+            else:
+                mictype = args.mictype.upper() # convert from u01 to U01
+
+            # add location tag for scoring (only for dev and eval sets)
+            if 'location' in x.keys():
+                location = x['location'].upper()
+            else:
+                location = 'NOLOCATION'
+
+            start_time = x['start_time'][mictype]
+            end_time = x['end_time'][mictype]
+        
+            # remove meta chars and convert to lower
+            words = x['words'].replace('"', '')\
+                              .replace('.', '')\
+                              .replace('?', '')\
+                              .replace(',', '')\
+                              .replace(':', '')\
+                              .replace(';', '')\
+                              .replace('!', '').lower()
+
+            # remove multiple spaces
+            words = " ".join(words.split())
+
+            # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55
+            start_time = hms_to_seconds(start_time)
+            end_time = hms_to_seconds(end_time)
+
+            uttid = speaker_id + '_' + session_id
+            if not args.mictype == 'worn':
+                uttid += '_' + mictype
+            uttid += '_' + location + '-' + start_time + '-' + end_time
+
+            if end_time > start_time:
+                sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8"))
diff --git a/egs/chime5/s5b/local/make_noise_list.py b/egs/chime5/s5b/local/make_noise_list.py
new file mode 100755
index 00000000000..5aaf7fa4062
--- /dev/null
+++ b/egs/chime5/s5b/local/make_noise_list.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+import glob
+import os
+import sys
+
+
+if len(sys.argv) != 2:
+    print ("Usage: {} <noises-dir>".format(sys.argv[0]))
+    raise SystemExit(1)
+
+
+for line in glob.glob("{}/*.wav".format(sys.argv[1])):
+    fname = os.path.basename(line.strip())
+
+    print ("--noise-id {} --noise-type point-source "
+           "--bg-fg-type foreground {}".format(fname, line.strip()))
diff --git a/egs/chime5/s5b/local/nnet3/compare_wer.sh b/egs/chime5/s5b/local/nnet3/compare_wer.sh
new file mode 100644
index 00000000000..fa627acd27b
--- /dev/null
+++ b/egs/chime5/s5b/local/nnet3/compare_wer.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev_clean_2 (tgsmall) "
+  "#WER dev_clean_2 (tglarge) ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
+
diff --git a/egs/chime5/s5b/local/nnet3/decode.sh b/egs/chime5/s5b/local/nnet3/decode.sh
new file mode 100755
index 00000000000..8fa54e0d4a6
--- /dev/null
+++ b/egs/chime5/s5b/local/nnet3/decode.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar 
+# Apache 2.0.
+
+# This script does 2-stage decoding where the first stage is used to get 
+# reliable frames for i-vector extraction.
+
+set -e
+
+# general opts
+iter=
+stage=0
+nj=30
+affix=  # affix for decode directory
+
+# ivector opts
+max_count=75  # parameter for extract_ivectors.sh
+sub_speaker_frames=6000
+ivector_scale=0.75
+get_weights_from_ctm=true
+weights_file=   # use weights from this archive (must be compressed using gunzip)
+silence_weight=0.00001   # apply this weight to silence frames during i-vector extraction
+ivector_dir=exp/nnet3
+
+# decode opts
+pass2_decode_opts="--min-active 1000"
+lattice_beam=8
+extra_left_context=0 # change for (B)LSTM
+extra_right_context=0 # change for BLSTM
+frames_per_chunk=50 # change for (B)LSTM
+acwt=0.1 # important to change this when using chain models
+post_decode_acwt=1.0 # important to change this when using chain models
+extra_left_context_initial=0
+extra_right_context_final=0
+
+graph_affix=
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
+
+. ./cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir> <graph-dir> <model-dir>"
+  echo " Options:"
+  echo "    --stage (0|1|2)   # start scoring script from part-way through."
+  echo "e.g.:"
+  echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
+  exit 1;
+fi
+
+data=$1 # data directory 
+lang=$2 # data/lang
+graph=$3 #exp/tri5a/graph_pp
+dir=$4 # exp/nnet3/tdnn
+
+model_affix=`basename $dir`
+ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter}
+affix=${affix:+_${affix}}${iter:+_iter${iter}}
+
+if [ $stage -le 1 ]; then
+  if [ ! -s ${data}_hires/feats.scp ]; then
+    utils/copy_data_dir.sh $data ${data}_hires
+    steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires
+    steps/compute_cmvn_stats.sh ${data}_hires
+    utils/fix_data_dir.sh ${data}_hires
+  fi
+fi
+
+data_set=$(basename $data)
+if [ $stage -le 2 ]; then
+  echo "Extracting i-vectors, stage 1"
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    --max-count $max_count \
+    ${data}_hires $ivector_dir/extractor \
+    $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1;
+  # float comparisons are hard in bash
+  if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then
+    ivector_scale_affix=_scale$ivector_scale
+  else
+    ivector_scale_affix=
+  fi
+
+  if [ ! -z "$ivector_scale_affix" ]; then
+    echo "$0: Scaling iVectors, stage 1"
+    srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1
+    outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1
+    mkdir -p $outdir
+    $train_cmd $outdir/log/scale_ivectors.log \
+      copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \
+      copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp;
+    cp $srcdir/ivector_period $outdir/ivector_period
+  fi
+fi
+
+decode_dir=$dir/decode${graph_affix}_${data_set}${affix}
+# generate the lattices
+if [ $stage -le 3 ]; then
+  echo "Generating lattices, stage 1"
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \
+    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \
+    --skip-scoring true ${iter:+--iter $iter} \
+    $graph ${data}_hires ${decode_dir}_stage1;
+fi
+
+if [ $stage -le 4 ]; then
+  if $get_weights_from_ctm; then
+    if [ ! -z $weights_file ]; then
+      echo "$0: Using provided vad weights file $weights_file"
+      ivector_extractor_weights=$weights_file
+    else
+      echo "$0 : Generating vad weights file"
+      ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz
+      local/extract_vad_weights.sh --silence-weight $silence_weight \
+        --cmd "$decode_cmd" ${iter:+--iter $iter} \
+        ${data}_hires $lang \
+        ${decode_dir}_stage1 $ivector_extractor_weights
+    fi
+  else
+    # get weights from best path decoding
+    ivector_extractor_weights=${decode_dir}_stage1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights"
+  # this does offline decoding, except we estimate the iVectors per
+  # speaker, excluding silence (based on alignments from a DNN decoding), with a
+  # different script.  This is just to demonstrate that script.
+  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+  # up into "sub-speakers" of at least that many frames... can be useful if
+  # acoustic conditions drift over time within the speaker's data.
+  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+    --silence-weight $silence_weight \
+    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+    ${data}_hires $lang $ivector_dir/extractor \
+    $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix};
+fi
+
+if [ $stage -le 6 ]; then
+  echo "Generating lattices, stage 2 with --acwt $acwt"
+  rm -f ${decode_dir}/.error
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \
+      --acwt $acwt --post-decode-acwt $post_decode_acwt \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --extra-left-context-initial $extra_left_context_initial \
+      --extra-right-context-final $extra_right_context_final \
+      --frames-per-chunk "$frames_per_chunk" \
+      --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \
+      --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \
+     $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error
+  [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1;
+fi
+exit 0
diff --git a/egs/chime5/s5b/local/nnet3/run_ivector_common.sh b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..3910e1812a3
--- /dev/null
+++ b/egs/chime5/s5b/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nj=96
+
+nnet3_affix=_train_worn_u100k
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 20 \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/chime5-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj ${nj} \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+fi
+
+if [ $stage -le 7 ]; then
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/chime5/s5b/local/prepare_data.sh b/egs/chime5/s5b/local/prepare_data.sh
new file mode 100755
index 00000000000..98087322c38
--- /dev/null
+++ b/egs/chime5/s5b/local/prepare_data.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
+# Apache 2.0
+
+# Begin configuration section.
+mictype=worn # worn, ref or others
+cleanup=true
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 3 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
+  echo -e >&2 "eg:\n  $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
+  exit 1
+fi
+
+set -e -o pipefail
+
+adir=$1
+jdir=$2
+dir=$3
+
+json_count=$(find -L $jdir -name "*.json" | wc -l)
+wav_count=$(find -L $adir -name "*.wav" | wc -l)
+
+if [ "$json_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $jdir will contain json files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+if [ "$wav_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $adir will contain wav files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+
+echo "$0: Converting transcription to text"
+
+mkdir -p $dir
+for file in $jdir/*json; do
+  ./local/json2text.py --mictype $mictype $file
+done | \
+  sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
+  sed -e 's/ - / /g' |\
+  sed -e 's/mm-/mm/g' > $dir/text.orig
+
+echo "$0: Creating datadir $dir for type=\"$mictype\""
+
+if [ $mictype == "worn" ]; then
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key, add .L and .R for left and right channel
+  # i.e. each file will have two entries (left and right channel)
+  find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
+    perl -ne '{
+      chomp;
+      $path = $_;
+      next unless $path;
+      @F = split "/", $path;
+      ($f = $F[@F-1]) =~ s/.wav//;
+      @F = split "_", $f;
+      print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
+      print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
+    }' | sort > $dir/wav.scp
+
+  # generate the transcripts for both left and right channel
+  # from the original transcript in the form
+  # P09_S03-0006072-0006147 gimme the baker
+  # create left and right channel transcript
+  # P09_S03.L-0006072-0006147 gimme the baker
+  # P09_S03.R-0006072-0006147 gimme the baker
+  sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
+elif [ $mictype == "ref" ]; then
+  # fixed reference array
+
+  # first get a text, which will be used to extract reference arrays
+  perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
+
+  find -L $adir | grep "\.wav" | sort > $dir/wav.flist
+  # following command provide the argument for grep to extract only reference arrays
+  grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
+  paste -d" " \
+	<(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \
+	$dir/wav.flist2 | sort > $dir/wav.scp
+else
+  # array mic case
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key
+  find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
+    perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
+    sort -u > $dir/wav.scp
+
+  # convert the transcripts from
+  # P09_S03-0006072-0006147 gimme the baker
+  # to the per-channel transcripts
+  # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
+  perl -ne '$l=$_;
+    for($i=1; $i<=4; $i++) {
+      ($x=$l)=~ s/-/.CH\Q$i\E-/;
+      print $x;}' $dir/text.orig | sort > $dir/text
+
+fi
+$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
+
+# Prepare 'segments', 'utt2spk', 'spk2utt'
+if [ $mictype == "worn" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" \
+    > $dir/segments
+elif [ $mictype == "ref" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e "s/ P.._/ /" > $dir/segments
+else
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e 's/ P.._/ /' > $dir/segments
+fi
+cut -f 1 -d ' ' $dir/segments | \
+  perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+# Check that data dirs are okay!
+utils/validate_data_dir.sh --no-feats $dir || exit 1
diff --git a/egs/chime5/s5b/local/prepare_dict.sh b/egs/chime5/s5b/local/prepare_dict.sh
new file mode 100755
index 00000000000..09083d0e795
--- /dev/null
+++ b/egs/chime5/s5b/local/prepare_dict.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./path.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+
+# check existing directories
+[ $# != 0 ] && echo "Usage: $0" && exit 1;
+
+dir=data/local/dict
+
+mkdir -p $dir
+echo "$0: Getting CMU dictionary"
+if [ ! -f $dir/cmudict.done ]; then
+  [ -d $dir/cmudict ] && rm -rf $dir/cmudict
+  svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict
+  touch $dir/cmudict.done
+fi
+
+# silence phones, one per line.
+for w in sil spn inaudible laughs noise; do
+  echo $w;
+done > $dir/silence_phones.txt
+echo sil > $dir/optional_silence.txt
+
+# For this setup we're discarding stress.
+cat $dir/cmudict/cmudict-0.7b.symbols | \
+  perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \
+  sort -u > $dir/nonsilence_phones.txt
+
+# An extra question will be added by including the silence phones in one class.
+paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt
+
+grep -v ';;;' $dir/cmudict/cmudict-0.7b |\
+  uconv -f latin1 -t utf-8 -x Any-Lower |\
+  perl -ne 's:(\S+)\(\d+\) :$1 :; s:  : :; print;' |\
+  perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \
+  > $dir/lexicon1_raw_nosil.txt || exit 1;
+
+# Add prons for laughter, noise, oov
+for w in `grep -v sil $dir/silence_phones.txt`; do
+  echo "[$w] $w"
+done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
+
+# we keep all words from the cmudict in the lexicon
+# might reduce OOV rate on dev and eval
+cat $dir/lexicon2_raw.txt  \
+   <( echo "mm m"
+      echo "<unk> spn"
+      echo "cuz k aa z"
+      echo "cuz k ah z"
+      echo "cuz k ao z"
+      echo "mmm m"; \
+      echo "hmm hh m"; \
+    ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt
+
+
+cat data/train*/text  | \
+  awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
+  sort -nr > $dir/word_counts
+
+cat $dir/word_counts | awk '{print $2}' > $dir/word_list
+
+awk '{print $1}' $dir/iv_lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.txt
+
+echo "*Highest-count OOVs (including fragments) are:"
+head -n 10 $dir/oov_counts.txt
+echo "*Highest-count OOVs (excluding fragments) are:"
+grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true
+
+echo "*Training a G2P and generating missing pronunciations"
+mkdir -p $dir/g2p/
+phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus
+ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\
+  -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \
+  -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa
+phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst
+awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt
+phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \
+  --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt
+
+## The next section is again just for debug purposes
+## to show words for which the G2P failed
+cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt
+rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists.
+awk '{print $1}' $dir/lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.g2p.txt
+
+echo "*Highest-count OOVs (including fragments) after G2P are:"
+head -n 10 $dir/oov_counts.g2p.txt
+
+utils/validate_dict_dir.pl $dir
+exit 0;
+
diff --git a/egs/chime5/s5b/local/reverberate_lat_dir.sh b/egs/chime5/s5b/local/reverberate_lat_dir.sh
new file mode 100755
index 00000000000..f601a37c0e1
--- /dev/null
+++ b/egs/chime5/s5b/local/reverberate_lat_dir.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2018  Vimal Manohar
+# Apache 2.0
+
+num_data_reps=1
+cmd=run.pl
+nj=20
+include_clean=false
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <train-data-dir> <noisy-latdir> <clean-latdir> <output-latdir>"
+  exit 1
+fi
+
+train_data_dir=$1
+noisy_latdir=$2
+clean_latdir=$3
+dir=$4
+
+clean_nj=$(cat $clean_latdir/num_jobs)
+
+$cmd JOB=1:$clean_nj $dir/copy_clean_lattices.JOB.log \
+  lattice-copy "ark:gunzip -c $clean_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_clean.JOB.ark,$dir/lats_clean.JOB.scp || exit 1
+  
+for n in $(seq $clean_nj); do
+  cat $dir/lats_clean.$n.scp 
+done > $dir/lats_clean.scp
+
+for i in $(seq $num_data_reps); do
+  cat $dir/lats_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+done > $dir/lats_rvb.scp
+
+noisy_nj=$(cat $noisy_latdir/num_jobs)
+$cmd JOB=1:$noisy_nj $dir/copy_noisy_lattices.JOB>log \
+  lattice-copy "ark:gunzip -c $noisy_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_noisy.JOB.ark,$dir/lats_noisy.JOB.scp || exit 1
+
+optional_clean=
+if $include_clean; then
+  optional_clean=$dir/lats_clean.scp
+fi
+
+for n in $(seq $noisy_nj); do
+  cat $dir/lats_noisy.$n.scp
+done | cat - $dir/lats_rvb.scp ${optional_clean} | sort -k1,1 > $dir/lats.scp
+
+utils/split_data.sh $train_data_dir $nj
+$cmd JOB=1:$nj $dir/copy_lattices.JOB.log \
+  lattice-copy "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/lats.scp |" \
+  "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1
+
+echo $nj > $dir/num_jobs
+
+if [ -f $clean_latdir/ali.1.gz ]; then
+  $cmd JOB=1:$clean_nj $dir/copy_clean_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $clean_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_clean.JOB.ark,$dir/ali_clean.JOB.scp
+    
+  for n in $(seq $clean_nj); do
+    cat $dir/ali_clean.$n.scp 
+  done > $dir/ali_clean.scp
+
+  for i in $(seq $num_data_reps); do
+    cat $dir/ali_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+  done > $dir/ali_rvb.scp
+  
+  optional_clean=
+  if $include_clean; then
+    optional_clean=$dir/ali_clean.scp
+  fi
+
+  $cmd JOB=1:$noisy_nj $dir/copy_noisy_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $noisy_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_noisy.JOB.ark,$dir/ali_noisy.JOB.scp
+
+  for n in $(seq $noisy_nj); do
+    cat $dir/ali_noisy.$n.scp
+  done | cat - $dir/ali_rvb.scp $optional_clean | sort -k1,1 > $dir/ali.scp
+
+  utils/split_data.sh $train_data_dir $nj || exit 1
+  $cmd JOB=1:$nj $dir/copy_rvb_alignments.JOB.log \
+    copy-int-vector "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/ali.scp |" \
+    "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1
+fi
+
+cp $clean_latdir/{final.*,tree,*.mat,*opts,*.txt} $dir || true
+
+rm $dir/lats_{clean,noisy}.*.{ark,scp} $dir/ali_{clean,noisy}.*.{ark,scp} || true # save space
diff --git a/egs/chime5/s5b/local/run_beamformit.sh b/egs/chime5/s5b/local/run_beamformit.sh
new file mode 100755
index 00000000000..aa3badd90d8
--- /dev/null
+++ b/egs/chime5/s5b/local/run_beamformit.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+cmd=run.pl
+bmf="1 2 3 4"
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_beamformit.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --bmf \"1 2 3 4\"                        # microphones used for beamforming"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'`
+
+if ! command  -v BeamformIt &>/dev/null ; then
+  echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+echo "Will use the following channels: $bmf"
+# number of channels
+numch=`echo $bmf | tr ' ' '\n' | wc -l`
+echo "the number of channels: $numch"
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list
+
+# this is an input file list of the microphones
+# format: 1st_wav 2nd_wav ... nth_wav
+input_arrays=$expdir/channels_$numch
+for x in `cat $output_wavfiles`; do
+  echo -n "$x"
+  for ch in $bmf; do
+    echo -n " $x.CH$ch.wav"
+  done
+  echo ""
+done > $input_arrays
+
+# split the list for parallel processing
+# number of jobs are set by the number of WAV files
+nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'`
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Beamforming\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat << EOF > $expdir/log/beamform.$n.sh
+while read line; do
+  $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \
+    --config_file `pwd`/conf/beamformit.cfg \
+    --source_dir $sdir \
+    --result_dir $odir
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/beamform.*.sh
+$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \
+  $expdir/log/beamform.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/local/run_recog.sh b/egs/chime5/s5b/local/run_recog.sh
new file mode 100755
index 00000000000..989a5f95d01
--- /dev/null
+++ b/egs/chime5/s5b/local/run_recog.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+# This is a subset of run.sh to only perform recognition experiments with evaluation data
+
+# Begin configuration section.
+decode_nj=20
+stage=0
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and stage 4
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+json_dir=${chime5_corpus}/transcriptions
+audio_dir=${chime5_corpus}/audio
+
+# training and test data
+train_set=train_worn_simu_u400k
+test_sets="eval_${enhancement}_dereverb_ref"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+if [ $stage -le 4 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+  for dset in eval; do
+    for mictype in u01 u02 u03 u04 u05 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
+			      ${audio_dir}/${dset} \
+			      ${dereverb_dir}/${dset} \
+			      ${mictype}
+    done
+  done
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u05 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${dereverb_dir}/${dset} \
+			      ${enhandir}/${dset}_${enhancement}_${mictype} \
+			      ${mictype}
+    done
+  done
+
+  for dset in eval; do
+    local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+			  ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
+  done
+fi
+
+if [ $stage -le 6 ]; then
+  # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
+  # add array ID to the speaker ID to avoid the use of other array information to meet regulations
+  # Before this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
+  # After this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
+  for dset in ${test_sets}; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    mkdir -p data/${dset}_nosplit_fix
+    cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
+    awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
+  done
+
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${test_sets}; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
+  done
+fi
+
+if [ $stage -le 7 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
+		       data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+nnet3_affix=_${train_set}_cleaned_rvb
+
+lm_suffix=
+
+if [ $stage -le 18 ]; then
+  # First the options that are passed through to run_ivector_common.sh
+  # (some of which are also used in this script directly).
+
+  # The rest are configs specific to this script.  Most of the parameters
+  # are just hardcoded at this level, in the commands below.
+  affix=1a   # affix for the TDNN directory name
+  tree_affix=
+  tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+  dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+
+  # training options
+  # training chunk-options
+  chunk_width=140,100,160
+  # we don't need extra left/right context for TDNN systems.
+  chunk_left_context=0
+  chunk_right_context=0
+  
+  utils/mkgraph.sh \
+      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --frames-per-chunk 150 --nj $decode_nj \
+        --ivector-dir exp/nnet3${nnet3_affix} \
+        --graph-affix ${lm_suffix} \
+        data/${data} data/lang${lm_suffix} \
+        $tree_dir/graph${lm_suffix} \
+        exp/chain${nnet3_affix}/tdnn1b_sp 
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 20 ]; then
+  # final scoring to get the official challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh \
+      --dev exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_dev_${enhancement}_dereverb_ref_2stage \
+      --eval exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_eval_${enhancement}_dereverb_ref_2stage
+fi
diff --git a/egs/chime5/s5b/local/run_wpe.py b/egs/chime5/s5b/local/run_wpe.py
new file mode 100755
index 00000000000..2f3818f9c42
--- /dev/null
+++ b/egs/chime5/s5b/local/run_wpe.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+# This script assumes that WPE (nara_wpe) is installed locally using miniconda.
+# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh
+# needs to be run and this script needs to be launched run with that version of
+# python.
+# See local/run_wpe.sh for example.
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+from nara_wpe.wpe import wpe
+from nara_wpe.utils import stft, istft
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try:
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/chime5/s5b/local/run_wpe.sh b/egs/chime5/s5b/local/run_wpe.sh
new file mode 100755
index 00000000000..ed512e69aae
--- /dev/null
+++ b/egs/chime5/s5b/local/run_wpe.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_wpe.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nj 50                        # number of jobs for parallel processing"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+task=`basename $sdir`
+expdir=exp/wpe/${task}_${array}
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'."
+    exit 1
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} > $expdir/channels_input
+cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output
+paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Dereverberation - $task - $array\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat <<-EOF > $expdir/log/wpe.$n.sh
+while read line; do
+  $miniconda_dir/bin/python local/run_wpe.py \
+    --file \$line
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/wpe.*.sh
+$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \
+  $expdir/log/wpe.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime5/s5b/local/score.sh b/egs/chime5/s5b/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/chime5/s5b/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/chime5/s5b/local/score_for_submit.sh b/egs/chime5/s5b/local/score_for_submit.sh
new file mode 100755
index 00000000000..23121d68b93
--- /dev/null
+++ b/egs/chime5/s5b/local/score_for_submit.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+#
+# This script provides official CHiME-5 challenge submission scores per room and session.
+# It first calculates the best search parameter configurations by using the dev set
+# and also create the transcriptions for dev and eval sets to be submitted.
+# The default setup does not calculate scores of the evaluation set since
+# the evaluation transcription is not distributed (July 9 2018)
+
+cmd=run.pl
+dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref
+eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref
+do_eval=false
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script provides official CHiME-5 challenge submission scores"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+    echo "    --dev <dev-decode-dir>          # dev set decoding directory"
+    echo "    --eval <eval-decode-dir>        # eval set decoding directory"
+    exit 1;
+fi
+
+# get language model weight and word insertion penalty from the dev set
+best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt`
+best_wip=`cat $dev/scoring_kaldi/wer_details/wip`
+
+echo "best LM weight: $best_lmwt"
+echo "insertion penalty weight: $best_wip"
+
+echo "==== development set ===="
+# development set
+# get the scoring result per utterance
+score_result=$dev/scoring_kaldi/wer_details/per_utt
+for session in S02 S09; do
+    for room in DINING KITCHEN LIVING; do
+	# get nerror
+	nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	# compute wer with scale=2
+	wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	# report the results
+	echo -n "session $session "
+	echo -n "room $room: "
+	echo -n "#words $nwrd, "
+	echo -n "#errors $nerr, "
+	echo "wer $wer %"
+    done
+done
+echo -n "overall: "
+# get nerror
+nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+# compute wer with scale=2
+wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+echo -n "#words $nwrd, "
+echo -n "#errors $nerr, "
+echo "wer $wer %"
+
+echo "==== evaluation set ===="
+# evaluation set
+# get the scoring result per utterance. Copied from local/score.sh
+mkdir -p $eval/scoring_kaldi/wer_details_devbest
+$cmd $eval/scoring_kaldi/log/stats1.log \
+     cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+     align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+     utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt
+score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt
+for session in S01 S21; do
+    for room in DINING KITCHEN LIVING; do
+	if $do_eval; then
+	    # get nerror
+	    nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	    nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	    # compute wer with scale=2
+	    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	    # report the results
+	    echo -n "session $session "
+	    echo -n "room $room: "
+	    echo -n "#words $nwrd, "
+	    echo -n "#errors $nerr, "
+	    echo "wer $wer %"
+	fi
+    done
+done
+if $do_eval; then
+    # get nerror
+    nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+    nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+    # compute wer with scale=2
+    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+    echo -n "overall: "
+    echo -n "#words $nwrd, "
+    echo -n "#errors $nerr, "
+    echo "wer $wer %"
+else
+    echo "skip evaluation scoring"
+    echo ""
+    echo "==== when you submit your result to the CHiME-5 challenge ===="
+    echo "Please rename your recognition results of "
+    echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "with {dev,eval}_<last name>_<affiliation>.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, "
+    echo "and submit both of them as your final challenge result"
+    echo "=================================================================="    
+fi
+
diff --git a/egs/chime5/s5b/local/train_lms_srilm.sh b/egs/chime5/s5b/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..5a1d56d24b3
--- /dev/null
+++ b/egs/chime5/s5b/local/train_lms_srilm.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe)
+# Apache 2.0
+
+export LC_ALL=C
+
+# Begin configuration section.
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+# End configuration section
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  echo >&2 "You appear to not have SRILM tools installed, either on your path,"
+  echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it."
+  exit 1
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+
+fi
+
+[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1
+[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1
+[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+# We also have to avoid skewing the LM by incorporating  the same sentences
+# from different channels
+sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+else
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+echo ""
+
+for best_ngram in {3,4}gram ; do
+  outlm=best_${best_ngram}.gz
+  lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ')
+  echo "$outlm -> $lmfilename"
+  (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm )
+done
diff --git a/egs/chime5/s5b/local/wer_output_filter b/egs/chime5/s5b/local/wer_output_filter
new file mode 100755
index 00000000000..6f4b6400716
--- /dev/null
+++ b/egs/chime5/s5b/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)
+# Apache 2.0
+
+
+## Filter for scoring of the STT results. Convert everything to lowercase
+## and add some ad-hoc fixes for the hesitations
+
+perl -e '
+   while(<STDIN>) {
+     @A  = split(" ", $_);
+     $id = shift @A; print "$id ";
+     foreach $a (@A) {
+       print lc($a) . " " unless $a =~ /\[.*\]/;
+     }
+     print "\n";
+    }' | \
+sed -e '
+    s/\<mhm\>/hmm/g;
+    s/\<mm\>/hmm/g;
+    s/\<mmm\>/hmm/g;
+'
+
+#| uconv -f  utf-8  -t utf-8 -x Latin-ASCII
+
diff --git a/egs/chime5/s5b/local/worn_audio_list b/egs/chime5/s5b/local/worn_audio_list
new file mode 100644
index 00000000000..fc7a44ad77d
--- /dev/null
+++ b/egs/chime5/s5b/local/worn_audio_list
@@ -0,0 +1,64 @@
+/export/corpora4/CHiME5/audio/train/S03_P09.wav
+/export/corpora4/CHiME5/audio/train/S03_P10.wav
+/export/corpora4/CHiME5/audio/train/S03_P11.wav
+/export/corpora4/CHiME5/audio/train/S03_P12.wav
+/export/corpora4/CHiME5/audio/train/S04_P09.wav
+/export/corpora4/CHiME5/audio/train/S04_P10.wav
+/export/corpora4/CHiME5/audio/train/S04_P11.wav
+/export/corpora4/CHiME5/audio/train/S04_P12.wav
+/export/corpora4/CHiME5/audio/train/S05_P13.wav
+/export/corpora4/CHiME5/audio/train/S05_P14.wav
+/export/corpora4/CHiME5/audio/train/S05_P15.wav
+/export/corpora4/CHiME5/audio/train/S05_P16.wav
+/export/corpora4/CHiME5/audio/train/S06_P13.wav
+/export/corpora4/CHiME5/audio/train/S06_P14.wav
+/export/corpora4/CHiME5/audio/train/S06_P15.wav
+/export/corpora4/CHiME5/audio/train/S06_P16.wav
+/export/corpora4/CHiME5/audio/train/S07_P17.wav
+/export/corpora4/CHiME5/audio/train/S07_P18.wav
+/export/corpora4/CHiME5/audio/train/S07_P19.wav
+/export/corpora4/CHiME5/audio/train/S07_P20.wav
+/export/corpora4/CHiME5/audio/train/S08_P21.wav
+/export/corpora4/CHiME5/audio/train/S08_P22.wav
+/export/corpora4/CHiME5/audio/train/S08_P23.wav
+/export/corpora4/CHiME5/audio/train/S08_P24.wav
+/export/corpora4/CHiME5/audio/train/S12_P33.wav
+/export/corpora4/CHiME5/audio/train/S12_P34.wav
+/export/corpora4/CHiME5/audio/train/S12_P35.wav
+/export/corpora4/CHiME5/audio/train/S12_P36.wav
+/export/corpora4/CHiME5/audio/train/S13_P33.wav
+/export/corpora4/CHiME5/audio/train/S13_P34.wav
+/export/corpora4/CHiME5/audio/train/S13_P35.wav
+/export/corpora4/CHiME5/audio/train/S13_P36.wav
+/export/corpora4/CHiME5/audio/train/S16_P21.wav
+/export/corpora4/CHiME5/audio/train/S16_P22.wav
+/export/corpora4/CHiME5/audio/train/S16_P23.wav
+/export/corpora4/CHiME5/audio/train/S16_P24.wav
+/export/corpora4/CHiME5/audio/train/S17_P17.wav
+/export/corpora4/CHiME5/audio/train/S17_P18.wav
+/export/corpora4/CHiME5/audio/train/S17_P19.wav
+/export/corpora4/CHiME5/audio/train/S17_P20.wav
+/export/corpora4/CHiME5/audio/train/S18_P41.wav
+/export/corpora4/CHiME5/audio/train/S18_P42.wav
+/export/corpora4/CHiME5/audio/train/S18_P43.wav
+/export/corpora4/CHiME5/audio/train/S18_P44.wav
+/export/corpora4/CHiME5/audio/train/S19_P49.wav
+/export/corpora4/CHiME5/audio/train/S19_P50.wav
+/export/corpora4/CHiME5/audio/train/S19_P51.wav
+/export/corpora4/CHiME5/audio/train/S19_P52.wav
+/export/corpora4/CHiME5/audio/train/S20_P49.wav
+/export/corpora4/CHiME5/audio/train/S20_P50.wav
+/export/corpora4/CHiME5/audio/train/S20_P51.wav
+/export/corpora4/CHiME5/audio/train/S20_P52.wav
+/export/corpora4/CHiME5/audio/train/S22_P41.wav
+/export/corpora4/CHiME5/audio/train/S22_P42.wav
+/export/corpora4/CHiME5/audio/train/S22_P43.wav
+/export/corpora4/CHiME5/audio/train/S22_P44.wav
+/export/corpora4/CHiME5/audio/train/S23_P53.wav
+/export/corpora4/CHiME5/audio/train/S23_P54.wav
+/export/corpora4/CHiME5/audio/train/S23_P55.wav
+/export/corpora4/CHiME5/audio/train/S23_P56.wav
+/export/corpora4/CHiME5/audio/train/S24_P53.wav
+/export/corpora4/CHiME5/audio/train/S24_P54.wav
+/export/corpora4/CHiME5/audio/train/S24_P55.wav
+/export/corpora4/CHiME5/audio/train/S24_P56.wav
diff --git a/egs/chime5/s5b/path.sh b/egs/chime5/s5b/path.sh
new file mode 100644
index 00000000000..fb1c0489386
--- /dev/null
+++ b/egs/chime5/s5b/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/chime5/s5b/run.sh b/egs/chime5/s5b/run.sh
new file mode 100755
index 00000000000..37bc5c2c94e
--- /dev/null
+++ b/egs/chime5/s5b/run.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+
+# Begin configuration section.
+nj=96
+decode_nj=20
+stage=0
+nnet_stage=-10
+num_data_reps=4
+snrs="20:10:15:5:0"
+foreground_snrs="20:10:15:5:0"
+background_snrs="20:10:15:5:0"
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and stage 4
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+json_dir=${chime5_corpus}/transcriptions
+audio_dir=${chime5_corpus}/audio
+
+# training and test data
+train_set=train_worn_simu_u400k
+test_sets="dev_${enhancement}_dereverb_ref" #"dev_worn dev_addition_dereverb_ref"
+#test_sets="dev_${enhancement}_ref" #"dev_worn dev_addition_dereverb_ref"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+if [ $stage -le 1 ]; then
+  # skip u03 as they are missing
+  for mictype in worn u01 u02 u04 u05 u06; do
+    local/prepare_data.sh --mictype ${mictype} \
+			  ${audio_dir}/train ${json_dir}/train data/train_${mictype}
+  done
+  for dataset in dev; do
+    for mictype in worn; do
+      local/prepare_data.sh --mictype ${mictype} \
+			    ${audio_dir}/${dataset} ${json_dir}/${dataset} \
+			    data/${dataset}_${mictype}
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh
+
+  utils/prepare_lang.sh \
+    data/local/dict "<unk>" data/local/lang data/lang
+
+  local/train_lms_srilm.sh \
+    --train-text data/train_worn/text --dev-text data/dev_worn/text \
+    --oov-symbol "<unk>" --words-file data/lang/words.txt \
+    data/ data/srilm
+fi
+
+LM=data/srilm/best_3gram.gz
+if [ $stage -le 3 ]; then
+  # Compiles G for chime5 trigram LM
+  utils/format_lm.sh \
+		data/lang $LM data/local/dict/lexicon.txt data/lang
+
+fi
+
+if [ $stage -le 4 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 120G" \
+			      ${audio_dir}/${dset} \
+			      ${dereverb_dir}/${dset} \
+			      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+			      ${dereverb_dir}/${dset} \
+			      ${enhandir}/${dset}_${enhancement}_${mictype} \
+			      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+			  ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb_ref
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
+  # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
+  utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
+  grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
+  utils/fix_data_dir.sh data/train_worn
+fi
+
+if [ $stage -le 6 ]; then
+  local/extract_noises.py $chime5_corpus/audio/train $chime5_corpus/transcriptions/train \
+    local/distant_audio_list distant_noises
+  local/make_noise_list.py distant_noises > distant_noise_list
+
+  noise_list=distant_noise_list
+  
+  if [ ! -d RIRS_NOISES/ ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters $noise_list)
+
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 1 \
+    --isotropic-noise-addition-probability 1 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 1 \
+    --source-sampling-rate 16000 \
+    data/train_worn data/train_worn_rvb
+fi
+
+if [ $stage -le 7 ]; then
+  # combine mix array and worn mics
+  # randomly extract first 100k utterances from all mics
+  # if you want to include more training data, you can increase the number of array mic utterances
+  utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u04 data/train_u05 data/train_u06
+  utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
+  utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
+
+  # only use left channel for worn mic recognition
+  # you can use both left and right channels for training
+  for dset in train dev; do
+    utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo
+    grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text
+    utils/fix_data_dir.sh data/${dset}_worn
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
+  # add array ID to the speaker ID to avoid the use of other array information to meet regulations
+  # Before this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
+  # After this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
+  for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    mkdir -p data/${dset}_nosplit_fix
+    cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/
+    awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
+  done
+
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${train_set} dev_worn; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset}
+  done
+  for dset in dev_${enhancement}_dereverb_ref eval_${enhancement}_dereverb_ref; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${train_set} ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
+		       data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 9 ]; then
+  # make a subset for monophone training
+  utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
+  utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
+fi
+
+if [ $stage -le 10 ]; then
+  # Starting basic training on MFCC features
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+		      data/${train_set}_30kshort data/lang exp/mono
+fi
+
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+			2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+			  4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 13 ]; then
+  utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
+  for dset in ${test_sets}; do
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+		    exp/tri2/graph data/${dset} exp/tri2/decode_${dset} &
+  done
+  wait
+fi
+
+if [ $stage -le 14 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+		     5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 15 ]; then
+  utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
+  for dset in ${test_sets}; do
+    steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+			  exp/tri3/graph data/${dset} exp/tri3/decode_${dset} &
+  done
+  wait
+fi
+
+if [ $stage -le 16 ]; then
+  # The following script cleans the data and produces cleaned data
+  steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \
+    --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+    data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
+fi
+
+if [ $stage -le 17 ]; then
+  # chain TDNN
+  local/chain/tuning/run_tdnn_1b.sh --nj ${nj} \
+    --stage $nnet_stage \
+    --train-set ${train_set}_cleaned \
+    --test-sets "$test_sets" \
+    --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
+fi
+
+if [ $stage -le 18 ]; then
+  # 2-stage decoding
+  for test_set in $test_sets; do
+    local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk 150 --nj $decode_nj \
+      --ivector-dir exp/nnet3_${train_set}_cleaned_rvb \
+      data/${test_set} data/lang_chain \
+      exp/chain_${train_set}_cleaned_rvb/tree_sp/graph \
+      exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp 
+  done
+fi
+
+if [ $stage -le 19 ]; then
+  # final scoring to get the official challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh \
+      --dev exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_${enhancement}_dereverb_ref \
+      --eval exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_${enhancement}_dereverb_ref
+fi
diff --git a/egs/chime5/s5b/steps b/egs/chime5/s5b/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/chime5/s5b/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/chime5/s5b/utils b/egs/chime5/s5b/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/chime5/s5b/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/chime6/README.txt b/egs/chime6/README.txt
new file mode 100644
index 00000000000..9fb48c26822
--- /dev/null
+++ b/egs/chime6/README.txt
@@ -0,0 +1,6 @@
+This is a kaldi recipe for the 6th CHiME Speech Separation and Recognition Challenge (CHiME-6).
+
+See http://spandh.dcs.shef.ac.uk/chime_challenge/ for more detailed information.
+
+s5_track1 : Track 1 of the challenge (oracle segments and speaker label is provided)
+s5_track2 : Track 2 of the challenge (only raw audio is provided)
diff --git a/egs/chime6/s5_track1/RESULTS b/egs/chime6/s5_track1/RESULTS
new file mode 100644
index 00000000000..73b47ddf3cc
--- /dev/null
+++ b/egs/chime6/s5_track1/RESULTS
@@ -0,0 +1,21 @@
+
+# tri2
+%WER 88.52 [ 52121 / 58881, 2023 ins, 30285 del, 19813 sub ] exp/tri2/decode_dev_gss/wer_17_0.5
+
+# tri3
+%WER 85.72 [ 50471 / 58881, 3079 ins, 23787 del, 23605 sub ] exp/tri3/decode_dev_gss/wer_17_0.5
+
+# nnet3 tdnn+chain
+%WER 41.21 [ 24267 / 58881, 2428 ins, 7606 del, 14233 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_worn_2stage/wer_11_0.0
+%WER 51.76 [ 30474 / 58881, 2665 ins, 11749 del, 16060 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_gss_multiarray_2stage/wer_10_0.0
+
+# result with the challenge submission format (Nov 17, 2019)
+# after the fix of speaker ID across arrays
+==== development set ====
+session S02 room DINING: #words 8288, #errors 4459, wer 53.80 %
+session S02 room KITCHEN: #words 12696, #errors 7170, wer 56.47 %
+session S02 room LIVING: #words 15460, #errors 7388, wer 47.78 %
+session S09 room DINING: #words 5766, #errors 3100, wer 53.76 %
+session S09 room KITCHEN: #words 8911, #errors 4483, wer 50.30 %
+session S09 room LIVING: #words 7760, #errors 3874, wer 49.92 %
+overall: #words 58881, #errors 30474, wer 51.75 %
diff --git a/egs/chime6/s5_track1/cmd.sh b/egs/chime6/s5_track1/cmd.sh
new file mode 100644
index 00000000000..9702501f1a7
--- /dev/null
+++ b/egs/chime6/s5_track1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+
diff --git a/egs/chime6/s5_track1/conf/beamformit.cfg b/egs/chime6/s5_track1/conf/beamformit.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime6/s5_track1/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime6/s5_track1/conf/mfcc.conf b/egs/chime6/s5_track1/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/chime6/s5_track1/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime6/s5_track1/conf/mfcc_hires.conf b/egs/chime6/s5_track1/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/chime6/s5_track1/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime6/s5_track1/conf/online_cmvn.conf b/egs/chime6/s5_track1/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/chime6/s5_track1/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime6/s5_track1/conf/queue.conf b/egs/chime6/s5_track1/conf/queue.conf
new file mode 100644
index 00000000000..73103195684
--- /dev/null
+++ b/egs/chime6/s5_track1/conf/queue.conf
@@ -0,0 +1,10 @@
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0 -q all.q -l hostname='!b19*'
+option gpu=* -l gpu=$0 -q g.q -l hostname='!b19*'
+
diff --git a/egs/chime6/s5_track1/local/add_location_to_uttid.sh b/egs/chime6/s5_track1/local/add_location_to_uttid.sh
new file mode 100755
index 00000000000..91bd0c0dd37
--- /dev/null
+++ b/egs/chime6/s5_track1/local/add_location_to_uttid.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Author: Ashish Arora
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+enhancement=gss
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/add_location_to_uttid.sh [options] <json-transcription-in-dir>"
+   echo "                        <perutt-in-dir> <uttid-location-mapping-out-file>"
+   echo "main options (for others, see top of script file)"
+   echo "  --enhancement                    # enhancement type (gss or beamformit)"
+   exit 1;
+fi
+
+jdir=$1
+puttdir=$2
+utt_loc_file=$3
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+if [[ ${enhancement} == *gss* ]]; then
+  local/get_location.py $jdir > $utt_loc_file
+  local/replace_uttid.py $utt_loc_file $puttdir/per_utt > $puttdir/per_utt_loc
+fi
+
+if [[ ${enhancement} == *beamformit* ]]; then
+  cat $puttdir/per_utt > $puttdir/per_utt_loc
+fi
diff --git a/egs/chime5/s5/local/chain/compare_wer.sh b/egs/chime6/s5_track1/local/chain/compare_wer.sh
similarity index 100%
rename from egs/chime5/s5/local/chain/compare_wer.sh
rename to egs/chime6/s5_track1/local/chain/compare_wer.sh
diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/chime6/s5_track1/local/chain/run_tdnn.sh
similarity index 100%
rename from egs/tedlium/s5_r3/local/chain/run_tdnnf.sh
rename to egs/chime6/s5_track1/local/chain/run_tdnn.sh
diff --git a/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..daad37e2cd7
--- /dev/null
+++ b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,270 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u100k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.01 bottleneck-dim=320"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1b.sh b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..a9c797ffa33
--- /dev/null
+++ b/egs/chime6/s5_track1/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# This factorized TDNN (TDNN-F) script is adapted from SWBD recipe 7q.
+# It uses resnet-style skip connections.
+# For details, refer to the paper:
+# "Semi-Orthogonal Low-Rank Matrix Factorization for Deep Neural Networks", Daniel Povey, Gaofeng Cheng, Yiming Wang, Ke Li, Hainan Xu, Mahsa Yarmohamadi, Sanjeev Khudanpur, Interspeech 2018
+#%WER 51.76 [ 30474 / 58881, 2665 ins, 11749 del, 16060 sub ] exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_gss_multiarray_2stage/wer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn_1b_sp
+# exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/: num-iters=429 nj=3..16 num-params=17.0M dim=40+100->2776 combine=-0.134->-0.133 (over 3) xent:train/valid[285,428,final]=(-2.37,-1.95,-1.95/-2.19,-1.90,-1.91) logprob:train/valid[285,428,final]=(-0.201,-0.125,-0.124/-0.198,-0.147,-0.148)
+
+set -e
+
+# configs for 'chain'
+stage=0
+nj=96
+train_set=train_worn_u400k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nnet3_affix=_train_worn_u400k
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+num_epochs=4
+common_egs_dir=
+# training options
+# training chunk-options
+chunk_width=140,100,160
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+skip_decoding=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" --generate-ali-from-lats true \
+    ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $lat_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule "$dropout_schedule" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ] && [[ $skip_decoding == "false" ]]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/chime6/s5_track1/local/check_tools.sh b/egs/chime6/s5_track1/local/check_tools.sh
new file mode 100755
index 00000000000..8e80e25ca33
--- /dev/null
+++ b/egs/chime6/s5_track1/local/check_tools.sh
@@ -0,0 +1,76 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+
+command -v uconv &>/dev/null \
+  || { echo  >&2 "uconv not found on PATH. You will have to install ICU4C"; exit 1; }
+
+command -v ngram &>/dev/null \
+  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh to install it"; exit 1; }
+
+if [  -z ${LIBLBFGS} ]; then
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+sox=`command -v sox 2>/dev/null` \
+  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }
+
+# If sox is found on path, check if the version is correct
+if [ ! -z "$sox" ]; then
+  sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
+  if [[ ! $sox_version =~ v14.4.* ]]; then
+    echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
+    exit 1
+  fi
+fi
+
+command -v phonetisaurus-align &>/dev/null \
+  || { echo  >&2 "Phonetisaurus not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_phonetisaurus.sh to install it"; exit 1; }
+
+command -v BeamformIt &>/dev/null \
+  || { echo  >&2 "BeamformIt not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_beamformit.sh to install it"; exit 1; }
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh'"
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" != "1" ]; then
+  echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+  exit 1
+fi
+
+# this is used for the audio synchronization
+sox_conda=`command -v ${miniconda_dir}/bin/sox 2>/dev/null`
+if [ -z "${sox_conda}" ]; then
+  echo "install conda sox (v14.4.2)" 
+  ${miniconda_dir}/bin/conda install -c conda-forge sox
+fi
+
+exit  0
diff --git a/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh b/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh
new file mode 100755
index 00000000000..82839604c9e
--- /dev/null
+++ b/egs/chime6/s5_track1/local/copy_lat_dir_parallel.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+cmd=queue.pl
+nj=40
+stage=0
+speed_perturb=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <utt-map> <data-dir> <src-lat-dir> <out-lat-dir>"
+  exit 1
+fi
+
+utt_map=$1
+data=$2
+srcdir=$3
+dir=$4
+
+mkdir -p $dir
+
+cp $srcdir/{phones.txt,tree,final.mdl} $dir || exit 1
+cp $srcdir/{final.alimdl,final.occs,splice_opts,cmvn_opts,delta_opts,final.mat,full.mat} 2>/dev/null || true
+
+nj_src=$(cat $srcdir/num_jobs) || exit 1
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj_src $dir/log/copy_lats_orig.JOB.log \
+    lattice-copy "ark:gunzip -c $srcdir/lat.JOB.gz |" \
+    ark,scp:$dir/lat_orig.JOB.ark,$dir/lat_orig.JOB.scp || exit 1
+fi
+
+for n in $(seq $nj_src); do
+  cat $dir/lat_orig.$n.scp
+done > $dir/lat_orig.scp || exit 1
+
+if $speed_perturb; then
+  for s in 0.9 1.1; do
+    awk -v s=$s '{print "sp"s"-"$1" sp"s"-"$2}' $utt_map
+  done | cat - $utt_map | sort -k1,1 > $dir/utt_map
+  utt_map=$dir/utt_map
+fi
+
+if [ $stage -le 2 ]; then
+  utils/filter_scp.pl -f 2 $dir/lat_orig.scp < $utt_map | \
+    utils/apply_map.pl -f 2 $dir/lat_orig.scp > \
+    $dir/lat.scp || exit 1
+
+  if [ ! -s $dir/lat.scp ]; then
+    echo "$0: $dir/lat.scp is empty. Something went wrong!"
+    exit 1
+  fi
+fi
+
+utils/split_data.sh $data $nj
+
+if [ $stage -le 3 ]; then
+  $cmd JOB=1:$nj $dir/log/copy_lats.JOB.log \
+    lattice-copy "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/lat.scp |" \
+    "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1
+fi
+
+echo $nj > $dir/num_jobs
+
+if [ -f $srcdir/ali.1.gz ]; then
+  if [ $stage -le 4 ]; then
+    $cmd JOB=1:$nj_src $dir/log/copy_ali_orig.JOB.log \
+      copy-int-vector "ark:gunzip -c $srcdir/ali.JOB.gz |" \
+      ark,scp:$dir/ali_orig.JOB.ark,$dir/ali_orig.JOB.scp || exit 1
+  fi
+
+  for n in $(seq $nj_src); do
+    cat $dir/ali_orig.$n.scp
+  done > $dir/ali_orig.scp || exit 1
+
+  if [ $stage -le 5 ]; then
+    utils/filter_scp.pl -f 2 $dir/ali_orig.scp < $utt_map | \
+      utils/apply_map.pl -f 2 $dir/ali_orig.scp > \
+      $dir/ali.scp || exit 1
+  
+    if [ ! -s $dir/ali.scp ]; then
+      echo "$0: $dir/ali.scp is empty. Something went wrong!"
+      exit 1
+    fi
+  fi
+
+  utils/split_data.sh $data $nj
+
+  if [ $stage -le 6 ]; then
+    $cmd JOB=1:$nj $dir/log/copy_ali.JOB.log \
+      copy-int-vector "scp:utils/filter_scp.pl $data/split$nj/JOB/utt2spk $dir/ali.scp |" \
+      "ark:|gzip -c > $dir/ali.JOB.gz" || exit 1
+  fi
+fi
+
+rm $dir/lat_orig.*.{ark,scp} $dir/ali_orig.*.{ark,scp} 2>/dev/null || true
diff --git a/egs/chime6/s5_track1/local/decode.sh b/egs/chime6/s5_track1/local/decode.sh
new file mode 100755
index 00000000000..7283a171000
--- /dev/null
+++ b/egs/chime6/s5_track1/local/decode.sh
@@ -0,0 +1,253 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+# This script only performs recognition experiments with evaluation data
+# This script can be run from run.sh or standalone. 
+# To run it standalone, you can download a pretrained chain ASR model using:
+# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz
+# and copy the contents of the {data/ exp/} directory to your {data/ exp/}
+
+# Begin configuration section.
+decode_nj=20
+gss_nj=50
+stage=0
+enhancement=gss        # for a new enhancement method,
+                       # change this variable and stage 4
+
+# training data
+train_set=train_worn_simu_u400k
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+# chime6 data directories, which are generated from ${chime5_corpus},
+# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly
+chime6_corpus=${PWD}/CHiME6
+json_dir=${chime6_corpus}/transcriptions
+audio_dir=${chime6_corpus}/audio
+
+enhanced_dir=enhanced
+if [[ ${enhancement} == *gss* ]]; then
+  enhanced_dir=${enhanced_dir}_multiarray
+  enhancement=${enhancement}_multiarray
+fi
+
+if [[ ${enhancement} == *beamformit* ]]; then
+  enhanced_dir=${enhanced_dir}
+  enhancement=${enhancement}
+fi
+
+enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1
+test_sets="dev_${enhancement} eval_${enhancement}"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+###########################################################################
+# We first generate the synchronized audio files across arrays and
+# corresponding JSON files. Note that this requires sox v14.4.2,
+# which is installed via miniconda in ./local/check_tools.sh
+###########################################################################
+
+if [ $stage -le 0 ]; then
+  local/generate_chime6_data.sh \
+    --cmd "$train_cmd" \
+    ${chime5_corpus} \
+    ${chime6_corpus}
+fi
+
+#########################################################################################
+# In stage 1, we perform GSS based enhancement or beamformit for the test sets. multiarray = true
+#can take around 10hrs for dev and eval set.
+#########################################################################################
+
+if [ $stage -le 1 ] && [[ ${enhancement} == *gss* ]]; then
+  echo "$0:  enhance data..."
+  # Guided Source Separation (GSS) from Paderborn University
+  # http://spandh.dcs.shef.ac.uk/chime_workshop/papers/CHiME_2018_paper_boeddecker.pdf
+  # @Article{PB2018CHiME5,
+  #   author    = {Boeddeker, Christoph and Heitkaemper, Jens and Schmalenstroeer, Joerg and Drude, Lukas and Heymann, Jahn and Haeb-Umbach, Reinhold},
+  #   title     = {{Front-End Processing for the CHiME-5 Dinner Party Scenario}},
+  #   year      = {2018},
+  #   booktitle = {CHiME5 Workshop},
+  # }
+
+  if [ ! -d pb_chime5/ ]; then
+    local/install_pb_chime5.sh
+  fi
+
+  if [ ! -f pb_chime5/cache/chime6.json ]; then
+    (
+    cd pb_chime5
+    miniconda_dir=$HOME/miniconda3/
+    export PATH=$miniconda_dir/bin:$PATH
+    export CHIME6_DIR=$chime6_corpus
+    make cache/chime6.json
+    )
+  fi
+
+  for dset in dev eval; do
+    local/run_gss.sh \
+      --cmd "$train_cmd --max-jobs-run $gss_nj" --nj 160 \
+      ${dset} \
+      ${enhanced_dir} \
+      ${enhanced_dir} || exit 1
+  done
+
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype gss ${enhanced_dir}/audio/${dset} \
+      ${json_dir}/${dset} data/${dset}_${enhancement} || exit 1
+  done
+fi
+
+#######################################################################
+# Prepare the dev and eval data with dereverberation (WPE) and
+# beamforming.
+#######################################################################
+
+if [ $stage -le 1 ] && [[ ${enhancement} == *beamformit* ]]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhanced_dir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u05 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \
+               ${audio_dir}/${dset} \
+               ${dereverb_dir}/${dset} \
+               ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u05 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+                      ${dereverb_dir}/${dset} \
+                      ${enhanced_dir}/${dset}_${enhancement}_${mictype} \
+                      ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype ref "$PWD/${enhanced_dir}/${dset}_${enhancement}_u0*" \
+                      ${json_dir}/${dset} data/${dset}_${enhancement}
+  done
+fi
+
+# In GSS enhancement, we do not have array information in utterance ID
+if [ $stage -le 2 ] && [[ ${enhancement} == *gss* ]]; then
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${test_sets}; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_orig
+  done
+
+  for dset in ${test_sets}; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_orig data/${dset}
+  done
+fi
+
+if [ $stage -le 2 ] && [[ ${enhancement} == *beamformit* ]]; then
+  # fix speaker ID issue (thanks to Dr. Naoyuki Kanda)
+  # add array ID to the speaker ID to avoid the use of other array information to meet regulations
+  # Before this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01
+  # After this fix
+  # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk
+  # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02
+  # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02
+  echo "$0: fix data..."
+  for dset in ${test_sets}; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    mkdir -p data/${dset}_nosplit_fix
+    for f in segments text wav.scp; do
+      if [ -f data/${dset}_nosplit/$f ]; then
+        cp data/${dset}_nosplit/$f data/${dset}_nosplit_fix
+      fi
+    done
+    awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk
+    utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt
+  done
+
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${test_sets}; do
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset}
+  done
+fi
+
+##########################################################################
+# DECODING: we perform 2 stage decoding.
+##########################################################################
+
+nnet3_affix=_${train_set}_cleaned_rvb
+lm_suffix=
+
+if [ $stage -le 3 ]; then
+  # First the options that are passed through to run_ivector_common.sh
+  # (some of which are also used in this script directly).
+
+  # The rest are configs specific to this script.  Most of the parameters
+  # are just hardcoded at this level, in the commands below.
+  echo "$0: decode data..."
+  affix=1b   # affix for the TDNN directory name
+  tree_affix=
+  tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+  dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+
+  # training options
+  # training chunk-options
+  chunk_width=140,100,160
+  # we don't need extra left/right context for TDNN systems.
+  chunk_left_context=0
+  chunk_right_context=0
+  
+  utils/mkgraph.sh \
+      --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+      $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      local/nnet3/decode.sh --affix 2stage --pass2-decode-opts "--min-active 1000" \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --frames-per-chunk 150 --nj $decode_nj \
+        --ivector-dir exp/nnet3${nnet3_affix} \
+        data/${data} data/lang${lm_suffix} \
+        $tree_dir/graph${lm_suffix} \
+        exp/chain${nnet3_affix}/tdnn${affix}_sp
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+##########################################################################
+# Scoring: here we obtain wer per session per location and overall WER
+##########################################################################
+
+if [ $stage -le 4 ]; then
+  # final scoring to get the official challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh --enhancement $enhancement --json $json_dir \
+      --dev exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_dev_${enhancement}_2stage \
+      --eval exp/chain${nnet3_affix}/tdnn1b_sp/decode${lm_suffix}_eval_${enhancement}_2stage
+fi
diff --git a/egs/chime6/s5_track1/local/distant_audio_list b/egs/chime6/s5_track1/local/distant_audio_list
new file mode 100644
index 00000000000..710945b014b
--- /dev/null
+++ b/egs/chime6/s5_track1/local/distant_audio_list
@@ -0,0 +1,372 @@
+S03_U01.CH1
+S03_U01.CH2
+S03_U01.CH3
+S03_U01.CH4
+S03_U02.CH1
+S03_U02.CH2
+S03_U02.CH3
+S03_U02.CH4
+S03_U03.CH1
+S03_U03.CH2
+S03_U03.CH3
+S03_U03.CH4
+S03_U04.CH1
+S03_U04.CH2
+S03_U04.CH3
+S03_U04.CH4
+S03_U05.CH1
+S03_U05.CH2
+S03_U05.CH3
+S03_U05.CH4
+S03_U06.CH1
+S03_U06.CH2
+S03_U06.CH3
+S03_U06.CH4
+S04_U01.CH1
+S04_U01.CH2
+S04_U01.CH3
+S04_U01.CH4
+S04_U02.CH1
+S04_U02.CH2
+S04_U02.CH3
+S04_U02.CH4
+S04_U03.CH1
+S04_U03.CH2
+S04_U03.CH3
+S04_U03.CH4
+S04_U04.CH1
+S04_U04.CH2
+S04_U04.CH3
+S04_U04.CH4
+S04_U05.CH1
+S04_U05.CH2
+S04_U05.CH3
+S04_U05.CH4
+S04_U06.CH1
+S04_U06.CH2
+S04_U06.CH3
+S04_U06.CH4
+S05_U01.CH1
+S05_U01.CH2
+S05_U01.CH3
+S05_U01.CH4
+S05_U02.CH1
+S05_U02.CH2
+S05_U02.CH3
+S05_U02.CH4
+S05_U05.CH1
+S05_U05.CH2
+S05_U05.CH3
+S05_U05.CH4
+S05_U06.CH1
+S05_U06.CH2
+S05_U06.CH3
+S05_U06.CH4
+S06_U01.CH1
+S06_U01.CH2
+S06_U01.CH3
+S06_U01.CH4
+S06_U02.CH1
+S06_U02.CH2
+S06_U02.CH3
+S06_U02.CH4
+S06_U03.CH1
+S06_U03.CH2
+S06_U03.CH3
+S06_U03.CH4
+S06_U04.CH1
+S06_U04.CH2
+S06_U04.CH3
+S06_U04.CH4
+S06_U05.CH1
+S06_U05.CH2
+S06_U05.CH3
+S06_U05.CH4
+S06_U06.CH1
+S06_U06.CH2
+S06_U06.CH3
+S06_U06.CH4
+S07_U01.CH1
+S07_U01.CH2
+S07_U01.CH3
+S07_U01.CH4
+S07_U02.CH1
+S07_U02.CH2
+S07_U02.CH3
+S07_U02.CH4
+S07_U03.CH1
+S07_U03.CH2
+S07_U03.CH3
+S07_U03.CH4
+S07_U04.CH1
+S07_U04.CH2
+S07_U04.CH3
+S07_U04.CH4
+S07_U05.CH1
+S07_U05.CH2
+S07_U05.CH3
+S07_U05.CH4
+S07_U06.CH1
+S07_U06.CH2
+S07_U06.CH3
+S07_U06.CH4
+S08_U01.CH1
+S08_U01.CH2
+S08_U01.CH3
+S08_U01.CH4
+S08_U02.CH1
+S08_U02.CH2
+S08_U02.CH3
+S08_U02.CH4
+S08_U03.CH1
+S08_U03.CH2
+S08_U03.CH3
+S08_U03.CH4
+S08_U04.CH1
+S08_U04.CH2
+S08_U04.CH3
+S08_U04.CH4
+S08_U05.CH1
+S08_U05.CH2
+S08_U05.CH3
+S08_U05.CH4
+S08_U06.CH1
+S08_U06.CH2
+S08_U06.CH3
+S08_U06.CH4
+S12_U01.CH1
+S12_U01.CH2
+S12_U01.CH3
+S12_U01.CH4
+S12_U02.CH1
+S12_U02.CH2
+S12_U02.CH3
+S12_U02.CH4
+S12_U03.CH1
+S12_U03.CH2
+S12_U03.CH3
+S12_U03.CH4
+S12_U04.CH1
+S12_U04.CH2
+S12_U04.CH3
+S12_U04.CH4
+S12_U05.CH1
+S12_U05.CH2
+S12_U05.CH3
+S12_U05.CH4
+S12_U06.CH1
+S12_U06.CH2
+S12_U06.CH3
+S12_U06.CH4
+S13_U01.CH1
+S13_U01.CH2
+S13_U01.CH3
+S13_U01.CH4
+S13_U02.CH1
+S13_U02.CH2
+S13_U02.CH3
+S13_U02.CH4
+S13_U03.CH1
+S13_U03.CH2
+S13_U03.CH3
+S13_U03.CH4
+S13_U04.CH1
+S13_U04.CH2
+S13_U04.CH3
+S13_U04.CH4
+S13_U05.CH1
+S13_U05.CH2
+S13_U05.CH3
+S13_U05.CH4
+S13_U06.CH1
+S13_U06.CH2
+S13_U06.CH3
+S13_U06.CH4
+S16_U01.CH1
+S16_U01.CH2
+S16_U01.CH3
+S16_U01.CH4
+S16_U02.CH1
+S16_U02.CH2
+S16_U02.CH3
+S16_U02.CH4
+S16_U03.CH1
+S16_U03.CH2
+S16_U03.CH3
+S16_U03.CH4
+S16_U04.CH1
+S16_U04.CH2
+S16_U04.CH3
+S16_U04.CH4
+S16_U05.CH1
+S16_U05.CH2
+S16_U05.CH3
+S16_U05.CH4
+S16_U06.CH1
+S16_U06.CH2
+S16_U06.CH3
+S16_U06.CH4
+S17_U01.CH1
+S17_U01.CH2
+S17_U01.CH3
+S17_U01.CH4
+S17_U02.CH1
+S17_U02.CH2
+S17_U02.CH3
+S17_U02.CH4
+S17_U03.CH1
+S17_U03.CH2
+S17_U03.CH3
+S17_U03.CH4
+S17_U04.CH1
+S17_U04.CH2
+S17_U04.CH3
+S17_U04.CH4
+S17_U05.CH1
+S17_U05.CH2
+S17_U05.CH3
+S17_U05.CH4
+S17_U06.CH1
+S17_U06.CH2
+S17_U06.CH3
+S17_U06.CH4
+S18_U01.CH1
+S18_U01.CH2
+S18_U01.CH3
+S18_U01.CH4
+S18_U02.CH1
+S18_U02.CH2
+S18_U02.CH3
+S18_U02.CH4
+S18_U03.CH1
+S18_U03.CH2
+S18_U03.CH3
+S18_U03.CH4
+S18_U04.CH1
+S18_U04.CH2
+S18_U04.CH3
+S18_U04.CH4
+S18_U05.CH1
+S18_U05.CH2
+S18_U05.CH3
+S18_U05.CH4
+S18_U06.CH1
+S18_U06.CH2
+S18_U06.CH3
+S18_U06.CH4
+S19_U01.CH1
+S19_U01.CH2
+S19_U01.CH3
+S19_U01.CH4
+S19_U02.CH1
+S19_U02.CH2
+S19_U02.CH3
+S19_U02.CH4
+S19_U03.CH1
+S19_U03.CH2
+S19_U03.CH3
+S19_U03.CH4
+S19_U04.CH1
+S19_U04.CH2
+S19_U04.CH3
+S19_U04.CH4
+S19_U05.CH1
+S19_U05.CH2
+S19_U05.CH3
+S19_U05.CH4
+S19_U06.CH1
+S19_U06.CH2
+S19_U06.CH3
+S19_U06.CH4
+S20_U01.CH1
+S20_U01.CH2
+S20_U01.CH3
+S20_U01.CH4
+S20_U02.CH1
+S20_U02.CH2
+S20_U02.CH3
+S20_U02.CH4
+S20_U03.CH1
+S20_U03.CH2
+S20_U03.CH3
+S20_U03.CH4
+S20_U04.CH1
+S20_U04.CH2
+S20_U04.CH3
+S20_U04.CH4
+S20_U05.CH1
+S20_U05.CH2
+S20_U05.CH3
+S20_U05.CH4
+S20_U06.CH1
+S20_U06.CH2
+S20_U06.CH3
+S20_U06.CH4
+S22_U01.CH1
+S22_U01.CH2
+S22_U01.CH3
+S22_U01.CH4
+S22_U02.CH1
+S22_U02.CH2
+S22_U02.CH3
+S22_U02.CH4
+S22_U04.CH1
+S22_U04.CH2
+S22_U04.CH3
+S22_U04.CH4
+S22_U05.CH1
+S22_U05.CH2
+S22_U05.CH3
+S22_U05.CH4
+S22_U06.CH1
+S22_U06.CH2
+S22_U06.CH3
+S22_U06.CH4
+S23_U01.CH1
+S23_U01.CH2
+S23_U01.CH3
+S23_U01.CH4
+S23_U02.CH1
+S23_U02.CH2
+S23_U02.CH3
+S23_U02.CH4
+S23_U03.CH1
+S23_U03.CH2
+S23_U03.CH3
+S23_U03.CH4
+S23_U04.CH1
+S23_U04.CH2
+S23_U04.CH3
+S23_U04.CH4
+S23_U05.CH1
+S23_U05.CH2
+S23_U05.CH3
+S23_U05.CH4
+S23_U06.CH1
+S23_U06.CH2
+S23_U06.CH3
+S23_U06.CH4
+S24_U01.CH1
+S24_U01.CH2
+S24_U01.CH3
+S24_U01.CH4
+S24_U02.CH1
+S24_U02.CH2
+S24_U02.CH3
+S24_U02.CH4
+S24_U03.CH1
+S24_U03.CH2
+S24_U03.CH3
+S24_U03.CH4
+S24_U04.CH1
+S24_U04.CH2
+S24_U04.CH3
+S24_U04.CH4
+S24_U05.CH1
+S24_U05.CH2
+S24_U05.CH3
+S24_U05.CH4
+S24_U06.CH1
+S24_U06.CH2
+S24_U06.CH3
+S24_U06.CH4
diff --git a/egs/chime6/s5_track1/local/extract_noises.py b/egs/chime6/s5_track1/local/extract_noises.py
new file mode 100755
index 00000000000..8f617752f2d
--- /dev/null
+++ b/egs/chime6/s5_track1/local/extract_noises.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import logging
+import os
+import sys
+import scipy.io.wavfile as siw
+import math
+import numpy as np
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        """Extract noises from the corpus based on the non-speech regions.
+        e.g. {} /export/corpora4/CHiME5/audio/train/ \\
+                /export/corpora4/CHiME5/transcriptions/train/ \\
+                /export/b05/zhiqiw/noise/""".format(sys.argv[0]))
+
+    parser.add_argument("--segment-length", default=20)
+    parser.add_argument("audio_dir", help="""Location of the CHiME5 Audio files. e.g. /export/corpora4/CHiME5/audio/train/""")
+    parser.add_argument("trans_dir", help="""Location of the CHiME5 Transcriptions. e.g. /export/corpora4/CHiME5/transcriptions/train/""")
+    parser.add_argument("audio_list", help="""List of ids of the CHiME5 recordings from which noise is extracted. e.g. local/distant_audio_list""")
+    parser.add_argument("out_dir", help="Output directory to write noise files. e.g. /export/b05/zhiqiw/noise/")
+
+    args = parser.parse_args()
+    return args
+
+
+def Trans_time(time, fs):
+    units = time.split(':')
+    time_second = float(units[0]) * 3600 + float(units[1]) * 60 + float(units[2])
+    return int(time_second*fs)
+
+
+# remove mic dependency for CHiME-6
+def Get_time(conf, tag, fs):
+    for i in conf:
+        st = Trans_time(i['start_time'], fs)
+        ed = Trans_time(i['end_time'], fs)
+        tag[st:ed] = 0
+    return tag
+
+
+def write_noise(out_dir, seg, audio, sig, tag, fs, cnt):
+    sig_noise = sig[np.nonzero(tag)]
+    for i in range(math.floor(len(sig_noise)/(seg*fs))):
+        siw.write(out_dir +'/noise'+str(cnt)+'.wav', fs, sig_noise[i*seg*fs:(i+1)*seg*fs])
+        cnt += 1
+    return cnt
+
+
+def main():
+    args = get_args()
+
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir)
+
+    wav_list = open(args.audio_list).readlines()
+
+    cnt = 1
+    for i, audio in enumerate(wav_list):
+        parts = audio.strip().split('.')
+        if len(parts) == 2:
+            # Assuming distant mic with name like S03_U01.CH1
+            session, mic = parts[0].split('_')
+            channel = parts[1]
+            base_name = session + "_" + mic + "." + channel
+        else:
+            # Assuming close talk mic with name like S03_P09
+            session, mic = audio.strip().split('_')
+            base_name = session + "_" + mic
+        fs, sig = siw.read(args.audio_dir + "/" + base_name + '.wav')
+        tag = np.ones(len(sig))
+        if i == 0 or session != session_p:
+            with open(args.trans_dir + "/" + session + '.json') as f:
+                conf = json.load(f)
+        tag = Get_time(conf, tag, fs)
+        cnt = write_noise(args.out_dir, args.segment_length, audio, sig, tag, fs, cnt)
+        session_p = session
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5_track1/local/extract_vad_weights.sh b/egs/chime6/s5_track1/local/extract_vad_weights.sh
new file mode 100755
index 00000000000..250b021bd8f
--- /dev/null
+++ b/egs/chime6/s5_track1/local/extract_vad_weights.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar
+# Apache 2.0.
+
+# This script converts lattices available from a first pass decode into a per-frame weights file
+# The ctms generated from the lattices are filtered. Silence frames are assigned a low weight (e.g.0.00001)
+# and voiced frames have a weight of 1.
+
+set -e
+
+stage=1
+cmd=run.pl
+silence_weight=0.00001
+#end configuration section.
+
+. ./cmd.sh
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <input-decode-dir> <output-wts-file-gzipped>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  exit 1;
+fi
+
+data_dir=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+output_wts_file_gz=$4
+
+if [ $stage -le 1 ]; then
+  echo "$0: generating CTM from input lattices"
+  steps/get_ctm_conf.sh --cmd "$cmd" \
+    --use-segments false \
+    $data_dir \
+    $lang \
+    $decode_dir
+fi
+
+if [ $stage -le 2 ]; then
+  name=`basename $data_dir`
+  # we just take the ctm from LMWT 10, it doesn't seem to affect the results a lot
+  ctm=$decode_dir/score_10/$name.ctm
+  echo "$0: generating weights file from ctm $ctm"
+
+  pad_frames=0  # this did not seem to be helpful but leaving it as an option.
+  feat-to-len scp:$data_dir/feats.scp ark,t:- >$decode_dir/utt.lengths
+  if [ ! -f $ctm ]; then  echo "$0: expected ctm to exist: $ctm"; exit 1; fi
+
+  cat $ctm | awk '$6 == 1.0 && $4 < 1.0' | \
+  grep -v -w mm | grep -v -w mhm | grep -v -F '[noise]' | \
+  grep -v -F '[laughter]' | grep -v -F '<unk>' | \
+  perl -e ' $lengths=shift @ARGV;  $pad_frames=shift @ARGV; $silence_weight=shift @ARGV;
+   $pad_frames >= 0 || die "bad pad-frames value $pad_frames";
+   open(L, "<$lengths") || die "opening lengths file";
+   @all_utts = ();
+   $utt2ref = { };
+   while (<L>) {
+     ($utt, $len) = split(" ", $_);
+     push @all_utts, $utt;
+     $array_ref = [ ];
+     for ($n = 0; $n < $len; $n++) { ${$array_ref}[$n] = $silence_weight; }
+     $utt2ref{$utt} = $array_ref;
+   }
+   while (<STDIN>) {
+     @A = split(" ", $_);
+     @A == 6 || die "bad ctm line $_";
+     $utt = $A[0]; $beg = $A[2]; $len = $A[3];
+     $beg_int = int($beg * 100) - $pad_frames;
+     $len_int = int($len * 100) + 2*$pad_frames;
+     $array_ref = $utt2ref{$utt};
+     !defined $array_ref  && die "No length info for utterance $utt";
+     for ($t = $beg_int; $t < $beg_int + $len_int; $t++) {
+       if ($t >= 0 && $t < @$array_ref) {
+         ${$array_ref}[$t] = 1;
+        }
+      }
+    }
+    foreach $utt (@all_utts) {  $array_ref = $utt2ref{$utt};
+      print $utt, " [ ", join(" ", @$array_ref), " ]\n";
+      } ' $decode_dir/utt.lengths $pad_frames $silence_weight | \
+        gzip -c > $output_wts_file_gz
+fi
diff --git a/egs/chime6/s5_track1/local/generate_chime6_data.sh b/egs/chime6/s5_track1/local/generate_chime6_data.sh
new file mode 100755
index 00000000000..93106cf605a
--- /dev/null
+++ b/egs/chime6/s5_track1/local/generate_chime6_data.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Copyright 2019, Johns Hopkins University (Author: Shinji Watanabe)
+# Apache 2.0
+#
+# This script generates synchronized audio data across arrays by considering
+# the frame dropping, clock drift etc. done by Prof. Jon Barker at University of
+# Sheffield. This script first downloads the synchronization tool and generate
+# the synchronized audios and corresponding JSON transcription files
+# Note that
+# 1) the JSON format is slightly changed from the original CHiME-5 one (simplified
+# thanks to the synchronization)
+# 2) it requires sox v.14.4.2 and Python 3.6.7
+# Unfortunately, the generated files would be different depending on the sox
+# and Python versions and to generate the exactly same audio files, this script uses
+# the fixed versions of sox and Python installed in the miniconda instead of system ones
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Wrong #arguments ($#, expected 2)"
+  echo "Usage: local/generate_chime6_data.sh [options] <chime5-in-dir> <chime6-out-dir>"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd <cmd> # Command to run in parallel with"
+  exit 1;
+fi
+
+sdir=$1
+odir=$2
+expdir=${PWD}/exp/chime6_data
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+# get chime6-synchronisation tools
+SYNC_PATH=${PWD}/chime6-synchronisation
+if [ ! -d ${SYNC_PATH} ]; then
+  git clone https://github.com/chimechallenge/chime6-synchronisation.git
+fi
+
+mkdir -p ${odir}
+mkdir -p ${expdir}/log
+
+# split the session to avoid too much disk access
+sessions1="S01 S02 S03 S04 S05 S06 S07"
+sessions2="S08 S09 S12 S13 S16 S17 S18"
+sessions3="S19 S20 S21 S22 S23 S24"
+
+CONDA_PATH=${HOME}/miniconda3/bin
+IN_PATH=${sdir}/audio
+OUT_PATH=${odir}/audio
+TMP_PATH=${odir}/audio_tmp
+
+if [ ! -d "${IN_PATH}" ]; then
+  echo "please specify the CHiME-5 data path correctly"
+  exit 1
+fi
+mkdir -p $OUT_PATH/train $OUT_PATH/eval $OUT_PATH/dev
+mkdir -p $TMP_PATH/train $TMP_PATH/eval $TMP_PATH/dev
+
+if [ -f ${odir}/audio/dev/S02_P05.wav ]; then
+  echo "CHiME-6 date already exists"
+  exit 0
+fi
+
+pushd ${SYNC_PATH}
+echo "Correct for frame dropping"
+for session in ${sessions1}; do
+  $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \
+    ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH &
+done
+wait
+for session in ${sessions2}; do
+  $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \
+    ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH &
+done
+wait
+for session in ${sessions3}; do
+  $cmd ${expdir}/correct_signals_for_frame_drops.${session}.log \
+    ${CONDA_PATH}/python correct_signals_for_frame_drops.py --session=${session} chime6_audio_edits.json $IN_PATH $TMP_PATH &
+done
+wait
+
+echo "Sox processing for correcting clock drift"
+for session in ${sessions1}; do
+  $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \
+    ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH &
+done
+wait
+for session in ${sessions2}; do
+  $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \
+    ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH &
+done
+wait
+for session in ${sessions3}; do
+  $cmd ${expdir}/correct_signals_for_clock_drift.${session}.log \
+    ${CONDA_PATH}/python correct_signals_for_clock_drift.py --session=${session} --sox_path $CONDA_PATH chime6_audio_edits.json $TMP_PATH $OUT_PATH &
+done
+wait
+
+echo "adjust the JSON files"
+mkdir -p ${odir}/transcriptions/eval ${odir}/transcriptions/dev ${odir}/transcriptions/train
+${CONDA_PATH}/python correct_transcript_for_clock_drift.py --clock_drift_data chime6_audio_edits.json ${sdir}/transcriptions ${odir}/transcriptions
+popd
+
+# finally check md5sum
+pushd ${odir}
+echo "check MD5 hash value for generated audios"
+md5sum -c ${SYNC_PATH}/audio_md5sums.txt || echo "check https://github.com/chimechallenge/chime6-synchronisation"
+popd
+
+echo "`basename $0` Done."
diff --git a/egs/chime6/s5_track1/local/get_location.py b/egs/chime6/s5_track1/local/get_location.py
new file mode 100755
index 00000000000..92351e72e65
--- /dev/null
+++ b/egs/chime6/s5_track1/local/get_location.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# Copyright Ashish Arora
+# Apache 2.0
+# This script create a utterance and location mapping file
+# It is used in score_for_submit script to get locationwise WER.
+# for GSS enhancement
+
+import json
+from datetime import timedelta
+from glob import glob
+import sys, io
+from decimal import Decimal
+
+SAMPLE_RATE = 16000
+
+def to_samples(time: str):
+    "mapping time in string to int, as mapped in pb_chime5"
+    "see https://github.com/fgnt/pb_chime5/blob/master/pb_chime5/database/chime5/get_speaker_activity.py"
+    hours, minutes, seconds = [t for t in time.split(':')]
+    hours = int(hours)
+    minutes = int(minutes)
+    seconds = Decimal(seconds)
+
+    seconds_samples = seconds * SAMPLE_RATE
+    samples = (
+        hours * 3600 * SAMPLE_RATE
+        + minutes * 60 * SAMPLE_RATE
+        + seconds_samples
+    )
+    return int(samples)
+
+
+def main():
+    output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    json_file_location= sys.argv[1] + '/*.json'
+    json_files = glob(json_file_location)
+
+    json_file_location= sys.argv[1] + '/*.json'
+    json_files = glob(json_file_location)
+    location_dict = {}
+    json_file_location= sys.argv[1] + '/*.json'
+    json_files = glob(json_file_location)
+    location_dict = {}
+    for file in json_files:
+        with open(file, 'r') as f:
+            session_dict = json.load(f)
+
+        for uttid in session_dict:
+            try:
+                ref=uttid['ref']
+                speaker_id = uttid['speaker']
+                location = uttid['location']
+                location=location.upper()
+                session_id=uttid['session_id']
+                words = uttid['words']
+                end_sample=to_samples(str(uttid['end_time']))
+                start_sample=to_samples(str(uttid['start_time']))
+                start_sample_str = str(int(start_sample * 100 / SAMPLE_RATE)).zfill(7)
+                end_sample_str = str(int(end_sample * 100 / SAMPLE_RATE)).zfill(7)
+                utt = "{0}_{1}-{2}-{3}".format(speaker_id, session_id, start_sample_str, end_sample_str)
+                location_dict[utt]=(location)
+            except:
+                continue
+
+    for key in sorted(location_dict.keys()):
+        utt= "{0} {1}".format(key, location_dict[key])
+        output.write(utt+ '\n')
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5_track1/local/install_pb_chime5.sh b/egs/chime6/s5_track1/local/install_pb_chime5.sh
new file mode 100755
index 00000000000..a151dc60f12
--- /dev/null
+++ b/egs/chime6/s5_track1/local/install_pb_chime5.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Installs pb_chime5
+# miniconda should be installed in $HOME/miniconda3/ 
+
+miniconda_dir=$HOME/miniconda3/
+
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run 'tools/extras/install_miniconda.sh" && exit 1;
+fi
+
+git clone https://github.com/fgnt/pb_chime5.git
+cd pb_chime5
+# Download submodule dependencies  # https://stackoverflow.com/a/3796947/5766934
+git submodule init  
+git submodule update
+
+$miniconda_dir/bin/python -m pip install cython
+$miniconda_dir/bin/python -m pip install pymongo
+$miniconda_dir/bin/python -m pip install fire
+$miniconda_dir/bin/python -m pip install -e pb_bss/
+$miniconda_dir/bin/python -m pip install -e .
diff --git a/egs/chime6/s5_track1/local/json2text.py b/egs/chime6/s5_track1/local/json2text.py
new file mode 100755
index 00000000000..34cf52f086b
--- /dev/null
+++ b/egs/chime6/s5_track1/local/json2text.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import json
+import argparse
+import logging
+import sys
+
+
+def hms_to_seconds(hms):
+    hour = hms.split(':')[0]
+    minute = hms.split(':')[1]
+    second = hms.split(':')[2].split('.')[0]
+
+    # .xx (10 ms order)
+    ms10 = hms.split(':')[2].split('.')[1]
+
+    # total seconds
+    seconds = int(hour) * 3600 + int(minute) * 60 + int(second)
+
+    return '{:07d}'.format(int(str(seconds) + ms10))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('json', type=str, help='JSON transcription file')
+    parser.add_argument('--mictype', type=str,
+                        choices=['ref', 'worn', 'gss', 'u01', 'u02', 'u03', 'u04', 'u05', 'u06'],
+                        help='Type of microphones')
+    args = parser.parse_args()
+
+    # logging info
+    log_format = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s:%(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
+
+    logging.debug("reading %s", args.json)
+    with open(args.json, 'rt', encoding="utf-8") as f:
+        j = json.load(f)
+
+    for x in j:
+        if '[redacted]' not in x['words']:
+            session_id = x['session_id']
+            speaker_id = x['speaker']
+            if args.mictype == 'ref':
+                mictype = x['ref']
+            elif args.mictype == 'worn' or args.mictype == 'gss':
+                mictype = 'original'
+            else:
+                mictype = args.mictype.upper() # convert from u01 to U01
+
+            # add location tag for scoring (only for dev and eval sets)
+            if 'location' in x.keys():
+                location = x['location'].upper()
+            else:
+                location = 'NOLOCATION'
+
+            # remove mic dependency for CHiME-6
+            start_time = x['start_time']
+            end_time = x['end_time']
+
+            # remove meta chars and convert to lower
+            words = x['words'].replace('"', '')\
+                              .replace('.', '')\
+                              .replace('?', '')\
+                              .replace(',', '')\
+                              .replace(':', '')\
+                              .replace(';', '')\
+                              .replace('!', '').lower()
+
+            # remove multiple spaces
+            words = " ".join(words.split())
+
+            # convert to seconds, e.g., 1:10:05.55 -> 3600 + 600 + 5.55 = 4205.55
+            start_time = hms_to_seconds(start_time)
+            end_time = hms_to_seconds(end_time)
+
+            uttid = speaker_id + '_' + session_id
+            if not args.mictype in ['worn', 'gss']:
+                uttid += '_' + mictype
+
+            if args.mictype == 'gss':
+                uttid += '-' + start_time + '-' + end_time
+            else:
+                uttid += '_' + location + '-' + start_time + '-' + end_time
+
+            # In several utterances, there are inconsistency in the time stamp
+            # (the end time is earlier than the start time)
+            # We just ignored such utterances.
+            if end_time > start_time:
+                sys.stdout.buffer.write((uttid + ' ' + words + '\n').encode("utf-8"))
diff --git a/egs/chime6/s5_track1/local/make_noise_list.py b/egs/chime6/s5_track1/local/make_noise_list.py
new file mode 100755
index 00000000000..5aaf7fa4062
--- /dev/null
+++ b/egs/chime6/s5_track1/local/make_noise_list.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+import glob
+import os
+import sys
+
+
+if len(sys.argv) != 2:
+    print ("Usage: {} <noises-dir>".format(sys.argv[0]))
+    raise SystemExit(1)
+
+
+for line in glob.glob("{}/*.wav".format(sys.argv[1])):
+    fname = os.path.basename(line.strip())
+
+    print ("--noise-id {} --noise-type point-source "
+           "--bg-fg-type foreground {}".format(fname, line.strip()))
diff --git a/egs/chime6/s5_track1/local/nnet3/compare_wer.sh b/egs/chime6/s5_track1/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..095e85cc338
--- /dev/null
+++ b/egs/chime6/s5_track1/local/nnet3/compare_wer.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev_clean_2 (tgsmall) "
+  "#WER dev_clean_2 (tglarge) ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/egs/chime6/s5_track1/local/nnet3/decode.sh b/egs/chime6/s5_track1/local/nnet3/decode.sh
new file mode 100755
index 00000000000..8fa54e0d4a6
--- /dev/null
+++ b/egs/chime6/s5_track1/local/nnet3/decode.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+
+# Copyright 2016 Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti)
+#           2019 Vimal Manohar 
+# Apache 2.0.
+
+# This script does 2-stage decoding where the first stage is used to get 
+# reliable frames for i-vector extraction.
+
+set -e
+
+# general opts
+iter=
+stage=0
+nj=30
+affix=  # affix for decode directory
+
+# ivector opts
+max_count=75  # parameter for extract_ivectors.sh
+sub_speaker_frames=6000
+ivector_scale=0.75
+get_weights_from_ctm=true
+weights_file=   # use weights from this archive (must be compressed using gunzip)
+silence_weight=0.00001   # apply this weight to silence frames during i-vector extraction
+ivector_dir=exp/nnet3
+
+# decode opts
+pass2_decode_opts="--min-active 1000"
+lattice_beam=8
+extra_left_context=0 # change for (B)LSTM
+extra_right_context=0 # change for BLSTM
+frames_per_chunk=50 # change for (B)LSTM
+acwt=0.1 # important to change this when using chain models
+post_decode_acwt=1.0 # important to change this when using chain models
+extra_left_context_initial=0
+extra_right_context_final=0
+
+graph_affix=
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
+
+. ./cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir> <graph-dir> <model-dir>"
+  echo " Options:"
+  echo "    --stage (0|1|2)   # start scoring script from part-way through."
+  echo "e.g.:"
+  echo "$0 data/dev data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
+  exit 1;
+fi
+
+data=$1 # data directory 
+lang=$2 # data/lang
+graph=$3 #exp/tri5a/graph_pp
+dir=$4 # exp/nnet3/tdnn
+
+model_affix=`basename $dir`
+ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter}
+affix=${affix:+_${affix}}${iter:+_iter${iter}}
+
+if [ $stage -le 1 ]; then
+  if [ ! -s ${data}_hires/feats.scp ]; then
+    utils/copy_data_dir.sh $data ${data}_hires
+    steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd "$train_cmd" ${data}_hires
+    steps/compute_cmvn_stats.sh ${data}_hires
+    utils/fix_data_dir.sh ${data}_hires
+  fi
+fi
+
+data_set=$(basename $data)
+if [ $stage -le 2 ]; then
+  echo "Extracting i-vectors, stage 1"
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    --max-count $max_count \
+    ${data}_hires $ivector_dir/extractor \
+    $ivector_dir/ivectors_${data_set}${ivector_affix}_stage1;
+  # float comparisons are hard in bash
+  if [ `bc <<< "$ivector_scale != 1"` -eq 1 ]; then
+    ivector_scale_affix=_scale$ivector_scale
+  else
+    ivector_scale_affix=
+  fi
+
+  if [ ! -z "$ivector_scale_affix" ]; then
+    echo "$0: Scaling iVectors, stage 1"
+    srcdir=$ivector_dir/ivectors_${data_set}${ivector_affix}_stage1
+    outdir=$ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1
+    mkdir -p $outdir
+    $train_cmd $outdir/log/scale_ivectors.log \
+      copy-matrix --scale=$ivector_scale scp:$srcdir/ivector_online.scp ark:- \| \
+      copy-feats --compress=true ark:-  ark,scp:$outdir/ivector_online.ark,$outdir/ivector_online.scp;
+    cp $srcdir/ivector_period $outdir/ivector_period
+  fi
+fi
+
+decode_dir=$dir/decode${graph_affix}_${data_set}${affix}
+# generate the lattices
+if [ $stage -le 3 ]; then
+  echo "Generating lattices, stage 1"
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" \
+    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix}${ivector_scale_affix}_stage1 \
+    --skip-scoring true ${iter:+--iter $iter} \
+    $graph ${data}_hires ${decode_dir}_stage1;
+fi
+
+if [ $stage -le 4 ]; then
+  if $get_weights_from_ctm; then
+    if [ ! -z $weights_file ]; then
+      echo "$0: Using provided vad weights file $weights_file"
+      ivector_extractor_weights=$weights_file
+    else
+      echo "$0 : Generating vad weights file"
+      ivector_extractor_weights=${decode_dir}_stage1/weights${affix}.gz
+      local/extract_vad_weights.sh --silence-weight $silence_weight \
+        --cmd "$decode_cmd" ${iter:+--iter $iter} \
+        ${data}_hires $lang \
+        ${decode_dir}_stage1 $ivector_extractor_weights
+    fi
+  else
+    # get weights from best path decoding
+    ivector_extractor_weights=${decode_dir}_stage1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  echo "Extracting i-vectors, stage 2 with weights from $ivector_extractor_weights"
+  # this does offline decoding, except we estimate the iVectors per
+  # speaker, excluding silence (based on alignments from a DNN decoding), with a
+  # different script.  This is just to demonstrate that script.
+  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+  # up into "sub-speakers" of at least that many frames... can be useful if
+  # acoustic conditions drift over time within the speaker's data.
+  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+    --silence-weight $silence_weight \
+    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+    ${data}_hires $lang $ivector_dir/extractor \
+    $ivector_extractor_weights $ivector_dir/ivectors_${data_set}${ivector_affix};
+fi
+
+if [ $stage -le 6 ]; then
+  echo "Generating lattices, stage 2 with --acwt $acwt"
+  rm -f ${decode_dir}/.error
+  steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd" $pass2_decode_opts \
+      --acwt $acwt --post-decode-acwt $post_decode_acwt \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --extra-left-context-initial $extra_left_context_initial \
+      --extra-right-context-final $extra_right_context_final \
+      --frames-per-chunk "$frames_per_chunk" \
+      --skip-scoring false ${iter:+--iter $iter} --lattice-beam $lattice_beam \
+      --online-ivector-dir $ivector_dir/ivectors_${data_set}${ivector_affix} \
+     $graph ${data}_hires ${decode_dir} || touch ${decode_dir}/.error
+  [ -f ${decode_dir}/.error ] && echo "$0: Error decoding" && exit 1;
+fi
+exit 0
diff --git a/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh b/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..cfa18cb7617
--- /dev/null
+++ b/egs/chime6/s5_track1/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nj=96
+
+nnet3_affix=_train_worn_u100k
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/chime5-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp; do
+    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 20 \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/chime5-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj ${nj} \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+fi
+
+exit 0
diff --git a/egs/chime6/s5_track1/local/prepare_data.sh b/egs/chime6/s5_track1/local/prepare_data.sh
new file mode 100755
index 00000000000..3d1ffe859a5
--- /dev/null
+++ b/egs/chime6/s5_track1/local/prepare_data.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
+# Apache 2.0
+
+# Begin configuration section.
+mictype=worn # worn, ref or others
+cleanup=true
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 3 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
+  echo -e >&2 "eg:\n  $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
+  exit 1
+fi
+
+set -e -o pipefail
+
+adir=$(utils/make_absolute.sh $1)
+jdir=$2
+dir=$3
+
+json_count=$(find -L $jdir -name "*.json" | wc -l)
+wav_count=$(find -L $adir -name "*.wav" | wc -l)
+
+if [ "$json_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $jdir will contain json files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+if [ "$wav_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $adir will contain wav files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+
+echo "$0: Converting transcription to text"
+
+mkdir -p $dir
+
+for file in $jdir/*json; do
+  ./local/json2text.py --mictype $mictype $file
+done | \
+  sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
+  sed -e 's/ - / /g' |\
+  sed -e 's/mm-/mm/g' > $dir/text.orig
+
+echo "$0: Creating datadir $dir for type=\"$mictype\""
+
+if [ $mictype == "worn" ]; then
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key, add .L and .R for left and right channel
+  # i.e. each file will have two entries (left and right channel)
+  find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
+    perl -ne '{
+      chomp;
+      $path = $_;
+      next unless $path;
+      @F = split "/", $path;
+      ($f = $F[@F-1]) =~ s/.wav//;
+      @F = split "_", $f;
+      print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
+      print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
+    }' | sort > $dir/wav.scp
+
+  # generate the transcripts for both left and right channel
+  # from the original transcript in the form
+  # P09_S03-0006072-0006147 gimme the baker
+  # create left and right channel transcript
+  # P09_S03.L-0006072-0006147 gimme the baker
+  # P09_S03.R-0006072-0006147 gimme the baker
+  sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
+elif [ $mictype == "ref" ]; then
+  # fixed reference array
+
+  # first get a text, which will be used to extract reference arrays
+  perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
+
+  find -L $adir | grep "\.wav" | sort > $dir/wav.flist
+  # following command provide the argument for grep to extract only reference arrays
+  grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
+  paste -d" " \
+	<(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \
+	$dir/wav.flist2 | sort > $dir/wav.scp
+elif [ $mictype == "gss" ]; then
+  find -L $adir -name  "P[0-9]*_S[0-9]*.wav" | \
+    perl -ne '{
+      chomp;
+      $path = $_;
+      next unless $path;
+      @F = split "/", $path;
+      ($f = $F[@F-1]) =~ s/.wav//;
+      print "$f $path\n";
+    }' | sort > $dir/wav.scp
+
+  cat $dir/text.orig | sort > $dir/text
+else
+  # array mic case
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key
+  find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
+    perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
+    sort -u > $dir/wav.scp
+
+  # convert the transcripts from
+  # P09_S03-0006072-0006147 gimme the baker
+  # to the per-channel transcripts
+  # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
+  perl -ne '$l=$_;
+    for($i=1; $i<=4; $i++) {
+      ($x=$l)=~ s/-/.CH\Q$i\E-/;
+      print $x;}' $dir/text.orig | sort > $dir/text
+
+fi
+$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
+
+# Prepare 'segments', 'utt2spk', 'spk2utt'
+if [ $mictype == "worn" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" \
+    > $dir/segments
+elif [ $mictype == "ref" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e "s/ P.._/ /" > $dir/segments
+elif [ $mictype != "gss" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e 's/ P.._/ /' > $dir/segments
+fi
+
+cut -f 1 -d ' ' $dir/text | \
+  perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+# Check that data dirs are okay!
+utils/validate_data_dir.sh --no-feats $dir || exit 1
diff --git a/egs/chime6/s5_track1/local/prepare_dict.sh b/egs/chime6/s5_track1/local/prepare_dict.sh
new file mode 100755
index 00000000000..09083d0e795
--- /dev/null
+++ b/egs/chime6/s5_track1/local/prepare_dict.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Copyright (c) 2018, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./path.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+
+# check existing directories
+[ $# != 0 ] && echo "Usage: $0" && exit 1;
+
+dir=data/local/dict
+
+mkdir -p $dir
+echo "$0: Getting CMU dictionary"
+if [ ! -f $dir/cmudict.done ]; then
+  [ -d $dir/cmudict ] && rm -rf $dir/cmudict
+  svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict
+  touch $dir/cmudict.done
+fi
+
+# silence phones, one per line.
+for w in sil spn inaudible laughs noise; do
+  echo $w;
+done > $dir/silence_phones.txt
+echo sil > $dir/optional_silence.txt
+
+# For this setup we're discarding stress.
+cat $dir/cmudict/cmudict-0.7b.symbols | \
+  perl -ne 's:[0-9]::g; s:\r::; print lc($_)' | \
+  sort -u > $dir/nonsilence_phones.txt
+
+# An extra question will be added by including the silence phones in one class.
+paste -d ' ' -s $dir/silence_phones.txt > $dir/extra_questions.txt
+
+grep -v ';;;' $dir/cmudict/cmudict-0.7b |\
+  uconv -f latin1 -t utf-8 -x Any-Lower |\
+  perl -ne 's:(\S+)\(\d+\) :$1 :; s:  : :; print;' |\
+  perl -ne '@F = split " ",$_,2; $F[1] =~ s/[0-9]//g; print "$F[0] $F[1]";' \
+  > $dir/lexicon1_raw_nosil.txt || exit 1;
+
+# Add prons for laughter, noise, oov
+for w in `grep -v sil $dir/silence_phones.txt`; do
+  echo "[$w] $w"
+done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1;
+
+# we keep all words from the cmudict in the lexicon
+# might reduce OOV rate on dev and eval
+cat $dir/lexicon2_raw.txt  \
+   <( echo "mm m"
+      echo "<unk> spn"
+      echo "cuz k aa z"
+      echo "cuz k ah z"
+      echo "cuz k ao z"
+      echo "mmm m"; \
+      echo "hmm hh m"; \
+    ) | sort -u | sed 's/[\t ]/\t/' > $dir/iv_lexicon.txt
+
+
+cat data/train*/text  | \
+  awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
+  sort -nr > $dir/word_counts
+
+cat $dir/word_counts | awk '{print $2}' > $dir/word_list
+
+awk '{print $1}' $dir/iv_lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.txt
+
+echo "*Highest-count OOVs (including fragments) are:"
+head -n 10 $dir/oov_counts.txt
+echo "*Highest-count OOVs (excluding fragments) are:"
+grep -v -E '^-|-$' $dir/oov_counts.txt | head -n 10 || true
+
+echo "*Training a G2P and generating missing pronunciations"
+mkdir -p $dir/g2p/
+phonetisaurus-align --input=$dir/iv_lexicon.txt --ofile=$dir/g2p/aligned_lexicon.corpus
+ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount\
+  -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \
+  -text $dir/g2p/aligned_lexicon.corpus -lm $dir/g2p/aligned_lexicon.arpa
+phonetisaurus-arpa2wfst --lm=$dir/g2p/aligned_lexicon.arpa --ofile=$dir/g2p/g2p.fst
+awk '{print $2}' $dir/oov_counts.txt > $dir/oov_words.txt
+phonetisaurus-apply --nbest 2 --model $dir/g2p/g2p.fst --thresh 5 --accumulate \
+  --word_list $dir/oov_words.txt > $dir/oov_lexicon.txt
+
+## The next section is again just for debug purposes
+## to show words for which the G2P failed
+cat $dir/oov_lexicon.txt $dir/iv_lexicon.txt | sort -u > $dir/lexicon.txt
+rm -f $dir/lexiconp.txt 2>/dev/null; # can confuse later script if this exists.
+awk '{print $1}' $dir/lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.g2p.txt
+
+echo "*Highest-count OOVs (including fragments) after G2P are:"
+head -n 10 $dir/oov_counts.g2p.txt
+
+utils/validate_dict_dir.pl $dir
+exit 0;
+
diff --git a/egs/chime6/s5_track1/local/replace_uttid.py b/egs/chime6/s5_track1/local/replace_uttid.py
new file mode 100755
index 00000000000..96c45b58783
--- /dev/null
+++ b/egs/chime6/s5_track1/local/replace_uttid.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+# Copyright Ashish Arora
+# Apache 2.0
+# This script is used in score_for_submit. It adds locationid to the utteranceid,
+# using uttid_location file, for locationwise scoring.
+
+import sys, io
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+def load_uttid_location(f):
+    locations = {}
+    for line in f:
+        parts=line.strip().split(' ')
+        uttid, loc = parts[0], parts[1]
+        locations[uttid] = loc
+    return locations
+
+locations = load_uttid_location(open(sys.argv[1],'r', encoding='utf8'))
+
+for line in open(sys.argv[2],'r', encoding='utf8'):
+    uttid, res = line.split(None, 1)
+    try:
+        location = locations[uttid]
+        location_uttid = location +'_'+ str(uttid)
+        output.write(location_uttid + ' ' + res)
+    except KeyError as e:
+            raise Exception("Could not find utteranceid in "
+                            "uttid_location file"
+                            "({0})\n".format(str(e)))
diff --git a/egs/chime6/s5_track1/local/reverberate_lat_dir.sh b/egs/chime6/s5_track1/local/reverberate_lat_dir.sh
new file mode 100755
index 00000000000..f601a37c0e1
--- /dev/null
+++ b/egs/chime6/s5_track1/local/reverberate_lat_dir.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+# Copyright 2018  Vimal Manohar
+# Apache 2.0
+
+num_data_reps=1
+cmd=run.pl
+nj=20
+include_clean=false
+
+. utils/parse_options.sh
+. ./path.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 <train-data-dir> <noisy-latdir> <clean-latdir> <output-latdir>"
+  exit 1
+fi
+
+train_data_dir=$1
+noisy_latdir=$2
+clean_latdir=$3
+dir=$4
+
+clean_nj=$(cat $clean_latdir/num_jobs)
+
+$cmd JOB=1:$clean_nj $dir/copy_clean_lattices.JOB.log \
+  lattice-copy "ark:gunzip -c $clean_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_clean.JOB.ark,$dir/lats_clean.JOB.scp || exit 1
+  
+for n in $(seq $clean_nj); do
+  cat $dir/lats_clean.$n.scp 
+done > $dir/lats_clean.scp
+
+for i in $(seq $num_data_reps); do
+  cat $dir/lats_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+done > $dir/lats_rvb.scp
+
+noisy_nj=$(cat $noisy_latdir/num_jobs)
+$cmd JOB=1:$noisy_nj $dir/copy_noisy_lattices.JOB>log \
+  lattice-copy "ark:gunzip -c $noisy_latdir/lat.JOB.gz |" \
+  ark,scp:$dir/lats_noisy.JOB.ark,$dir/lats_noisy.JOB.scp || exit 1
+
+optional_clean=
+if $include_clean; then
+  optional_clean=$dir/lats_clean.scp
+fi
+
+for n in $(seq $noisy_nj); do
+  cat $dir/lats_noisy.$n.scp
+done | cat - $dir/lats_rvb.scp ${optional_clean} | sort -k1,1 > $dir/lats.scp
+
+utils/split_data.sh $train_data_dir $nj
+$cmd JOB=1:$nj $dir/copy_lattices.JOB.log \
+  lattice-copy "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/lats.scp |" \
+  "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1
+
+echo $nj > $dir/num_jobs
+
+if [ -f $clean_latdir/ali.1.gz ]; then
+  $cmd JOB=1:$clean_nj $dir/copy_clean_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $clean_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_clean.JOB.ark,$dir/ali_clean.JOB.scp
+    
+  for n in $(seq $clean_nj); do
+    cat $dir/ali_clean.$n.scp 
+  done > $dir/ali_clean.scp
+
+  for i in $(seq $num_data_reps); do
+    cat $dir/ali_clean.scp | awk -vi=$i '{print "rev"i"_"$0}'
+  done > $dir/ali_rvb.scp
+  
+  optional_clean=
+  if $include_clean; then
+    optional_clean=$dir/ali_clean.scp
+  fi
+
+  $cmd JOB=1:$noisy_nj $dir/copy_noisy_alignments.JOB.log \
+    copy-int-vector "ark:gunzip -c $noisy_latdir/ali.JOB.gz |" \
+    ark,scp:$dir/ali_noisy.JOB.ark,$dir/ali_noisy.JOB.scp
+
+  for n in $(seq $noisy_nj); do
+    cat $dir/ali_noisy.$n.scp
+  done | cat - $dir/ali_rvb.scp $optional_clean | sort -k1,1 > $dir/ali.scp
+
+  utils/split_data.sh $train_data_dir $nj || exit 1
+  $cmd JOB=1:$nj $dir/copy_rvb_alignments.JOB.log \
+    copy-int-vector "scp:utils/filter_scp.pl $train_data_dir/split$nj/JOB/utt2spk $dir/ali.scp |" \
+    "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1
+fi
+
+cp $clean_latdir/{final.*,tree,*.mat,*opts,*.txt} $dir || true
+
+rm $dir/lats_{clean,noisy}.*.{ark,scp} $dir/ali_{clean,noisy}.*.{ark,scp} || true # save space
diff --git a/egs/chime6/s5_track1/local/run_beamformit.sh b/egs/chime6/s5_track1/local/run_beamformit.sh
new file mode 100755
index 00000000000..aa3badd90d8
--- /dev/null
+++ b/egs/chime6/s5_track1/local/run_beamformit.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+cmd=run.pl
+bmf="1 2 3 4"
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_beamformit.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --bmf \"1 2 3 4\"                        # microphones used for beamforming"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+expdir=exp/enhan/`echo $odir | awk -F '/' '{print $NF}'`_`echo $bmf | tr ' ' '_'`
+
+if ! command  -v BeamformIt &>/dev/null ; then
+  echo "Missing BeamformIt, run 'cd $KALDI_ROOT/tools/; ./extras/install_beamformit.sh; cd -;'" && exit 1
+fi
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+echo "Will use the following channels: $bmf"
+# number of channels
+numch=`echo $bmf | tr ' ' '\n' | wc -l`
+echo "the number of channels: $numch"
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list
+
+# this is an input file list of the microphones
+# format: 1st_wav 2nd_wav ... nth_wav
+input_arrays=$expdir/channels_$numch
+for x in `cat $output_wavfiles`; do
+  echo -n "$x"
+  for ch in $bmf; do
+    echo -n " $x.CH$ch.wav"
+  done
+  echo ""
+done > $input_arrays
+
+# split the list for parallel processing
+# number of jobs are set by the number of WAV files
+nj=`wc -l $expdir/wavfiles.list | awk '{print $1}'`
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Beamforming\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat << EOF > $expdir/log/beamform.$n.sh
+while read line; do
+  $BEAMFORMIT/BeamformIt -s \$line -c $input_arrays \
+    --config_file `pwd`/conf/beamformit.cfg \
+    --source_dir $sdir \
+    --result_dir $odir
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/beamform.*.sh
+$cmd JOB=1:$nj $expdir/log/beamform.JOB.log \
+  $expdir/log/beamform.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime6/s5_track1/local/run_gss.sh b/egs/chime6/s5_track1/local/run_gss.sh
new file mode 100755
index 00000000000..fbdc4af25d1
--- /dev/null
+++ b/egs/chime6/s5_track1/local/run_gss.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+# Config:
+cmd=run.pl
+nj=4
+multiarray=outer_array_mics
+bss_iterations=5
+context_samples=160000
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_gss.sh [options] <session-id> <log-dir> <enhanced-dir>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --bss_iterations 5                       # Number of EM iterations"
+   echo "  --context_samples 160000                 # Left-right context in number of samples"
+   echo "  --multiarray <configuration>             # Multiarray configuration"
+   exit 1;
+fi
+
+# setting multiarray as "true" uses all mics, we didn't see any performance
+# gain from this we have chosen settings that makes the enhacement finish
+# in around 1/3 of a day without significant change in performance.
+# our result during the experiments are as follows:
+
+#MAF: multi array = False
+#MAT: multi array = True
+#Enhancement  Iterations  Num Microphones  Context  Computational time for GSS  #cpus  dev WER  eval WER
+#GSS(MAF)     10           24                        17   hrs                   30     62.3     57.98
+#GSS(MAT)      5           24               10s      26   hrs                   50     53.15    53.77
+#GSS(MAT)      5           12               10s       9.5 hrs                   50     53.09    53.75
+
+session_id=$1
+log_dir=$2
+enhanced_dir=$3
+if [ ! -d pb_chime5/ ]; then
+  echo "Missing pb_chime5, run 'local/install_pb_chime5'" 
+  exit 1
+fi
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir/ ]; then
+  echo "$miniconda_dir/ does not exist. Please run '../../../tools/extras/install_miniconda.sh'"
+  exit 1
+fi
+
+enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || \
+  { echo "Could not make absolute '$enhanced_dir'" && exit 1; }
+
+$cmd JOB=1:$nj $log_dir/log/enhance_${session_id}.JOB.log \
+  cd pb_chime5/ '&&' \
+  $miniconda_dir/bin/python -m pb_chime5.scripts.kaldi_run with \
+    chime6=True \
+    storage_dir=$enhanced_dir \
+    session_id=$session_id \
+    job_id=JOB number_of_jobs=$nj \
+    bss_iterations=$bss_iterations \
+    context_samples=$context_samples \
+    multiarray=$multiarray || exit 1
diff --git a/egs/chime6/s5_track1/local/run_wpe.py b/egs/chime6/s5_track1/local/run_wpe.py
new file mode 100755
index 00000000000..fbb264f2fd2
--- /dev/null
+++ b/egs/chime6/s5_track1/local/run_wpe.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+# This script assumes that WPE (nara_wpe) is installed locally using miniconda.
+# ../../../tools/extras/install_miniconda.sh and ../../../tools/extras/install_wpe.sh
+# needs to be run and this script needs to be launched run with that version of
+# python.
+# See local/run_wpe.sh for example.
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+# to avoid huge memory consumption we decided to use `wpe_v8` instead of the original wpe by
+# following the advice from Christoph Boeddeker at Paderborn University
+# https://github.com/chimechallenge/kaldi_chime6/commit/2ea6ac07ef66ad98602f073b24a233cb7f61605c#r36147334
+from nara_wpe.wpe import wpe_v8 as wpe
+from nara_wpe.utils import stft, istft
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try:
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/chime6/s5_track1/local/run_wpe.sh b/egs/chime6/s5_track1/local/run_wpe.sh
new file mode 100755
index 00000000000..ed512e69aae
--- /dev/null
+++ b/egs/chime6/s5_track1/local/run_wpe.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/run_wpe.sh [options] <wav-in-dir> <wav-out-dir> <array-id>"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nj 50                        # number of jobs for parallel processing"
+   exit 1;
+fi
+
+sdir=$1
+odir=$2
+array=$3
+task=`basename $sdir`
+expdir=exp/wpe/${task}_${array}
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '$KALDI_ROOT/tools/extras/install_miniconda.sh'."
+    exit 1
+fi
+
+# check if WPE is installed
+result=`$miniconda_dir/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+    exit 1
+fi
+
+mkdir -p $odir
+mkdir -p $expdir/log
+
+# wavfiles.list can be used as the name of the output files
+output_wavfiles=$expdir/wavfiles.list
+find -L ${sdir} | grep -i ${array} > $expdir/channels_input
+cat $expdir/channels_input | awk -F '/' '{print $NF}' | sed "s@S@$odir\/S@g" > $expdir/channels_output
+paste -d" " $expdir/channels_input $expdir/channels_output > $output_wavfiles
+
+# split the list for parallel processing
+split_wavfiles=""
+for n in `seq $nj`; do
+  split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+done
+utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+
+echo -e "Dereverberation - $task - $array\n"
+# making a shell script for each job
+for n in `seq $nj`; do
+cat <<-EOF > $expdir/log/wpe.$n.sh
+while read line; do
+  $miniconda_dir/bin/python local/run_wpe.py \
+    --file \$line
+done < $output_wavfiles.$n
+EOF
+done
+
+chmod a+x $expdir/log/wpe.*.sh
+$cmd JOB=1:$nj $expdir/log/wpe.JOB.log \
+  $expdir/log/wpe.JOB.sh
+
+echo "`basename $0` Done."
diff --git a/egs/chime6/s5_track1/local/score.sh b/egs/chime6/s5_track1/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/chime6/s5_track1/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track1/local/score_for_submit.sh b/egs/chime6/s5_track1/local/score_for_submit.sh
new file mode 100755
index 00000000000..ba7d6cde574
--- /dev/null
+++ b/egs/chime6/s5_track1/local/score_for_submit.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Copyright 2019       Johns Hopkins University (Author: Shinji Watanabe)
+# Apache 2.0
+#
+# This script provides official CHiME-6 challenge track 1 submission scores per room and session.
+# It first calculates the best search parameter configurations by using the dev set
+# and also create the transcriptions for dev and eval sets to be submitted.
+# The default setup does not calculate scores of the evaluation set since
+# the evaluation transcription is not distributed (July 9 2018)
+
+cmd=run.pl
+dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref
+eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref
+do_eval=true
+enhancement=gss
+json=
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script provides official CHiME-6 challenge submission scores"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+    echo "    --dev <dev-decode-dir>          # dev set decoding directory"
+    echo "    --eval <eval-decode-dir>        # eval set decoding directory"
+    echo "    --enhancement                   # enhancement type (gss or beamformit)"
+    echo "    --json <json-directory>         # directory containing CHiME-6 json files"
+    exit 1;
+fi
+
+# get language model weight and word insertion penalty from the dev set
+best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt`
+best_wip=`cat $dev/scoring_kaldi/wer_details/wip`
+
+echo "best LM weight: $best_lmwt"
+echo "insertion penalty weight: $best_wip"
+
+echo "==== development set ===="
+# development set
+# get uttid location mapping
+local/add_location_to_uttid.sh --enhancement $enhancement $json/dev \
+  $dev/scoring_kaldi/wer_details/ $dev/scoring_kaldi/wer_details/uttid_location
+# get the scoring result per utterance
+score_result=$dev/scoring_kaldi/wer_details/per_utt_loc
+
+for session in S02 S09; do
+    for room in DINING KITCHEN LIVING; do
+	# get nerror
+	nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	# compute wer with scale=2
+	wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	# report the results
+	echo -n "session $session "
+	echo -n "room $room: "
+	echo -n "#words $nwrd, "
+	echo -n "#errors $nerr, "
+	echo "wer $wer %"
+    done
+done
+echo -n "overall: "
+# get nerror
+nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+# get nwords from references (NF-2 means to exclude utterance id and " ref ")
+nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+# compute wer with scale=2
+wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+echo -n "#words $nwrd, "
+echo -n "#errors $nerr, "
+echo "wer $wer %"
+
+echo "==== evaluation set ===="
+# evaluation set
+# get the scoring result per utterance. Copied from local/score.sh
+mkdir -p $eval/scoring_kaldi/wer_details_devbest
+$cmd $eval/scoring_kaldi/log/stats1.log \
+     cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+     align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+     utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt
+
+local/add_location_to_uttid.sh --enhancement $enhancement $json/eval \
+  $eval/scoring_kaldi/wer_details_devbest/ $eval/scoring_kaldi/wer_details_devbest/uttid_location
+
+score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt_loc
+for session in S01 S21; do
+    for room in DINING KITCHEN LIVING; do
+	if $do_eval; then
+	    # get nerror
+	    nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'`
+	    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+	    nwrd=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$3+$4+$6} END {print sum}'`
+	    # compute wer with scale=2
+	    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+	
+	    # report the results
+	    echo -n "session $session "
+	    echo -n "room $room: "
+	    echo -n "#words $nwrd, "
+	    echo -n "#errors $nerr, "
+	    echo "wer $wer %"
+	fi
+    done
+done
+if $do_eval; then
+    # get nerror
+    nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'`
+    # get nwords from references (NF-2 means to exclude utterance id and " ref ")
+    nwrd=`grep "\#csid" $score_result | awk '{sum+=$3+$4+$6} END {print sum}'`
+    # compute wer with scale=2
+    wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc`
+    echo -n "overall: "
+    echo -n "#words $nwrd, "
+    echo -n "#errors $nerr, "
+    echo "wer $wer %"
+else
+    echo "skip evaluation scoring"
+    echo ""
+    echo "==== when you submit your result to the CHiME-6 challenge track 1 ===="
+    echo "Please rename your recognition results of "
+    echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt"
+    echo "with {dev,eval}_<last name>_<affiliation>.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, "
+    echo "and submit both of them as your final challenge result"
+    echo "=================================================================="    
+fi
+
diff --git a/egs/chime6/s5_track1/local/train_lms_srilm.sh b/egs/chime6/s5_track1/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..5a1d56d24b3
--- /dev/null
+++ b/egs/chime6/s5_track1/local/train_lms_srilm.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal, Shinji Watanabe)
+# Apache 2.0
+
+export LC_ALL=C
+
+# Begin configuration section.
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+# End configuration section
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  echo >&2 "You appear to not have SRILM tools installed, either on your path,"
+  echo >&2 "Use the script \$KALDI_ROOT/tools/install_srilm.sh to install it."
+  exit 1
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+
+fi
+
+[ ! -f $words_file ] && echo >&2 "File $words_file must exist!" && exit 1
+[ ! -f $train_text ] && echo >&2 "File $train_text must exist!" && exit 1
+[ ! -f $dev_text ] && echo >&2 "File $dev_text must exist!" && exit 1
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+# We also have to avoid skewing the LM by incorporating  the same sentences
+# from different channels
+sed -e "s/\.CH.//" -e "s/_.\-./_/" -e "s/NOLOCATION\(\.[LR]\)*-//" -e "s/U[0-9][0-9]_//" $train_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+sed -e "s/\.CH.//" -e "s/_.\-./_/" $dev_text | sort -u | \
+  perl -ane 'print join(" ", @F[1..$#F]) . "\n" if @F > 1' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn111.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn112.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn122.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn123.gz -kndiscount1 -gt1min 1 \
+  -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -interpolate \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz \
+  -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz \
+  -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz \
+  -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz \
+  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 \
+  -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please note that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 3 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    ngram -lm - -order 4 -unk -map-unk "$oov_symbol" -prune-lowprobs -write-lm - |\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+else
+  echo >&2  "SRILM is not compiled with the support of MaxEnt models."
+  echo >&2  "You should use the script in \$KALDI_ROOT/tools/install_srilm.sh"
+  echo >&2  "which will take care of compiling the SRILM with MaxEnt support"
+  exit 1;
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -prune-lowprobs -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+echo ""
+
+for best_ngram in {3,4}gram ; do
+  outlm=best_${best_ngram}.gz
+  lmfilename=$(grep "${best_ngram}" $tgtdir/perplexities.txt | head -n 1 | cut -f 1 -d ' ')
+  echo "$outlm -> $lmfilename"
+  (cd $tgtdir; rm -f $outlm; ln -sf $(basename $lmfilename) $outlm )
+done
diff --git a/egs/chime6/s5_track1/local/wer_output_filter b/egs/chime6/s5_track1/local/wer_output_filter
new file mode 100755
index 00000000000..6f4b6400716
--- /dev/null
+++ b/egs/chime6/s5_track1/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2017  Johns Hopkins University (Author: Yenda Trmal <jtrmal@gmail.com>)
+# Apache 2.0
+
+
+## Filter for scoring of the STT results. Convert everything to lowercase
+## and add some ad-hoc fixes for the hesitations
+
+perl -e '
+   while(<STDIN>) {
+     @A  = split(" ", $_);
+     $id = shift @A; print "$id ";
+     foreach $a (@A) {
+       print lc($a) . " " unless $a =~ /\[.*\]/;
+     }
+     print "\n";
+    }' | \
+sed -e '
+    s/\<mhm\>/hmm/g;
+    s/\<mm\>/hmm/g;
+    s/\<mmm\>/hmm/g;
+'
+
+#| uconv -f  utf-8  -t utf-8 -x Latin-ASCII
+
diff --git a/egs/chime6/s5_track1/path.sh b/egs/chime6/s5_track1/path.sh
new file mode 100644
index 00000000000..fb1c0489386
--- /dev/null
+++ b/egs/chime6/s5_track1/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/chime6/s5_track1/run.sh b/egs/chime6/s5_track1/run.sh
new file mode 100755
index 00000000000..0890a939faf
--- /dev/null
+++ b/egs/chime6/s5_track1/run.sh
@@ -0,0 +1,280 @@
+#!/bin/bash
+#
+# Based mostly on the TED-LIUM and Switchboard recipe
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+# Apache 2.0
+#
+
+# Begin configuration section.
+nj=96
+decode_nj=20
+stage=0
+nnet_stage=-10
+decode_stage=1
+decode_only=false
+num_data_reps=4
+foreground_snrs="20:10:15:5:0"
+background_snrs="20:10:15:5:0"
+enhancement=beamformit # gss or beamformit
+
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+if [ $decode_only == "true" ]; then
+  stage=16
+fi
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+# chime6 data directories, which are generated from ${chime5_corpus},
+# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly
+chime6_corpus=${PWD}/CHiME6
+json_dir=${chime6_corpus}/transcriptions
+audio_dir=${chime6_corpus}/audio
+
+if [[ ${enhancement} == *gss* ]]; then
+  enhanced_dir=${enhanced_dir}_multiarray
+  enhancement=${enhancement}_multiarray
+fi
+
+if [[ ${enhancement} == *beamformit* ]]; then
+  enhanced_dir=${enhanced_dir}
+  enhancement=${enhancement}
+fi
+
+test_sets="dev_${enhancement} eval_${enhancement}"
+train_set=train_worn_simu_u400k
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+###########################################################################
+# We first generate the synchronized audio files across arrays and
+# corresponding JSON files. Note that this requires sox v14.4.2,
+# which is installed via miniconda in ./local/check_tools.sh
+###########################################################################
+
+if [ $stage -le 0 ]; then
+  local/generate_chime6_data.sh \
+    --cmd "$train_cmd" \
+    ${chime5_corpus} \
+    ${chime6_corpus}
+fi
+
+###########################################################################
+# We prepare dict and lang in stages 1 to 3.
+###########################################################################
+
+if [ $stage -le 1 ]; then
+  echo "$0:  prepare data..."
+  # skip u03 and u04 as they are missing
+  for mictype in worn u01 u02 u05 u06; do
+    local/prepare_data.sh --mictype ${mictype} \
+			  ${audio_dir}/train ${json_dir}/train data/train_${mictype}
+  done
+  for dataset in dev; do
+    for mictype in worn; do
+      local/prepare_data.sh --mictype ${mictype} \
+			    ${audio_dir}/${dataset} ${json_dir}/${dataset} \
+			    data/${dataset}_${mictype}
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0:  train lm ..."
+  local/prepare_dict.sh
+
+  utils/prepare_lang.sh \
+    data/local/dict "<unk>" data/local/lang data/lang
+
+  local/train_lms_srilm.sh \
+    --train-text data/train_worn/text --dev-text data/dev_worn/text \
+    --oov-symbol "<unk>" --words-file data/lang/words.txt \
+    data/ data/srilm
+fi
+
+LM=data/srilm/best_3gram.gz
+if [ $stage -le 3 ]; then
+  # Compiles G for chime5 trigram LM
+  echo "$0:  prepare lang..."
+  utils/format_lm.sh \
+		data/lang $LM data/local/dict/lexicon.txt data/lang
+
+fi
+  
+#########################################################################################
+# In stages 4 to 7, we augment and fix train data for our training purpose. point source
+# noises are extracted from chime corpus. Here we use 400k utterances from array microphones,
+# its augmentation and all the worn set utterances in train.
+#########################################################################################
+
+if [ $stage -le 4 ]; then
+  # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
+  # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
+  utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
+  grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
+  utils/fix_data_dir.sh data/train_worn
+fi
+
+if [ $stage -le 5 ]; then
+  local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \
+    local/distant_audio_list distant_noises
+  local/make_noise_list.py distant_noises > distant_noise_list
+
+  noise_list=distant_noise_list
+  
+  if [ ! -d RIRS_NOISES/ ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters $noise_list)
+
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 1 \
+    --isotropic-noise-addition-probability 1 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 1 \
+    --source-sampling-rate 16000 \
+    data/train_worn data/train_worn_rvb
+fi
+
+if [ $stage -le 6 ]; then
+  # combine mix array and worn mics
+  # randomly extract first 400k utterances from all mics
+  # if you want to include more training data, you can increase the number of array mic utterances
+  utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06
+  utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
+  utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
+
+  # only use left channel for worn mic recognition
+  # you can use both left and right channels for training
+  for dset in train dev; do
+    utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo
+    grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text
+    utils/fix_data_dir.sh data/${dset}_worn
+  done
+fi
+
+if [ $stage -le 7 ]; then
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  for dset in ${train_set}; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset}
+  done
+fi
+
+##################################################################################
+# Now make 13-dim MFCC features. We use 13-dim fetures for GMM-HMM systems.
+##################################################################################
+
+if [ $stage -le 8 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  echo "$0:  make features..."
+  mfccdir=mfcc
+  for x in ${train_set}; do
+    steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \
+		       data/$x exp/make_mfcc/$x $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+###################################################################################
+# Stages 9 to 13 train monophone and triphone models. They will be used for
+# generating lattices for training the chain model
+###################################################################################
+
+if [ $stage -le 9 ]; then
+  # make a subset for monophone training
+  utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
+  utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
+fi
+
+if [ $stage -le 10 ]; then
+  # Starting basic training on MFCC features
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+		      data/${train_set}_30kshort data/lang exp/mono
+fi
+
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+			2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+			  4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 13 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+		     5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
+fi
+
+#######################################################################
+# Perform data cleanup for training data.
+#######################################################################
+
+if [ $stage -le 14 ]; then
+  # The following script cleans the data and produces cleaned data
+  steps/cleanup/clean_and_segment_data.sh --nj ${nj} --cmd "$train_cmd" \
+    --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+    data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
+fi
+
+##########################################################################
+# CHAIN MODEL TRAINING
+# skipping decoding here and performing it in step 16
+##########################################################################
+
+if [ $stage -le 15 ]; then
+  # chain TDNN
+  local/chain/run_tdnn.sh --nj ${nj} \
+    --stage $nnet_stage \
+    --train-set ${train_set}_cleaned \
+    --test-sets "$test_sets" \
+    --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
+fi
+
+##########################################################################
+# DECODING is done in the local/decode.sh script. This script performs
+# enhancement, fixes test sets performs feature extraction and 2 stage decoding
+##########################################################################
+
+if [ $stage -le 16 ]; then
+  local/decode.sh --stage $decode_stage \
+    --enhancement $enhancement \
+    --train_set "$train_set"
+fi
+
+exit 0;
diff --git a/egs/chime6/s5_track1/steps b/egs/chime6/s5_track1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/chime6/s5_track1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/chime6/s5_track1/utils b/egs/chime6/s5_track1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/chime6/s5_track1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/RESULTS b/egs/chime6/s5_track2/RESULTS
new file mode 100644
index 00000000000..cf87e7cc109
--- /dev/null
+++ b/egs/chime6/s5_track2/RESULTS
@@ -0,0 +1,18 @@
+# Results for Chime-6 track 2 for dev and eval, using pretrained models
+# available at http://kaldi-asr.org/models/m12.
+
+# Speech Activity Detection (SAD)
+          Missed speech   False alarm   Total error
+Dev         4.3             2.1           6.4                                                
+Eval        5.6             5.9           11.5
+
+# The results for the remaining pipeline are only for array U06.
+
+# Diarization
+        DER       JER
+Dev     57.15     83.96
+Eval    54.12     80.33
+
+# ASR nnet3 tdnn+chain
+Dev:  U06 %WER 81.18 [ 58881 / 47798, 1638 ins, 30528 del, 15632 sub ]
+Eval: U06 %WER 85.39 [ 55132 / 47076, 1107 ins, 27768 del, 18201 sub ]
diff --git a/egs/chime6/s5_track2/cmd.sh b/egs/chime6/s5_track2/cmd.sh
new file mode 100644
index 00000000000..86514d94d4d
--- /dev/null
+++ b/egs/chime6/s5_track2/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/chime6/s5_track2/conf/beamformit.cfg b/egs/chime6/s5_track2/conf/beamformit.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/chime6/s5_track2/conf/beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/chime6/s5_track2/conf/mfcc.conf b/egs/chime6/s5_track2/conf/mfcc.conf
new file mode 100644
index 00000000000..32988403b00
--- /dev/null
+++ b/egs/chime6/s5_track2/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=16000
diff --git a/egs/chime6/s5_track2/conf/mfcc_hires.conf b/egs/chime6/s5_track2/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/chime6/s5_track2/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/chime6/s5_track2/conf/online_cmvn.conf b/egs/chime6/s5_track2/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/chime6/s5_track2/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/chime6/s5_track2/conf/sad.conf b/egs/chime6/s5_track2/conf/sad.conf
new file mode 100644
index 00000000000..752bb1cf6c5
--- /dev/null
+++ b/egs/chime6/s5_track2/conf/sad.conf
@@ -0,0 +1,2 @@
+affix=_1a
+nnet_type=stats
diff --git a/egs/chime6/s5_track2/diarization b/egs/chime6/s5_track2/diarization
new file mode 120000
index 00000000000..bad937c1444
--- /dev/null
+++ b/egs/chime6/s5_track2/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/chain b/egs/chime6/s5_track2/local/chain
new file mode 120000
index 00000000000..dd7910711d1
--- /dev/null
+++ b/egs/chime6/s5_track2/local/chain
@@ -0,0 +1 @@
+../../s5_track1/local/chain/
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/check_dset_error.py b/egs/chime6/s5_track2/local/check_dset_error.py
new file mode 100755
index 00000000000..0ed7f59ae83
--- /dev/null
+++ b/egs/chime6/s5_track2/local/check_dset_error.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+
+import argparse
+import sys, os
+import string
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script splits a kaldi text file
+        into per_speaker per_session text files""")
+    parser.add_argument("wer_dir_path", type=str,
+                        help="path of directory containing wer files")
+    parser.add_argument("output_dir_path", type=str,
+                        help="path of the directory containing per speaker output files")
+    args = parser.parse_args()
+    return args
+
+def get_results(filename):
+    with open(filename) as f:
+        first_line = f.readline()
+        parts = first_line.strip().split(',')
+        total_words = parts[0].split()[-1]
+        ins = parts[1].split()[0]
+        deletion = parts[2].split()[0]
+        sub = parts[3].split()[0]
+        return int(total_words), int(ins), int(deletion), int(sub)
+
+def main():
+    args = get_args()
+    recodingid_error_dict={}
+    min_wer_per_recording = os.path.join(args.wer_dir_path, 'all.txt')
+    for line in open(min_wer_per_recording, 'r', encoding='utf8'):
+        toks = line.strip().split()
+        recordingid = toks[1]
+        total_words = toks[-5][:-1]
+        total_errors = toks[-4][:-1]
+        total_ins = toks[-3][:-1]
+        total_del = toks[-2][:-1]
+        total_sub = toks[-1]
+        recodingid_error_dict[recordingid]=(total_words, total_errors, total_ins, total_del, total_sub)
+    
+    recording_spkorder_file = os.path.join(args.output_dir_path, 'recordinid_spkorder')
+    for line in open(recording_spkorder_file, 'r', encoding='utf8'):
+        parts = line.strip().split(':')
+        recordingid = parts[0]
+        spkorder = parts[1]
+        spkorder_list=spkorder.split('_')
+        num_speakers=len(spkorder_list)
+        total_errors=total_words=total_ins=total_del=total_sub=0    
+        for i in range(1, num_speakers+1):
+            filename = 'wer_' + recordingid + '_' + 'r' + str(i)+ 'h' + str(spkorder_list[i-1])
+            wer_filename = os.path.join(args.wer_dir_path, filename)
+            words, ins, deletion, sub = get_results(wer_filename)
+            total_words += words
+            total_ins += ins
+            total_del += deletion
+            total_sub += sub
+            total_errors += ins + deletion + sub
+        assert int(total_words) == int(recodingid_error_dict[recordingid][0]), "Total words mismatch"
+        assert int(total_errors) == int(recodingid_error_dict[recordingid][1]), "Total errors mismatch"
+        assert int(total_ins) == int(recodingid_error_dict[recordingid][2]), "Total insertions mismatch"
+        assert int(total_del) == int(recodingid_error_dict[recordingid][3]), "Total deletions mismatch"
+        assert int(total_sub) == int(recodingid_error_dict[recordingid][4]), "Total substitutions mismatch"
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5_track2/local/check_tools.sh b/egs/chime6/s5_track2/local/check_tools.sh
new file mode 120000
index 00000000000..4e835e887f2
--- /dev/null
+++ b/egs/chime6/s5_track2/local/check_tools.sh
@@ -0,0 +1 @@
+../../s5_track1/local/check_tools.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py b/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py
new file mode 100755
index 00000000000..410dced190c
--- /dev/null
+++ b/egs/chime6/s5_track2/local/convert_rttm_to_utt2spk_and_segments.py
@@ -0,0 +1,98 @@
+#! /usr/bin/env python
+# Copyright   2019   Vimal Manohar
+# Apache 2.0.
+
+"""This script converts an RTTM with
+speaker info into kaldi utt2spk and segments"""
+
+import argparse
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script converts an RTTM with
+        speaker info into kaldi utt2spk and segments""")
+    parser.add_argument("--use-reco-id-as-spkr", type=str,
+                        choices=["true", "false"], default="false",
+                        help="Use the recording ID based on RTTM and "
+                        "reco2file_and_channel as the speaker")
+    parser.add_argument("--append-reco-id-to-spkr", type=str,
+                        choices=["true", "false"], default="false",
+                        help="Append recording ID to the speaker ID")
+
+    parser.add_argument("rttm_file", type=str,
+                        help="""Input RTTM file.
+                        The format of the RTTM file is
+                        <type> <file-id> <channel-id> <begin-time> """
+                        """<end-time> <NA> <NA> <speaker> <conf>""")
+    parser.add_argument("reco2file_and_channel", type=str,
+                        help="""Input reco2file_and_channel.
+                        The format is <recording-id> <file-id> <channel-id>.""")
+    parser.add_argument("utt2spk", type=str,
+                        help="Output utt2spk file")
+    parser.add_argument("segments", type=str,
+                        help="Output segments file")
+
+    args = parser.parse_args()
+
+    args.use_reco_id_as_spkr = bool(args.use_reco_id_as_spkr == "true")
+    args.append_reco_id_to_spkr = bool(args.append_reco_id_to_spkr == "true")
+
+    if args.use_reco_id_as_spkr:
+        if args.append_reco_id_to_spkr:
+            raise Exception("Appending recording ID to speaker does not make sense when using --use-reco-id-as-spkr=true")
+
+    return args
+
+def main():
+    args = get_args()
+
+    file_and_channel2reco = {}
+    utt2spk={}
+    segments={}
+    for line in open(args.reco2file_and_channel):
+        parts = line.strip().split()
+        file_and_channel2reco[(parts[1], parts[2])] = parts[0]
+
+    utt2spk_writer = open(args.utt2spk, 'w')
+    segments_writer = open(args.segments, 'w')
+    for line in open(args.rttm_file):
+        parts = line.strip().split()
+        if parts[0] != "SPEAKER":
+            continue
+
+        file_id = parts[1]
+        channel = parts[2]
+
+        try:
+            reco = file_and_channel2reco[(file_id, channel)]
+        except KeyError as e:
+            raise Exception("Could not find recording with "
+                            "(file_id, channel) "
+                            "= ({0},{1}) in {2}: {3}\n".format(
+                                file_id, channel,
+                                args.reco2file_and_channel, str(e)))
+
+        start_time = float(parts[3])
+        end_time = start_time + float(parts[4])
+
+        if args.use_reco_id_as_spkr:
+            spkr = reco
+        else:
+            if args.append_reco_id_to_spkr:
+                spkr = reco + "-" + parts[7]
+            else:
+                spkr = parts[7]
+
+        st = int(start_time * 100)
+        end = int(end_time * 100)
+        utt = "{0}-{1:06d}-{2:06d}".format(spkr, st, end)
+        utt2spk[utt]=spkr
+        segments[utt]=(reco, start_time, end_time)
+
+    for uttid_id in sorted(utt2spk):
+        utt2spk_writer.write("{0} {1}\n".format(uttid_id, utt2spk[uttid_id]))
+        segments_writer.write("{0} {1} {2:7.2f} {3:7.2f}\n".format(
+            uttid_id, segments[uttid_id][0], segments[uttid_id][1], segments[uttid_id][2]))
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh b/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh
new file mode 120000
index 00000000000..a168a917d92
--- /dev/null
+++ b/egs/chime6/s5_track2/local/copy_lat_dir_parallel.sh
@@ -0,0 +1 @@
+../../s5_track1/local/copy_lat_dir_parallel.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/decode.sh b/egs/chime6/s5_track2/local/decode.sh
new file mode 100755
index 00000000000..876cc0be126
--- /dev/null
+++ b/egs/chime6/s5_track2/local/decode.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+#
+# This script decodes raw utterances through the entire pipeline:
+# Feature extraction -> SAD -> Diarization -> ASR
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+#            2019  Desh Raj, David Snyder, Ashish Arora
+# Apache 2.0
+
+# Begin configuration section.
+nj=8
+decode_nj=10
+stage=0
+sad_stage=0
+diarizer_stage=0
+decode_diarize_stage=0
+score_stage=0
+enhancement=beamformit
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+# chime6 data directories, which are generated from ${chime5_corpus},
+# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly
+chime6_corpus=${PWD}/CHiME6
+json_dir=${chime6_corpus}/transcriptions
+audio_dir=${chime6_corpus}/audio
+
+enhanced_dir=enhanced
+enhanced_dir=$(utils/make_absolute.sh $enhanced_dir) || exit 1
+
+# training data
+train_set=train_worn_simu_u400k
+test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb"
+
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+. ./conf/sad.conf
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1
+
+###########################################################################
+# We first generate the synchronized audio files across arrays and
+# corresponding JSON files. Note that this requires sox v14.4.2,
+# which is installed via miniconda in ./local/check_tools.sh
+###########################################################################
+
+if [ $stage -le 0 ]; then
+  local/generate_chime6_data.sh \
+    --cmd "$train_cmd" \
+    ${chime5_corpus} \
+    ${chime6_corpus}
+fi
+
+#######################################################################
+# Prepare the dev and eval data with dereverberation (WPE) and
+# beamforming.
+#######################################################################
+if [ $stage -le 1 ]; then
+  # Beamforming using reference arrays
+  # enhanced WAV directory
+  enhandir=enhan
+  dereverb_dir=${PWD}/wav/wpe/
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_wpe.sh --nj 4 --cmd "$train_cmd --mem 20G" \
+            ${audio_dir}/${dset} \
+            ${dereverb_dir}/${dset} \
+            ${mictype}
+    done
+  done
+
+  for dset in dev eval; do
+    for mictype in u01 u02 u03 u04 u06; do
+      local/run_beamformit.sh --cmd "$train_cmd" \
+        ${dereverb_dir}/${dset} \
+        ${enhandir}/${dset}_${enhancement}_${mictype} \
+        ${mictype}
+    done
+  done
+
+  # Note that for the evaluation sets, we use the flag
+  # "--train false". This keeps the files segments, text,
+  # and utt2spk with .bak extensions, so that they can
+  # be used later for scoring if needed but are not used
+  # in the intermediate stages.
+  for dset in dev eval; do
+    local/prepare_data.sh --mictype ref --train false \
+      "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \
+      ${json_dir}/${dset} data/${dset}_${enhancement}_dereverb
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  mfccdir=mfcc
+  for x in ${test_sets}; do
+    steps/make_mfcc.sh --nj $decode_nj --cmd "$train_cmd" \
+      --mfcc-config conf/mfcc_hires.conf \
+      data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+#######################################################################
+# Perform SAD on the dev/eval data
+#######################################################################
+dir=exp/segmentation${affix}
+sad_work_dir=exp/sad${affix}_${nnet_type}/
+sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a
+
+if [ $stage -le 3 ]; then
+  for datadir in ${test_sets}; do
+    test_set=data/${datadir}
+    if [ ! -f ${test_set}/wav.scp ]; then
+      echo "$0: Not performing SAD on ${test_set}"
+      exit 0
+    fi
+    # Perform segmentation
+    local/segmentation/detect_speech_activity.sh --nj $decode_nj --stage $sad_stage \
+      $test_set $sad_nnet_dir mfcc $sad_work_dir \
+      data/${datadir} || exit 1
+
+    mv data/${datadir}_seg data/${datadir}_${nnet_type}_seg
+    mv data/${datadir}/{segments.bak,utt2spk.bak} data/${datadir}_${nnet_type}_seg
+    # Generate RTTM file from segmentation performed by SAD. This can
+    # be used to evaluate the performance of the SAD as an intermediate
+    # step.
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+      data/${datadir}_${nnet_type}_seg/utt2spk data/${datadir}_${nnet_type}_seg/segments \
+      data/${datadir}_${nnet_type}_seg/rttm
+  done
+fi
+
+#######################################################################
+# Perform diarization on the dev/eval data
+#######################################################################
+if [ $stage -le 4 ]; then
+  for datadir in ${test_sets}; do
+    local/diarize.sh --nj 10 --cmd "$train_cmd" --stage $diarizer_stage \
+      exp/xvector_nnet_1a \
+      data/${datadir}_${nnet_type}_seg \
+      exp/${datadir}_${nnet_type}_seg_diarization
+  done
+fi
+
+#######################################################################
+# Decode diarized output using trained chain model
+#######################################################################
+if [ $stage -le 5 ]; then
+  for datadir in ${test_sets}; do
+    local/decode_diarized.sh --nj $nj --cmd "$decode_cmd" --stage $decode_diarize_stage \
+      exp/${datadir}_${nnet_type}_seg_diarization data/$datadir data/lang \
+      exp/chain_${train_set}_cleaned_rvb exp/nnet3_${train_set}_cleaned_rvb \
+      data/${datadir}_diarized
+  done
+fi
+
+#######################################################################
+# Score decoded dev/eval sets
+#######################################################################
+if [ $stage -le 6 ]; then
+  # final scoring to get the challenge result
+  # please specify both dev and eval set directories so that the search parameters
+  # (insertion penalty and language model weight) will be tuned using the dev set
+  local/score_for_submit.sh --stage $score_stage \
+      --dev_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage \
+      --dev_datadir dev_beamformit_dereverb_diarized_hires \
+      --eval_decodedir exp/chain_${train_set}_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage \
+      --eval_datadir eval_beamformit_dereverb_diarized_hires
+fi
+exit 0;
diff --git a/egs/chime6/s5_track2/local/decode_diarized.sh b/egs/chime6/s5_track2/local/decode_diarized.sh
new file mode 100755
index 00000000000..2d0ad6a3b95
--- /dev/null
+++ b/egs/chime6/s5_track2/local/decode_diarized.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# Copyright   2019   Ashish Arora, Vimal Manohar
+# Apache 2.0.
+# This script takes an rttm file, and performs decoding on on a test directory.
+# The output directory contains a text file which can be used for scoring.
+
+
+stage=0
+nj=8
+cmd=queue.pl
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 6 ]; then
+  echo "Usage: $0 <rttm-dir> <in-data-dir> <lang-dir> <model-dir> <ivector-dir> <out-dir>"
+  echo "e.g.: $0 data/rttm data/dev data/lang_chain exp/chain_train_worn_simu_u400k_cleaned_rvb \
+                 exp/nnet3_train_worn_simu_u400k_cleaned_rvb data/dev_diarized"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  exit 1;
+fi
+
+rttm_dir=$1
+data_in=$2
+lang_dir=$3
+asr_model_dir=$4
+ivector_extractor=$5
+out_dir=$6
+
+for f in $rttm_dir/rttm $data_in/wav.scp $data_in/text.bak \
+         $lang_dir/L.fst $asr_model_dir/tree_sp/graph/HCLG.fst \
+         $asr_model_dir/tdnn1b_sp/final.mdl; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0 copying data files in output directory"
+  cp $rttm_dir/rttm $rttm_dir/rttm_1
+  sed -i 's/'.ENH'/''/g' $rttm_dir/rttm_1
+  mkdir -p ${out_dir}_hires
+  cp ${data_in}/{wav.scp,utt2spk} ${out_dir}_hires
+  utils/data/get_reco2dur.sh ${out_dir}_hires
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0 creating segments file from rttm and utt2spk, reco2file_and_channel "
+  local/convert_rttm_to_utt2spk_and_segments.py --append-reco-id-to-spkr=true $rttm_dir/rttm_1 \
+    <(awk '{print $2".ENH "$2" "$3}' $rttm_dir/rttm_1 |sort -u) \
+    ${out_dir}_hires/utt2spk ${out_dir}_hires/segments
+
+  utils/utt2spk_to_spk2utt.pl ${out_dir}_hires/utt2spk > ${out_dir}_hires/spk2utt
+
+  awk '{print $1" "$1" 1"}' ${out_dir}_hires/wav.scp > ${out_dir}_hires/reco2file_and_channel
+  utils/fix_data_dir.sh ${out_dir}_hires || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0 extracting mfcc freatures using segments file"
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj --cmd queue.pl ${out_dir}_hires
+  steps/compute_cmvn_stats.sh ${out_dir}_hires
+  cp $data_in/text.bak ${out_dir}_hires/text
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0 performing decoding on the extracted features"
+  local/nnet3/decode.sh --affix 2stage --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk 150 --nj $nj --ivector-dir $ivector_extractor \
+    $out_dir $lang_dir $asr_model_dir/tree_sp/graph $asr_model_dir/tdnn1b_sp/
+fi
+
diff --git a/egs/chime6/s5_track2/local/diarize.sh b/egs/chime6/s5_track2/local/diarize.sh
new file mode 100755
index 00000000000..561d5fe7755
--- /dev/null
+++ b/egs/chime6/s5_track2/local/diarize.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Copyright   2019   David Snder
+# Apache 2.0.
+#
+# This script takes an input directory that has a segments file (and
+# a feats.scp file), and performs diarization on it.  The output directory
+# contains an RTTM file which can be used to resegment the input data.
+
+stage=0
+nj=10
+cmd="run.pl"
+ref_rttm=
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <model-dir> <in-data-dir> <out-dir>"
+  echo "e.g.: $0 exp/xvector_nnet_1a  data/dev exp/dev_diarization"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs."
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --ref-rttm <path to reference RTTM>              # if present, used to score output RTTM."
+  exit 1;
+fi
+
+model_dir=$1
+data_in=$2
+out_dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/segments $model_dir/plda \
+  $model_dir/final.raw $model_dir/extract.config; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  echo "$0: keeping only data corresponding to array U06 "
+  echo "$0: we can skip this stage, to perform diarization on all arrays "
+  # to perform diarization ond scoring on all array please skip this step and
+  # pass all_array = true in local/multispeaker_score.sh
+  cp -r data/$name data/${name}.bak
+  mv data/$name/wav.scp data/$name/wav.scp.bak
+  grep 'U06' data/$name/wav.scp.bak > data/$name/wav.scp
+  utils/fix_data_dir.sh data/$name
+  nj=2 # since we have reduced number of "speakers" now
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: computing features for x-vector extractor"
+  utils/fix_data_dir.sh data/${name}
+  rm -rf data/${name}_cmn
+  local/nnet3/xvector/prepare_feats.sh --nj $nj --cmd "$cmd" \
+    data/$name data/${name}_cmn exp/${name}_cmn
+  cp data/$name/segments exp/${name}_cmn/
+  utils/fix_data_dir.sh data/${name}_cmn
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: extracting x-vectors for all segments"
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$cmd" \
+    --nj $nj --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $model_dir \
+    data/${name}_cmn $out_dir/xvectors_${name}
+fi
+
+# Perform PLDA scoring
+if [ $stage -le 3 ]; then
+  # Perform PLDA scoring on all pairs of segments for each recording.
+  echo "$0: performing PLDA scoring between all pairs of x-vectors"
+  diarization/nnet3/xvector/score_plda.sh --cmd "$cmd" \
+    --target-energy 0.5 \
+    --nj $nj $model_dir/ $out_dir/xvectors_${name} \
+    $out_dir/xvectors_${name}/plda_scores
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: performing clustering using PLDA scores (we assume 4 speakers per recording)"
+  awk '{print $1, "4"}' data/$name/wav.scp > data/$name/reco2num_spk
+  diarization/cluster.sh --cmd "$cmd" --nj $nj \
+    --reco2num-spk data/$name/reco2num_spk \
+    --rttm-channel 1 \
+    $out_dir/xvectors_${name}/plda_scores $out_dir
+  echo "$0: wrote RTTM to output directory ${out_dir}"
+fi
+
+# For scoring the diarization system, we use the same tool that was
+# used in the DIHARD II challenge. This is available at:
+# https://github.com/nryant/dscore
+if [ $stage -le 5 ]; then
+  # If a reference RTTM file is not provided, we create one using the backed up
+  # segments and utt2spk files in the original data directory.
+  if [ -z $ref_rttm ]; then
+    ref_rttm=data/$name/rttm
+    echo "$0: preparing ref RTTM file from segments and utt2spk"
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py data/$name/utt2spk.bak \
+      data/$name/segments.bak $ref_rttm
+  fi
+  grep 'U06' $ref_rttm > ${ref_rttm}.U06
+  ref_rttm_path=$(readlink -f ${ref_rttm}.U06)
+  out_rttm_path=$(readlink -f $out_dir/rttm)
+  if ! [ -d dscore ]; then
+    git clone https://github.com/nryant/dscore.git || exit 1;
+    cd dscore
+    python -m pip install --user -r requirements.txt
+    cd ..
+  fi
+  cd dscore
+  python score.py -r $ref_rttm_path -s $out_rttm_path
+  cd ..
+fi
+
diff --git a/egs/chime6/s5_track2/local/distant_audio_list b/egs/chime6/s5_track2/local/distant_audio_list
new file mode 120000
index 00000000000..0455876cf4d
--- /dev/null
+++ b/egs/chime6/s5_track2/local/distant_audio_list
@@ -0,0 +1 @@
+../../s5_track1/local/distant_audio_list
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/extract_noises.py b/egs/chime6/s5_track2/local/extract_noises.py
new file mode 120000
index 00000000000..04a6389916d
--- /dev/null
+++ b/egs/chime6/s5_track2/local/extract_noises.py
@@ -0,0 +1 @@
+../../s5_track1/local/extract_noises.py
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/extract_vad_weights.sh b/egs/chime6/s5_track2/local/extract_vad_weights.sh
new file mode 120000
index 00000000000..0db29cded5d
--- /dev/null
+++ b/egs/chime6/s5_track2/local/extract_vad_weights.sh
@@ -0,0 +1 @@
+../../s5_track1/local/extract_vad_weights.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/gen_aligned_hyp.py b/egs/chime6/s5_track2/local/gen_aligned_hyp.py
new file mode 100755
index 00000000000..acaa3a13ad5
--- /dev/null
+++ b/egs/chime6/s5_track2/local/gen_aligned_hyp.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# Copyright   2019   Yusuke Fujita
+# Apache 2.0.
+
+"""This script generates hypothesis utterances aligned with reference segments.
+  Usage: gen_align_hyp.py alignment.txt wc.txt > hyp.txt
+    alignment.txt is a session-level word alignment generated by align-text command.
+    wc.txt is a sequence of utt-id:reference_word_count generated by 'local/get_ref_perspeaker_persession_file.py'.
+"""
+
+import sys, io
+import string
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+def load_align_text(f):
+    alignments = {}
+    for line in f:
+        recoid, res = line.split(None, 1)
+        alignments[recoid] = []
+        toks = res.split(';')
+        for tok in toks:
+            ref, hyp = tok.split()
+            alignments[recoid].append((ref, hyp))
+    return alignments
+
+alignments = load_align_text(open(sys.argv[1],'r', encoding='utf8'))
+
+for line in open(sys.argv[2],'r', encoding='utf8'):
+    recoid, res = line.split(None, 1)
+    ali = iter(alignments[recoid])
+    toks = res.split()
+    for tok in toks:
+        uttid, count = tok.split(':')
+        count = int(count)
+        text = ''
+        for i in range(count):
+            while True:
+                ref, hyp = ali.__next__()
+                if hyp != '<eps>':
+                    text += ' ' + hyp
+                if ref != '<eps>':
+                    break
+        output.write(uttid + ' ' + text.strip() + '\n')
diff --git a/egs/chime6/s5_track2/local/generate_chime6_data.sh b/egs/chime6/s5_track2/local/generate_chime6_data.sh
new file mode 120000
index 00000000000..62882cd6279
--- /dev/null
+++ b/egs/chime6/s5_track2/local/generate_chime6_data.sh
@@ -0,0 +1 @@
+../../s5_track1/local/generate_chime6_data.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/get_best_error.py b/egs/chime6/s5_track2/local/get_best_error.py
new file mode 100755
index 00000000000..b9d8b0d43e7
--- /dev/null
+++ b/egs/chime6/s5_track2/local/get_best_error.py
@@ -0,0 +1,84 @@
+#! /usr/bin/env python3
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+"""This script finds best matching of reference and hypothesis speakers.
+  For the best matching speakers,it provides the WER for the reference session
+  (eg:S02) and hypothesis recording (eg: S02_U02)"""
+
+import itertools
+import numpy as np
+import argparse
+from munkres import Munkres
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script finds best matching of reference and hypothesis speakers.
+  For the best matching it provides the WER""")
+    parser.add_argument("WER_dir", type=str,
+                        help="path of WER files")
+    parser.add_argument("recording_id", type=str,
+                        help="recording_id name")
+    parser.add_argument("num_speakers", type=str,
+                        help="number of speakers in ref")
+    args = parser.parse_args()
+    return args
+
+
+def get_results(filename):
+    with open(filename) as f:
+        first_line = f.readline()
+        parts = first_line.strip().split(',')
+        total_words = parts[0].split()[-1]
+        ins = parts[1].split()[0]
+        deletions = parts[2].split()[0]
+        sub = parts[3].split()[0]
+        return total_words, ins, deletions, sub
+
+
+def get_min_wer(recording_id, num_speakers, WER_dir):
+    best_wer_file = WER_dir + '/' + 'best_wer' + '_' + recording_id
+    best_wer_writer = open(best_wer_file, 'w')
+    m = Munkres()
+    total_error_mat = [0] * num_speakers
+    all_errors_mat = [0] * num_speakers
+    for i in range(num_speakers):
+        total_error_mat[i] = [0] * num_speakers
+        all_errors_mat[i] = [0] * num_speakers
+    for i in range(1, num_speakers+1):
+        for j in range(1, num_speakers+1):
+            filename = '/wer_' + recording_id + '_' + 'r' + str(i)+ 'h' + str(j)
+            filename = WER_dir + filename
+            total_words, ins, deletions, sub = get_results(filename)
+            ins = int(ins)
+            deletions = int(deletions)
+            sub = int(sub)
+            total_error = ins + deletions + sub
+            total_error_mat[i-1][j-1]=total_error
+            all_errors_mat[i-1][j-1]= (total_words, total_error, ins, deletions, sub)
+
+    indexes = m.compute(total_error_mat)
+    total_errors=total_words=total_ins=total_del=total_sub=0
+    spk_order = '('
+    for row, column in indexes:
+        words, errs, ins, deletions, sub = all_errors_mat[row][column]
+        total_errors += int(errs)
+        total_words += int(words)
+        total_ins += int(ins)
+        total_del += int(deletions)
+        total_sub += int(sub)
+        spk_order = spk_order + str(column+1) + ', '
+    spk_order = spk_order + ')' 
+    text = "Best error: (#T #E #I #D #S) " + str(total_words)+ ', '+str(total_errors)+ ', '+str(total_ins)+ ', '+str(total_del)+ ', '+str(total_sub)
+    best_wer_writer.write(" recording_id: "+ recording_id + ' ')
+    best_wer_writer.write(' best hypothesis speaker order: ' + spk_order + ' ')
+    best_wer_writer.write(text+ '\n')
+    best_wer_writer.close()
+
+
+def main():
+    args = get_args()
+    get_min_wer(args.recording_id, int(args.num_speakers), args.WER_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py b/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py
new file mode 100755
index 00000000000..7b3e14aaa49
--- /dev/null
+++ b/egs/chime6/s5_track2/local/get_hyp_perspeaker_perarray_file.py
@@ -0,0 +1,56 @@
+#! /usr/bin/env python
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+"""This script splits a kaldi (text) file
+  into per_array per_session per_speaker hypothesis (text) files"""
+
+import argparse
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script splits a kaldi text file
+        into per_array per_session per_speaker  text files""")
+    parser.add_argument("input_text_path", type=str,
+                        help="path of text files")
+    parser.add_argument("output_dir_path", type=str,
+                        help="Output path for per_array per_session per_speaker reference files")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    # S09_U06.ENH-4-704588-704738
+    args = get_args()
+    sessionid_micid_speakerid_dict= {}
+    for line in open(args.input_text_path):
+        parts = line.strip().split()
+        uttid_id = parts[0]
+        temp = uttid_id.strip().split('.')[0]
+        micid = temp.strip().split('_')[1]
+        speakerid = uttid_id.strip().split('-')[1]
+        sessionid = uttid_id.strip().split('_')[0]
+        sessionid_micid_speakerid = sessionid + '_' + micid + '_' + speakerid
+        if sessionid_micid_speakerid not in sessionid_micid_speakerid_dict:
+            sessionid_micid_speakerid_dict[sessionid_micid_speakerid]=list()
+        sessionid_micid_speakerid_dict[sessionid_micid_speakerid].append(line)
+
+    for sessionid_micid_speakerid in sorted(sessionid_micid_speakerid_dict):
+        hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid
+        hyp_writer = open(hyp_file, 'w')
+        combined_hyp_file = args.output_dir_path + '/' + 'hyp' + '_' + sessionid_micid_speakerid + '_comb'
+        combined_hyp_writer = open(combined_hyp_file, 'w')
+        utterances = sessionid_micid_speakerid_dict[sessionid_micid_speakerid]
+        text = ''
+        for line in utterances:
+            parts = line.strip().split()
+            text = text + ' ' + ' '.join(parts[1:])
+            hyp_writer.write(line)
+        combined_utterance = 'utt' + " " + text
+        combined_hyp_writer.write(combined_utterance)
+        combined_hyp_writer.write('\n')
+        combined_hyp_writer.close()
+        hyp_writer.close()
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py b/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py
new file mode 100755
index 00000000000..6b00e29e6b1
--- /dev/null
+++ b/egs/chime6/s5_track2/local/get_ref_perspeaker_persession_file.py
@@ -0,0 +1,79 @@
+#! /usr/bin/env python
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+"""This script splits a kaldi (text) file
+  into per_speaker per_session reference (text) file"""
+
+import argparse
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""This script splits a kaldi text file
+        into per_speaker per_session text files""")
+    parser.add_argument("input_text_path", type=str,
+                        help="path of text file")
+    parser.add_argument("output_dir_path", type=str,
+                        help="Output path for per_session per_speaker reference files")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+    sessionid_speakerid_dict= {}
+    spkrid_mapping = {}
+    for line in open(args.input_text_path):
+        parts = line.strip().split()
+        uttid_id = parts[0]
+        speakerid = uttid_id.strip().split('_')[0]
+        sessionid = uttid_id.strip().split('_')[1]
+        sessionid_speakerid = sessionid + '_' + speakerid
+        if sessionid_speakerid not in sessionid_speakerid_dict:
+            sessionid_speakerid_dict[sessionid_speakerid]=list()
+        sessionid_speakerid_dict[sessionid_speakerid].append(line)
+
+    spkr_num = 1
+    prev_sessionid = ''
+    for sessionid_speakerid in sorted(sessionid_speakerid_dict):
+        spkr_id = sessionid_speakerid.strip().split('_')[1]
+        curr_sessionid = sessionid_speakerid.strip().split('_')[0]
+        if prev_sessionid != curr_sessionid:
+            prev_sessionid = curr_sessionid
+            spkr_num = 1
+        if spkr_id not in spkrid_mapping:
+            spkrid_mapping[spkr_id] = spkr_num
+            spkr_num += 1
+
+    for sessionid_speakerid in sorted(sessionid_speakerid_dict):
+        ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str(
+            spkrid_mapping[sessionid_speakerid.split('_')[1]])
+        ref_writer = open(ref_file, 'w')
+        wc_file = args.output_dir_path + '/ref_wc_' + sessionid_speakerid.split('_')[0] + '_' + str(
+            spkrid_mapping[sessionid_speakerid.split('_')[1]])
+        wc_writer = open(wc_file, 'w')
+        combined_ref_file = args.output_dir_path + '/ref_' + sessionid_speakerid.split('_')[0] + '_' + str(
+            spkrid_mapping[sessionid_speakerid.split('_')[1]]) + '_comb'
+        combined_ref_writer = open(combined_ref_file, 'w')
+        utterances = sessionid_speakerid_dict[sessionid_speakerid]
+        text = ''
+        uttid_wc = 'utt'
+        for line in utterances:
+            parts = line.strip().split()
+            uttid_id = parts[0]
+            utt_text = ' '.join(parts[1:])
+            text = text + ' ' + ' '.join(parts[1:])
+            ref_writer.write(line)
+            length = str(len(utt_text.split()))
+            uttid_id_len = uttid_id + ":" + length
+            uttid_wc = uttid_wc + ' ' + uttid_id_len
+        combined_utterance = 'utt' + " " + text
+        combined_ref_writer.write(combined_utterance)
+        combined_ref_writer.write('\n')
+        combined_ref_writer.close()
+        wc_writer.write(uttid_wc)
+        wc_writer.write('\n')
+        wc_writer.close()
+        ref_writer.close()
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/chime6/s5_track2/local/install_pb_chime5.sh b/egs/chime6/s5_track2/local/install_pb_chime5.sh
new file mode 120000
index 00000000000..ce5ea5f9f08
--- /dev/null
+++ b/egs/chime6/s5_track2/local/install_pb_chime5.sh
@@ -0,0 +1 @@
+../../s5_track1/local/install_pb_chime5.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/json2text.py b/egs/chime6/s5_track2/local/json2text.py
new file mode 120000
index 00000000000..2aa0a8dd1f9
--- /dev/null
+++ b/egs/chime6/s5_track2/local/json2text.py
@@ -0,0 +1 @@
+../../s5_track1/local/json2text.py
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/make_noise_list.py b/egs/chime6/s5_track2/local/make_noise_list.py
new file mode 120000
index 00000000000..d8dcc7822fc
--- /dev/null
+++ b/egs/chime6/s5_track2/local/make_noise_list.py
@@ -0,0 +1 @@
+../../s5_track1/local/make_noise_list.py
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/make_voxceleb1.pl b/egs/chime6/s5_track2/local/make_voxceleb1.pl
new file mode 100755
index 00000000000..2268c20ab52
--- /dev/null
+++ b/egs/chime6/s5_track2/local/make_voxceleb1.pl
@@ -0,0 +1,130 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#
+# Usage: make_voxceleb1.pl /export/voxceleb1 data/
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 data/\n";
+  exit(1);
+}
+
+($data_base, $out_dir) = @ARGV;
+my $out_test_dir = "$out_dir/voxceleb1_test";
+my $out_train_dir = "$out_dir/voxceleb1_train";
+
+if (system("mkdir -p $out_test_dir") != 0) {
+  die "Error making directory $out_test_dir";
+}
+
+if (system("mkdir -p $out_train_dir") != 0) {
+  die "Error making directory $out_train_dir";
+}
+
+opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (! -e "$data_base/voxceleb1_test.txt") {
+  system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt");
+}
+
+if (! -e "$data_base/vox1_meta.csv") {
+  system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv");
+}
+
+open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt";
+open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
+open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk";
+open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp";
+open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk";
+open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp";
+open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+
+my %id2spkr = ();
+while (<META_IN>) {
+  chomp;
+  my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
+  $id2spkr{$vox_id} = $spkr_id;
+}
+
+my $test_spkrs = ();
+while (<TRIAL_IN>) {
+  chomp;
+  my ($tar_or_non, $path1, $path2) = split;
+
+  # Create entry for left-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path1);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id1 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  # Create entry for right-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path2);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id2 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  my $target = "nontarget";
+  if ($tar_or_non eq "1") {
+    $target = "target";
+  }
+  print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+}
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+  my $new_spkr_id = $spkr_id;
+  # If we're using a newer version of VoxCeleb1, we need to "deanonymize"
+  # the speaker labels.
+  if (exists $id2spkr{$spkr_id}) {
+    $new_spkr_id = $id2spkr{$spkr_id};
+  }
+  opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
+  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+  closedir $dh;
+  foreach (@files) {
+    my $filename = $_;
+    my $rec_id = substr($filename, 0, 11);
+    my $segment = substr($filename, 12, 7);
+    my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
+    my $utt_id = "$new_spkr_id-$rec_id-$segment";
+    if (exists $test_spkrs{$new_spkr_id}) {
+      print WAV_TEST "$utt_id", " $wav", "\n";
+      print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
+    } else {
+      print WAV_TRAIN "$utt_id", " $wav", "\n";
+      print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n";
+    }
+  }
+}
+
+close(SPKR_TEST) or die;
+close(WAV_TEST) or die;
+close(SPKR_TRAIN) or die;
+close(WAV_TRAIN) or die;
+close(TRIAL_OUT) or die;
+close(TRIAL_IN) or die;
+close(META_IN) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_test_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) {
+  die "Error validating directory $out_test_dir";
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_train_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) {
+  die "Error validating directory $out_train_dir";
+}
diff --git a/egs/chime6/s5_track2/local/make_voxceleb2.pl b/egs/chime6/s5_track2/local/make_voxceleb2.pl
new file mode 100755
index 00000000000..34c1591eba3
--- /dev/null
+++ b/egs/chime6/s5_track2/local/make_voxceleb2.pl
@@ -0,0 +1,70 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#
+# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev
+#
+# Note: This script requires ffmpeg to be installed and its location included in $PATH.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb2> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n";
+  exit(1);
+}
+
+# Check that ffmpeg is installed.
+if (`which ffmpeg` eq "") {
+  die "Error: this script requires that ffmpeg is installed.";
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+
+  opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!";
+  my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+  closedir $dh;
+
+  foreach (@rec_dirs) {
+    my $rec_id = $_;
+
+    opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+    my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh);
+    closedir $dh;
+
+    foreach (@files) {
+      my $name = $_;
+      my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|";
+      my $utt_id = "$spkr_id-$rec_id-$name";
+      print WAV "$utt_id", " $wav", "\n";
+      print SPKR "$utt_id", " $spkr_id", "\n";
+    }
+  }
+}
+close(SPKR) or die;
+close(WAV) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/chime6/s5_track2/local/multispeaker_score.sh b/egs/chime6/s5_track2/local/multispeaker_score.sh
new file mode 100755
index 00000000000..74e089c4052
--- /dev/null
+++ b/egs/chime6/s5_track2/local/multispeaker_score.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright   2019   Ashish Arora, Yusuke Fujita
+# Apache 2.0.
+# This script takes a reference and hypothesis text file, and performs 
+# multispeaker scoring.
+
+stage=0
+cmd=queue.pl
+num_spkrs=4
+num_hyp_spk=4
+datadir=dev_beamformit_dereverb
+get_stats=true
+all_array=false
+declare -a recording_id_array=("S02_U06" "S09_U06")
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <ref-file> <hyp-file> <out-dir>"
+  echo "e.g.: $0 data/diarized/text data/dev \
+    exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi/penalty_1.0/10.txt \
+    exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_xvector_sad/scoring_kaldi_multispeaker"
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  exit 1;
+fi
+
+ref_file=$1
+hyp_file=$2
+out_dir=$3
+
+output_dir=$out_dir/per_speaker_output
+wer_dir=$out_dir/per_speaker_wer
+
+# For dev and evaluation set, we take corresopnding arrays
+if [[ ${datadir} == *dev* ]]; then
+  recording_id_array=("S02_U06" "S09_U06")
+fi
+
+if [[ ${datadir} == *eval* ]]; then
+  recording_id_array=("S01_U06" "S21_U06")
+fi
+
+if [[ ${datadir} == *dev* ]] && [[ $all_array == "true" ]]; then
+  recording_id_array=("S02_U01" "S02_U02" "S02_U03" "S02_U04" "S02_U06" "S09_U01" "S09_U02" "S09_U03" "S09_U04" "S09_U06")
+fi
+
+if [[ ${datadir} == *eval* ]] && [[ $all_array == "true" ]]; then
+  recording_id_array=("S01_U01" "S01_U02" "S01_U03" "S01_U04" "S01_U06" "S21_U01" "S21_U02" "S21_U03" "S21_U04" "S21_U06")
+fi
+
+for f in $ref_file $hyp_file; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+if [ $stage -le 0 ]; then
+  # generate per speaker per session file at paragraph level for the reference"
+  # and per speaker per array file at paraghaph level for the hypothesis"
+  mkdir -p $output_dir $wer_dir
+  local/wer_output_filter < $ref_file > $output_dir/ref_filt.txt
+  local/wer_output_filter < $hyp_file > $output_dir/hyp_filt.txt
+  local/get_ref_perspeaker_persession_file.py $output_dir/ref_filt.txt $output_dir
+  local/get_hyp_perspeaker_perarray_file.py $output_dir/hyp_filt.txt $output_dir
+fi
+
+if [ $stage -le 1 ]; then
+  if [ $num_hyp_spk -le 3 ]; then
+    # create dummy per speaker per array hypothesis files for if the"
+    # perdicted number of speakers by diarization is less than 4 "
+    for recording_id in "${recording_id_array[@]}"; do
+      for (( i=$num_hyp_spk+1; i<$num_spkrs+1; i++ )); do
+        echo 'utt ' > ${dir}/hyp_${recording_id}_${i}_comb
+      done
+    done
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # calculate wer for each ref and hypothesis speaker"
+  for recording_id in "${recording_id_array[@]}"; do
+    for (( i=0; i<$((num_spkrs * num_spkrs)); i++ )); do
+      ind_r=$((i / num_spkrs + 1))
+      ind_h=$((i % num_spkrs + 1))
+      sessionid="$(echo $recording_id | cut -d'_' -f1)"
+
+      # compute WER with combined texts
+      compute-wer --text --mode=present ark:${output_dir}/ref_${sessionid}_${ind_r}_comb \
+        ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb \
+        > $wer_dir/wer_${recording_id}_r${ind_r}h${ind_h} 2>/dev/null
+    done
+
+    local/get_best_error.py $wer_dir $recording_id $num_spkrs
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  # print best word error rate"
+  # it will print best wer for each recording and each array"
+  cat $wer_dir/best_wer* > $wer_dir/all.txt
+  cat $wer_dir/all.txt | local/print_dset_error.py \
+    $output_dir/recordinid_spkorder > $wer_dir/array_wer.txt
+fi
+
+if [ $stage -le 4 ]; then
+  # checks if DP result of total error is equivalent
+  # to the sum of the individual errors:
+  local/check_dset_error.py $wer_dir $output_dir
+fi
+
+if [ $stage -le 5 ] && [[ $get_stats == "true" ]]; then
+  # generate per utterance wer details at utterance level
+  mkdir -p $wer_dir/wer_details $wer_dir/wer_details/log/
+  while read -r line;
+  do
+    recording_id=$(echo "$line" | cut -f1 -d ":")
+    spkorder_str=$(echo "$line" | cut -f2 -d ":")
+    sessionid=$(echo "$line" | cut -f1 -d "_")
+    IFS='_' read -r -a spkorder_list <<< "$spkorder_str"
+    IFS=" "
+    ind_r=1
+    for ind_h in "${spkorder_list[@]}"; do
+
+      $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_comb.log \
+        align-text ark:${output_dir}/ref_${sessionid}_${ind_r}_comb ark:${output_dir}/hyp_${recording_id}_${ind_h}_comb ark:$output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt
+
+      # split hypothesis texts along with reference utterances using word alignment of combined texts
+      local/gen_aligned_hyp.py $output_dir/alignment_${sessionid}_r${ind_r}h${ind_h}.txt ${output_dir}/ref_wc_${sessionid}_${ind_r} > ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation
+
+      ## compute per utterance alignments
+      $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_per_utt.log \
+        cat ${output_dir}/hyp_${recording_id}_r${ind_r}h${ind_h}_ref_segmentation \| \
+        align-text --special-symbol="'***'" ark:${output_dir}/ref_${sessionid}_${ind_r} ark:- ark,t:- \|  \
+        utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} || exit 1
+
+      $cmd $wer_dir/wer_details/log/${recording_id}_r${ind_r}h${ind_h}_ops.log \
+        cat $wer_dir/wer_details/per_utt_${recording_id}_r${ind_r}h${ind_h} \| \
+        utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+        sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $wer_dir/wer_details/ops_${recording_id}_r${ind_r}h${ind_h} || exit 1;
+
+      ind_r=$(( ind_r + 1 ))
+    done
+  done < $output_dir/recordinid_spkorder
+  # done generating per utterance wer details
+fi
diff --git a/egs/chime6/s5_track2/local/nnet3/compare_wer.sh b/egs/chime6/s5_track2/local/nnet3/compare_wer.sh
new file mode 120000
index 00000000000..87041e833d0
--- /dev/null
+++ b/egs/chime6/s5_track2/local/nnet3/compare_wer.sh
@@ -0,0 +1 @@
+../../../s5_track1/local/nnet3/compare_wer.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/nnet3/decode.sh b/egs/chime6/s5_track2/local/nnet3/decode.sh
new file mode 120000
index 00000000000..32595ccedbc
--- /dev/null
+++ b/egs/chime6/s5_track2/local/nnet3/decode.sh
@@ -0,0 +1 @@
+../../../s5_track1/local/nnet3/decode.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh b/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh
new file mode 120000
index 00000000000..4161993c225
--- /dev/null
+++ b/egs/chime6/s5_track2/local/nnet3/run_ivector_common.sh
@@ -0,0 +1 @@
+../../../s5_track1/local/nnet3/run_ivector_common.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh
new file mode 100755
index 00000000000..cb8fe2e6326
--- /dev/null
+++ b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+#
+# Apache 2.0.
+
+# This script applies sliding window CMVN and writes the features to disk.
+#
+# Although this kind of script isn't necessary in speaker recognition recipes,
+# it can be helpful in the diarization recipes.  The script
+# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very
+# short (e.g., 1-2 seconds) segments.  Therefore, in order to apply the sliding
+# window CMVN in a meaningful way, it must be performed prior to performing
+# the subsegmentation.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+for f in $data_in/segments $data_in/segments/vad.scp ; do
+  [ -f $f ] && cp $f $data_out/`basename $f`;
+done
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh
new file mode 100755
index 00000000000..dcdbe1b1593
--- /dev/null
+++ b/egs/chime6/s5_track2/local/nnet3/xvector/prepare_feats_for_egs.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+#
+# Apache 2.0.
+
+# This script applies sliding window CMVN and removes silence frames.  This
+# is performed on the raw features prior to generating examples for training
+# the x-vector system.  Once the training examples are generated, the features
+# created by this script can be removed.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/vad.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh b/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh
new file mode 120000
index 00000000000..585b63fd2dd
--- /dev/null
+++ b/egs/chime6/s5_track2/local/nnet3/xvector/run_xvector.sh
@@ -0,0 +1 @@
+tuning/run_xvector_1a.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh
new file mode 100755
index 00000000000..94fc7e7682f
--- /dev/null
+++ b/egs/chime6/s5_track2/local/nnet3/xvector/tuning/run_xvector_1a.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+# Copyright      2018   David Snyder
+#                2018   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#                2018   Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# This script trains the x-vector DNN.  The recipe is similar to the one
+# described in "Diarization is Hard: Some Experiences and Lessons Learned
+# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al.
+
+. ./cmd.sh
+set -e
+
+stage=1
+train_stage=-1
+use_gpu=true
+remove_egs=false
+
+data=data/train
+nnet_dir=exp/xvector_nnet_1a/
+egs_dir=exp/xvector_nnet_1a/egs
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
+
+# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh.
+# The argument --num-repeats is related to the number of times a speaker
+# repeats per archive.  If it seems like you're getting too many archives
+# (e.g., more than 200) try increasing the --frames-per-iter option.  The
+# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the
+# minimum and maximum length (in terms of number of frames) of the features
+# in the examples.
+#
+# To make sense of the egs script, it may be necessary to put an "exit 1"
+# command immediately after stage 3.  Then, inspect
+# exp/<your-dir>/egs/temp/ranges.* . The ranges files specify the examples that
+# will be created, and which archives they will be stored in.  Each line of
+# ranges.* has the following form:
+#    <utt-id> <local-ark-indx> <global-ark-indx> <start-frame> <end-frame> <spk-id>
+# For example:
+#    100304-f-sre2006-kacg-A 1 2 4079 881 23
+
+# If you're satisfied with the number of archives (e.g., 50-150 archives is
+# reasonable) and with the number of examples per speaker (e.g., 1000-5000
+# is reasonable) then you can let the script continue to the later stages.
+# Otherwise, try increasing or decreasing the --num-repeats option.  You might
+# need to fiddle with --frames-per-iter.  Increasing this value decreases the
+# the number of archives and increases the number of examples per archive.
+# Decreasing this value increases the number of archives, while decreasing the
+# number of examples per archive.
+if [ $stage -le 6 ]; then
+  echo "$0: Getting neural network training egs";
+  # dump egs.
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
+  fi
+  sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
+    --nj 8 \
+    --stage 0 \
+    --frames-per-iter 1000000000 \
+    --frames-per-iter-diagnostic 500000 \
+    --min-frames-per-chunk 200 \
+    --max-frames-per-chunk 400 \
+    --num-diagnostic-archives 3 \
+    --num-repeats 40 \
+    "$data" $egs_dir
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}')
+  feat_dim=$(cat $egs_dir/info/feat_dim)
+
+  # This chunk-size corresponds to the maximum number of frames the
+  # stats layer is able to pool over.  In this script, it corresponds
+  # to 4 seconds.  If the input recording is greater than 4 seconds,
+  # we will compute multiple xvectors from the same recording and average
+  # to produce the final xvector.
+  max_chunk_size=400
+
+  # The smallest number of frames we're comfortable computing an xvector from.
+  # Note that the hard minimum is given by the left and right context of the
+  # frame-level layers.
+  min_chunk_size=20
+  mkdir -p $nnet_dir/configs
+  cat <<EOF > $nnet_dir/configs/network.xconfig
+  # please note that it is important to have input layer with the name=input
+
+  # The frame-level layers
+  input dim=${feat_dim} name=input
+  relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
+  relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
+  relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn4 dim=512
+  relu-batchnorm-layer name=tdnn5 dim=1500
+
+  # The stats pooling layer. Layers after this are segment-level.
+  # In the config below, the first and last argument (0, and ${max_chunk_size})
+  # means that we pool over an input segment starting at frame 0
+  # and ending at frame ${max_chunk_size} or earlier.  The other arguments (1:1)
+  # mean that no subsampling is performed.
+  stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
+
+  # This is where we usually extract the embedding (aka xvector) from.
+  relu-batchnorm-layer name=tdnn6 dim=128 input=stats
+  output-layer name=output include-log-softmax=true dim=${num_targets}
+EOF
+
+  steps/nnet3/xconfig_to_configs.py \
+      --xconfig-file $nnet_dir/configs/network.xconfig \
+      --config-dir $nnet_dir/configs/
+  cp $nnet_dir/configs/final.config $nnet_dir/nnet.config
+
+  # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh
+  echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config
+  echo "$max_chunk_size" > $nnet_dir/max_chunk_size
+  echo "$min_chunk_size" > $nnet_dir/min_chunk_size
+fi
+
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+srand=123
+if [ $stage -le 8 ]; then
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --trainer.optimization.proportional-shrink 10 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.minibatch-size=64 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2 \
+    --trainer.num-epochs=3 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.shuffle-buffer-size=1000 \
+    --egs.frames-per-eg=1 \
+    --egs.dir="$egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --dir=$nnet_dir  || exit 1;
+fi
+
+exit 0;
diff --git a/egs/chime6/s5_track2/local/prepare_data.sh b/egs/chime6/s5_track2/local/prepare_data.sh
new file mode 100755
index 00000000000..c6b8121dab0
--- /dev/null
+++ b/egs/chime6/s5_track2/local/prepare_data.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
+# Apache 2.0
+
+# Begin configuration section.
+mictype=worn # worn, ref or others
+cleanup=true
+train=true
+
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 3 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
+  echo -e >&2 "eg:\n  $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
+  exit 1
+fi
+
+set -e -o pipefail
+
+adir=$1
+jdir=$2
+dir=$3
+
+json_count=$(find -L $jdir -name "*.json" | wc -l)
+wav_count=$(find -L $adir -name "*.wav" | wc -l)
+
+if [ "$json_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $jdir will contain json files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+if [ "$wav_count" -eq 0 ]; then
+  echo >&2 "We expect that the directory $adir will contain wav files."
+  echo >&2 "That implies you have supplied a wrong path to the data."
+  exit 1
+fi
+
+echo "$0: Converting transcription to text"
+
+mkdir -p $dir
+for file in $jdir/*json; do
+  ./local/json2text.py --mictype $mictype $file
+done | \
+  sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
+  sed -e 's/ - / /g' |\
+  sed -e 's/mm-/mm/g' > $dir/text.orig
+
+echo "$0: Creating datadir $dir for type=\"$mictype\""
+
+if [ $mictype == "worn" ]; then
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key, add .L and .R for left and right channel
+  # i.e. each file will have two entries (left and right channel)
+  find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
+    perl -ne '{
+      chomp;
+      $path = $_;
+      next unless $path;
+      @F = split "/", $path;
+      ($f = $F[@F-1]) =~ s/.wav//;
+      @F = split "_", $f;
+      print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
+      print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
+    }' | sort > $dir/wav.scp
+
+  # generate the transcripts for both left and right channel
+  # from the original transcript in the form
+  # P09_S03-0006072-0006147 gimme the baker
+  # create left and right channel transcript
+  # P09_S03.L-0006072-0006147 gimme the baker
+  # P09_S03.R-0006072-0006147 gimme the baker
+  sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
+elif [ $mictype == "ref" ]; then
+  # fixed reference array
+  
+  # first get a text, which will be used to extract reference arrays
+  perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
+
+  find -L $adir | grep "\.wav" | sort > $dir/wav.flist
+  # following command provide the argument for grep to extract only reference arrays
+  #grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
+  paste -d" " \
+	<(awk -F "/" '{print $NF}' $dir/wav.flist | sed -e "s/\.wav/.ENH/") \
+	$dir/wav.flist | sort > $dir/wav.scp
+else
+  # array mic case
+  # convert the filenames to wav.scp format, use the basename of the file
+  # as a the wav.scp key
+  find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
+    perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
+    sort -u > $dir/wav.scp
+
+  # convert the transcripts from
+  # P09_S03-0006072-0006147 gimme the baker
+  # to the per-channel transcripts
+  # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
+  # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
+  perl -ne '$l=$_;
+    for($i=1; $i<=4; $i++) {
+      ($x=$l)=~ s/-/.CH\Q$i\E-/;
+      print $x;}' $dir/text.orig | sort > $dir/text
+
+fi
+$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
+
+# Prepare 'segments', 'utt2spk', 'spk2utt'
+if [ $mictype == "worn" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" \
+    > $dir/segments
+elif [ $mictype == "ref" ]; then
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e "s/ P.._/ /" > $dir/segments
+else
+  cut -d" " -f 1 $dir/text | \
+    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
+    sed -e "s/_[A-Z]*\././2" |\
+    sed -e 's/ P.._/ /' > $dir/segments
+fi
+cut -f 1 -d ' ' $dir/segments | \
+  perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+if [ $train != 'true' ]; then
+  # For scoring the final system, we need the original utt2spk
+  # and text file. So we keep them with the extension .bak here
+  # so that they don't affect the validate_data_dir steps in
+  # the intermediate steps.
+  for file in text utt2spk spk2utt segments; do
+    mv $dir/$file $dir/$file.bak
+  done
+  
+  # For dev and eval data, prepare pseudo utt2spk.
+  awk '{print $1, $1}' $dir/wav.scp > $dir/utt2spk
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+fi
diff --git a/egs/chime6/s5_track2/local/prepare_dict.sh b/egs/chime6/s5_track2/local/prepare_dict.sh
new file mode 120000
index 00000000000..ada30947463
--- /dev/null
+++ b/egs/chime6/s5_track2/local/prepare_dict.sh
@@ -0,0 +1 @@
+../../s5_track1/local/prepare_dict.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/print_dset_error.py b/egs/chime6/s5_track2/local/print_dset_error.py
new file mode 100755
index 00000000000..1a7fd4ff365
--- /dev/null
+++ b/egs/chime6/s5_track2/local/print_dset_error.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# Copyright   2019   Ashish Arora
+# Apache 2.0.
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+spkorder_writer = open(sys.argv[1],'w', encoding='utf8')
+array_id_error_dict={}
+for line in infile:
+    toks = line.strip().split()
+    recordingid = toks[1]
+    total_words = toks[-5][:-1]
+    total_errors = toks[-4][:-1]
+    total_ins = toks[-3][:-1]
+    total_del = toks[-2][:-1]
+    total_sub = toks[-1]
+    spk_order = toks[6][1] + '_' + toks[7][0] + '_' + toks[8][0] + '_' + toks[9][0]
+    spkorder_writer.write(recordingid + ':' + spk_order + '\n')
+    arrayid=recordingid.strip().split('_')[1]
+    if arrayid not in array_id_error_dict:
+        array_id_error_dict[arrayid]=[0]*5
+    array_id_error_dict[arrayid][0]+=int(total_words)
+    array_id_error_dict[arrayid][1]+=int(total_errors)
+    array_id_error_dict[arrayid][2]+=int(total_ins)
+    array_id_error_dict[arrayid][3]+=int(total_del)
+    array_id_error_dict[arrayid][4]+=int(total_sub)
+
+
+for arrayid in sorted(array_id_error_dict):
+    wer = float(array_id_error_dict[arrayid][1])/float(array_id_error_dict[arrayid][0])*100
+    wer_detail = "%WER {0:5.2f} [ {1} / {2}, {3} ins, {4} del, {5} sub ]".format(wer, array_id_error_dict[arrayid][0], array_id_error_dict[arrayid][1], array_id_error_dict[arrayid][2], array_id_error_dict[arrayid][3], array_id_error_dict[arrayid][4])
+    output.write(arrayid + ' ' + wer_detail + '\n')
+
diff --git a/egs/chime6/s5_track2/local/reverberate_lat_dir.sh b/egs/chime6/s5_track2/local/reverberate_lat_dir.sh
new file mode 120000
index 00000000000..57302268f6d
--- /dev/null
+++ b/egs/chime6/s5_track2/local/reverberate_lat_dir.sh
@@ -0,0 +1 @@
+../../s5_track1/local/reverberate_lat_dir.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/run_beamformit.sh b/egs/chime6/s5_track2/local/run_beamformit.sh
new file mode 120000
index 00000000000..832a16e3ba7
--- /dev/null
+++ b/egs/chime6/s5_track2/local/run_beamformit.sh
@@ -0,0 +1 @@
+../../s5_track1/local/run_beamformit.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/run_ivector_common.sh b/egs/chime6/s5_track2/local/run_ivector_common.sh
new file mode 120000
index 00000000000..df7fca84335
--- /dev/null
+++ b/egs/chime6/s5_track2/local/run_ivector_common.sh
@@ -0,0 +1 @@
+../../s5_track1/local/nnet3/run_ivector_common.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/run_wpe.py b/egs/chime6/s5_track2/local/run_wpe.py
new file mode 120000
index 00000000000..6621607c932
--- /dev/null
+++ b/egs/chime6/s5_track2/local/run_wpe.py
@@ -0,0 +1 @@
+../../s5_track1/local/run_wpe.py
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/run_wpe.sh b/egs/chime6/s5_track2/local/run_wpe.sh
new file mode 120000
index 00000000000..187080e62e4
--- /dev/null
+++ b/egs/chime6/s5_track2/local/run_wpe.sh
@@ -0,0 +1 @@
+../../s5_track1/local/run_wpe.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/score.sh b/egs/chime6/s5_track2/local/score.sh
new file mode 120000
index 00000000000..6a200b42ed3
--- /dev/null
+++ b/egs/chime6/s5_track2/local/score.sh
@@ -0,0 +1 @@
+../steps/scoring/score_kaldi_wer.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/score_for_submit.sh b/egs/chime6/s5_track2/local/score_for_submit.sh
new file mode 100755
index 00000000000..087a078316a
--- /dev/null
+++ b/egs/chime6/s5_track2/local/score_for_submit.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# Apache 2.0
+#
+# This script provides CHiME-6 challenge track 2 submission scores.
+# It calculates the best search parameter configurations by using the dev set
+# and provides wer for dev and eval
+
+cmd=run.pl
+stage=0
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+dev_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_dev_beamformit_dereverb_diarized_2stage
+eval_decodedir=exp/chain_train_worn_simu_u400k_cleaned_rvb/tdnn1b_sp/decode_eval_beamformit_dereverb_diarized_2stage
+dev_datadir=dev_beamformit_dereverb_diarized_hires
+eval_datadir=eval_beamformit_dereverb_diarized_hires
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    echo "Usage: $0 [--cmd (run.pl|queue.pl...)]"
+    echo "This script provides CHiME-6 challenge submission scores"
+    echo " Options:"
+    echo "    --cmd (run.pl|queue.pl...)            # specify how to run the sub-processes."
+    echo "    --dev_decodedir <dev-decode-dir>      # dev set decoding directory"
+    echo "    --eval_decodedir <eval-decode-dir>    # eval set decoding directory"
+    echo "    --dev_datadir <dev-data-dir>          # dev set data directory"
+    echo "    --eval_datadir <eval-data-dir>        # eval set data directory"
+    echo "    --min_lmwt <int>                      # minumum LM-weight for lattice rescoring "
+    echo "    --max_lmwt <int>                      # maximum LM-weight for lattice rescoring "
+    
+    exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # obtaining multi speaker WER for all lmwt and wip
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for LMWT in $(seq $min_lmwt $max_lmwt); do
+      local/multispeaker_score.sh --cmd "$cmd" \
+      --datadir $dev_datadir --get_stats false data/$dev_datadir/text \
+      $dev_decodedir/scoring_kaldi/penalty_$wip/$LMWT.txt \
+      $dev_decodedir/scoring_kaldi_multispeaker/penalty_$wip/$LMWT
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # obtaining best lmwt, wip and wer
+  # adding /dev/null to the command list below forces grep to output the filename
+  mkdir -p $dev_decodedir/scoring_kaldi_multispeaker
+  grep WER $dev_decodedir/scoring_kaldi_multispeaker/penalty_*/*/per_speaker_wer/array_wer.txt /dev/null \
+    | utils/best_wer.sh >& $dev_decodedir/scoring_kaldi_multispeaker/best_wer
+
+  best_wer_file=$(awk '{print $NF}' $dev_decodedir/scoring_kaldi_multispeaker/best_wer)
+  best_array=$(echo $best_wer_file | awk -F: '{N=NF; print $N}')
+  best_lmwt=$(echo $best_wer_file | awk -F/ '{N=NF-2; print $N}')
+  best_wip=$(echo $best_wer_file | awk -F_ '{N=NF-3; print $N}' | awk -F/ '{N=NF-2; print $N}')
+
+  # printing and storing best lmwt, best_array and wip
+  echo "best array: $best_array"
+  echo "best LM weight: $best_lmwt"
+  echo "best insertion penalty weight: $best_wip"
+
+  echo $best_lmwt > $dev_decodedir/scoring_kaldi_multispeaker/lmwt
+  echo $best_wip >  $dev_decodedir/scoring_kaldi_multispeaker/wip
+  echo $best_array >  $dev_decodedir/scoring_kaldi_multispeaker/best_array
+fi
+
+if [ $stage -le 3 ]; then
+  # obtaining per utterance stats for dev
+  local/multispeaker_score.sh --cmd "$cmd" \
+    --datadir $dev_datadir data/$dev_datadir/text \
+    $dev_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+    $dev_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/
+fi
+
+if [ $stage -le 4 ]; then
+  # obtaining per utterance stats for eval
+  local/multispeaker_score.sh --cmd "$cmd" \
+    --datadir $eval_datadir data/$eval_datadir/text \
+    $eval_decodedir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+    $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/
+fi
+
+if [ $stage -le 5 ]; then
+  # obtaining eval wer corresponding to best lmwt, best_array and wip of dev
+  best_array="$(cat $dev_decodedir/scoring_kaldi_multispeaker/best_array)"
+  best_lmwt="$(cat $dev_decodedir/scoring_kaldi_multispeaker/lmwt)"
+  best_wip="$(cat $dev_decodedir/scoring_kaldi_multispeaker/wip)"
+
+  grep WER $eval_decodedir/scoring_kaldi_multispeaker/penalty_$best_wip/$best_lmwt/per_speaker_wer/array_wer.txt /dev/null \
+    | grep $best_array | utils/best_wer.sh >& $eval_decodedir/scoring_kaldi_multispeaker/best_wer
+
+  # printing dev and eval wer
+  echo "Dev:  $(<$dev_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-15
+  echo "Eval: $(<$eval_decodedir/scoring_kaldi_multispeaker/best_wer)" | cut -d " " -f 1-14
+fi
+
diff --git a/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh b/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh
new file mode 100755
index 00000000000..91d52b39269
--- /dev/null
+++ b/egs/chime6/s5_track2/local/segmentation/detect_speech_activity.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+
+# Copyright 2016-17  Vimal Manohar
+#              2017  Nagendra Kumar Goel
+# Apache 2.0.
+
+# This script does nnet3-based speech activity detection given an input 
+# kaldi data directory and outputs a segmented kaldi data directory.
+# This script can also do music detection and other similar segmentation
+# using appropriate options such as --output-name output-music.
+
+set -e 
+set -o pipefail
+set -u
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+affix=  # Affix for the segmentation
+nj=32
+cmd=queue.pl
+stage=-1
+
+# Feature options (Must match training)
+mfcc_config=conf/mfcc_hires.conf
+feat_affix=   # Affix for the type of feature used
+
+output_name=output   # The output node in the network
+sad_name=sad    # Base name for the directory storing the computed loglikes
+                # Can be music for music detection
+segmentation_name=segmentation  # Base name for the directory doing segmentation
+                                # Can be segmentation_music for music detection
+
+# SAD network config
+iter=final  # Model iteration to use
+
+# Contexts must ideally match training for LSTM models, but
+# may not necessarily for stats components
+extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
+extra_right_context=0  
+extra_left_context_initial=-1
+extra_right_context_final=-1
+frames_per_chunk=150
+
+# Decoding options
+graph_opts="--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0"
+acwt=0.3
+
+# These <from>_in_<to>_weight represent the fraction of <from> probability 
+# to transfer to <to> class.
+# e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3
+transform_probs_opts=""
+
+# Postprocessing options
+segment_padding=0.2   # Duration (in seconds) of padding added to segments 
+min_segment_dur=0   # Minimum duration (in seconds) required for a segment to be included
+                    # This is before any padding. Segments shorter than this duration will be removed.
+                    # This is an alternative to --min-speech-duration above.
+merge_consecutive_max_dur=0   # Merge consecutive segments as long as the merged segment is no longer than this many
+                              # seconds. The segments are only merged if their boundaries are touching.
+                              # This is after padding by --segment-padding seconds.
+                              # 0 means do not merge. Use 'inf' to not limit the duration.
+
+echo $* 
+
+. utils/parse_options.sh
+
+if [ $# -ne 5 ]; then
+  echo "This script does nnet3-based speech activity detection given an input kaldi "
+  echo "data directory and outputs an output kaldi data directory."
+  echo "See script for details of the options to be supplied."
+  echo "Usage: $0 <src-data-dir> <sad-nnet-dir> <mfcc-dir> <work-dir> <out-data-dir>"
+  echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\"
+  echo "    mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev"
+  echo ""
+  echo "Options: "
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --nj <num-job>                                 # number of parallel jobs to run."
+  echo "  --stage <stage>                                # stage to do partial re-run from."
+  echo "  --convert-data-dir-to-whole <true|false>    # If true, the input data directory is "
+  echo "                                              # first converted to whole data directory (i.e. whole recordings) "
+  echo "                                              # and segmentation is done on that."
+  echo "                                              # If false, then the original segments are "
+  echo "                                              # retained and they are split into sub-segments."
+  echo "  --output-name <name>    # The output node in the network"
+  echo "  --extra-left-context  <context|0>   # Set to some large value, typically 40 for LSTM (must match training)"
+  echo "  --extra-right-context  <context|0>   # For BLSTM or statistics pooling"
+  exit 1
+fi
+
+src_data_dir=$1   # The input data directory that needs to be segmented.
+                  # If convert_data_dir_to_whole is true, any segments in that will be ignored.
+sad_nnet_dir=$2   # The SAD neural network
+mfcc_dir=$3       # The directory to store the features
+dir=$4            # Work directory
+data_dir=$5       # The output data directory will be ${data_dir}_seg
+
+affix=${affix:+_$affix}
+feat_affix=${feat_affix:+_$feat_affix}
+
+data_id=`basename $data_dir`
+sad_dir=${dir}/${sad_name}${affix}_${data_id}${feat_affix}
+seg_dir=${dir}/${segmentation_name}${affix}_${data_id}${feat_affix}
+test_data_dir=data/${data_id}${feat_affix}
+
+###############################################################################
+## Forward pass through the network network and dump the log-likelihoods.
+###############################################################################
+
+frame_subsampling_factor=1
+if [ -f $sad_nnet_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sad_nnet_dir/frame_subsampling_factor)
+fi
+
+mkdir -p $dir
+if [ $stage -le 1 ]; then
+  if [ "$(readlink -f $sad_nnet_dir)" != "$(readlink -f $dir)" ]; then
+    cp $sad_nnet_dir/cmvn_opts $dir || exit 1
+  fi
+
+  ########################################################################
+  ## Initialize neural network for decoding using the output $output_name
+  ########################################################################
+
+  if [ ! -z "$output_name" ] && [ "$output_name" != output ]; then
+    $cmd $dir/log/get_nnet_${output_name}.log \
+      nnet3-copy --edits="rename-node old-name=$output_name new-name=output" \
+      $sad_nnet_dir/$iter.raw $dir/${iter}_${output_name}.raw || exit 1
+    iter=${iter}_${output_name}
+  else 
+    if ! diff $sad_nnet_dir/$iter.raw $dir/$iter.raw; then
+      cp $sad_nnet_dir/$iter.raw $dir/
+    fi
+  fi
+
+  steps/nnet3/compute_output.sh --nj $nj --cmd "$cmd" \
+    --iter ${iter} \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk $frames_per_chunk --apply-exp true \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    ${test_data_dir} $dir $sad_dir || exit 1
+fi
+
+###############################################################################
+## Prepare FST we search to make speech/silence decisions.
+###############################################################################
+
+utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1
+frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1
+
+graph_dir=${dir}/graph_${output_name}
+if [ $stage -le 2 ]; then
+  mkdir -p $graph_dir
+
+  # 1 for silence and 2 for speech
+  cat <<EOF > $graph_dir/words.txt
+<eps> 0
+silence 1
+speech 2
+EOF
+
+  $cmd $graph_dir/log/make_graph.log \
+    steps/segmentation/internal/prepare_sad_graph.py $graph_opts \
+      --frame-shift=$(perl -e "print $frame_shift * $frame_subsampling_factor") - \| \
+    fstcompile --isymbols=$graph_dir/words.txt --osymbols=$graph_dir/words.txt '>' \
+      $graph_dir/HCLG.fst
+fi
+
+###############################################################################
+## Do Viterbi decoding to create per-frame alignments.
+###############################################################################
+
+post_vec=$sad_nnet_dir/post_${output_name}.vec
+if [ ! -f $sad_nnet_dir/post_${output_name}.vec ]; then
+  if [ ! -f $sad_nnet_dir/post_${output_name}.txt ]; then
+    echo "$0: Could not find $sad_nnet_dir/post_${output_name}.vec. "
+    echo "Re-run the corresponding stage in the training script possibly "
+    echo "with --compute-average-posteriors=true or compute the priors "
+    echo "from the training labels"
+    exit 1
+  else
+    post_vec=$sad_nnet_dir/post_${output_name}.txt
+  fi
+fi
+
+mkdir -p $seg_dir
+if [ $stage -le 3 ]; then
+  steps/segmentation/internal/get_transform_probs_mat.py \
+    --priors="$post_vec" $transform_probs_opts > $seg_dir/transform_probs.mat
+
+  steps/segmentation/decode_sad.sh --acwt $acwt --cmd "$cmd" \
+    --nj $nj \
+    --transform "$seg_dir/transform_probs.mat" \
+    $graph_dir $sad_dir $seg_dir
+fi
+
+###############################################################################
+## Post-process segmentation to create kaldi data directory.
+###############################################################################
+
+if [ $stage -le 4 ]; then
+  steps/segmentation/post_process_sad_to_segments.sh \
+    --segment-padding $segment_padding --min-segment-dur $min_segment_dur \
+    --merge-consecutive-max-dur $merge_consecutive_max_dur \
+    --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \
+    ${test_data_dir} ${seg_dir} ${seg_dir}
+fi
+
+if [ $stage -le 5 ]; then
+  utils/data/subsegment_data_dir.sh ${test_data_dir} ${seg_dir}/segments \
+    ${data_dir}_seg
+fi
+
+echo "$0: Created output segmented kaldi data directory in ${data_dir}_seg"
+exit 0
diff --git a/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh b/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh
new file mode 100755
index 00000000000..5701424869a
--- /dev/null
+++ b/egs/chime6/s5_track2/local/segmentation/tuning/train_lstm_sad_1a.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+
+# Copyright 2017   Nagendra Kumar Goel
+#           2018   Vimal Manohar
+# Apache 2.0
+
+# This is a script to train a TDNN for speech activity detection (SAD) 
+# using LSTM for long-context information.
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+egs_opts=
+
+chunk_width=20
+
+extra_left_context=60
+extra_right_context=10
+relu_dim=256
+cell_dim=256
+projection_dim=64
+
+# training options
+num_epochs=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=8
+remove_egs=true
+max_param_change=0.2  # Small max-param change for small network
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+
+egs_dir=
+nj=40
+
+dir=
+affix=1a
+
+data_dir=
+targets_dir=
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+set -o pipefail
+set -u
+
+if [ -z "$dir" ]; then
+  dir=exp/segmentation_1a/tdnn_lstm_asr_sad
+fi
+dir=$dir${affix:+_$affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+mkdir -p $dir
+
+samples_per_iter=`perl -e "print int(400000 / $chunk_width)"`
+cmvn_opts="--norm-means=false --norm-vars=false"
+echo $cmvn_opts > $dir/cmvn_opts
+
+if [ $stage -le 5 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat 
+
+  relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0
+  relu-renorm-layer name=tdnn4 input=Append(-6,0,6,12) add-log-stddev=true dim=$relu_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0
+  relu-renorm-layer name=tdnn5 input=Append(-12,0,12,24) dim=$relu_dim
+
+  output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs/
+
+  cat <<EOF >> $dir/configs/vars
+num_targets=3
+EOF
+fi
+
+if [ $stage -le 6 ]; then
+  num_utts=`cat $data_dir/utt2spk | wc -l`
+  # Set num_utts_subset for diagnostics to a reasonable value
+  # of max(min(0.005 * num_utts, 300), 12)
+  num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts`
+
+  steps/nnet3/train_raw_rnn.py --stage=$train_stage \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \
+    --egs.chunk-left-context=$extra_left_context \
+    --egs.chunk-right-context=$extra_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.max-param-change=$max_param_change \
+    --trainer.compute-per-dim-accuracy=true \
+    --cmd="$decode_cmd" --nj $nj \
+    --cleanup=true \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --use-dense-targets=true \
+    --feat-dir=$data_dir \
+    --targets-scp="$targets_dir/targets.scp" \
+    --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
+    --dir=$dir || exit 1
+fi
+
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  $train_cmd $dir/log/get_priors.log \
+    matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+    ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
+
+  echo 3 > $dir/frame_subsampling_factor
+fi
diff --git a/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh b/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh
new file mode 100755
index 00000000000..bb985462f49
--- /dev/null
+++ b/egs/chime6/s5_track2/local/segmentation/tuning/train_stats_sad_1a.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+# Copyright 2017   Nagendra Kumar Goel
+#           2018   Vimal Manohar
+# Apache 2.0
+
+# This is a script to train a TDNN for speech activity detection (SAD) 
+# using statistics pooling for long-context information.
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+egs_opts=
+
+chunk_width=20
+
+# The context is chosen to be around 1 second long. The context at test time
+# is expected to be around the same.
+extra_left_context=79
+extra_right_context=21
+
+relu_dim=256
+
+# training options
+num_epochs=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=8
+remove_egs=true
+max_param_change=0.2  # Small max-param change for small network
+
+egs_dir=
+nj=40
+
+dir=
+affix=1a
+
+data_dir=
+targets_dir=
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+set -o pipefail
+set -u
+
+if [ -z "$dir" ]; then
+  dir=exp/segmentation_1a/tdnn_stats_sad
+fi
+dir=$dir${affix:+_$affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+mkdir -p $dir
+
+samples_per_iter=`perl -e "print int(400000 / $chunk_width)"`
+cmvn_opts="--norm-means=false --norm-vars=false"
+echo $cmvn_opts > $dir/cmvn_opts
+
+if [ $stage -le 5 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat 
+
+  relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true
+  stats-layer name=tdnn3_stats config=mean+count(-99:3:9:99)
+  relu-renorm-layer name=tdnn4 input=Append(tdnn3@-6,tdnn3@0,tdnn3@6,tdnn3@12,tdnn3_stats) add-log-stddev=true dim=$relu_dim
+  stats-layer name=tdnn4_stats config=mean+count(-108:6:18:108)
+  relu-renorm-layer name=tdnn5 input=Append(tdnn4@-12,tdnn4@0,tdnn4@12,tdnn4@24,tdnn4_stats) dim=$relu_dim
+
+  output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs/
+
+  cat <<EOF >> $dir/configs/vars
+num_targets=3
+EOF
+fi
+
+if [ $stage -le 6 ]; then
+  num_utts=`cat $data_dir/utt2spk | wc -l`
+  # Set num_utts_subset for diagnostics to a reasonable value
+  # of max(min(0.005 * num_utts, 300), 12)
+  num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts`
+
+  steps/nnet3/train_raw_rnn.py --stage=$train_stage \
+    --feat.cmvn-opts=$cmvn_opts \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \
+    --egs.chunk-left-context=$extra_left_context \
+    --egs.chunk-right-context=$extra_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.max-param-change=$max_param_change \
+    --trainer.compute-per-dim-accuracy=true \
+    --cmd="$decode_cmd" --nj $nj \
+    --cleanup=true \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --use-dense-targets=true \
+    --feat-dir=$data_dir \
+    --targets-scp="$targets_dir/targets.scp" \
+    --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
+    --dir=$dir || exit 1
+fi
+
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  #$train_cmd $dir/log/get_priors.log \
+  #  matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+  #  ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
+
+  # Since the train data is individual microphones, while the dev and
+  # eval are beamformed, it is likely that the train contains a much
+  # higher ratio of silences. So using priors computed from the train
+  # data may miss a lot of speech in the dev/eval sets. Hence we manually
+  # tune the prior on the dev set.
+  # With the following prior, the SAD system results are:
+  # Dev (using -c 0.25)
+  # MISSED SPEECH =   1188.59 secs (  3.3 percent of scored time)
+  # FALARM SPEECH =    539.37 secs (  1.5 percent of scored time)
+  echo "[ 30 2 1 ]" > $dir/post_output.vec || exit 1
+
+  echo 3 > $dir/frame_subsampling_factor
+fi
+
diff --git a/egs/chime6/s5_track2/local/train_diarizer.sh b/egs/chime6/s5_track2/local/train_diarizer.sh
new file mode 100755
index 00000000000..71918e7cabc
--- /dev/null
+++ b/egs/chime6/s5_track2/local/train_diarizer.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+# Copyright
+#        2019   David Snyder
+# Apache 2.0.
+#
+# This script is based on the run.sh script in the Voxceleb v2 recipe.
+# It trains an x-vector DNN for diarization.
+
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+voxceleb1_root=/export/corpora/VoxCeleb1
+voxceleb2_root=/export/corpora/VoxCeleb2
+data_dir=train_worn_simu_u400k
+model_dir=exp/xvector_nnet_1a
+
+stage=0
+train_stage=-1
+
+. ./cmd.sh
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+set -e -u -o pipefail
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+  exit 1
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: preparing voxceleb 2 data"
+  local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
+  local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
+
+  echo "$0: preparing voxceleb 1 data (see comments if this step fails)"
+  # The format of the voxceleb 1 corpus has changed several times since it was
+  # released.  Therefore, our dataprep scripts may or may not fail depending
+  # on the version of the corpus you obtained.
+  # If you downloaded the corpus soon after it was first released, this
+  # version of the dataprep script might work:
+  local/make_voxceleb1.pl $voxceleb1_root data/voxceleb1
+  # However, if you've downloaded the corpus recently, you may need to use the
+  # the following scripts instead:
+  #local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  #local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
+  # We should now have about 7,351 speakers and 1,277,503 utterances.
+  utils/combine_data.sh data/voxceleb data/voxceleb2_train data/voxceleb2_test
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing features for training data (voxceleb 1 + 2)"
+  steps/make_mfcc.sh --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/voxceleb exp/make_mfcc $mfccdir
+  utils/fix_data_dir.sh data/voxceleb
+  # Note that we apply CMN to the MFCCs and write these to the disk.  These
+  # features will later be used to train the x-vector DNN.
+fi
+
+# In this section, we augment the voxceleb data with reverberation.
+# Note that we can probably improve the x-vector DNN if we include
+# augmentations from the nonspeech regions of the Chime 6 training
+# dataset.
+if [ $stage -le 2 ]; then
+  echo "$0: applying augmentation to x-vector training data (just reverb for now)"
+  frame_shift=0.01
+  awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/voxceleb/utt2num_frames > data/voxceleb/reco2dur
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    echo "$0: downloading simulated room impulse response dataset"
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of the training data.  Note that we don't add any
+  # additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications 1 \
+    --source-sampling-rate 16000 \
+    data/voxceleb data/voxceleb_reverb
+  utils/copy_data_dir.sh --utt-suffix "-reverb" data/voxceleb_reverb data/voxceleb_reverb.new
+  rm -rf data/voxceleb_reverb
+  mv data/voxceleb_reverb.new data/voxceleb_reverb
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: making MFCCs for augmented training data"
+  # Make MFCCs for the augmented data.  Note that we do not compute a new
+  # vad.scp file here.  Instead, we use the vad.scp from the clean version of
+  # the list.
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/voxceleb_reverb exp/make_mfcc $mfccdir
+  # Combine the clean and augmented training data.  This is now roughly
+  # double the size of the original clean list.
+  utils/combine_data.sh data/voxceleb_combined data/voxceleb_reverb data/voxceleb
+fi
+
+# Now we prepare the features to generate examples for xvector training.
+if [ $stage -le 4 ]; then
+  # This script applies CMVN and removes nonspeech frames.  Note that this is somewhat
+  # wasteful, as it roughly doubles the amount of training data on disk.  After
+  # creating voxceleb examples, this can be removed.
+  echo "$0: preparing features to train x-vector DNN"
+  local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+    data/voxceleb_combined data/voxceleb_combined_cmn exp/voxceleb_combined_cmn
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+fi
+
+if [ $stage -le 5 ]; then
+  # Now, we need to remove features that are too short after removing silence
+  # frames.  We want at least 4s (400 frames) per utterance.
+  min_len=400
+  mv data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2num_frames.bak
+  awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/voxceleb_combined_cmn/utt2num_frames.bak > data/voxceleb_combined_cmn/utt2num_frames
+  utils/filter_scp.pl data/voxceleb_combined_cmn/utt2num_frames data/voxceleb_combined_cmn/utt2spk > data/voxceleb_combined_cmn/utt2spk.new
+  mv data/voxceleb_combined_cmn/utt2spk.new data/voxceleb_combined_cmn/utt2spk
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+
+  # We also want several utterances per speaker. Now we'll throw out speakers
+  # with fewer than 8 utterances.
+  min_num_utts=8
+  awk '{print $1, NF-1}' data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2num
+  awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/voxceleb_combined_cmn/spk2num | utils/filter_scp.pl - data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/spk2utt.new
+  mv data/voxceleb_combined_cmn/spk2utt.new data/voxceleb_combined_cmn/spk2utt
+  utils/spk2utt_to_utt2spk.pl data/voxceleb_combined_cmn/spk2utt > data/voxceleb_combined_cmn/utt2spk
+
+  utils/filter_scp.pl data/voxceleb_combined_cmn/utt2spk data/voxceleb_combined_cmn/utt2num_frames > data/voxceleb_combined_cmn/utt2num_frames.new
+  mv data/voxceleb_combined_cmn/utt2num_frames.new data/voxceleb_combined_cmn/utt2num_frames
+
+  utils/fix_data_dir.sh data/voxceleb_combined_cmn
+fi
+
+# Stages 6 through 8 are handled in run_xvector.sh.
+# This script trains the x-vector DNN on the augmented voxceleb data.
+local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage $train_stage \
+  --data data/voxceleb_combined_cmn --nnet-dir $model_dir \
+  --egs-dir $model_dir/egs
+
+if [ $stage -le 9 ]; then
+  echo "$0: preparing a subset of Chime 6 training data to train PLDA model"
+  utils/subset_data_dir.sh ${data_dir} 100000 data/plda_train
+  steps/make_mfcc.sh --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf --nj 40 --cmd "$train_cmd" \
+    data/plda_train exp/make_mfcc $mfccdir
+  utils/fix_data_dir.sh data/plda_train
+  local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+    data/plda_train data/plda_train_cmn exp/plda_train_cmn
+  if [ -f data/plda_train/segments ]; then
+    cp data/plda_train/segments data/plda_train_cmn/
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: extracting x-vector for PLDA training data"
+  utils/fix_data_dir.sh data/plda_train_cmn
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \
+    --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \
+    --hard-min true $model_dir \
+    data/plda_train_cmn $model_dir/xvectors_plda_train
+fi
+
+# Train PLDA models
+if [ $stage -le 11 ]; then
+  echo "$0: training PLDA model"
+  $train_cmd $model_dir/xvectors_plda_train/log/plda.log \
+    ivector-compute-plda ark:$model_dir/xvectors_plda_train/spk2utt \
+      "ark:ivector-subtract-global-mean \
+      scp:$model_dir/xvectors_plda_train/xvector.scp ark:- \
+      | transform-vec $model_dir/xvectors_plda_train/transform.mat ark:- ark:- \
+      | ivector-normalize-length ark:- ark:- |" \
+      $model_dir/xvectors_plda_train/plda || exit 1;
+  cp $model_dir/xvectors_plda_train/plda $model_dir/
+  cp $model_dir/xvectors_plda_train/transform.mat $model_dir/
+  cp $model_dir/xvectors_plda_train/mean.vec $model_dir/
+fi
diff --git a/egs/chime6/s5_track2/local/train_lms_srilm.sh b/egs/chime6/s5_track2/local/train_lms_srilm.sh
new file mode 120000
index 00000000000..a7666f6cded
--- /dev/null
+++ b/egs/chime6/s5_track2/local/train_lms_srilm.sh
@@ -0,0 +1 @@
+../../s5_track1/local/train_lms_srilm.sh
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/local/train_sad.sh b/egs/chime6/s5_track2/local/train_sad.sh
new file mode 100755
index 00000000000..e12a0cad694
--- /dev/null
+++ b/egs/chime6/s5_track2/local/train_sad.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+# Copyright  2017  Nagendra Kumar Goel
+#            2017  Vimal Manohar
+#            2019  Desh Raj
+# Apache 2.0
+
+# This script is based on local/run_asr_segmentation.sh script in the
+# Aspire recipe. It demonstrates nnet3-based speech activity detection for
+# segmentation.
+# This script:
+# 1) Prepares targets (per-frame labels) for a subset of training data 
+#    using GMM models
+# 2) Trains TDNN+Stats or TDNN+LSTM neural network using the targets 
+# 3) Demonstrates using the SAD system to get segments of dev data
+
+lang=data/lang   # Must match the one used to train the models
+lang_test=data/lang_test  # Lang directory for decoding.
+
+data_dir=
+test_sets=
+# Model directory used to align the $data_dir to get target labels for training
+# SAD. This should typically be a speaker-adapted system.
+sat_model_dir=
+# Model direcotry used to decode the whole-recording version of the $data_dir to
+# get target labels for training SAD. This should typically be a
+# speaker-independent system like LDA+MLLT system.
+model_dir=
+graph_dir=                  # Graph for decoding whole-recording version of $data_dir.
+                            # If not provided, a new one will be created using $lang_test
+
+# List of weights on labels obtained from alignment;
+# labels obtained from decoding; and default labels in out-of-segment regions
+merge_weights=1.0,0.1,0.5
+
+prepare_targets_stage=-10
+nstage=-10
+train_stage=-10
+stage=0
+nj=50
+reco_nj=40
+
+# test options
+test_nj=10
+
+. ./cmd.sh
+. ./conf/sad.conf
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+set -e -u -o pipefail
+. utils/parse_options.sh 
+
+if [ $# -ne 0 ]; then
+  exit 1
+fi
+
+dir=exp/segmentation${affix}
+sad_work_dir=exp/sad${affix}_${nnet_type}/
+sad_nnet_dir=$dir/tdnn_${nnet_type}_sad_1a
+
+mkdir -p $dir
+mkdir -p ${sad_work_dir}
+
+# See $lang/phones.txt and decide which should be garbage
+garbage_phones="laughs inaudible"
+silence_phones="sil spn noise"
+
+for p in $garbage_phones; do 
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
+  done
+done > $dir/garbage_phones.txt
+
+for p in $silence_phones; do 
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
+  done
+done > $dir/silence_phones.txt
+
+if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
+  steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then
+  echo "$0: Invalid $dir/{silence,garbage}_phones.txt"
+  exit 1
+fi
+
+# The training data may already be segmented, so we first prepare
+# a "whole" training data (not segmented) for training the SAD
+# system.
+
+whole_data_dir=${data_dir}_whole
+whole_data_id=$(basename $whole_data_dir)
+
+if [ $stage -le 0 ]; then
+  utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
+fi
+
+###############################################################################
+# Extract features for the whole data directory. We extract 13-dim MFCCs to
+# generate targets using the GMM system, and 40-dim MFCCs to train the NN-based
+# SAD.
+###############################################################################
+if [ $stage -le 1 ]; then
+  steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd"  --write-utt2num-frames true \
+    --mfcc-config conf/mfcc.conf \
+    $whole_data_dir exp/make_mfcc/${whole_data_id}
+  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${whole_data_id}
+  utils/fix_data_dir.sh $whole_data_dir
+
+  utils/copy_data_dir.sh $whole_data_dir ${whole_data_dir}_hires
+  steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd"  --write-utt2num-frames true \
+    --mfcc-config conf/mfcc_hires.conf \
+    ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires
+  steps/compute_cmvn_stats.sh ${whole_data_dir}_hires exp/make_mfcc/${whole_data_id}_hires
+  utils/fix_data_dir.sh ${whole_data_dir}_hires
+fi
+
+###############################################################################
+# Prepare SAD targets for recordings
+###############################################################################
+targets_dir=$dir/${whole_data_id}_combined_targets_sub3
+if [ $stage -le 2 ]; then
+  steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \
+    --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \
+    --nj $nj --reco-nj $reco_nj --lang-test $lang \
+    --garbage-phones-list $dir/garbage_phones.txt \
+    --silence-phones-list $dir/silence_phones.txt \
+    --merge-weights "$merge_weights" \
+    --remove-mismatch-frames false \
+    --graph-dir "$graph_dir" \
+    $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
+fi
+
+###############################################################################
+# Train a neural network for SAD
+###############################################################################
+if [ $stage -le 3 ]; then
+	if [ $nnet_type == "stats" ]; then
+		# Train a STATS-pooling network for SAD
+		local/segmentation/tuning/train_stats_sad_1a.sh \
+		  --stage $nstage --train-stage $train_stage \
+		  --targets-dir ${targets_dir} \
+		  --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1
+	
+	elif [ $nnet_type == "lstm" ]; then
+    # Train a TDNN+LSTM network for SAD
+    local/segmentation/tuning/train_lstm_sad_1a.sh \
+      --stage $nstage --train-stage $train_stage \
+      --targets-dir ${targets_dir} \
+      --data-dir ${whole_data_dir}_hires --affix "1a" || exit 1
+
+  fi
+fi
+
+exit 0;
diff --git a/egs/chime6/s5_track2/local/wer_output_filter b/egs/chime6/s5_track2/local/wer_output_filter
new file mode 120000
index 00000000000..12a6c616d3d
--- /dev/null
+++ b/egs/chime6/s5_track2/local/wer_output_filter
@@ -0,0 +1 @@
+../../s5_track1/local/wer_output_filter
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/path.sh b/egs/chime6/s5_track2/path.sh
new file mode 100644
index 00000000000..c2526194bee
--- /dev/null
+++ b/egs/chime6/s5_track2/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
diff --git a/egs/chime6/s5_track2/run.sh b/egs/chime6/s5_track2/run.sh
new file mode 100755
index 00000000000..1350b8e14d5
--- /dev/null
+++ b/egs/chime6/s5_track2/run.sh
@@ -0,0 +1,296 @@
+#!/bin/bash
+#
+# Chime-6 Track 2 baseline. Based mostly on the Chime-5 recipe, with the exception
+# that we are required to perform speech activity detection and speaker
+# diarization before ASR, since we do not have access to the oracle SAD and 
+# diarization labels.
+#
+# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal)
+#            2019  Desh Raj, David Snyder, Ashish Arora
+# Apache 2.0
+
+# Begin configuration section.
+nj=50
+decode_nj=20
+stage=0
+nnet_stage=-10
+sad_stage=0
+diarizer_stage=0
+decode_stage=1
+enhancement=beamformit # for a new enhancement method,
+                       # change this variable and decode stage
+decode_only=false
+num_data_reps=4
+snrs="20:10:15:5:0"
+foreground_snrs="20:10:15:5:0"
+background_snrs="20:10:15:5:0"
+# End configuration section
+. ./utils/parse_options.sh
+
+. ./cmd.sh
+. ./path.sh
+
+if [ $decode_only == "true" ]; then
+  stage=18
+fi
+
+set -e # exit on error
+
+# chime5 main directory path
+# please change the path accordingly
+chime5_corpus=/export/corpora4/CHiME5
+# chime6 data directories, which are generated from ${chime5_corpus},
+# to synchronize audio files across arrays and modify the annotation (JSON) file accordingly
+chime6_corpus=${PWD}/CHiME6
+json_dir=${chime6_corpus}/transcriptions
+audio_dir=${chime6_corpus}/audio
+
+# training and test data
+train_set=train_worn_simu_u400k
+sad_train_set=train_worn_u400k
+test_sets="dev_${enhancement}_dereverb eval_${enhancement}_dereverb"
+
+# This script also needs the phonetisaurus g2p, srilm, beamformit
+./local/check_tools.sh || exit 1;
+
+###########################################################################
+# We first generate the synchronized audio files across arrays and
+# corresponding JSON files. Note that this requires sox v14.4.2,
+# which is installed via miniconda in ./local/check_tools.sh
+###########################################################################
+
+if [ $stage -le 0 ]; then
+  local/generate_chime6_data.sh \
+    --cmd "$train_cmd" \
+    ${chime5_corpus} \
+    ${chime6_corpus}
+fi
+
+###########################################################################
+# We prepare dict and lang in stages 1 to 3.
+###########################################################################
+
+if [ $stage -le 1 ]; then
+  # skip u03 and u04 as they are missing
+  for mictype in worn u01 u02 u05 u06; do
+    local/prepare_data.sh --mictype ${mictype} --train true \
+        ${audio_dir}/train ${json_dir}/train data/train_${mictype}
+  done
+  for dataset in dev; do
+    for mictype in worn; do
+      local/prepare_data.sh --mictype ${mictype} --train true \
+          ${audio_dir}/${dataset} ${json_dir}/${dataset} \
+          data/${dataset}_${mictype}
+    done
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh
+
+  utils/prepare_lang.sh \
+    data/local/dict "<unk>" data/local/lang data/lang
+
+  local/train_lms_srilm.sh \
+    --train-text data/train_worn/text --dev-text data/dev_worn/text \
+    --oov-symbol "<unk>" --words-file data/lang/words.txt \
+    data/ data/srilm
+fi
+
+LM=data/srilm/best_3gram.gz
+if [ $stage -le 3 ]; then
+  # Compiles G for chime5 trigram LM
+  utils/format_lm.sh \
+    data/lang $LM data/local/dict/lexicon.txt data/lang
+
+fi
+
+if [ $stage -le 4 ]; then
+  # remove possibly bad sessions (P11_S03, P52_S19, P53_S24, P54_S24)
+  # see http://spandh.dcs.shef.ac.uk/chime_challenge/data.html for more details
+  utils/copy_data_dir.sh data/train_worn data/train_worn_org # back up
+  grep -v -e "^P11_S03" -e "^P52_S19" -e "^P53_S24" -e "^P54_S24" data/train_worn_org/text > data/train_worn/text
+  utils/fix_data_dir.sh data/train_worn
+fi
+
+
+#########################################################################################
+# In stages 5 and 6, we augment and fix train data for our training purpose. point source
+# noises are extracted from chime corpus. Here we use 400k utterances from array microphones,
+# its augmentation and all the worn set utterances in train.
+#########################################################################################
+
+if [ $stage -le 5 ]; then
+  echo "$0: Extracting noise list from training data"
+  local/extract_noises.py $chime6_corpus/audio/train $chime6_corpus/transcriptions/train \
+    local/distant_audio_list distant_noises
+  local/make_noise_list.py distant_noises > distant_noise_list
+
+  noise_list=distant_noise_list
+  
+  echo "$0: Preparing simulated RIRs for data augmentation"
+  if [ ! -d RIRS_NOISES/ ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters $noise_list)
+
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 1 \
+    --isotropic-noise-addition-probability 1 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 1 \
+    --source-sampling-rate 16000 \
+    data/train_worn data/train_worn_rvb
+fi
+
+if [ $stage -le 6 ]; then
+  # combine mix array and worn mics
+  # randomly extract first 400k utterances from all mics
+  # if you want to include more training data, you can increase the number of array mic utterances
+  utils/combine_data.sh data/train_uall data/train_u01 data/train_u02 data/train_u05 data/train_u06
+  utils/subset_data_dir.sh data/train_uall 400000 data/train_u400k
+  utils/combine_data.sh data/${train_set} data/train_worn data/train_worn_rvb data/train_u400k
+  utils/combine_data.sh data/${sad_train_set} data/train_worn data/train_u400k
+fi
+
+if [ $stage -le 7 ]; then
+  # Split speakers up into 3-minute chunks.  This doesn't hurt adaptation, and
+  # lets us use more jobs for decoding etc.
+  utils/copy_data_dir.sh data/${train_set} data/${train_set}_nosplit
+  utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${train_set}_nosplit data/${train_set}
+fi
+
+##################################################################################
+# Now make MFCC features. We use 13-dim MFCCs to train the GMM-HMM models.
+##################################################################################
+
+if [ $stage -le 8 ]; then
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+  echo "$0:  make features..."
+  mfccdir=mfcc
+  steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+             --mfcc-config conf/mfcc.conf \
+             data/${train_set} exp/make_mfcc/${train_set} $mfccdir
+  steps/compute_cmvn_stats.sh data/${train_set} exp/make_mfcc/${train_set} $mfccdir
+  utils/fix_data_dir.sh data/${train_set}
+fi
+
+###################################################################################
+# Stages 9 to 14 train monophone and triphone models. They will be used for 
+# generating lattices for training the chain model and for obtaining targets
+# for training the SAD system.
+###################################################################################
+
+if [ $stage -le 9 ]; then
+  # make a subset for monophone training
+  utils/subset_data_dir.sh --shortest data/${train_set} 100000 data/${train_set}_100kshort
+  utils/subset_data_dir.sh data/${train_set}_100kshort 30000 data/${train_set}_30kshort
+fi
+
+if [ $stage -le 10 ]; then
+  # Starting basic training on MFCC features
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+          data/${train_set}_30kshort data/lang exp/mono
+fi
+
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+        data/${train_set} data/lang exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd "$train_cmd" \
+      2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+        data/${train_set} data/lang exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+        4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 13 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+        data/${train_set} data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+         5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 14 ]; then
+  # The following script cleans the data and produces cleaned data
+  steps/cleanup/clean_and_segment_data.sh --nj $nj --cmd "$train_cmd" \
+    --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+    data/${train_set} data/lang exp/tri3 exp/tri3_cleaned data/${train_set}_cleaned
+fi
+
+##########################################################################
+# CHAIN MODEL TRAINING
+# You can also download a pretrained chain ASR model using:
+# wget http://kaldi-asr.org/models/12/0012_asr_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_asr_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 15 ]; then
+  # chain TDNN
+  local/chain/run_tdnn.sh --nj $nj \
+    --stage $nnet_stage \
+    --train-set ${train_set}_cleaned \
+    --test-sets "$test_sets" \
+    --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned_rvb
+fi
+
+##########################################################################
+# SAD MODEL TRAINING
+# You can also download a pretrained SAD model using:
+# wget http://kaldi-asr.org/models/12/0012_sad_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_sad_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 16 ]; then
+  local/train_sad.sh --stage $sad_stage --nj $nj \
+    --data-dir data/${sad_train_set} --test-sets "${test_sets}" \
+    --sat-model-dir exp/tri3_cleaned \
+    --model-dir exp/tri2
+fi
+
+##########################################################################
+# DIARIZATION MODEL TRAINING
+# You can also download a pretrained diarization model using:
+# wget http://kaldi-asr.org/models/12/0012_diarization_v1.tar.gz
+# Once it is downloaded, extract using: tar -xvzf 0012_diarization_v1.tar.gz
+# and copy the contents of the exp/ directory to your exp/
+##########################################################################
+if [ $stage -le 17 ]; then
+  local/train_diarizer.sh --stage $diarizer_stage \
+    --data-dir data/${train_set} \
+    --model-dir exp/xvector_nnet_1a
+fi
+
+##########################################################################
+# DECODING: In track 2, we are given raw utterances without segment
+# or speaker information, so we have to decode the whole pipeline, i.e.,
+# SAD -> Diarization -> ASR. This is done in the local/decode.sh
+# script.
+##########################################################################
+if [ $stage -le 18 ]; then
+  local/decode.sh --stage $decode_stage \
+    --enhancement $enhancement \
+    --test-sets "$test_sets"
+fi
+
+exit 0;
+
diff --git a/egs/chime6/s5_track2/sid b/egs/chime6/s5_track2/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/chime6/s5_track2/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/steps b/egs/chime6/s5_track2/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/chime6/s5_track2/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/chime6/s5_track2/utils b/egs/chime6/s5_track2/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/chime6/s5_track2/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/cifar/v1/image/copy_data_dir.sh b/egs/cifar/v1/image/copy_data_dir.sh
new file mode 100755
index 00000000000..c923f5cc07a
--- /dev/null
+++ b/egs/cifar/v1/image/copy_data_dir.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script operates on a directory, such as in data/train/,
+# that contains some subset of the following files:
+#  feats.scp
+#  images.scp
+#  vad.scp
+#  spk2utt
+#  utt2spk
+#  text
+#
+# It copies to another directory, possibly adding a specified prefix or a suffix
+# to the utterance and/or speaker names.  Note, the recording-ids stay the same.
+#
+
+
+# begin configuration section
+spk_prefix=
+utt_prefix=
+spk_suffix=
+utt_suffix=
+validate_opts=   # should rarely be needed.
+# end configuration section
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1"
+  echo "Options"
+  echo "   --spk-prefix=<prefix>     # Prefix for speaker ids, default empty"
+  echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
+  echo "   --spk-suffix=<suffix>     # Suffix for speaker ids, default empty"
+  echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
+  exit 1;
+fi
+
+
+export LC_ALL=C
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/utt2spk ]; then
+  echo "copy_data_dir.sh: no such file $srcdir/utt2spk"
+  exit 1;
+fi
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+set -e;
+
+mkdir -p $destdir
+
+cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
+cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map
+
+if [ ! -f $srcdir/utt2uniq ]; then
+  if [[ ! -z $utt_prefix  ||  ! -z $utt_suffix ]]; then
+    cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq
+  fi
+else
+  cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq
+fi
+
+cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map  | \
+  utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk
+
+utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt
+
+if [ -f $srcdir/feats.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp
+fi
+
+if [ -f $srcdir/vad.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp
+fi
+
+if [ -f $srcdir/images.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/images.scp >$destdir/images.scp
+fi
+
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text
+fi
+if [ -f $srcdir/utt2dur ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
+fi
+if [ -f $srcdir/cmvn.scp ]; then
+  utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
+fi
+
+rm $destdir/spk_map $destdir/utt_map
+
+echo "$0: copied data from $srcdir to $destdir"
+
+for f in feats.scp cmvn.scp vad.scp utt2uniq utt2dur utt2num_frames text images.scp; do
+  if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
+    echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
+    echo " ... $destdir/.backup/$f"
+    mkdir -p $destdir/.backup
+    mv $destdir/$f $destdir/.backup/
+  fi
+done
+
+
+[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats"
+[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text"
+
+utils/validate_data_dir.sh $validate_opts $destdir
diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py
index 02321fdd2df..33996c8eef1 100755
--- a/egs/cifar/v1/image/get_allowed_lengths.py
+++ b/egs/cifar/v1/image/get_allowed_lengths.py
@@ -10,6 +10,7 @@
     file is later used by make_features.py to pad each image sufficiently so that
     they all have an allowed length. This is intended for end2end chain training.
 """
+from __future__ import division
 
 import argparse
 import os
@@ -117,14 +118,14 @@ def find_allowed_durations(start_len, end_len, args):
                           (length // args.frame_subsampling_factor))
             allowed_lengths.append(length)
             fp.write("{}\n".format(int(length)))
-            length *= args.factor
+            length = max(length * args.factor, length + args.frame_subsampling_factor)
     return allowed_lengths
 
 
 
 def main():
     args = get_args()
-    args.factor = 1.0 + args.factor / 100.0
+    args.factor = 1.0 + args.factor/100.0
 
     image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames'))
 
@@ -133,7 +134,7 @@ def main():
                 "Coverage rate: {}%".format(start_dur, end_dur,
                                       100.0 - args.coverage_factor * 2))
     logger.info("There will be {} unique allowed lengths "
-                "for the images.".format(int(math.log(end_dur / start_dur) /
+                "for the images.".format(int((math.log(float(end_dur)/start_dur))/
                                              math.log(args.factor))))
 
     allowed_durations = find_allowed_durations(start_dur, end_dur, args)
diff --git a/egs/cifar/v1/image/matrix_to_image.py b/egs/cifar/v1/image/matrix_to_image.py
index 52dcead7479..908b1f8b3ed 100755
--- a/egs/cifar/v1/image/matrix_to_image.py
+++ b/egs/cifar/v1/image/matrix_to_image.py
@@ -26,6 +26,7 @@
 copy-feats --binary=false $(grep $imgid data/train/feats.scp | cut -d' ' -f2) - | \
            image/matrix_to_image.py --color=1 > $imgid.bmp
 """
+from __future__ import division
 
 import argparse
 import sys
@@ -59,7 +60,7 @@
         num_cols = len(line)  # initialize
     if len(line) != num_cols:
         raise Exception("All rows should be of the same length")
-    line = map(float, line)  # string to float
+    line = [float(i) for i in line]  # string to float
     if max(line) > 1:
         raise Excetion("Element value in the matrix should be normalized and no larger than 1")
     line = [int(x * 255) for x in line]  # float to integer ranging from 0 to 255
@@ -70,7 +71,7 @@
     if num_cols % 3 != 0:
         raise Exception("Number of columns should be a multiple of 3 in the color mode")
     width = num_rows
-    height = num_cols / 3
+    height = num_cols/3
     # reform the image matrix
     image_array = [[0 for i in range(width * 3)] for j in range(height)]
     for i in range(height):
diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/cifar/v1/image/ocr/make_features.py
similarity index 51%
rename from egs/madcat_ar/v1/local/make_features.py
rename to egs/cifar/v1/image/ocr/make_features.py
index a21276d32c2..aa909f596c9 100755
--- a/egs/madcat_ar/v1/local/make_features.py
+++ b/egs/cifar/v1/image/ocr/make_features.py
@@ -2,27 +2,33 @@
 
 # Copyright      2017  Chun Chieh Chang
 #                2017  Ashish Arora
+#                2017  Yiwen Shao
 #                2018  Hossein Hadian
+#                2018  Desh Raj
 
 """ This script converts images to Kaldi-format feature matrices. The input to
     this script is the path to a data directory, e.g. "data/train". This script
     reads the images listed in images.scp and writes them to standard output
     (by default) as Kaldi-formatted matrices (in text form). It also scales the
     images so they have the same height (via --feat-dim). It can optionally pad
-    the images (on left/right sides) with white pixels.
+    the images (on left/right sides) with white pixels. It by default performs 
+    augmentation, (directly scaling down and scaling up). It will double the 
+    data but we can turn augmentation off (via --no-augment).
     If an 'image2num_frames' file is found in the data dir, it will be used
     to enforce the images to have the specified length in that file by padding
     white pixels (the --padding option will be ignored in this case). This relates
     to end2end chain training.
-
     eg. local/make_features.py data/train --feat-dim 40
 """
-
+import random
 import argparse
 import os
 import sys
 import numpy as np
 from scipy import misc
+import math
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
 
 parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
                                                 writes them to standard output in text format.""")
@@ -38,8 +44,15 @@
 parser.add_argument('--padding', type=int, default=5,
                     help='Number of white pixels to pad on the left'
                     'and right side of the image.')
-
-
+parser.add_argument('--num-channels', type=int, default=1,
+                    help='Number of color channels')
+parser.add_argument('--vertical-shift', type=int, default=0,
+                    help='total number of padding pixel per column')
+parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="Flip the image left-right for right to left languages")
+parser.add_argument('--augment_type', type=str, default='no_aug',
+                    choices=['no_aug', 'random_scale','random_shift'],
+                    help='Subset of data to process.')
 args = parser.parse_args()
 
 
@@ -59,18 +72,6 @@ def write_kaldi_matrix(file_handle, matrix, key):
             file_handle.write("\n")
     file_handle.write(" ]\n")
 
-
-def get_scaled_image(im):
-    scale_size = args.feat_dim
-    sx = im.shape[1]  # width
-    sy = im.shape[0]  # height
-    scale = (1.0 * scale_size) / sy
-    nx = int(scale_size)
-    ny = int(scale * sx)
-    im = misc.imresize(im, (nx, ny))
-    return im
-
-
 def horizontal_pad(im, allowed_lengths = None):
     if allowed_lengths is None:
         left_padding = right_padding = args.padding
@@ -88,21 +89,73 @@ def horizontal_pad(im, allowed_lengths = None):
         left_padding = int(padding // 2)
         right_padding = padding - left_padding
     dim_y = im.shape[0] # height
-    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
-                                           dtype=int), im), axis=1)
-    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
-                                                    dtype=int)), axis=1)
+    if args.num_channels in [1,4]:
+        im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
+                                               dtype=int), im), axis=1)
+        im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
+                                                        dtype=int)), axis=1)
+    else:
+        im_pad = np.concatenate((255 * np.ones((dim_y, left_padding, args.num_channels),
+                                               dtype=int), im), axis=1)
+        im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding, args.num_channels),
+                                                        dtype=int)), axis=1)
     return im_pad1
 
+def get_scaled_image_aug(im, mode='normal'):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx) 
+    scale_size = random.randint(10, 30)
+    scale = (1.0 * scale_size) / sy
+    down_nx = int(scale_size)
+    down_ny = int(scale * sx)
+    if mode == 'normal':
+        im = misc.imresize(im, (nx, ny))
+        return im
+    else:
+        im_scaled_down = misc.imresize(im, (down_nx, down_ny))
+        im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
+        return im_scaled_up
+    return im
 
-### main ###
+def vertical_shift(im, mode='normal'):
+    if args.vertical_shift == 0:
+        return im
+    total = args.vertical_shift
+    if mode == 'notmid':
+        val = random.randint(0, 1)
+        if val == 0:
+            mode = 'top'
+        else:
+            mode = 'bottom'
+    if mode == 'normal':
+        top = int(total / 2)
+        bottom = total - top
+    elif mode == 'top':  # more padding on top
+        top = random.randint(total / 2, total)
+        bottom = total - top
+    elif mode == 'bottom':  # more padding on bottom
+        top = random.randint(0, total / 2)
+        bottom = total - top
+    width = im.shape[1]
+    im_pad = np.concatenate(
+        (255 * np.ones((top, width), dtype=int) -
+         np.random.normal(2, 1, (top, width)).astype(int), im), axis=0)
+    im_pad = np.concatenate(
+        (im_pad, 255 * np.ones((bottom, width), dtype=int) -
+         np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0)
+    return im_pad
 
+### main ###
+random.seed(1)
 data_list_path = args.images_scp_path
-
 if args.out_ark == '-':
     out_fh = sys.stdout
 else:
-    out_fh = open(args.out_ark,'wb')
+    out_fh = open(args.out_ark,'w')
 
 allowed_lengths = None
 allowed_len_handle = args.allowed_len_file_path
@@ -123,13 +176,31 @@ def horizontal_pad(im, allowed_lengths = None):
         line_vect = line.split(' ')
         image_id = line_vect[0]
         image_path = line_vect[1]
-        im = misc.imread(image_path)
-        im_scaled = get_scaled_image(im)
-        im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths)
-        if im_horizontal_padded is None:
+        if args.num_channels == 4:
+            im = misc.imread(image_path, mode='L')
+        else:
+            im = misc.imread(image_path)
+        if args.fliplr:
+            im = np.fliplr(im)
+        if args.augment_type == 'no_aug' or 'random_shift':
+            im = get_scaled_image_aug(im, 'normal')
+        elif args.augment_type == 'random_scale':
+            im = get_scaled_image_aug(im, 'scaled')
+        im = horizontal_pad(im, allowed_lengths)
+        if im is None:
             num_fail += 1
             continue
-        data = np.transpose(im_horizontal_padded, (1, 0))
+        if args.augment_type == 'no_aug' or 'random_scale':
+            im = vertical_shift(im, 'normal')
+        elif args.augment_type == 'random_shift':
+            im = vertical_shift(im, 'notmid')
+        if args.num_channels in [1,4]:
+            data = np.transpose(im, (1, 0))
+        elif args.num_channels == 3:
+            H = im.shape[0]
+            W = im.shape[1]
+            C = im.shape[2]
+            data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C))
         data = np.divide(data, 255.0)
         num_ok += 1
         write_kaldi_matrix(out_fh, data, image_id)
diff --git a/egs/cifar/v1/image/select_image_in_egs.py b/egs/cifar/v1/image/select_image_in_egs.py
index 88d7d568e66..dbf48e6403d 100755
--- a/egs/cifar/v1/image/select_image_in_egs.py
+++ b/egs/cifar/v1/image/select_image_in_egs.py
@@ -9,6 +9,7 @@
 #     --vertical-shift=0.3 --srand=27 --num-channels=3 ark:exp/cifar10_egs/egs.1.ark ark,t:- | \
 #     image/select_image_in_egs.py $id | image/matrix_to_image.py --color 3 > $id.bmp
 
+from __future__ import print_function
 import argparse
 import sys
 
diff --git a/egs/cifar/v1/local/process_data.py b/egs/cifar/v1/local/process_data.py
index 51173dafc6f..38a599297d2 100755
--- a/egs/cifar/v1/local/process_data.py
+++ b/egs/cifar/v1/local/process_data.py
@@ -6,6 +6,7 @@
 
 """ This script prepares the training and test data for CIFAR-10 or CIFAR-100.
 """
+from __future__ import division
 
 import argparse
 import os
@@ -14,13 +15,13 @@
 parser = argparse.ArgumentParser(description="""Converts train/test data of
                                                 CIFAR-10 or CIFAR-100 to
                                                 Kaldi feature format""")
-parser.add_argument('database', type=str,
+parser.add_argument('database',
                     default='data/dl/cifar-10-batches-bin',
                     help='path to downloaded cifar data (binary version)')
-parser.add_argument('dir', type=str, help='output dir')
-parser.add_argument('--cifar-version', type=str, default='CIFAR-10', choices=['CIFAR-10', 'CIFAR-100'])
-parser.add_argument('--dataset', type=str, default='train', choices=['train', 'test'])
-parser.add_argument('--out-ark', type=str, default='-', help='where to write output feature data')
+parser.add_argument('dir', help='output dir')
+parser.add_argument('--cifar-version', default='CIFAR-10', choices=['CIFAR-10', 'CIFAR-100'])
+parser.add_argument('--dataset', default='train', choices=['train', 'test'])
+parser.add_argument('--out-ark', default='-', help='where to write output feature data')
 
 args = parser.parse_args()
 
@@ -37,7 +38,7 @@ def load_cifar10_data_batch(datafile):
         for i in range(num_images_in_batch):
             label = ord(fh.read(1))
             bin_img = fh.read(C * H * W)
-            img = [[[ord(byte) / 255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]]
+            img = [[[ord(byte)/255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]]
                   for row in range(H)] for channel in range(C)]
             labels += [label]
             data += [img]
@@ -52,7 +53,7 @@ def load_cifar100_data_batch(datafile, num_images_in_batch):
             coarse_label = ord(fh.read(1))
             fine_label = ord(fh.read(1))
             bin_img = fh.read(C * H * W)
-            img = [[[ord(byte) / 255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]]
+            img = [[[ord(byte)/255.0 for byte in bin_img[channel*H*W+row*W:channel*H*W+(row+1)*W]]
                   for row in range(H)] for channel in range(C)]
             fine_labels += [fine_label]
             coarse_labels += [coarse_label]
@@ -80,7 +81,7 @@ def write_kaldi_matrix(file_handle, matrix, key):
         if num_cols != len(matrix[row_index]):
             raise Exception("All the rows of a matrix are expected to "
                             "have the same length")
-        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        file_handle.write(" ".join([str(x) for x in matrix[row_index]]))
         if row_index != num_rows - 1:
             file_handle.write("\n")
     file_handle.write(" ]\n")
diff --git a/egs/cmu_cslu_kids/README b/egs/cmu_cslu_kids/README
new file mode 100644
index 00000000000..0b8512e2487
--- /dev/null
+++ b/egs/cmu_cslu_kids/README
@@ -0,0 +1,21 @@
+This is an ASR recipe for children speech using cmu_kids and cslu_kids.
+Both of the corpora can be found on LDC:
+    - cmu_kids : https://catalog.ldc.upenn.edu/LDC97S63
+    - cslu_kids: https://catalog.ldc.upenn.edu/LDC2007S18
+
+To run this recipe, you'll need a copy of both corpora:
+    ./run.sh --cmu_kids <path_to_cmu_corpus> --cslu_kids <path_to_cslu_corpus>
+
+By default, this recipe will download an LM pretrained on LibriSpeech from 
+lm_url=www.openslr.org/resources/11. If you already have a copy of this LM 
+and do not wish to redownload, you can specify the LM path using the --lm_src option:
+    ./run.sh --cmu_kids <path_to_cmu_corpus> --cslu_kids <path_to_cslu_corpus>\
+        --lm_src <path_to_librispeech_lm>
+
+This recipe will also download and clean CMU_Dict by default. If you have a clean copy 
+already, or wish to use your own dictionary, simply copy your version of the dict to 
+        data/local/dict
+
+To run extra features for triphone models or VLTN, set the following options true:
+    ./run.sh --cmu_kids <path_to_cmu_corpus> --cslu_kids <path_to_cslu_corpus>\
+        --vtln true --extra_features true
diff --git a/egs/cmu_cslu_kids/s5/cmd.sh b/egs/cmu_cslu_kids/s5/cmd.sh
new file mode 100644
index 00000000000..179307556d5
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/cmd.sh
@@ -0,0 +1,23 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd=queue.pl
+export decode_cmd="queue.pl --mem 2G"
+# the use of cuda_cmd is deprecated, used only in 'nnet1',
+export cuda_cmd="queue.pl --gpu 1"
+
+if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
+  queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
+  export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
+  export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
+  export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
+fi
diff --git a/egs/cmu_cslu_kids/s5/conf/decode.config b/egs/cmu_cslu_kids/s5/conf/decode.config
new file mode 100644
index 00000000000..10b0eee900b
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/conf/decode.config
@@ -0,0 +1,4 @@
+# Use wider-than-normal decoding beams for RM.
+first_beam=16.0
+beam=20.0
+lattice_beam=10.0
diff --git a/egs/cmu_cslu_kids/s5/conf/decode_dnn.config b/egs/cmu_cslu_kids/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..e7cfca74763
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/conf/decode_dnn.config
@@ -0,0 +1,8 @@
+# In RM, the optimal decode LMWT is in range 2..5, which is different from usual 10..15
+# (it is caused by using simple rule-based LM, instead of n-gram LM),
+scoring_opts="--min-lmwt 2 --max-lmwt 10"
+# Still, it is better to use --acwt 0.1, both for decoding and sMBR,
+acwt=0.1
+# For this small task we can afford to have large beams,
+beam=30.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=18.0 # this has most effect on size of the lattices.
diff --git a/egs/cmu_cslu_kids/s5/conf/mfcc.conf b/egs/cmu_cslu_kids/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..6bbcb763153
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--allow_downsample=true
diff --git a/egs/cmu_cslu_kids/s5/conf/mfcc_hires.conf b/egs/cmu_cslu_kids/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..40f95e97010
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/conf/mfcc_hires.conf
@@ -0,0 +1,11 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
+--allow-downsample=true
diff --git a/egs/cmu_cslu_kids/s5/conf/online_cmvn.conf b/egs/cmu_cslu_kids/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/reverb/s5/conf/fbank.conf b/egs/cmu_cslu_kids/s5/conf/plp.conf
similarity index 57%
rename from egs/reverb/s5/conf/fbank.conf
rename to egs/cmu_cslu_kids/s5/conf/plp.conf
index c4b73674cab..e7e8a9e14af 100644
--- a/egs/reverb/s5/conf/fbank.conf
+++ b/egs/cmu_cslu_kids/s5/conf/plp.conf
@@ -1,2 +1,2 @@
 # No non-default options for now.
-
+--allow_downsample=true 
diff --git a/egs/cmu_cslu_kids/s5/local/chain/compare_wer.sh b/egs/cmu_cslu_kids/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..8ee5db2326a
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/chain/compare_wer.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev_clean_2 (tgsmall) "
+  "#WER dev_clean_2 (tglarge) ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/cmu_cslu_kids/s5/local/chain/run_tdnnf.sh b/egs/cmu_cslu_kids/s5/local/chain/run_tdnnf.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/chain/run_tdnnf.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/cmu_cslu_kids/s5/local/chain/tdnnf_decode.sh b/egs/cmu_cslu_kids/s5/local/chain/tdnnf_decode.sh
new file mode 100755
index 00000000000..8d124193584
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/chain/tdnnf_decode.sh
@@ -0,0 +1,82 @@
+#! /bin/bash
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+# Decode on new data set using trained model. 
+# The data directory should be prepared in kaldi style.
+# Usage:
+#     ./local/chain/tdnnF_decode.sh --data_src <prepared_data_dir> 
+
+set -euo pipefail
+echo "$0 $@"
+
+stage=0
+decode_nj=10
+data_src=
+affix=
+tree_affix=
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+data_name=$(basename $data_src)
+data_hires="data/${data_name}_hires"
+ivect_dir=exp/nnet3${nnet3_affix}/ivector_$data_name
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+
+mfcc=mfcc_hires_$data_name
+chunk_width=140,100,160
+reporting_email=
+
+if [ $stage -le 0 ]; then 
+    rm -rf $data_hires
+    cp -r $data_src $data_hires
+fi
+# High resolution mfcc
+if [ $stage -le 1 ]; then 
+    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" $data_hires \
+        exp/$data_name/make_feat_hires  $mfcc|| exit 1;
+    steps/compute_cmvn_stats.sh $data_hires || exit 1;
+    utils/fix_data_dir.sh $data_hires || exit 1;
+fi
+
+# Extract i-vector
+if [ $stage -le 2 ]; then 
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+        $data_hires exp/nnet3"$affix"/extractor $ivect_dir
+fi
+
+if [ $stage -le 3 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+    (
+      nspk=$(wc -l <$data_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir $ivect_dir \
+          $tree_dir/graph_tgsmall $data_hires ${dir}/decode_tgsmall_$data_name || exit 1
+      
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       $data_hires ${dir}/decode_{tgsmall,tglarge}_$data_name || exit 1
+    ) || touch $dir/.error &
+
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
diff --git a/egs/cmu_cslu_kids/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/cmu_cslu_kids/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..51e0123d0f2
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+#           2019       Fei Wu
+
+# Based on material recipe for low-resource languages
+# Factored TDNN with skip connectiong and splicing (two bottle neck layers)
+
+#   WER results on dev
+#   Model       LM          Corpus      WER(%)
+#   tdnn_1a     tg_large    Combined    11.72
+#   tdnn_1a     tg_small    Combined    13.61
+#   tdnn_1a     tg_large    CMU_Kids    17.26
+#   tdnn_1a     tg_small    CMU_Kids    26.43
+#   tdnn_1a     tg_large    CSLU_Kids   10.80
+#   tdnn_1a     tg_small    CSLU_Kids   12.50
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1a_sp/: num-iters=342 nj=2..5 num-params=17.9M dim=40+100->3192 combine=-0.042->-0.041 (over 8) xent:train/valid[227,341,final]=(-0.451,-0.363,-0.346/-0.524,-0.466,-0.434) logprob:train/valid[227,341,final]=(-0.047,-0.043,-0.042/-0.058,-0.056,-0.054) 
+
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=10
+train_set=train
+test_sets="test"
+gmm=tri3       
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a  
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training chunk-options
+chunk_width=140,100,160
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 7" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh\
+    --stage $stage \
+    --train-set $train_set \
+    --test-sets $test_sets \
+    --gmm $gmm \
+    --nnet3_affix "$nnet3_affix" || exit 1;
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+    [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+    # Build a tree using our new topology.  We know we have alignments for the
+    # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+    # those.  The num-leaves is always somewhat less than the num-leaves from
+    # the GMM baseline.
+     if [ -f $tree_dir/final.mdl ]; then
+       echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+       exit 1;
+    fi
+    steps/nnet3/chain/build_tree.sh \
+      --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+      $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 11 ]; then
+    mkdir -p $dir
+    echo "$0: creating neural net configs using the xconfig parser";
+    
+    num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+    learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+    opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+    linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+    output_opts="l2-regularize=0.002"
+    
+    mkdir -p $dir/configs
+    
+    cat <<EOF > $dir/configs/network.xconfig
+    input dim=100 name=ivector
+    input dim=40 name=input
+    
+    # please note that it is important to have input layer with the name=input
+    # as the layer immediately preceding the fixed-affine-layer to enable
+    # the use of short notation for the descriptor
+    fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+    
+    # the first splicing is moved before the lda layer, so no splicing here
+    relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+    linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+    linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+    relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+    linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+    relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+    linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+    linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+    relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+    linear-component name=tdnn5l dim=256 $linear_opts
+    relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+    linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+    linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+    relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+    linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+    linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+    relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+    linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+    linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+    relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+    linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+    linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+    relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+    linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+    linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+    relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+    linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+    linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+    relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+    linear-component name=prefinal-l dim=256 $linear_opts
+    
+    relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+    linear-component name=prefinal-chain-l dim=256 $linear_opts
+    batchnorm-component name=prefinal-chain-batchnorm
+    output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+    
+    relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+    linear-component name=prefinal-xent-l dim=256 $linear_opts
+    batchnorm-component name=prefinal-xent-batchnorm
+    output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+    
+EOF
+
+    steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+    
+fi
+
+
+if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+    fi
+    
+    steps/nnet3/chain/train.py --stage=$train_stage \
+      --cmd="$decode_cmd" \
+      --feat.online-ivector-dir=$train_ivector_dir \
+      --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+      --chain.xent-regularize $xent_regularize \
+      --chain.leaky-hmm-coefficient=0.1 \
+      --chain.l2-regularize=0.0 \
+      --chain.apply-deriv-weights=false \
+      --chain.lm-opts="--num-extra-lm-states=2000" \
+      --trainer.dropout-schedule $dropout_schedule \
+      --trainer.add-option="--optimization.memory-compression-level=2" \
+      --trainer.srand=$srand \
+      --trainer.max-param-change=2.0 \
+      --trainer.num-epochs=20 \
+      --trainer.frames-per-iter=3000000 \
+      --trainer.optimization.num-jobs-initial=2 \
+      --trainer.optimization.num-jobs-final=5 \
+      --trainer.optimization.initial-effective-lrate=0.002 \
+      --trainer.optimization.final-effective-lrate=0.0002 \
+      --trainer.num-chunk-per-minibatch=128,64 \
+      --egs.chunk-width=$chunk_width \
+      --egs.dir="$common_egs_dir" \
+      --egs.opts="--frames-overlap-per-eg 0" \
+      --cleanup.remove-egs=$remove_egs \
+      --use-gpu=true \
+      --reporting.email="$reporting_email" \
+      --feat-dir=$train_data_dir \
+      --tree-dir=$tree_dir \
+      --lat-dir=$lat_dir \
+      --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+    # Note: it's not important to give mkgraph.sh the lang directory with the
+    # matched topology (since it gets the topology file from the model).
+    utils/mkgraph.sh \
+      --self-loop-scale 1.0 data/lang_test_tgsmall \
+      $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+    frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+    rm $dir/.error 2>/dev/null || true
+
+    for data in $test_sets; do
+        (
+        nspk=$(wc -l <data/${data}_hires/spk2utt)
+        steps/nnet3/decode.sh \
+            --acwt 1.0 --post-decode-acwt 10.0 \
+            --frames-per-chunk $frames_per_chunk \
+            --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+            --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+            $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+        data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+        ) || touch $dir/.error &
+    done
+    wait
+    [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0
diff --git a/egs/cmu_cslu_kids/s5/local/cmu_prepare_data.sh b/egs/cmu_cslu_kids/s5/local/cmu_prepare_data.sh
new file mode 100755
index 00000000000..d452b8574b6
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/cmu_prepare_data.sh
@@ -0,0 +1,89 @@
+#! /bin/bash
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+# Prepares cmu_kids. 
+# Should be run from egs/cmu_cslu_kids
+
+set -eu
+corpus=cmu_kids/kids
+data=data/data_cmu
+test_percentage=30
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+total_cnt=0
+test_cnt=0
+train_cnt=0
+
+for d in $data/train $data/test; do
+    mkdir -p $d
+    ./local/file_check.sh $d
+done
+
+echo "Preparing cmu_kids..."
+for kid in $corpus/*; do 
+	if [ -d $kid ]; then
+        # echo "Kid: $kid"
+		spkID=$(basename $kid)
+		sph="$kid/signal"
+	    if [ -d $sph ];then
+            # echo "$sph"
+            for utt in $sph/*; do
+                if [ ${utt: -4} == ".sph" ]; then
+                    total_cnt=$[$total_cnt+1]   
+                    rnd=$((1+RANDOM % 100))
+                    uttID=$(basename $utt)
+                    uttID=${uttID%".sph"}
+                    sentID=${uttID#$spkID}
+                    sentID=${sentID:0:3}
+
+                    # Find the sentence
+                    grep $sentID cmu_kids/tables/sentence.tbl > tmp
+                    cut -f 3- < tmp > out                    
+        
+                    tr '[:lower:]' '[:upper:]' < out > tmp
+                    tr -d '[:cntrl:]' < tmp > out
+                    sent=$(<out)
+
+                    # Clean transcript 
+                    cp $kid/trans/$uttID.trn tmp
+                    tr -d '\n' < tmp > out
+                    tr '[:lower:]' '[:upper:]' < tmp > out
+                    trans=$(<out)
+                     
+                    if [ $rnd -le $test_percentage ]; then
+                        target="test"
+                        test_cnt=$[$test_cnt+1]
+                    else
+                        target="train"
+                        train_cnt=$[$train_cnt+1]
+                    fi
+
+                    echo "$uttID $spkID" >> $data/$target/utt2spk
+                    echo "$uttID $KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -f wav -p -c 1 $utt|" >> $data/$target/wav.scp
+                    echo "$spkID f" >> $data/$target/spk2gender
+                    echo "$uttID $sent" >> $data/$target/text
+                fi
+            done
+        fi
+	fi
+done
+
+for d in $data/train $data/test; do
+    utils/utt2spk_to_spk2utt.pl $d/utt2spk > $d/spk2utt
+    utils/fix_data_dir.sh $d
+done
+
+printf "\t total: %s; train: %s; test: %s.\n" "$total_cnt" "$train_cnt" "$test_cnt" 
+rm -f out tmp
+
+# Optional
+# Get data duration, just for book keeping
+# for data in $data/train $data/test; do
+#     ./local/data_duration.sh $data
+# done
+# 
+
diff --git a/egs/cmu_cslu_kids/s5/local/cslu_aud_prep.sh b/egs/cmu_cslu_kids/s5/local/cslu_aud_prep.sh
new file mode 100755
index 00000000000..735f87eca9f
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/cslu_aud_prep.sh
@@ -0,0 +1,43 @@
+#/bin/bash 
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+# Called by local/cslu_DataPrep.shi
+
+Assignment()
+{
+    rnd=$((1+RANDOM % 100))
+    if [ $rnd -le $test_percentage ]; then 
+        target="test"
+    else
+        target="train"
+    fi
+}
+audio=
+test_percentage=30  # Percent of data reserved as test set 
+debug=debug/cslu_dataprep_debug
+data=data/data_cslu
+. ./utils/parse_options.sh
+
+uttID=$(basename $audio)
+uttID=${uttID%'.wav'}
+sentID=${uttID: -3}
+spkID=${uttID%$sentID}
+sentID=${sentID%"0"}
+sentID=$(echo "$sentID" | tr '[:lower:]' '[:upper:]' )
+
+line=$(grep $sentID cslu/docs/all.map)
+
+if [ -z "$line" ]; then     # Can't map utterance to transcript
+    echo $audio $sentID >> $debug
+else
+    txt=$(echo $line | grep -oP '"\K.*?(?=")')
+    cap_txt=${txt^^}
+    Assignment
+    echo "$uttID $cap_txt" >> $data/$target/text
+    echo "$uttID $spkID" >> $data/$target/utt2spk
+    echo "$spkID f" >> $data/$target/spk2gender
+    echo "$uttID $audio" >> $data/$target/wav.scp
+fi
+
diff --git a/egs/cmu_cslu_kids/s5/local/cslu_prepare_data.sh b/egs/cmu_cslu_kids/s5/local/cslu_prepare_data.sh
new file mode 100755
index 00000000000..621179079b3
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/cslu_prepare_data.sh
@@ -0,0 +1,49 @@
+#! /bin/bash 
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+# Prepares cslu_kids
+# Should be run from egs/cmu_csli_kids
+
+set -e
+Looper()
+{
+    # echo "Looping through $1"
+    for f in $1/*; do 
+        if [ -d $f ]; then
+            Looper $f
+        else            
+            ./local/cslu_aud_prep.sh --data $data --audio $f
+        fi
+    done
+}
+
+data=data/data_cslu
+corpus=cslu
+. ./utils/parse_options.sh
+
+rm -f debug/cslu_dataprep_debug
+mkdir -p debug
+# File check, remove previous data and features files 
+for d in $data/test $data/train; do 
+    mkdir -p $d
+    ./local/file_check.sh $d
+done
+
+echo "Preparing cslu_kids..."
+Looper $corpus/speech/scripted
+
+for d in $data/test $data/train; do
+    ./utils/utt2spk_to_spk2utt.pl $d
+    ./utils/fix_data_dir.sh $d
+done
+if [ -f debug/cslu_dataprep_debug ]; then
+    echo "Missing transcripts for some utterances. See cslu_dataprep_debug"
+fi
+
+# Optional
+# Get data duration, just for book keeping
+# for data in data/data_cslu/test data/data_cslu/train; do 
+#     ./local/data_duration.sh $data
+# done
diff --git a/egs/cmu_cslu_kids/s5/local/data_duration.sh b/egs/cmu_cslu_kids/s5/local/data_duration.sh
new file mode 100755
index 00000000000..e838e365ea7
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/data_duration.sh
@@ -0,0 +1,19 @@
+#! /bin/bash 
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+# Get duration of the utterance given data dir
+set -eu
+echo $0 $@
+
+data_dir=$1
+mkdir -p duration
+
+./utils/data/get_utt2dur.sh $data_dir
+
+echo "$data_dir"
+python local/sum_duration.py $data_dir/utt2dur 
+echo ""
+
+
diff --git a/egs/cmu_cslu_kids/s5/local/download_cmu_dict.sh b/egs/cmu_cslu_kids/s5/local/download_cmu_dict.sh
new file mode 100755
index 00000000000..0248dd0cae1
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/download_cmu_dict.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright 2019 Fei Wu
+set -eu 
+# Adapted from the local/prepare_dict script in 
+# the librispeech recipe. Download and prepare CMU_dict.
+# For childresn speech ASR tasks, since the vocabulary in cmu_kids and 
+# cslu_kids is relatively easy comparing to librispeech, we use only the 
+# CMU_dict, and do not handle OOV with G2P.
+# Should be run from egs/cmu_cslu_kids.
+# Usage:
+#   local/download_cmu_dict.sh --dict_dir <path_to_dict_dir>
+
+dict_dir=data/local/dict
+OOV="<UNK>"
+
+. ./utils/parse_options.sh || exit 1;
+. ./path.sh || exit 1
+
+if [ ! -d $dict_dir ]; then
+  echo "Downloading and preparing CMU dict"
+  svn co -r 12440 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dict_dir/raw_dict || exit 1;
+  
+  echo "Removing the pronunciation variant markers ..."
+  grep -v ';;;' $dict_dir/raw_dict/cmudict.0.7a | \
+  perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' | \
+  sort -u > $dict_dir/lexicon.txt || exit 1;
+
+  tr -d '\r' <  $dict_dir/raw_dict/cmudict.0.7a.symbols > $dict_dir/nonsilence_phones.txt
+  
+  echo "$OOV SIL" >> $dict_dir/lexicon.txt
+  
+  echo "SIL" > $dict_dir/silence_phones.txt
+  echo "SPN" >> $dict_dir/silence_phones.txt
+  echo "SIL" > $dict_dir/optional_silence.txt
+
+  rm -rf $dict_dir/raw_dict
+fi
diff --git a/egs/cmu_cslu_kids/s5/local/download_lm.sh b/egs/cmu_cslu_kids/s5/local/download_lm.sh
new file mode 100755
index 00000000000..382f313df7c
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/download_lm.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+if [ $# -ne "2" ]; then
+  echo "Usage: $0 <base-url> <download_dir>"
+  echo "e.g.: $0 http://www.openslr.org/resources/11 data/local/lm"
+  exit 1
+fi
+
+base_url=$1
+dst_dir=$2
+
+# given a filename returns the corresponding file size in bytes
+# The switch cases below can be autogenerated by entering the data directory and running:
+# for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done
+function filesize() {
+  case $1 in
+    "3-gram.arpa.gz") echo "759636181";;
+    "3-gram.pruned.1e-7.arpa.gz") echo "34094057";;
+    "3-gram.pruned.3e-7.arpa.gz") echo "13654242";;
+    "4-gram.arpa.gz") echo "1355172078";;
+    "g2p-model-5") echo "20098243";;
+    "librispeech-lexicon.txt") echo "5627653";;
+    "librispeech-lm-corpus.tgz") echo "1803499244";;
+    "librispeech-lm-norm.txt.gz") echo "1507274412";;
+    "librispeech-vocab.txt") echo "1737588";;
+    *) echo "";;
+  esac
+}
+
+function check_and_download () {
+  [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; }
+  fname=$1
+  echo "Downloading file '$fname' into '$dst_dir'..."
+  expect_size="$(filesize $fname)"
+  [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; }
+  if [[ -s $dst_dir/$fname ]]; then
+    # In the following statement, the first version works on linux, and the part
+    # after '||' works on Linux.
+    f=$dst_dir/$fname
+    fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
+    if [[ "$fsize" -eq "$expect_size" ]]; then
+      echo "'$fname' already exists and appears to be complete"
+      return 0
+    else
+      echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..."
+    fi
+  fi
+  wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || {
+    echo "Error while trying to download $fname!"
+    return 1
+  }
+  f=$dst_dir/$fname
+  # In the following statement, the first version works on linux, and the part after '||'
+  # works on Linux.
+  fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
+  [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; }
+  return 0
+}
+
+mkdir -p $dst_dir
+
+for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz 4-gram.arpa.gz \
+         g2p-model-5 librispeech-lm-corpus.tgz librispeech-vocab.txt librispeech-lexicon.txt; do
+  check_and_download $f || exit 1
+done
+
+cd $dst_dir
+ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
+ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
+ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz
+ln -sf 4-gram.arpa.gz lm_fglarge.arpa.gz
+
+exit 0
diff --git a/egs/cmu_cslu_kids/s5/local/file_check.sh b/egs/cmu_cslu_kids/s5/local/file_check.sh
new file mode 100755
index 00000000000..859f228058a
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/file_check.sh
@@ -0,0 +1,17 @@
+#! /bin/bash
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+
+printf "\t File Check in folder: %s.\n" "$1"
+
+WavScp="$1/wav.scp"
+Text="$1/text"
+Utt2Spk="$1/utt2spk"
+Gend="$1/utt2gender"
+Spk2Utt="$1/spk2utt"
+rm -f $WavScp $Text $Utt2Spk $Gend $Spk2Utt
+
+
+
diff --git a/egs/cmu_cslu_kids/s5/local/format_lms.sh b/egs/cmu_cslu_kids/s5/local/format_lms.sh
new file mode 100755
index 00000000000..b530f61d2d9
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/format_lms.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+# Prepares the test time language model(G) transducers
+# (adapted from wsj/s5/local/wsj_format_data.sh)
+
+. ./path.sh || exit 1;
+
+# begin configuration section
+src_dir=data/lang
+# end configuration section
+
+. utils/parse_options.sh || exit 1;
+
+set -e
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <lm-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
+  echo ", where:"
+  echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
+  echo "Options:"
+  echo "   --src-dir  <dir>           # source lang directory, default data/lang"
+  exit 1
+fi
+
+lm_dir=$1
+
+if [ ! -d $lm_dir ]; then
+  echo "$0: expected source LM directory $lm_dir to exist"
+  exit 1;
+fi
+if [ ! -f $src_dir/words.txt ]; then
+  echo "$0: expected $src_dir/words.txt to exist."
+  exit 1;
+fi
+
+
+tmpdir=data/local/lm_tmp.$$
+trap "rm -r $tmpdir" EXIT
+
+mkdir -p $tmpdir
+
+for lm_suffix in tgsmall tgmed; do
+  # tglarge is prepared by a separate command, called from run.sh; we don't
+  # want to compile G.fst for tglarge, as it takes a while.
+  test=${src_dir}_test_${lm_suffix}
+  mkdir -p $test
+  cp -r ${src_dir}/* $test
+  gunzip -c $lm_dir/lm_${lm_suffix}.arpa.gz | \
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
+  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
+done
+
+echo "Succeeded in formatting data."
+
+exit 0
diff --git a/egs/cmu_cslu_kids/s5/local/make_lm.pl b/egs/cmu_cslu_kids/s5/local/make_lm.pl
new file mode 100755
index 00000000000..80eea5a6198
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/make_lm.pl
@@ -0,0 +1,119 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Yanmin Qian  Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This file takes as input the file wp_gram.txt that comes with the RM
+# distribution, and creates the language model as an acceptor in FST form.
+
+# make_rm_lm.pl   wp_gram.txt > G.txt
+
+if (@ARGV != 1) {
+    print "usage: make_rm_lm.pl  wp_gram.txt > G.txt\n";
+    exit(0);
+}
+unless (open(IN_FILE, "@ARGV[0]")) {
+    die ("can't open @ARGV[0]");
+}
+
+
+$flag = 0;
+$count_wrd = 0;
+$cnt_ends = 0;
+$init = "";
+
+while ($line = <IN_FILE>)
+{	
+	chop($line);        # Return the last char
+
+    $line =~ s/ //g;    # Selete all spaces 
+    
+	if(($line =~ /^>/)) # If line has ">"
+	{
+		if($flag == 0)          # Flip flag
+		{
+			$flag = 1;
+		}
+		$line =~ s/>//g;        # Delete ">" 
+		$hashcnt{$init} = $i;
+		$init = $line;
+		$i = 0;
+		$count_wrd++;
+		@LineArray[$count_wrd - 1] = $init;
+ 		$hashwrd{$init} = 0;
+	}
+	elsif($flag != 0)
+	{
+		
+		$hash{$init}[$i] = $line;
+		$i++; 			
+		if($line =~ /SENTENCE-END/)
+		{
+			$cnt_ends++;
+		}
+ 	} 
+	else
+	{}
+}
+
+$hashcnt{$init} = $i;
+
+$num = 0;
+$weight = 0;
+$init_wrd = "SENTENCE-END";
+$hashwrd{$init_wrd} = @LineArray;
+for($i = 0; $i < $hashcnt{$init_wrd}; $i++)
+{
+	$weight = -log(1/$hashcnt{$init_wrd});
+	$hashwrd{$hash{$init_wrd}[$i]} = $i + 1;
+	print "0    $hashwrd{$hash{$init_wrd}[$i]}    $hash{$init_wrd}[$i]    $hash{$init_wrd}[$i]    $weight\n";
+}
+$num = $i;
+
+for($i = 0; $i < @LineArray; $i++)
+{
+	if(@LineArray[$i] eq 'SENTENCE-END')
+	{}
+	else
+	{
+		if($hashwrd{@LineArray[$i]} == 0)
+		{
+			$num++;
+			$hashwrd{@LineArray[$i]} = $num;
+		}
+		for($j = 0; $j < $hashcnt{@LineArray[$i]}; $j++)
+		{
+			$weight = -log(1/$hashcnt{@LineArray[$i]});
+			if($hashwrd{$hash{@LineArray[$i]}[$j]} == 0)
+			{
+				$num++;
+				$hashwrd{$hash{@LineArray[$i]}[$j]} = $num;
+			}
+			if($hash{@LineArray[$i]}[$j] eq 'SENTENCE-END')
+			{
+				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    <eps>    <eps>    $weight\n"
+                }
+			else
+			{
+				print "$hashwrd{@LineArray[$i]}    $hashwrd{$hash{@LineArray[$i]}[$j]}    $hash{@LineArray[$i]}[$j]    $hash{@LineArray[$i]}[$j]    $weight\n";
+			}
+		}
+	}
+}
+
+print "$hashwrd{$init_wrd}    0\n";
+close(IN_FILE);
+
+
diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/compare_wer.sh b/egs/cmu_cslu_kids/s5/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..095e85cc338
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/nnet3/compare_wer.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev_clean_2 (tgsmall) "
+  "#WER dev_clean_2 (tglarge) ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/run_ivector_common.sh b/egs/cmu_cslu_kids/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..c695f2c9f1c
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="test"
+gmm=tri3b
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/ivectors/cmu_cslu_kids-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/run_tdnn_lstm.sh b/egs/cmu_cslu_kids/s5/local/nnet3/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..9669251c14a
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/nnet3/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1c.sh
\ No newline at end of file
diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..de858973c98
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,223 @@
+#!/bin/bash
+
+# This is a basic TDNN+LSTM nnet3 experiment.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm1a_sp
+# exp/nnet3/tdnn_lstm1a_sp: num-iters=32 nj=2..2 num-params=8.4M dim=40+100->2041 combine=-0.47->-0.38 loglike:train/valid[20,31,combined]=(-0.62,-0.38,-0.37/-1.03,-1.03,-1.02) accuracy:train/valid[20,31,combined]=(0.79,0.87,0.87/0.70,0.72,0.72)
+
+# Below, comparing with the chain TDNN system.  It's a little better with the
+# small-vocab decoding.  Both systems are probably super-badly tuned, and the
+# chain system probably used too many jobs.
+#
+# local/nnet3/compare_wer.sh exp/chain/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp
+# System                tdnn1a_sp tdnn_lstm1a_sp
+#WER dev_clean_2 (tgsmall)      18.43     17.37
+#WER dev_clean_2 (tglarge)      13.15     13.43
+# Final train prob                  -0.3933
+# Final valid prob                  -0.9662
+# Final train acc                    0.8652
+# Final valid acc                    0.7206
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lang=data/lang
+label_delay=5
+
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/graph_tgsmall/HCLG.fst $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda delay=$label_delay input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=$lang \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $gmm_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 13 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --nj $nspk --cmd "$decode_cmd" \
+        $gmm_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh b/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..ba4ecc268df
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+
+# This is like 1a, but adding dropout.   It's definitely helpful,
+# and you can see in the objf values that the train-test difference
+# is less.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm1b_sp
+# exp/nnet3/tdnn_lstm1b_sp: num-iters=32 nj=2..2 num-params=8.4M dim=40+100->2041 combine=-0.71->-0.58 loglike:train/valid[20,31,combined]=(-2.78,-0.95,-0.57/-2.94,-1.31,-0.98) accuracy:train/valid[20,31,combined]=(0.48,0.75,0.81/0.45,0.67,0.71)
+
+# local/nnet3/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/nnet3/tdnn_lstm1b_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b_sp
+#WER dev_clean_2 (tgsmall)      17.67     17.01
+#             [online:]         18.06     17.26
+#WER dev_clean_2 (tglarge)      13.43     12.63
+#             [online:]         13.73     12.94
+# Final train prob        -0.3660   -0.5680
+# Final valid prob        -1.0236   -0.9771
+# Final train acc          0.8737    0.8067
+# Final valid acc          0.7222    0.7144
+
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN+LSTM directory name
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lang=data/lang
+label_delay=5
+
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/graph_tgsmall/HCLG.fst $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  lstm_opts="decay-time=20 delay=-3 dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda delay=$label_delay input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=$lang \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $gmm_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 13 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --nj $nspk --cmd "$decode_cmd" \
+        $gmm_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
new file mode 100755
index 00000000000..74df56b0537
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+
+# 1c is like 1b, but changing renorm to batchnorm and adding l2 regularization.
+
+# local/nnet3/compare_wer.sh --online exp/nnet3/tdnn_lstm1b_sp exp/nnet3/tdnn_lstm1c_sp
+# System                tdnn_lstm1b_sp tdnn_lstm1c_sp
+#WER dev_clean_2 (tgsmall)      17.20     16.03
+#             [online:]         17.33     16.16
+#WER dev_clean_2 (tglarge)      12.69     11.66
+#             [online:]         12.90     11.70
+# Final train prob        -0.5626   -0.6092
+# Final valid prob        -0.9453   -0.9147
+# Final train acc          0.8068    0.7999
+# Final valid acc          0.7202    0.7235
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm1c_sp
+# exp/nnet3/tdnn_lstm1c_sp: num-iters=32 nj=1..2 num-params=8.4M dim=40+100->2041 combine=-0.99->-0.81 loglike:train/valid[20,31,combined]=(-1.22,-0.69,-0.61/-1.34,-1.02,-0.91) accuracy:train/valid[20,31,combined]=(0.68,0.779,0.800/0.64,0.70,0.724)
+
+
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1c   # affix for the TDNN+LSTM directory name
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lang=data/lang
+label_delay=5
+
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/graph_tgsmall/HCLG.fst $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.05"
+  lstm_opts="l2-regularize=0.01 decay-time=20 delay=-3 dropout-proportion=0.0"
+  output_opts="l2-regularize=0.01"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda delay=$label_delay input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  relu-batchnorm-layer name=tdnn1 dim=520 $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 dim=520 $tdnn_opts input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts
+  relu-batchnorm-layer name=tdnn3 dim=520 $tdnn_opts input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=520 $tdnn_opts input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts
+  relu-batchnorm-layer name=tdnn5 dim=520 $tdnn_opts input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=520 $tdnn_opts input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 $lstm_opts
+
+  output-layer name=output input=lstm3 $output_opts output-delay=$label_delay dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=$lang \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $gmm_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 13 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --nj $nspk --cmd "$decode_cmd" \
+        $gmm_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/cmu_cslu_kids/s5/local/prepare_grammar.sh b/egs/cmu_cslu_kids/s5/local/prepare_grammar.sh
new file mode 100755
index 00000000000..5b26615eaf7
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/prepare_grammar.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+# Takes no arguments. 
+
+
+
+tmpdir=data/local/tmp
+[ ! -f $tmpdir/G.txt ] && echo "No such file $tmpdir/G.txt" && exit 1;
+
+. ./path.sh || exit 1; # for KALDI_ROOT
+
+fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
+    --keep_osymbols=false $tmpdir/G.txt | fstarcsort --sort_type=ilabel > data/lang/G.fst || exit 1;
+
+# Checking that G is stochastic [note, it wouldn't be for an Arpa]
+fstisstochastic data/lang/G.fst || echo Error: G is not stochastic
+
+# Checking that G.fst is determinizable.
+fstdeterminize data/lang/G.fst /dev/null || echo Error determinizing G.
+
+# Checking that L_disambig.fst is determinizable.
+fstdeterminize data/lang/L_disambig.fst /dev/null || echo Error determinizing L.
+
+# Checking that disambiguated lexicon times G is determinizable
+fsttablecompose data/lang/L_disambig.fst data/lang/G.fst | \
+   fstdeterminize >/dev/null || echo Error
+
+# Checking that LG is stochastic:
+fsttablecompose data/lang/L.fst data/lang/G.fst | \
+   fstisstochastic || echo Error: LG is not stochastic.
+
+# Checking that L_disambig.G is stochastic:
+fsttablecompose data/lang/L_disambig.fst data/lang/G.fst | \
+   fstisstochastic || echo Error: LG is not stochastic.
+
+echo "Succeeded preparing grammar for CMU_kids."
diff --git a/egs/cmu_cslu_kids/s5/local/score.sh b/egs/cmu_cslu_kids/s5/local/score.sh
new file mode 100755
index 00000000000..c812199fc98
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/score.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2014  Guoguo Chen
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \
+    lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+    lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+    lattice-best-path --word-symbol-table=$symtab \
+      ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
+done
+
+# Note: the double level of quoting for the sed command
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
+    cat $dir/scoring/LMWT.$wip.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+    ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+done
+
+exit 0;
diff --git a/egs/cmu_cslu_kids/s5/local/sort_result.sh b/egs/cmu_cslu_kids/s5/local/sort_result.sh
new file mode 100755
index 00000000000..aedec9dc344
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/sort_result.sh
@@ -0,0 +1,46 @@
+#! /bin/bash 
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+# Sorts and reports results in results/results.txt
+# for all models in exp. Expects decode directories 
+# to be named as exp/<model>/decode* or exp/chain/tdnn*/decode*
+# Should be run from egs/cmu_cslu_kids.
+
+res=${1:-"results/results.txt"}
+exp=exp
+mkdir -p results
+rm -f $res
+
+echo "Sorting results in: "
+echo "# ---------- GMM-HMM Models ----------" >> $res
+for mdl in $exp/mono* $exp/tri*; do
+    echo "  $mdl"
+    if [ -d $mdl ];then
+        for dec in $mdl/decode*;do
+            echo "    $dec"
+            if [ -d $dec ];then
+                grep WER $dec/wer* | \
+                    sort -k2 -n > $dec/WERs
+                head -n 1 $dec/WERs >> $res
+            fi
+        done
+    fi
+done
+
+echo "# ---------- DNN-HMM Models ----------" >> $res
+# DNN results
+for mdl in $exp/chain/tdnn*; do
+    echo "  $mdl"
+    for dec in $mdl/decode*; do
+        if [ -d $dec ]; then
+            echo "    $dec"
+            grep WER $dec/wer* | \
+                sort -k2 -n > $dec/WERs
+            head -n 1 $dec/WERs >> $res
+        fi
+    done
+done
+
+sed -i "s/:/    /g" $res 
diff --git a/egs/cmu_cslu_kids/s5/local/subset_dataset.sh b/egs/cmu_cslu_kids/s5/local/subset_dataset.sh
new file mode 100755
index 00000000000..050128247a4
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/subset_dataset.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2017  Luminar Technologies, Inc. (author: Daniel Galvez)
+# Apache 2.0
+
+# The following commands were used to generate the mini_librispeech dataset:
+#
+# Note that data generation is random. This could be fixed by
+# providing a seed argument to the shuf program.
+
+if [ "$#" -ne 3 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir> <num-hours>"
+  echo "e.g.: $0 /export/a05/dgalvez/LibriSpeech/train-clean-100 \\
+                 /export/a05/dgalvez/LibriSpeech/train-clean-5 5"
+  exit 1
+fi
+
+src_dir=$1
+dest_dir=$2
+dest_num_hours=$3
+
+src=$(basename $src_dir)
+dest=$(basename $dest_dir)
+librispeech_dir=$(dirname $src_dir)
+
+# TODO: Possibly improve this to ensure gender balance and speaker
+# balance.
+# TODO: Use actual time values instead of assuming that to make sure we get $dest_num_hours of data
+src_num_hours=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | awk -F'|' '{ print $3 }' | \
+python -c '
+from __future__ import print_function
+from sys import stdin
+minutes_str = stdin.read().split()
+print(int(round(sum([float(minutes) for minutes in minutes_str]) / 60.0)))')
+src_num_chapters=$(grep "$src" $librispeech_dir/CHAPTERS.TXT | \
+                      awk -F'|' '{ print $1 }' | sort -u | wc -l)
+mkdir -p data/subset_tmp
+grep "$src" $librispeech_dir/CHAPTERS.TXT | \
+  awk -F'|' '{ print $1 }' | \
+  shuf -n $(((dest_num_hours * src_num_chapters) / src_num_hours)) > \
+       data/subset_tmp/${dest}_chapter_id_list.txt
+
+while read -r chapter_id || [[ -n "$chapter_id" ]]; do
+  chapter_dir=$(find $src_dir/ -mindepth 2 -name "$chapter_id" -type d)
+  speaker_id=$(basename $(dirname $chapter_dir))
+  mkdir -p $dest_dir/$speaker_id/
+  cp -r $chapter_dir $dest_dir/$speaker_id/
+done  < data/subset_tmp/${dest}_chapter_id_list.txt
diff --git a/egs/cmu_cslu_kids/s5/local/sum_duration.py b/egs/cmu_cslu_kids/s5/local/sum_duration.py
new file mode 100644
index 00000000000..0af7ba62151
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/sum_duration.py
@@ -0,0 +1,15 @@
+# Sum duration obtained by using 
+# utils/data/get_utt2dur.sh
+
+import sys
+file = sys.argv[1]
+sum = 0
+with open(file, 'r') as fp:
+    line = fp.readline()
+    while(line):
+        toks = line.strip().split()
+        sum += float(toks[1])
+        line = fp.readline()
+fp.close()
+h=sum/3600
+sys.stdout.write("%f hour data.\n"%h)
diff --git a/egs/cmu_cslu_kids/s5/local/train_lms.sh b/egs/cmu_cslu_kids/s5/local/train_lms.sh
new file mode 100755
index 00000000000..0807210be18
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/train_lms.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+
+# This script trains LMs on the WSJ LM-training data.
+# It requires that you have already run wsj_extend_dict.sh,
+# to get the larger-size dictionary including all of CMUdict
+# plus any OOVs and possible acronyms that we could easily 
+# derive pronunciations for.
+
+dict_suffix=
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+srcdir=data/local/dict${dict_suffix}_larger
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/kaldi_lm:$PATH
+( # First make sure the kaldi_lm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d kaldi_lm ]; then
+   echo Not installing the kaldi_lm toolkit since it is already there.
+ else
+   echo Downloading and installing the kaldi_lm tools
+   if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
+   fi
+   tar -xvzf kaldi_lm.tar.gz || exit 1;
+   cd kaldi_lm
+   make || exit 1;
+   echo Done making the kaldi_lm tools
+ fi
+) || exit 1;
+
+
+
+if [ ! -f $srcdir/cleaned.gz -o ! -f $srcdir/lexicon.txt ]; then
+  echo "Expecting files $srcdir/cleaned.gz and $srcdir/lexicon.txt to exist";
+  echo "You need to run local/wsj_extend_dict.sh before running this script."
+  exit 1;
+fi
+
+# Get a wordlist-- keep everything but silence, which should not appear in
+# the LM.
+awk '{print $1}' $srcdir/lexicon.txt | grep -v -w '!SIL' > $dir/wordlist.txt
+
+# Get training data with OOV words (w.r.t. our current vocab) replaced with <UNK>.
+echo "Getting training data with OOV words replaced with <UNK> (train_nounk.gz)" 
+gunzip -c $srcdir/cleaned.gz | awk -v w=$dir/wordlist.txt \
+  'BEGIN{while((getline<w)>0) v[$1]=1;}
+  {for (i=1;i<=NF;i++) if ($i in v) printf $i" ";else printf "<UNK> ";print ""}'|sed 's/ $//g' \
+  | gzip -c > $dir/train_nounk.gz
+
+# Get unigram counts (without bos/eos, but this doens't matter here, it's
+# only to get the word-map, which treats them specially & doesn't need their
+# counts).
+# Add a 1-count for each word in word-list by including that in the data,
+# so all words appear.
+gunzip -c $dir/train_nounk.gz | cat - $dir/wordlist.txt | \
+  awk '{ for(x=1;x<=NF;x++) count[$x]++; } END{for(w in count){print count[w], w;}}' | \
+ sort -nr > $dir/unigram.counts
+
+# Get "mapped" words-- a character encoding of the words that makes the common words very short.
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map
+
+gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz
+
+# To save disk space, remove the un-mapped training data.  We could
+# easily generate it again if needed.
+rm $dir/train_nounk.gz 
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
+# 7.8 million N-grams.
+
+prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
+# 1.45 million N-grams.
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
+# 10.3 million N-grams.
+
+prune_lm.sh --arpa 7.0 $dir/4gram-mincount
+# 1.50 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
+
+
+exit 0
+
+### Below here, this script is showing various commands that 
+## were run during LM tuning.
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
+# 7.8 million N-grams.
+
+prune_lm.sh --arpa 3.0 $dir/3gram-mincount/
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 156.408740
+# 2.5 million N-grams.
+
+prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
+# 1.45 million N-grams.
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
+# 10.3 million N-grams.
+
+prune_lm.sh --arpa 3.0 $dir/4gram-mincount
+#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 143.206294
+# 2.6 million N-grams.
+
+prune_lm.sh --arpa 4.0 $dir/4gram-mincount
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 146.927717
+# 2.15 million N-grams.
+
+prune_lm.sh --arpa 5.0 $dir/4gram-mincount
+# 1.86 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 150.162023
+
+prune_lm.sh --arpa 7.0 $dir/4gram-mincount
+# 1.50 million N-grams
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 155.663757
+
+train_lm.sh --arpa --lmtype 3gram $dir
+# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 135.692866
+# 20.0 million N-grams
+
+! which ngram-count  \
+  && echo "SRILM tools not installed so not doing the comparison" && exit 1;
+
+#################
+# You could finish the script here if you wanted.
+# Below is to show how to do baselines with SRILM.
+#  You'd have to install the SRILM toolkit first.
+
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+gunzip -c $srcdir/cleaned.gz | head -$heldout_sent > $sdir/cleaned.heldout
+gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent > $sdir/cleaned.train
+(echo "<s>"; echo "</s>" ) | cat - $dir/wordlist.txt > $sdir/wordlist.final.s
+
+# 3-gram:
+ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/cleaned.heldout # consider -debug 2
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -491456 ppl= 141.457 ppl1= 177.437
+
+# Trying 4-gram:
+ngram-count -text $sdir/cleaned.train -order 4 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o4g.kn.gz
+ngram -order 4 -lm $sdir/srilm.o4g.kn.gz -ppl $sdir/cleaned.heldout 
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -480939 ppl= 127.233 ppl1= 158.822
+
+#3-gram with pruning:
+ngram-count -text $sdir/cleaned.train -order 3 -limit-vocab -vocab $sdir/wordlist.final.s -unk \
+  -prune 0.0000001 -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.pr7.kn.gz
+ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout 
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 478 OOVs
+#0 zeroprobs, logprob= -510828 ppl= 171.947 ppl1= 217.616
+# Around 2.25M N-grams.
+# Note: this is closest to the experiment done with "prune_lm.sh --arpa 3.0 $dir/3gram-mincount/"
+# above, which gave 2.5 million N-grams and a perplexity of 156.
+
+# Note: all SRILM experiments above fully discount all singleton 3 and 4-grams.
+# You can use -gt3min=0 and -gt4min=0 to stop this (this will be comparable to
+# the kaldi_lm experiments above without "-mincount".
+
+##  From here is how to train with
+# IRSTLM.  This is not really working at the moment.
+
+if [ -z $IRSTLM ] ; then
+  export IRSTLM=$KALDI_ROOT/tools/irstlm/
+fi
+export PATH=${PATH}:$IRSTLM/bin
+if ! command -v prune-lm >/dev/null 2>&1 ; then
+  echo "$0: Error: the IRSTLM is not available or compiled" >&2
+  echo "$0: Error: We used to install it by default, but." >&2
+  echo "$0: Error: this is no longer the case." >&2
+  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
+  echo "$0: Error: and run extras/install_irstlm.sh" >&2
+  exit 1
+fi
+
+idir=$dir/irstlm
+mkdir $idir
+gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | add-start-end.sh | \
+  gzip -c > $idir/train.gz
+
+dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
+ cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
+{print $0;}}' > vocab.irstlm.20k
+
+
+build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz  -p yes \
+  -n 3 -s improved-kneser-ney -b yes
+# Testing perplexity with SRILM tools:
+ngram -lm $idir/lm_3gram.gz  -ppl $sdir/cleaned.heldout 
+#data/local/local_lm/irstlm/lm_3gram.gz: line 162049: warning: non-zero probability for <unk> in closed-vocabulary LM
+#file data/local/local_lm/srilm/cleaned.heldout: 10000 sentences, 218996 words, 0 OOVs
+#0 zeroprobs, logprob= -513670 ppl= 175.041 ppl1= 221.599
+
+# Perplexity is very bad (should be ~141, since we used -p option,
+# not 175),
+# but adding -debug 3 to the command line shows that
+# the IRSTLM LM does not seem to sum to one properly, so it seems that
+# it produces an LM that isn't interpretable in the normal way as an ARPA
+# LM.
+
+
+
diff --git a/egs/cmu_cslu_kids/s5/local/vtln.sh b/egs/cmu_cslu_kids/s5/local/vtln.sh
new file mode 100755
index 00000000000..0ca179ce89f
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/local/vtln.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+# Run VTLN. This will be run if the vtln option 
+# is set to be true in run.sh.
+
+set -eu
+stage=0
+featdir=mfcc/vtln
+data=data
+mdl=exp/tri3
+mdl_vtln=${mdl}_vtln
+vtln_lda=exp/tri4
+vtln_sat=exp/tri5
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+mkdir -p $featdir
+
+steps/train_lvtln.sh --cmd "$train_cmd" 1800 9000 $data/train $data/lang $mdl $mdl_vtln
+
+if [ $stage -le 0 ]; then
+    mkdir -p $data/train_vtln
+    cp $data/train/* $data/train_vtln || true
+    cp $mdl_vtln/final.warp $data/train_vtln/spk2warp
+    steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" $data/train_vtln exp/make_mfcc/train_vtln $featdir  
+    steps/compute_cmvn_stats.sh $data/train_vtln exp/make_mfcc/train_vtln $featdir  
+fi
+
+if [ $stage -le 1 ]; then 
+    utils/mkgraph.sh $data/lang_test_tgmed $mdl_vtln $mdl_vtln/graph
+    steps/decode_lvtln.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
+        $mdl_vtln/graph $data/test $mdl_vtln/decode
+fi 
+
+if [ $stage -le 2 ]; then
+    mkdir -p $data/test_vtln
+    cp $data/test/* $data/test_vtln || true
+    cp $mdl_vtln/decode/final.warp $data/test_vtln/spk2warp
+    steps/make_mfcc.sh --nj 8 --cmd "$train_cmd" $data/test_vtln exp/make_mfcc/test_vtln $featdir  
+    steps/compute_cmvn_stats.sh $data/test_vtln exp/make_mfcc/test_vtln $featdir  
+fi 
+
+if [ $stage -le 3 ]; then
+    steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 1800 9000 \
+        $data/train_vtln $data/lang $mdl_vtln $vtln_lda
+    utils/mkgraph.sh $data/lang_test_tgmed $vtln_lda $vtln_lda/graph
+    echo "$mdl_vtln + lda + mllt" > $vtln_lda/mcodel_discription
+    steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
+        $vtln_lda/graph $data/test_vtln $vtln_lda/decode
+fi
+
+if [ $stage -le 4 ]; then
+    steps/train_sat.sh 1800 9000 $data/train_vtln $data/lang $vtln_lda $vtln_sat
+    utils/mkgraph.sh $data/lang_test_tgmed $vtln_sat $vtln_sat/graph 
+    steps/decode_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" $vtln_sat/graph $data/test_vtln $vtln_sat/decode 
+    echo  "$mdl_vtln + lda + mllt + SAT" > $vtln_sat/model_discription
+fi
diff --git a/egs/cmu_cslu_kids/s5/path.sh b/egs/cmu_cslu_kids/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/cmu_cslu_kids/s5/run.sh b/egs/cmu_cslu_kids/s5/run.sh
new file mode 100755
index 00000000000..43ae1ea9426
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/run.sh
@@ -0,0 +1,177 @@
+#! /bin/bash
+
+# Copyright Johns Hopkins University
+#   2019 Fei Wu
+
+set -eo
+
+stage=0
+cmu_kids=               # path to cmu_kids corpus
+cslu_kids=              # path to cslu_kids corpus
+lm_src=                 # path of existing librispeech lm 
+extra_features=false    # Extra features for GMM model (MMI, boosting and MPE)
+vtln=false              # Optional, run VLTN on gmm and tdnnf models if set true 
+email=                  # Reporting email for tdnn-f training
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+lm_url=www.openslr.org/resources/11
+mkdir -p data
+mkdir -p data/local
+
+# Prepare data
+if [ $stage -le 0 ]; then
+  # Make soft link to the corpora
+  if [ ! -e cmu_kids ]; then
+    ln -sf $cmu_kids cmu_kids
+  fi
+  if [ ! -e cslu ]; then
+    ln -sf $cslu_kids cslu
+  fi
+  
+  # Make softlink to lm, if lm_src provided
+  if [ ! -z "$lm_src" ] && [ ! -e data/local/lm ] ; then
+    ln -sf $lm_src data/local/lm
+  fi
+  
+  # Remove old data dirs
+  rm -rf data/data_cmu
+  rm -rf data/data_cslu
+
+  # Data Prep
+  ./local/cmu_prepare_data.sh --corpus cmu_kids/kids --data data/data_cmu
+  ./local/cslu_prepare_data.sh --corpus cslu --data data/data_cslu 
+fi
+
+# Combine data
+if [ $stage -le 1 ]; then
+   mkdir -p data/train
+   mkdir -p data/test
+   rm -rf data/train/*
+   rm -rf data/test/*
+   ./utils/combine_data.sh data/train data/data_cmu/train data/data_cslu/train
+   ./utils/combine_data.sh data/test data/data_cmu/test data/data_cslu/test
+fi
+
+# LM, WFST Preparation
+if [ $stage -le 2 ]; then
+  if [ ! -d data/local/dict ]; then
+      ./local/download_cmu_dict.sh
+  fi
+
+  if [ ! -e data/local/lm ]; then
+    echo "lm_src not provided. Downloading lm from openslr."
+    ./local/download_lm.sh $lm_url data/local/lm
+  fi
+
+  utils/prepare_lang.sh data/local/dict "<UNK>"  data/local/lang data/lang
+  local/format_lms.sh --src_dir data/lang  data/local/lm 
+   
+  # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs
+  utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge
+  utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge 
+fi
+
+# Make MFCC features
+if [ $stage -le 3 ]; then
+  mkdir -p mfcc
+  mkdir -p exp
+  steps/make_mfcc.sh --nj 40 --cmd "$train_cmd" data/test exp/make_feat/test mfcc
+  steps/compute_cmvn_stats.sh data/test exp/make_feat/test mfcc
+  steps/make_mfcc.sh --nj 40 --cmd "$train_cmd" data/train exp/make_feat/train mfcc 
+  steps/compute_cmvn_stats.sh data/train exp/make_feat/train mfcc
+fi
+
+# Mono-phone 
+if [ $stage -le 4 ]; then
+  # Train
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" data/train data/lang exp/mono 
+  #Decode
+  utils/mkgraph.sh data/lang_test_tgsmall exp/mono exp/mono/graph
+  steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/mono/graph data/test exp/mono/decode
+  #Align
+  steps/align_si.sh --nj 20 --cmd "$train_cmd" data/train data/lang exp/mono exp/mono_ali
+fi
+
+# Tri1 [Vanilla tri phone model]
+if [ $stage -le 5 ]; then
+  # Train
+  steps/train_deltas.sh --cmd "$train_cmd" 1800 9000 data/train data/lang exp/mono_ali exp/tri1
+  # Decode 
+  utils/mkgraph.sh data/lang_test_tgmed exp/tri1 exp/tri1/graph 
+  steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1/graph data/test exp/tri1/decode
+  # Align - make graph - decode again   
+  steps/align_si.sh --nj 20 --cmd "queue.pl" --use-graphs true data/train data/lang_test_tgmed exp/tri1 exp/tri1_ali
+  utils/mkgraph.sh data/lang_test_tgmed exp/tri1_ali exp/tri1_ali/graph
+  steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri1_ali/graph data/test exp/tri1_ali/decode
+fi
+
+# Add LDA and MLLT
+if [ $stage -le 6 ]; then
+  # Train
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" 1800 9000 data/train data/lang exp/tri1_ali exp/tri2
+  utils/mkgraph.sh data/lang_test_tgmed exp/tri2 exp/tri2/graph
+  # Decode
+  steps/decode.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2/decode
+  # Align - make graph - dcode again 
+  steps/align_si.sh --nj 20 --cmd "$train_cmd" --use-graphs true data/train data/lang_test_tgmed exp/tri2 exp/tri2_ali
+  utils/mkgraph.sh data/lang_test_tgmed exp/tri2_ali exp/tri2_ali/graph
+  steps/decode_fmllr.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri2_ali/graph data/test exp/tri2_ali/decode
+fi 
+
+# Add other features
+if [ $stage -le 7 ]; then
+  if [ $extra_features = true ]; then
+    # Add MMI
+    steps/make_denlats.sh --nj 20 --cmd "$train_cmd" data/train data/lang exp/tri2 exp/tri2_denlats
+    steps/train_mmi.sh data/train data/lang exp/tri2_ali exp/tri2_denlats exp/tri2_mmi
+    steps/decode.sh --config conf/decode.config --iter 4 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi/decode_it4
+    steps/decode.sh --config conf/decode.config --iter 3 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi/decode_it3
+    
+    # Add Boosting 
+    steps/train_mmi.sh --boost 0.05 data/train data/lang exp/tri2_ali exp/tri2_denlats exp/tri2_mmi_b0.05
+    steps/decode.sh --config conf/decode.config --iter 4 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi_b0.05/decode_it4
+    steps/decode.sh --config conf/decode.config --iter 3 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mmi_b0.05/decode_it3
+    
+    # Add MPE 
+    steps/train_mpe.sh data/train data/lang exp/tri2_ali exp/tri2_denlats exp/tri2_mpe
+    steps/decode.sh --config conf/decode.config --iter 4 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mpe/decode_it4
+    steps/decode.sh --config conf/decode.config --iter 3 --nj 20 --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2_mpe/decode_it3
+  fi
+fi
+
+# Add SAT
+if [ $stage -le 8 ]; then 
+  # Do LDA+MLLT+SAT, and decode.
+  steps/train_sat.sh 1800 9000 data/train data/lang exp/tri2_ali exp/tri3
+  utils/mkgraph.sh data/lang_test_tgmed exp/tri3 exp/tri3/graph
+  steps/decode_fmllr.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri3/graph data/test exp/tri3/decode
+fi
+
+if [ $stage -le 9 ]; then
+  # Align all data with LDA+MLLT+SAT system (tri3)
+  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" --use-graphs true data/train data/lang_test_tgmed exp/tri3 exp/tri3_ali
+  utils/mkgraph.sh data/lang_test_tgmed exp/tri3_ali exp/tri3_ali/graph   
+  steps/decode_fmllr.sh --config conf/decode.config --nj 40 --cmd "$decode_cmd" exp/tri3_ali/graph data/test exp/tri3_ali/decode
+fi
+
+if [ $stage -le 10 ]; then 
+    # Uncomment reporting email option to get training progress updates by email
+  ./local/chain/run_tdnnf.sh --train_set train \
+      --test_sets test --gmm tri3  # --reporting_email $email 
+fi
+
+
+# Optional VTLN. Run if vtln is set to true
+if [ $stage -le 11 ]; then
+  if [ $vtln = true ]; then
+    ./local/vtln.sh
+    ./local/chain/run_tdnnf.sh --nnet3_affix vtln --train_set train_vtln \
+        --test_sets test_vtln --gmm tri5 # --reporting_email $email
+  fi
+fi
+
+# Collect and resport WER results for all models
+./local/sort_result.sh
diff --git a/egs/cmu_cslu_kids/s5/steps b/egs/cmu_cslu_kids/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/cmu_cslu_kids/s5/utils b/egs/cmu_cslu_kids/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/cmu_cslu_kids/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/cnceleb/README.txt b/egs/cnceleb/README.txt
new file mode 100644
index 00000000000..db8789839a9
--- /dev/null
+++ b/egs/cnceleb/README.txt
@@ -0,0 +1,9 @@
+
+This directory contains example scripts for CN-Celeb speaker 
+verification. The CN-Celeb corpus is required, and can be 
+downloaded from Openslr http://www.openslr.org/82/ or from 
+CSLT@Tsinghua http://cslt.riit.tsinghua.edu.cn/~data/CN-Celeb/
+
+The subdirectories "v1" and so on are different speaker recognition 
+recipes. The recipe in v1 demonstrates a standard approach using a 
+full-covariance GMM-UBM, iVectors, and a PLDA backend.
diff --git a/egs/cnceleb/v1/README.txt b/egs/cnceleb/v1/README.txt
new file mode 100644
index 00000000000..dc5086f0b7a
--- /dev/null
+++ b/egs/cnceleb/v1/README.txt
@@ -0,0 +1,4 @@
+
+ This example demonstrates a traditional iVector system based on 
+ CN-Celeb dataset. 
+
diff --git a/egs/cnceleb/v1/cmd.sh b/egs/cnceleb/v1/cmd.sh
new file mode 100755
index 00000000000..d1ca1a6d126
--- /dev/null
+++ b/egs/cnceleb/v1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+
+
diff --git a/egs/cnceleb/v1/conf/mfcc.conf b/egs/cnceleb/v1/conf/mfcc.conf
new file mode 100644
index 00000000000..649cffb9de8
--- /dev/null
+++ b/egs/cnceleb/v1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=24
+--snip-edges=false
diff --git a/egs/cnceleb/v1/conf/vad.conf b/egs/cnceleb/v1/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/cnceleb/v1/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/cnceleb/v1/local/make_cnceleb.sh b/egs/cnceleb/v1/local/make_cnceleb.sh
new file mode 100755
index 00000000000..14d44d6d3d0
--- /dev/null
+++ b/egs/cnceleb/v1/local/make_cnceleb.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright      2017  Ignacio Viñals
+#           2017-2018  David Snyder
+#                2019  Jiawen Kang
+#
+# This script prepares the CN-Celeb dataset. It creates separate directories
+# for train, eval enroll and eval test. It also prepares a trials files, in the eval test directory.
+
+if [  $# != 2 ]; then
+    echo "Usage: make_cnceleb.sh <CN-Celeb_PATH> <out_dir>"
+    echo "E.g.: make_cnceleb.sh /export/corpora/CN-Celeb data"
+    exit 1
+fi
+
+in_dir=$1
+out_dir=$2
+
+# Prepare the development data
+this_out_dir=${out_dir}/train
+mkdir -p $this_out_dir 2>/dev/null
+WAVFILE=$this_out_dir/wav.scp
+SPKFILE=$this_out_dir/utt2spk
+rm $WAVFILE $SPKFILE 2>/dev/null
+this_in_dir=${in_dir}/dev
+
+for spkr_id in `cat $this_in_dir/dev.lst`; do
+  for f in $in_dir/data/$spkr_id/*.wav; do
+    wav_id=$(basename $f | sed s:.wav$::)
+    echo "${spkr_id}-${wav_id} $f" >> $WAVFILE
+    echo "${spkr_id}-${wav_id} ${spkr_id}" >> $SPKFILE
+  done
+done
+utils/fix_data_dir.sh $this_out_dir
+
+# Prepare the evaluation data
+for mode in enroll test; do
+  this_out_dir=${out_dir}/eval_${mode}
+  mkdir -p $this_out_dir 2>/dev/null
+  WAVFILE=$this_out_dir/wav.scp
+  SPKFILE=$this_out_dir/utt2spk
+  rm $WAVFILE $SPKFILE 2>/dev/null
+  this_in_dir=${in_dir}/eval/${mode}
+
+  for f in $this_in_dir/*.wav; do
+    wav_id=$(basename $f | sed s:.wav$::)
+    spkr_id=$(echo ${wav_id} | cut -d "-" -f1)
+    echo "${wav_id} $f" >> $WAVFILE
+    echo "${wav_id} ${spkr_id}" >> $SPKFILE
+  done
+  utils/fix_data_dir.sh $this_out_dir
+done
+
+# Prepare test trials
+this_out_dir=$out_dir/eval_test/trials
+mkdir -p $out_dir/eval_test/trials
+this_in_dir=${in_dir}/eval/lists
+cat $this_in_dir/trials.lst | sed 's@-enroll@@g' | sed 's@test/@@g' | sed 's@.wav@@g' | \
+  awk '{if ($3 == "1")
+         {print $1,$2,"target"}
+       else
+         {print $1,$2,"nontarget"}
+       }'> $this_out_dir/trials.lst
+
diff --git a/egs/cnceleb/v1/path.sh b/egs/cnceleb/v1/path.sh
new file mode 100755
index 00000000000..e50f57c5271
--- /dev/null
+++ b/egs/cnceleb/v1/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/cnceleb/v1/run.sh b/egs/cnceleb/v1/run.sh
new file mode 100755
index 00000000000..0ca7ed8f277
--- /dev/null
+++ b/egs/cnceleb/v1/run.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Copyright   2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#             2017   Johns Hopkins University (Author: Daniel Povey)
+#        2017-2018   David Snyder
+#             2018   Ewald Enzinger
+#             2019   Tsinghua University (Author: Jiawen Kang and Lantian Li)
+# Apache 2.0.
+#
+# This is an i-vector-based recipe for CN-Celeb database.
+# See ../README.txt for more info on data required. The recipe uses
+# CN-Celeb/dev for training the UBM, T matrix and PLDA, and CN-Celeb/eval
+# for evaluation. The results are reported in terms of EER and minDCF,
+# and are inline in the comments below.
+
+. ./cmd.sh
+. ./path.sh
+set -e
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+cnceleb_root=/export/corpora/CN-Celeb
+eval_trails_core=data/eval_test/trials/trials.lst
+
+stage=0
+
+if [ $stage -le 0 ]; then
+  # Prepare the CN-Celeb dataset. The script is used to prepare the development
+  # dataset and evaluation dataset.
+  local/make_cnceleb.sh $cnceleb_root data
+fi
+
+if [ $stage -le 1 ]; then
+  # Make MFCCs and compute the energy-based VAD for each dataset
+  for name in train eval_enroll eval_test; do
+    steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 20 --cmd "$train_cmd" \
+      data/${name} exp/make_mfcc $mfccdir
+    utils/fix_data_dir.sh data/${name}
+    sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \
+      data/${name} exp/make_vad $vaddir
+    utils/fix_data_dir.sh data/${name}
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # Train the UBM
+  sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \
+    --nj 20 --num-threads 8 \
+    data/train 2048 \
+    exp/diag_ubm
+
+  sid/train_full_ubm.sh --cmd "$train_cmd --mem 16G" \
+    --nj 20 --remove-low-count-gaussians false \
+    data/train \
+    exp/diag_ubm exp/full_ubm
+fi
+
+if [ $stage -le 3 ]; then
+  # Train the i-vector extractor.
+  sid/train_ivector_extractor.sh --nj 20 --cmd "$train_cmd --mem 16G" \
+    --ivector-dim 400 --num-iters 5 \
+    exp/full_ubm/final.ubm data/train \
+    exp/extractor
+fi
+
+if [ $stage -le 4 ]; then
+  # Note that there are over one-third of the utterances less than 2 seconds in our training set,
+  # and these short utterances are harmful for PLDA training. Therefore, to improve performance 
+  # of PLDA modeling and inference, we will combine the short utterances longer than 5 seconds.
+  utils/data/combine_short_segments.sh --speaker-only true \
+    data/train 5 data/train_comb
+  # Compute the energy-based VAD for train_comb
+  sid/compute_vad_decision.sh --nj 20 --cmd "$train_cmd" \
+    data/train_comb exp/make_vad $vaddir
+  utils/fix_data_dir.sh data/train_comb
+fi
+
+if [ $stage -le 5 ]; then
+  # These i-vectors will be used for mean-subtraction, LDA, and PLDA training.
+  sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    exp/extractor data/train_comb \
+    exp/ivectors_train_comb
+
+  # Extract i-vector for eval sets.
+  for name in eval_enroll eval_test; do
+    sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 10 \
+      exp/extractor data/$name \
+      exp/ivectors_$name
+  done
+fi
+
+if [ $stage -le 6 ]; then
+  # Compute the mean vector for centering the evaluation i-vectors.
+  $train_cmd exp/ivectors_train_comb/log/compute_mean.log \
+    ivector-mean scp:exp/ivectors_train_comb/ivector.scp \
+    exp/ivectors_train_comb/mean.vec || exit 1;
+
+  # This script uses LDA to decrease the dimensionality prior to PLDA.
+  lda_dim=150
+  $train_cmd exp/ivectors_train_comb/log/lda.log \
+    ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \
+    "ark:ivector-subtract-global-mean scp:exp/ivectors_train_comb/ivector.scp ark:- |" \
+    ark:data/train_comb/utt2spk exp/ivectors_train_comb/transform.mat || exit 1;
+
+  # Train the PLDA model.
+  $train_cmd exp/ivectors_train_comb/log/plda.log \
+    ivector-compute-plda ark:data/train_comb/spk2utt \
+    "ark:ivector-subtract-global-mean scp:exp/ivectors_train_comb/ivector.scp ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    exp/ivectors_train_comb/plda || exit 1;
+
+fi
+
+if [ $stage -le 7 ]; then
+  # Compute PLDA scores for CN-Celeb eval core trials
+  $train_cmd exp/scores/log/cnceleb_eval_scoring.log \
+    ivector-plda-scoring --normalize-length=true \
+    --num-utts=ark:exp/ivectors_eval_enroll/num_utts.ark \
+    "ivector-copy-plda --smoothing=0.0 exp/ivectors_train_comb/plda - |" \
+    "ark:ivector-mean ark:data/eval_enroll/spk2utt scp:exp/ivectors_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_train_comb/mean.vec ark:- ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    "ark:ivector-subtract-global-mean exp/ivectors_train_comb/mean.vec scp:exp/ivectors_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_train_comb/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    "cat '$eval_trails_core' | cut -d\  --fields=1,2 |" exp/scores/cnceleb_eval_scores || exit 1;
+
+  # CN-Celeb Eval Core:
+  # EER: 13.91%
+  # minDCF(p-target=0.01): 0.6530
+  # minDCF(p-target=0.001): 0.7521
+  echo -e "\nCN-Celeb Eval Core:";
+  eer=$(paste $eval_trails_core exp/scores/cnceleb_eval_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null)
+  mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores/cnceleb_eval_scores $eval_trails_core 2> /dev/null`
+  mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores/cnceleb_eval_scores $eval_trails_core 2> /dev/null`
+  echo "EER: $eer%"
+  echo "minDCF(p-target=0.01): $mindcf1"
+  echo "minDCF(p-target=0.001): $mindcf2"
+fi
diff --git a/egs/cnceleb/v1/sid b/egs/cnceleb/v1/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/cnceleb/v1/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/cnceleb/v1/steps b/egs/cnceleb/v1/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/cnceleb/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/cnceleb/v1/utils b/egs/cnceleb/v1/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/cnceleb/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
index 635e3de1076..d4acd0fed4b 100755
--- a/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/commonvoice/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -141,7 +141,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/commonvoice/s5/local/prepare_dict.sh b/egs/commonvoice/s5/local/prepare_dict.sh
index d6d1aba41fb..cdfffe42080 100755
--- a/egs/commonvoice/s5/local/prepare_dict.sh
+++ b/egs/commonvoice/s5/local/prepare_dict.sh
@@ -52,7 +52,7 @@ if [[ "$(uname)" == "Darwin" ]]; then
   alias readlink=greadlink
 fi
 
-sequitur=$KALDI_ROOT/tools/sequitur
+sequitur=$KALDI_ROOT/tools/sequitur-g2p
 export PATH=$PATH:$sequitur/bin
 export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages`
 
diff --git a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
index a463db77066..75ceb80e3e0 100755
--- a/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/csj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -133,7 +133,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/csj/s5/local/csj_data_prep.sh b/egs/csj/s5/local/csj_data_prep.sh
index 55738bf0e37..69e2865e316 100755
--- a/egs/csj/s5/local/csj_data_prep.sh
+++ b/egs/csj/s5/local/csj_data_prep.sh
@@ -45,7 +45,9 @@ if [ ! -d $CSJ ]; then
 fi
 
 # CSJ dictionary file check
-[ ! -f $dir/lexicon.txt ] && cp $CSJ/lexicon/lexicon.txt $dir || exit 1;
+if [ ! -f $dir/lexicon.txt ]; then
+  cp $CSJ/lexicon/lexicon.txt $dir || exit 1;
+fi
 
 ### Config of using wav data that relates with acoustic model training ###
 if [ $mode -eq 3 ]
diff --git a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh
index f288e4fb4d3..5cd78ee94ae 100755
--- a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh
+++ b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh
@@ -61,7 +61,7 @@ if [ ! -e $outd/.done_make_trans ];then
                 mkdir -p $outd/$vol/$id
 
                 case "$csjv" in
-                    "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/$WAV" ;;
+                    "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/${WAV}$vol" ;;
                     "dvd" ) TPATH="$resource/$vol/$id"   ; WPATH="$resource/$vol/$id" ;;
                     "merl" ) TPATH="$resource/$vol/$SDB" ; WPATH="$resource/$vol/$WAV" ;;
                 esac
diff --git a/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh b/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh
index 4677ff473cb..297aed1f486 100755
--- a/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh
+++ b/egs/csj/s5/local/nnet/run_dnn_tandem_uc.sh
@@ -280,4 +280,4 @@ exit 0
 %WER 14.88 [ 2557 / 17189, 556 ins, 359 del, 1642 sub ] exp/tandem2uc-tri4/decode_eval3_csj/wer_20_0.5
 %WER 17.03 [ 2927 / 17189, 592 ins, 417 del, 1918 sub ] exp/tandem2uc-tri4/decode_eval3_csj.si/wer_20_1.0
 %WER 13.44 [ 2311 / 17189, 430 ins, 340 del, 1541 sub ] exp/tandem2uc-tri4_mmi_b0.1/decode_eval3_csj/wer_20_1.0
-EOF
\ No newline at end of file
+EOF
diff --git a/egs/csj/s5/local/run_sgmm2.sh b/egs/csj/s5/local/run_sgmm2.sh
index 619c6c5d1ef..c66b43c4f7f 100755
--- a/egs/csj/s5/local/run_sgmm2.sh
+++ b/egs/csj/s5/local/run_sgmm2.sh
@@ -18,7 +18,7 @@ fi
 if [ ! -f exp/ubm5/final.ubm ]; then
   steps/train_ubm.sh --cmd "$train_cmd" 1400 data/train_nodup data/lang \
     exp/tri4_ali_nodup exp/ubm5 || exit 1;
-fi 
+fi
 
 # steps/train_sgmm2.sh --cmd "$train_cmd" \
 steps/train_sgmm2_group.sh --cmd "$train_cmd" \
diff --git a/egs/dihard_2018/README.txt b/egs/dihard_2018/README.txt
new file mode 100644
index 00000000000..a7a00c8bf4e
--- /dev/null
+++ b/egs/dihard_2018/README.txt
@@ -0,0 +1,14 @@
+ 
+ This is a Kaldi recipe for The First DIHARD Speech Diarization Challenge.  
+ DIHARD is a new annual challenge focusing on "hard" diarization; that is,
+ speech diarization for challenging corpora where there is an expectation that
+ the current state-of-the-art will fare poorly, including, but not limited
+ to: clinical interviews, extended child language acquisition recordings,
+ YouTube videos and "speech in the wild" (e.g., recordings in restaurants)
+ See https://coml.lscp.ens.fr/dihard/index.html for details.
+
+ The subdirectories "v1" and so on are different speaker diarization
+ recipes. The recipe in v1 demonstrates a standard approach using a
+ full-covariance GMM-UBM, i-vectors, PLDA scoring and agglomerative
+ hierarchical clustering. The example in v2 demonstrates DNN speaker 
+ embeddings, PLDA scoring and agglomerative hierarchical clustering.
diff --git a/egs/dihard_2018/v1/README.txt b/egs/dihard_2018/v1/README.txt
new file mode 100644
index 00000000000..98bf3641b03
--- /dev/null
+++ b/egs/dihard_2018/v1/README.txt
@@ -0,0 +1,13 @@
+ This recipe is the speaker diarization recipe for The First DIHARD Speech
+ Diarization Challenge (DIHARD 2018). There are two tracks in the DIHARD 2018 
+ competition , one uses oracle SAD (track1) and the other required that SAD 
+ was performed from scratch (track2). This script is for track1.
+
+ The recipe is closely based on the following paper:
+ http://www.danielpovey.com/files/2018_interspeech_dihard.pdf but doesn't
+ contain the VB refinement. The whole system mainly contains full-covariance
+ GMM-UBM, i-vector extractor (T-matrix), PLDA scoring and agglomerative 
+ hierarchical clustering. The VoxCeleb datasets are used for training i-vectors 
+ and PLDA. The development set of the DIHARD 2018 competition is used as 
+ validation set to tune parameters. The system is tested on the DIHARD 2018 
+ evaluation set. 
diff --git a/egs/dihard_2018/v1/cmd.sh b/egs/dihard_2018/v1/cmd.sh
new file mode 100755
index 00000000000..c35cd18f287
--- /dev/null
+++ b/egs/dihard_2018/v1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl"
+
+
diff --git a/egs/dihard_2018/v1/conf/mfcc.conf b/egs/dihard_2018/v1/conf/mfcc.conf
new file mode 100644
index 00000000000..649cffb9de8
--- /dev/null
+++ b/egs/dihard_2018/v1/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=24
+--snip-edges=false
diff --git a/egs/dihard_2018/v1/conf/vad.conf b/egs/dihard_2018/v1/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/dihard_2018/v1/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/dihard_2018/v1/diarization b/egs/dihard_2018/v1/diarization
new file mode 120000
index 00000000000..bad937c1444
--- /dev/null
+++ b/egs/dihard_2018/v1/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
\ No newline at end of file
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_dev.py b/egs/dihard_2018/v1/local/make_dihard_2018_dev.py
new file mode 100755
index 00000000000..fa652da8b4c
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_dihard_2018_dev.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+# This script is called by local/make_dihard_2018_dev.sh, and it creates the
+# necessary files for DIHARD 2018 development directory.
+
+import sys, os
+
+def prepare_dihard_2018_dev(src_dir, data_dir):
+    wavscp_fi = open(data_dir + "/wav.scp" , 'w')
+    utt2spk_fi = open(data_dir + "/utt2spk" , 'w')
+    segments_fi = open(data_dir + "/segments" , 'w')
+    rttm_fi = open(data_dir + "/rttm" , 'w')
+    reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w')
+
+    for subdir, dirs, files in os.walk(src_dir):
+        for file in files:
+            filename = os.path.join(subdir, file)
+            if filename.endswith(".lab"):
+                utt = os.path.basename(filename).split(".")[0]
+                lines = open(filename, 'r').readlines()
+                segment_id = 0
+                for line in lines:
+                    start, end, speech = line.split()
+                    segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
+                    segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end)
+                    utt2spk_str = "{} {}\n".format(segment_id_str, utt)
+                    segments_fi.write(segments_str)
+                    utt2spk_fi.write(utt2spk_str)
+                    segment_id += 1
+                wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\
+                        "-b 16 - channels 1 |\n".format(utt, src_dir, utt)
+                wavscp_fi.write(wav_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_str = fh.read()
+                rttm_fi.write(rttm_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_list = fh.readlines()
+                spk_list = [(x.split())[7] for x in rttm_list] 
+                num_spk = len(set(spk_list))
+                reco2num_spk_fi.write("{} {}\n".format(utt, num_spk))
+    wavscp_fi.close()
+    utt2spk_fi.close()
+    segments_fi.close()
+    rttm_fi.close()
+    reco2num_spk_fi.close()
+    return 0
+
+def main():
+    src_dir = sys.argv[1]
+    data_dir = sys.argv[2]
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    prepare_dihard_2018_dev(src_dir, data_dir)
+    return 0
+
+if __name__=="__main__":
+    main()
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh b/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh
new file mode 100755
index 00000000000..cc48e2e792a
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_dihard_2018_dev.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright 2018   Zili Huang
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the DIHARD 2018 development data directory.
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <path-to-dihard_2018_dev> <path-to-output>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC2018E31 data/dihard_2018_dev"
+fi
+
+path_to_dihard_2018_dev=$1
+data_dir=$2
+
+echo "Preparing ${data_dir}..."
+local/make_dihard_2018_dev.py ${path_to_dihard_2018_dev} ${data_dir}
+
+sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp
+mv ${data_dir}/rttm_tmp ${data_dir}/rttm
+sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp
+mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk
+utils/fix_data_dir.sh ${data_dir}
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_eval.py b/egs/dihard_2018/v1/local/make_dihard_2018_eval.py
new file mode 100755
index 00000000000..2a8acbee58d
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_dihard_2018_eval.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+# This script is called by local/make_dihard_2018_eval.sh, and it creates the
+# necessary files for DIHARD 2018 evaluation directory.
+
+import sys, os
+
+def prepare_dihard_2018_eval(src_dir, data_dir):
+    wavscp_fi = open(data_dir + "/wav.scp" , 'w')
+    utt2spk_fi = open(data_dir + "/utt2spk" , 'w')
+    segments_fi = open(data_dir + "/segments" , 'w')
+    rttm_fi = open(data_dir + "/rttm" , 'w')
+    reco2num_spk_fi = open(data_dir + "/reco2num_spk" , 'w')
+
+    for subdir, dirs, files in os.walk(src_dir):
+        for file in files:
+            filename = os.path.join(subdir, file)
+            if filename.endswith(".lab"):
+                utt = os.path.basename(filename).split(".")[0]
+                lines = open(filename, 'r').readlines()
+                segment_id = 0
+                for line in lines:
+                    start, end, speech = line.split()
+                    segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
+                    segments_str = "{} {} {} {}\n".format(segment_id_str, utt, start, end)
+                    utt2spk_str = "{} {}\n".format(segment_id_str, utt)
+                    segments_fi.write(segments_str)
+                    utt2spk_fi.write(utt2spk_str)
+                    segment_id += 1
+                wav_str = "{} sox -t flac {}/data/flac/{}.flac -t wav -r 16k "\
+                        "-b 16 - channels 1 |\n".format(utt, src_dir, utt)
+                wavscp_fi.write(wav_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_str = fh.read()
+                rttm_fi.write(rttm_str)
+                with open("{}/data/rttm/{}.rttm".format(src_dir, utt), 'r') as fh:
+                    rttm_list = fh.readlines()
+                spk_list = [(x.split())[7] for x in rttm_list] 
+                num_spk = len(set(spk_list))
+                reco2num_spk_fi.write("{} {}\n".format(utt, num_spk))
+    wavscp_fi.close()
+    utt2spk_fi.close()
+    segments_fi.close()
+    rttm_fi.close()
+    reco2num_spk_fi.close()
+    return 0
+
+def main():
+    src_dir = sys.argv[1]
+    data_dir = sys.argv[2]
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    prepare_dihard_2018_eval(src_dir, data_dir)
+    return 0
+
+if __name__=="__main__":
+    main()
diff --git a/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh b/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh
new file mode 100755
index 00000000000..0a461c635ec
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_dihard_2018_eval.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright 2018   Zili Huang
+# Apache 2.0.
+#
+# This script, called by ../run.sh, creates the DIHARD 2018 evaluation directory.
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <path-to-dihard_2018_eval> <path-to-output>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC2018E32v1.1 data/dihard_2018_eval"
+fi
+
+path_to_dihard_2018_eval=$1
+data_dir=$2
+
+echo "Preparing ${data_dir}..."
+local/make_dihard_2018_eval.py ${path_to_dihard_2018_eval} ${data_dir}
+
+sort -k 2,2 -s ${data_dir}/rttm > ${data_dir}/rttm_tmp
+mv ${data_dir}/rttm_tmp ${data_dir}/rttm
+sort -k 1,1 -s ${data_dir}/reco2num_spk > ${data_dir}/reco2num_spk_tmp
+mv ${data_dir}/reco2num_spk_tmp ${data_dir}/reco2num_spk
+utils/fix_data_dir.sh ${data_dir}
diff --git a/egs/dihard_2018/v1/local/make_voxceleb1.pl b/egs/dihard_2018/v1/local/make_voxceleb1.pl
new file mode 100755
index 00000000000..2268c20ab52
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_voxceleb1.pl
@@ -0,0 +1,130 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#
+# Usage: make_voxceleb1.pl /export/voxceleb1 data/
+
+if (@ARGV != 2) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 data/\n";
+  exit(1);
+}
+
+($data_base, $out_dir) = @ARGV;
+my $out_test_dir = "$out_dir/voxceleb1_test";
+my $out_train_dir = "$out_dir/voxceleb1_train";
+
+if (system("mkdir -p $out_test_dir") != 0) {
+  die "Error making directory $out_test_dir";
+}
+
+if (system("mkdir -p $out_train_dir") != 0) {
+  die "Error making directory $out_train_dir";
+}
+
+opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (! -e "$data_base/voxceleb1_test.txt") {
+  system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt");
+}
+
+if (! -e "$data_base/vox1_meta.csv") {
+  system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv");
+}
+
+open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt";
+open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
+open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk";
+open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp";
+open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk";
+open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp";
+open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+
+my %id2spkr = ();
+while (<META_IN>) {
+  chomp;
+  my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
+  $id2spkr{$vox_id} = $spkr_id;
+}
+
+my $test_spkrs = ();
+while (<TRIAL_IN>) {
+  chomp;
+  my ($tar_or_non, $path1, $path2) = split;
+
+  # Create entry for left-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path1);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id1 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  # Create entry for right-hand side of trial
+  my ($spkr_id, $filename) = split('/', $path2);
+  my $rec_id = substr($filename, 0, 11);
+  my $segment = substr($filename, 12, 7);
+  my $utt_id2 = "$spkr_id-$rec_id-$segment";
+  $test_spkrs{$spkr_id} = ();
+
+  my $target = "nontarget";
+  if ($tar_or_non eq "1") {
+    $target = "target";
+  }
+  print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+}
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+  my $new_spkr_id = $spkr_id;
+  # If we're using a newer version of VoxCeleb1, we need to "deanonymize"
+  # the speaker labels.
+  if (exists $id2spkr{$spkr_id}) {
+    $new_spkr_id = $id2spkr{$spkr_id};
+  }
+  opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
+  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+  closedir $dh;
+  foreach (@files) {
+    my $filename = $_;
+    my $rec_id = substr($filename, 0, 11);
+    my $segment = substr($filename, 12, 7);
+    my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
+    my $utt_id = "$new_spkr_id-$rec_id-$segment";
+    if (exists $test_spkrs{$new_spkr_id}) {
+      print WAV_TEST "$utt_id", " $wav", "\n";
+      print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
+    } else {
+      print WAV_TRAIN "$utt_id", " $wav", "\n";
+      print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n";
+    }
+  }
+}
+
+close(SPKR_TEST) or die;
+close(WAV_TEST) or die;
+close(SPKR_TRAIN) or die;
+close(WAV_TRAIN) or die;
+close(TRIAL_OUT) or die;
+close(TRIAL_IN) or die;
+close(META_IN) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_test_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) {
+  die "Error validating directory $out_test_dir";
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_train_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) {
+  die "Error validating directory $out_train_dir";
+}
diff --git a/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl
new file mode 100755
index 00000000000..0bc13bea251
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_voxceleb1_v2.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#           2019  Soonshin Seo
+#
+# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
+#
+# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
+# The script 'make_voxceleb1.pl' works for the oldest version of the corpus. 
+# This script should be used if you've downloaded the corpus recently.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
+  exit(1);
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+print "$data_base/$dataset/wav\n";
+opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if ($dataset eq "dev"){
+  open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TRAIN "$utt_id", " $wav", "\n";
+        print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TRAIN) or die;
+  close(WAV_TRAIN) or die;
+}
+
+if ($dataset eq "test"){
+  if (! -e "$data_base/voxceleb1_test_v2.txt") {
+    system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
+  }
+
+  open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
+  open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+  open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  my $test_spkrs = ();
+  while (<TRIAL_IN>) {
+    chomp;
+    my ($tar_or_non, $path1, $path2) = split;
+    # Create entry for left-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path1);
+    my $utt_id1 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    # Create entry for right-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path2);
+    my $utt_id2 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    my $target = "nontarget";
+    if ($tar_or_non eq "1") {
+      $target = "target";
+    }
+    print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+  }
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+	  my $rec_id = $_;
+	  opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+	  my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+	  closedir $dh;
+  	  foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TEST "$utt_id", " $wav", "\n";
+        print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TEST) or die;
+  close(WAV_TEST) or die;
+  close(TRIAL_OUT) or die;
+  close(TRIAL_IN) or die;
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/dihard_2018/v1/local/make_voxceleb2.pl b/egs/dihard_2018/v1/local/make_voxceleb2.pl
new file mode 100755
index 00000000000..34c1591eba3
--- /dev/null
+++ b/egs/dihard_2018/v1/local/make_voxceleb2.pl
@@ -0,0 +1,70 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#
+# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev
+#
+# Note: This script requires ffmpeg to be installed and its location included in $PATH.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb2> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n";
+  exit(1);
+}
+
+# Check that ffmpeg is installed.
+if (`which ffmpeg` eq "") {
+  die "Error: this script requires that ffmpeg is installed.";
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
+open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
+
+foreach (@spkr_dirs) {
+  my $spkr_id = $_;
+
+  opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!";
+  my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+  closedir $dh;
+
+  foreach (@rec_dirs) {
+    my $rec_id = $_;
+
+    opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+    my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh);
+    closedir $dh;
+
+    foreach (@files) {
+      my $name = $_;
+      my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|";
+      my $utt_id = "$spkr_id-$rec_id-$name";
+      print WAV "$utt_id", " $wav", "\n";
+      print SPKR "$utt_id", " $spkr_id", "\n";
+    }
+  }
+}
+close(SPKR) or die;
+close(WAV) or die;
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/dihard_2018/v1/local/prepare_feats.sh b/egs/dihard_2018/v1/local/prepare_feats.sh
new file mode 100755
index 00000000000..9fa70a2d91e
--- /dev/null
+++ b/egs/dihard_2018/v1/local/prepare_feats.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#
+# Apache 2.0.
+
+# This script adds deltas, applies sliding window CMVN and writes the features to disk.
+#
+# Although this kind of script isn't necessary in speaker recognition recipes,
+# it can be helpful in the diarization recipes.  The script
+# diarization/extract_ivectors.sh extracts i-vectors from very
+# short (e.g., 1-2 seconds) segments.  Therefore, in order to apply the sliding
+# window CMVN in a meaningful way, it must be performed prior to performing
+# the subsegmentation.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+delta_window=3
+delta_order=2
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_ivector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/dihard_2018/v1/ivector-$(date +'%m_%d_%H_%M')/ivector_cmvn_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/ivector_cmvn_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+delta_opts="--delta-window=$delta_window --delta-order=$delta_order"
+
+$cmd JOB=1:$nj $dir/log/create_ivector_cmvn_feats_${name}.JOB.log \
+  add-deltas $delta_opts scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  ark:- ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/ivector_cmvn_feats_${name}.JOB.ark,$featdir/ivector_cmvn_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/ivector_cmvn_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating ivector features for $name"
diff --git a/egs/dihard_2018/v1/path.sh b/egs/dihard_2018/v1/path.sh
new file mode 100755
index 00000000000..851c14e27c3
--- /dev/null
+++ b/egs/dihard_2018/v1/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/dihard_2018/v1/run.sh b/egs/dihard_2018/v1/run.sh
new file mode 100755
index 00000000000..eb23ac500cd
--- /dev/null
+++ b/egs/dihard_2018/v1/run.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+# Copyright   2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#             2017   Johns Hopkins University (Author: Daniel Povey)
+#        2017-2018   David Snyder
+#             2018   Ewald Enzinger
+#             2018   Zili Huang
+# Apache 2.0.
+#
+# See ../README.txt for more info on data required.
+# Results (diarization error rate) are inline in comments below.
+
+. ./cmd.sh
+. ./path.sh
+set -e
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+voxceleb1_root=/export/corpora/VoxCeleb1
+voxceleb2_root=/export/corpora/VoxCeleb2
+dihard_2018_dev=/export/corpora/LDC/LDC2018E31
+dihard_2018_eval=/export/corpora/LDC/LDC2018E32v1.1
+num_components=2048
+ivector_dim=400
+ivec_dir=exp/extractor_c${num_components}_i${ivector_dim}
+
+stage=0
+
+if [ $stage -le 0 ]; then
+  local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
+  local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
+
+  # Now prepare the VoxCeleb1 train and test data.  If you downloaded the corpus soon
+  # after it was first released, you may need to use an older version of the script, which
+  # can be invoked as follows:
+  # local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
+  # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
+  # This should give 7,351 speakers and 1,277,503 utterances.
+  utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
+
+  # Prepare the development and evaluation set for DIHARD 2018.
+  local/make_dihard_2018_dev.sh $dihard_2018_dev data/dihard_2018_dev
+  local/make_dihard_2018_eval.sh $dihard_2018_eval data/dihard_2018_eval
+fi
+
+if [ $stage -le 1 ]; then
+  # Make MFCCs for each dataset
+  for name in train dihard_2018_dev dihard_2018_eval; do
+    steps/make_mfcc.sh --write-utt2num-frames true \
+      --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd --max-jobs-run 20" \
+      data/${name} exp/make_mfcc $mfccdir
+    utils/fix_data_dir.sh data/${name}
+  done
+
+  # Compute the energy-based VAD for train
+  sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
+    data/train exp/make_vad $vaddir
+  utils/fix_data_dir.sh data/train
+
+  # This writes features to disk after adding deltas and applying the sliding window CMN.
+  # Although this is somewhat wasteful in terms of disk space, for diarization
+  # it ends up being preferable to performing the CMN in memory.  If the CMN
+  # were performed in memory it would need to be performed after the subsegmentation,
+  # which leads to poorer results.
+  for name in train dihard_2018_dev dihard_2018_eval; do
+    local/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+      data/$name data/${name}_cmn exp/${name}_cmn
+    if [ -f data/$name/vad.scp ]; then
+      cp data/$name/vad.scp data/${name}_cmn/
+    fi
+    if [ -f data/$name/segments ]; then
+      cp data/$name/segments data/${name}_cmn/
+    fi
+    utils/fix_data_dir.sh data/${name}_cmn
+  done
+
+  echo "0.01" > data/train_cmn/frame_shift
+  # Create segments to extract i-vectors from for PLDA training data.
+  # The segments are created using an energy-based speech activity
+  # detection (SAD) system, but this is not necessary.  You can replace
+  # this with segments computed from your favorite SAD.
+  diarization/vad_to_segments.sh --nj 40 --cmd "$train_cmd" \
+      data/train_cmn data/train_cmn_segmented
+fi
+
+if [ $stage -le 2 ]; then
+  # Train the UBM on VoxCeleb 1 and 2.
+  sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \
+    --nj 40 --num-threads 8 \
+    data/train $num_components \
+    exp/diag_ubm
+
+  sid/train_full_ubm.sh --cmd "$train_cmd --mem 25G" \
+    --nj 40 --remove-low-count-gaussians false \
+    data/train \
+    exp/diag_ubm exp/full_ubm
+fi
+
+if [ $stage -le 3 ]; then
+  # In this stage, we train the i-vector extractor on a subset of VoxCeleb 1
+  # and 2.
+  #
+  # Note that there are well over 1 million utterances in our training set,
+  # and it takes an extremely long time to train the extractor on all of this.
+  # Also, most of those utterances are very short.  Short utterances are
+  # harmful for training the i-vector extractor.  Therefore, to reduce the
+  # training time and improve performance, we will only train on the 100k
+  # longest utterances.
+  utils/subset_data_dir.sh \
+    --utt-list <(sort -n -k 2 data/train/utt2num_frames | tail -n 100000) \
+    data/train data/train_100k
+
+  # Train the i-vector extractor.
+  sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 16G" \
+    --ivector-dim $ivector_dim --num-iters 5 \
+    exp/full_ubm/final.ubm data/train_100k \
+    $ivec_dir
+fi
+
+if [ $stage -le 4 ]; then
+  # Extract i-vectors for DIHARD 2018 development and evaluation set. 
+  # We set apply-cmn false and apply-deltas false because we already add
+  # deltas and apply cmn in stage 1.
+  diarization/extract_ivectors.sh --cmd "$train_cmd --mem 20G" \
+    --nj 40 --window 1.5 --period 0.75 --apply-cmn false --apply-deltas false \
+    --min-segment 0.5 $ivec_dir \
+    data/dihard_2018_dev_cmn $ivec_dir/ivectors_dihard_2018_dev
+
+  diarization/extract_ivectors.sh --cmd "$train_cmd --mem 20G" \
+    --nj 40 --window 1.5 --period 0.75 --apply-cmn false --apply-deltas false \
+    --min-segment 0.5 $ivec_dir \
+    data/dihard_2018_eval_cmn $ivec_dir/ivectors_dihard_2018_eval
+
+  # Reduce the amount of training data for the PLDA training.
+  utils/subset_data_dir.sh data/train_cmn_segmented 128000 data/train_cmn_segmented_128k
+  # Extract i-vectors for the VoxCeleb, which is our PLDA training
+  # data.  A long period is used here so that we don't compute too
+  # many i-vectors for each recording.
+  diarization/extract_ivectors.sh --cmd "$train_cmd --mem 25G" \
+    --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false --apply-deltas false \
+    --hard-min true $ivec_dir \
+    data/train_cmn_segmented_128k $ivec_dir/ivectors_train_segmented_128k
+fi
+
+if [ $stage -le 5 ]; then
+  # Train a PLDA model on VoxCeleb, using DIHARD 2018 development set to whiten.
+  "$train_cmd" $ivec_dir/ivectors_dihard_2018_dev/log/plda.log \
+    ivector-compute-plda ark:$ivec_dir/ivectors_train_segmented_128k/spk2utt \
+      "ark:ivector-subtract-global-mean \
+      scp:$ivec_dir/ivectors_train_segmented_128k/ivector.scp ark:- \
+      | transform-vec $ivec_dir/ivectors_dihard_2018_dev/transform.mat ark:- ark:- \
+      | ivector-normalize-length ark:- ark:- |" \
+    $ivec_dir/ivectors_dihard_2018_dev/plda || exit 1;
+fi
+
+# Perform PLDA scoring
+if [ $stage -le 6 ]; then
+  # Perform PLDA scoring on all pairs of segments for each recording.
+  diarization/score_plda.sh --cmd "$train_cmd --mem 4G" \
+    --nj 20 $ivec_dir/ivectors_dihard_2018_dev $ivec_dir/ivectors_dihard_2018_dev \
+    $ivec_dir/ivectors_dihard_2018_dev/plda_scores
+
+  diarization/score_plda.sh --cmd "$train_cmd --mem 4G" \
+    --nj 20 $ivec_dir/ivectors_dihard_2018_dev $ivec_dir/ivectors_dihard_2018_eval \
+    $ivec_dir/ivectors_dihard_2018_eval/plda_scores
+fi
+
+# Cluster the PLDA scores using a stopping threshold.
+if [ $stage -le 7 ]; then
+  # First, we find the threshold that minimizes the DER on DIHARD 2018 development set.
+  mkdir -p $ivec_dir/tuning
+  echo "Tuning clustering threshold for DIHARD 2018 development set"
+  best_der=100
+  best_threshold=0
+
+  # The threshold is in terms of the log likelihood ratio provided by the
+  # PLDA scores.  In a perfectly calibrated system, the threshold is 0.
+  # In the following loop, we evaluate DER performance on DIHARD 2018 development 
+  # set using some reasonable thresholds for a well-calibrated system.
+  for threshold in -0.5 -0.4 -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3 0.4 0.5; do
+    diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+      --threshold $threshold --rttm-channel 1 $ivec_dir/ivectors_dihard_2018_dev/plda_scores \
+      $ivec_dir/ivectors_dihard_2018_dev/plda_scores_t$threshold
+
+    md-eval.pl -r data/dihard_2018_dev/rttm \
+     -s $ivec_dir/ivectors_dihard_2018_dev/plda_scores_t$threshold/rttm \
+     2> $ivec_dir/tuning/dihard_2018_dev_t${threshold}.log \
+     > $ivec_dir/tuning/dihard_2018_dev_t${threshold}
+
+    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+      $ivec_dir/tuning/dihard_2018_dev_t${threshold})
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
+      best_der=$der
+      best_threshold=$threshold
+    fi
+  done
+  echo "$best_threshold" > $ivec_dir/tuning/dihard_2018_dev_best
+
+  diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    --threshold $(cat $ivec_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
+    $ivec_dir/ivectors_dihard_2018_dev/plda_scores $ivec_dir/ivectors_dihard_2018_dev/plda_scores
+
+  # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD 
+  # 2018 development set. The DIHARD 2018 development set is used as the validation 
+  # set to tune the parameters. 
+  diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    --threshold $(cat $ivec_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
+    $ivec_dir/ivectors_dihard_2018_eval/plda_scores $ivec_dir/ivectors_dihard_2018_eval/plda_scores
+
+  mkdir -p $ivec_dir/results
+  # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of   
+  # the DIHARD challenge. The DER is calculated with no unscored collars and including  
+  # overlapping speech.
+  md-eval.pl -r data/dihard_2018_eval/rttm \
+    -s $ivec_dir/ivectors_dihard_2018_eval/plda_scores/rttm 2> $ivec_dir/results/threshold.log \
+    > $ivec_dir/results/DER_threshold.txt
+  der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+    $ivec_dir/results/DER_threshold.txt)
+  # Using supervised calibration, DER: 28.51%
+  echo "Using supervised calibration, DER: $der%"
+fi
+
+# Cluster the PLDA scores using the oracle number of speakers
+if [ $stage -le 8 ]; then
+  # In this section, we show how to do the clustering if the number of speakers
+  # (and therefore, the number of clusters) per recording is known in advance.
+  diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    --reco2num-spk data/dihard_2018_eval/reco2num_spk --rttm-channel 1 \
+    $ivec_dir/ivectors_dihard_2018_eval/plda_scores $ivec_dir/ivectors_dihard_2018_eval/plda_scores_num_spk
+
+  md-eval.pl -r data/dihard_2018_eval/rttm \
+    -s $ivec_dir/ivectors_dihard_2018_eval/plda_scores_num_spk/rttm 2> $ivec_dir/results/num_spk.log \
+    > $ivec_dir/results/DER_num_spk.txt
+  der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+    $ivec_dir/results/DER_num_spk.txt)
+  # Using the oracle number of speakers, DER: 24.42%
+  echo "Using the oracle number of speakers, DER: $der%"
+fi
diff --git a/egs/dihard_2018/v1/sid b/egs/dihard_2018/v1/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/dihard_2018/v1/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/dihard_2018/v1/steps b/egs/dihard_2018/v1/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/dihard_2018/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/dihard_2018/v1/utils b/egs/dihard_2018/v1/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/dihard_2018/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/README.txt b/egs/dihard_2018/v2/README.txt
new file mode 100644
index 00000000000..5487a911184
--- /dev/null
+++ b/egs/dihard_2018/v2/README.txt
@@ -0,0 +1,17 @@
+ This recipe is the speaker diarization recipe for The First DIHARD Speech
+ Diarization Challenge (DIHARD 2018). There are two tracks in the DIHARD 2018 
+ competition , one uses oracle SAD (track1) and the other required that SAD 
+ was performed from scratch (track2). This script is for track1.
+
+ The recipe is closely based on the following paper:
+ http://www.danielpovey.com/files/2018_interspeech_dihard.pdf but doesn't
+ contain the VB refinement. The whole system mainly contains training and
+ extract x-vectors, PLDA scoring and agglomerative hierarchical clustering. 
+ The VoxCeleb datasets are used for training x-vectors and PLDA. The
+ development set of the DIHARD 2018 competition is used as validation set to
+ tune parameters. The system is tested on the DIHARD 2018 evaluation set. 
+
+ We also use the following datasets for augmentation.
+
+     MUSAN               http://www.openslr.org/17
+     RIR_NOISES          http://www.openslr.org/28
diff --git a/egs/dihard_2018/v2/cmd.sh b/egs/dihard_2018/v2/cmd.sh
new file mode 100755
index 00000000000..c35cd18f287
--- /dev/null
+++ b/egs/dihard_2018/v2/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl"
+
+
diff --git a/egs/dihard_2018/v2/conf/mfcc.conf b/egs/dihard_2018/v2/conf/mfcc.conf
new file mode 100755
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs/dihard_2018/v2/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs/dihard_2018/v2/conf/vad.conf b/egs/dihard_2018/v2/conf/vad.conf
new file mode 100755
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs/dihard_2018/v2/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs/dihard_2018/v2/diarization b/egs/dihard_2018/v2/diarization
new file mode 120000
index 00000000000..bad937c1444
--- /dev/null
+++ b/egs/dihard_2018/v2/diarization
@@ -0,0 +1 @@
+../../callhome_diarization/v1/diarization
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_dev.py b/egs/dihard_2018/v2/local/make_dihard_2018_dev.py
new file mode 120000
index 00000000000..3c69bc08240
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_dihard_2018_dev.py
@@ -0,0 +1 @@
+../../v1/local/make_dihard_2018_dev.py
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_dev.sh b/egs/dihard_2018/v2/local/make_dihard_2018_dev.sh
new file mode 120000
index 00000000000..6fe340e9df2
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_dihard_2018_dev.sh
@@ -0,0 +1 @@
+../../v1/local/make_dihard_2018_dev.sh
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_eval.py b/egs/dihard_2018/v2/local/make_dihard_2018_eval.py
new file mode 120000
index 00000000000..d107a5446ca
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_dihard_2018_eval.py
@@ -0,0 +1 @@
+../../v1/local/make_dihard_2018_eval.py
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/make_dihard_2018_eval.sh b/egs/dihard_2018/v2/local/make_dihard_2018_eval.sh
new file mode 120000
index 00000000000..0c01aee4fa7
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_dihard_2018_eval.sh
@@ -0,0 +1 @@
+../../v1/local/make_dihard_2018_eval.sh
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/make_voxceleb1.pl b/egs/dihard_2018/v2/local/make_voxceleb1.pl
new file mode 120000
index 00000000000..c54d69af919
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_voxceleb1.pl
@@ -0,0 +1 @@
+../../v1/local/make_voxceleb1.pl
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl
new file mode 120000
index 00000000000..2e7a22eaadc
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_voxceleb1_v2.pl
@@ -0,0 +1 @@
+../../v1/local/make_voxceleb1_v2.pl
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/make_voxceleb2.pl b/egs/dihard_2018/v2/local/make_voxceleb2.pl
new file mode 120000
index 00000000000..701225dfa57
--- /dev/null
+++ b/egs/dihard_2018/v2/local/make_voxceleb2.pl
@@ -0,0 +1 @@
+../../v1/local/make_voxceleb2.pl
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats.sh b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats.sh
new file mode 100755
index 00000000000..4ad2c42d8b9
--- /dev/null
+++ b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#
+# Apache 2.0.
+
+# This script applies sliding window CMVN and writes the features to disk.
+#
+# Although this kind of script isn't necessary in speaker recognition recipes,
+# it can be helpful in the diarization recipes.  The script
+# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very
+# short (e.g., 1-2 seconds) segments.  Therefore, in order to apply the sliding
+# window CMVN in a meaningful way, it must be performed prior to performing
+# the subsegmentation.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/dihard_2018/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats_for_egs.sh
new file mode 100755
index 00000000000..1d8ac6153e7
--- /dev/null
+++ b/egs/dihard_2018/v2/local/nnet3/xvector/prepare_feats_for_egs.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#
+# Copied from egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh (commit 3ea534070fd2cccd2e4ee21772132230033022ce).
+#
+# Apache 2.0.
+
+# This script applies sliding window cmvn and removes silence frames.  This
+# is performed on the raw features prior to generating examples for training
+# the xvector system.
+
+nj=40
+cmd="run.pl"
+stage=0
+norm_vars=false
+center=true
+compress=true
+cmn_window=300
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+if [ $# != 3 ]; then
+  echo "Usage: $0 <in-data-dir> <out-data-dir> <feat-dir>"
+  echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features"
+  echo "Options: "
+  echo "  --nj <nj>                                        # number of parallel jobs"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --norm-vars <true|false>                         # If true, normalize variances in the sliding window cmvn"
+  exit 1;
+fi
+
+data_in=$1
+data_out=$2
+dir=$3
+
+name=`basename $data_in`
+
+for f in $data_in/feats.scp $data_in/vad.scp ; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Set various variables.
+mkdir -p $dir/log
+mkdir -p $data_out
+featdir=$(utils/make_absolute.sh $dir)
+
+if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then
+  utils/create_split_dir.pl \
+    /export/b{14,15,16,17}/$USER/kaldi-data/egs/dihard_2018/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage
+fi
+
+for n in $(seq $nj); do
+  # the next command does nothing unless $featdir/storage/ exists, see
+  # utils/create_data_link.pl for more info.
+  utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark
+done
+
+cp $data_in/utt2spk $data_out/utt2spk
+cp $data_in/spk2utt $data_out/spk2utt
+cp $data_in/wav.scp $data_out/wav.scp
+
+write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB"
+
+sdata_in=$data_in/split$nj;
+utils/split_data.sh $data_in $nj || exit 1;
+
+$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \
+  apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \
+  scp:${sdata_in}/JOB/feats.scp ark:- \| \
+  select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \
+  copy-feats --compress=$compress $write_num_frames_opt ark:- \
+  ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1;
+
+for n in $(seq $nj); do
+  cat $featdir/xvector_feats_${name}.$n.scp || exit 1;
+done > ${data_out}/feats.scp || exit 1
+
+for n in $(seq $nj); do
+  cat $featdir/log/utt2num_frames.$n || exit 1;
+done > $data_out/utt2num_frames || exit 1
+rm $featdir/log/utt2num_frames.*
+
+echo "$0: Succeeded creating xvector features for $name"
diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/run_xvector.sh b/egs/dihard_2018/v2/local/nnet3/xvector/run_xvector.sh
new file mode 120000
index 00000000000..585b63fd2dd
--- /dev/null
+++ b/egs/dihard_2018/v2/local/nnet3/xvector/run_xvector.sh
@@ -0,0 +1 @@
+tuning/run_xvector_1a.sh
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/dihard_2018/v2/local/nnet3/xvector/tuning/run_xvector_1a.sh
new file mode 100755
index 00000000000..4ee472b1c71
--- /dev/null
+++ b/egs/dihard_2018/v2/local/nnet3/xvector/tuning/run_xvector_1a.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+# Copyright      2017   David Snyder
+#                2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#                2017   Johns Hopkins University (Author: Daniel Povey)
+#
+# Copied from egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh (commit e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b).
+#
+# Apache 2.0.
+
+# This script trains a DNN similar to the recipe described in
+# http://www.danielpovey.com/files/2018_icassp_xvectors.pdf
+
+. ./cmd.sh
+set -e
+
+stage=1
+train_stage=0
+use_gpu=true
+remove_egs=false
+
+data=data/train
+nnet_dir=exp/xvector_nnet_1a/
+egs_dir=exp/xvector_nnet_1a/egs
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l)
+
+# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh.
+# The argument --num-repeats is related to the number of times a speaker
+# repeats per archive.  If it seems like you're getting too many archives
+# (e.g., more than 200) try increasing the --frames-per-iter option.  The
+# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the
+# minimum and maximum length (in terms of number of frames) of the features
+# in the examples.
+#
+# To make sense of the egs script, it may be necessary to put an "exit 1"
+# command immediately after stage 3.  Then, inspect
+# exp/<your-dir>/egs/temp/ranges.* . The ranges files specify the examples that
+# will be created, and which archives they will be stored in.  Each line of
+# ranges.* has the following form:
+#    <utt-id> <local-ark-indx> <global-ark-indx> <start-frame> <end-frame> <spk-id>
+# For example:
+#    100304-f-sre2006-kacg-A 1 2 4079 881 23
+
+# If you're satisfied with the number of archives (e.g., 50-150 archives is
+# reasonable) and with the number of examples per speaker (e.g., 1000-5000
+# is reasonable) then you can let the script continue to the later stages.
+# Otherwise, try increasing or decreasing the --num-repeats option.  You might
+# need to fiddle with --frames-per-iter.  Increasing this value decreases the
+# the number of archives and increases the number of examples per archive.
+# Decreasing this value increases the number of archives, while decreasing the
+# number of examples per archive.
+if [ $stage -le 6 ]; then
+  echo "$0: Getting neural network training egs";
+  # dump egs.
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{03,04,05,06}/$USER/kaldi-data/egs/dihard_2018/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage
+  fi
+  sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \
+    --nj 8 \
+    --stage 0 \
+    --frames-per-iter 1000000000 \
+    --frames-per-iter-diagnostic 100000 \
+    --min-frames-per-chunk 200 \
+    --max-frames-per-chunk 400 \
+    --num-diagnostic-archives 3 \
+    --num-repeats 50 \
+    "$data" $egs_dir
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}')
+  feat_dim=$(cat $egs_dir/info/feat_dim)
+
+  # This chunk-size corresponds to the maximum number of frames the
+  # stats layer is able to pool over.  In this script, it corresponds
+  # to 100 seconds.  If the input recording is greater than 100 seconds,
+  # we will compute multiple xvectors from the same recording and average
+  # to produce the final xvector.
+  max_chunk_size=10000
+
+  # The smallest number of frames we're comfortable computing an xvector from.
+  # Note that the hard minimum is given by the left and right context of the
+  # frame-level layers.
+  min_chunk_size=25
+  mkdir -p $nnet_dir/configs
+  cat <<EOF > $nnet_dir/configs/network.xconfig
+  # please note that it is important to have input layer with the name=input
+
+  # The frame-level layers
+  input dim=${feat_dim} name=input
+  relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
+  relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
+  relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn4 dim=512
+  relu-batchnorm-layer name=tdnn5 dim=1500
+
+  # The stats pooling layer. Layers after this are segment-level.
+  # In the config below, the first and last argument (0, and ${max_chunk_size})
+  # means that we pool over an input segment starting at frame 0
+  # and ending at frame ${max_chunk_size} or earlier.  The other arguments (1:1)
+  # mean that no subsampling is performed.
+  stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
+
+  # This is where we usually extract the embedding (aka xvector) from.
+  relu-batchnorm-layer name=tdnn6 dim=512 input=stats
+
+  # This is where another layer the embedding could be extracted
+  # from, but usually the previous one works better.
+  relu-batchnorm-layer name=tdnn7 dim=512
+  output-layer name=output include-log-softmax=true dim=${num_targets}
+EOF
+
+  steps/nnet3/xconfig_to_configs.py \
+      --xconfig-file $nnet_dir/configs/network.xconfig \
+      --config-dir $nnet_dir/configs/
+  cp $nnet_dir/configs/final.config $nnet_dir/nnet.config
+
+  # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh
+  echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config
+  echo "$max_chunk_size" > $nnet_dir/max_chunk_size
+  echo "$min_chunk_size" > $nnet_dir/min_chunk_size
+fi
+
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+srand=123
+if [ $stage -le 8 ]; then
+  steps/nnet3/train_raw_dnn.py --stage=$train_stage \
+    --cmd="$train_cmd" \
+    --trainer.optimization.proportional-shrink 10 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.minibatch-size=64 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2 \
+    --trainer.num-epochs=3 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.shuffle-buffer-size=1000 \
+    --egs.frames-per-eg=1 \
+    --egs.dir="$egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --dir=$nnet_dir  || exit 1;
+fi
+
+exit 0;
diff --git a/egs/dihard_2018/v2/path.sh b/egs/dihard_2018/v2/path.sh
new file mode 100755
index 00000000000..851c14e27c3
--- /dev/null
+++ b/egs/dihard_2018/v2/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/dihard_2018/v2/run.sh b/egs/dihard_2018/v2/run.sh
new file mode 100755
index 00000000000..6cd6630a838
--- /dev/null
+++ b/egs/dihard_2018/v2/run.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+# Copyright   2017   Johns Hopkins University (Author: Daniel Garcia-Romero)
+#             2017   Johns Hopkins University (Author: Daniel Povey)
+#        2017-2018   David Snyder
+#             2018   Ewald Enzinger
+#             2018   Zili Huang
+# Apache 2.0.
+#
+# See ../README.txt for more info on data required.
+# Results (diarization error rate) are inline in comments below.
+
+. ./cmd.sh
+. ./path.sh
+set -e
+mfccdir=`pwd`/mfcc
+vaddir=`pwd`/mfcc
+
+voxceleb1_root=/export/corpora/VoxCeleb1
+voxceleb2_root=/export/corpora/VoxCeleb2
+nnet_dir=exp/xvector_nnet_1a
+musan_root=/export/corpora/JHU/musan
+dihard_2018_dev=/export/corpora/LDC/LDC2018E31
+dihard_2018_eval=/export/corpora/LDC/LDC2018E32v1.1
+
+stage=0
+
+if [ $stage -le 0 ]; then
+  local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
+  local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
+
+  # Now prepare the VoxCeleb1 train and test data.  If you downloaded the corpus soon
+  # after it was first released, you may need to use an older version of the script, which
+  # can be invoked as follows:
+  # local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+
+  # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
+  # This should give 7,351 speakers and 1,277,503 utterances.
+  utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
+
+  # Prepare the development and evaluation set for DIHARD 2018.
+  local/make_dihard_2018_dev.sh $dihard_2018_dev data/dihard_2018_dev
+  local/make_dihard_2018_eval.sh $dihard_2018_eval data/dihard_2018_eval
+fi
+
+if [ $stage -le 1 ]; then
+  # Make MFCCs for each dataset.
+  for name in train dihard_2018_dev dihard_2018_eval; do
+    steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd --max-jobs-run 20" \
+      data/${name} exp/make_mfcc $mfccdir
+    utils/fix_data_dir.sh data/${name}
+  done
+
+  # Compute the energy-based VAD for training set.
+  sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \
+      data/train exp/make_vad $vaddir
+  utils/fix_data_dir.sh data/train
+
+  # This writes features to disk after applying the sliding window CMN.
+  # Although this is somewhat wasteful in terms of disk space, for diarization
+  # it ends up being preferable to performing the CMN in memory.  If the CMN
+  # were performed in memory (e.g., we used --apply-cmn true in
+  # diarization/nnet3/xvector/extract_xvectors.sh) it would need to be
+  # performed after the subsegmentation, which leads to poorer results.
+  for name in train dihard_2018_dev dihard_2018_eval; do
+    local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \
+      data/$name data/${name}_cmn exp/${name}_cmn
+    if [ -f data/$name/vad.scp ]; then
+      cp data/$name/vad.scp data/${name}_cmn/
+    fi
+    if [ -f data/$name/segments ]; then
+      cp data/$name/segments data/${name}_cmn/
+    fi
+    utils/fix_data_dir.sh data/${name}_cmn
+  done
+
+  echo "0.01" > data/dihard_2018_dev_cmn/frame_shift
+  echo "0.01" > data/dihard_2018_eval_cmn/frame_shift
+  echo "0.01" > data/train_cmn/frame_shift
+  # Create segments to extract x-vectors from for PLDA training data.
+  # The segments are created using an energy-based speech activity
+  # detection (SAD) system, but this is not necessary.  You can replace
+  # this with segments computed from your favorite SAD.
+  diarization/vad_to_segments.sh --nj 40 --cmd "$train_cmd" \
+      data/train_cmn data/train_cmn_segmented
+fi
+
+# In this section, we augment the training data with reverberation,
+# noise, music, and babble, and combine it with the clean data.
+if [ $stage -le 2 ]; then
+  frame_shift=0.01
+  awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of the training data.  Note that we don't add any
+  # additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications 1 \
+    --source-sampling-rate 16000 \
+    data/train data/train_reverb
+  cp data/train/vad.scp data/train_reverb/
+  utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new
+  rm -rf data/train_reverb
+  mv data/train_reverb.new data/train_reverb
+
+  # Prepare the MUSAN corpus, which consists of music, speech, and noise
+  # suitable for augmentation.
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
+
+  # Get the duration of the MUSAN recordings.  This will be used by the
+  # script augment_data_dir.py.
+  for name in speech noise music; do
+    utils/data/get_utt2dur.sh data/musan_${name}
+    mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur
+  done
+
+  # Augment with musan_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
+  # Augment with musan_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
+  # Augment with musan_speech
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
+
+  # Combine reverb, noise, music, and babble into one directory.
+  utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
+fi
+
+if [ $stage -le 3 ]; then
+  # Take a random subset of the augmentations
+  utils/subset_data_dir.sh data/train_aug 1000000 data/train_aug_1m
+  utils/fix_data_dir.sh data/train_aug_1m
+
+  # Make MFCCs for the augmented data.  Note that we do not compute a new
+  # vad.scp file here.  Instead, we use the vad.scp from the clean version of
+  # the list.
+  steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd --max-jobs-run 20" \
+    data/train_aug_1m exp/make_mfcc $mfccdir
+
+  # Combine the clean and augmented training data.  This is now roughly
+  # double the size of the original clean list.
+  utils/combine_data.sh data/train_combined data/train_aug_1m data/train
+fi
+
+# Now we prepare the features to generate examples for xvector training.
+if [ $stage -le 4 ]; then
+  # This script applies CMVN and removes nonspeech frames.  Note that this is somewhat
+  # wasteful, as it roughly doubles the amount of training data on disk.  After
+  # creating training examples, this can be removed.
+  local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \
+    data/train_combined data/train_combined_no_sil exp/train_combined_no_sil
+  utils/fix_data_dir.sh data/train_combined_no_sil
+fi
+
+if [ $stage -le 5 ]; then
+  # Now, we need to remove features that are too short after removing silence
+  # frames.  We want at least 4s (400 frames) per utterance.
+  min_len=400
+  mv data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2num_frames.bak
+  awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_no_sil/utt2num_frames.bak > data/train_combined_no_sil/utt2num_frames
+  utils/filter_scp.pl data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2spk > data/train_combined_no_sil/utt2spk.new
+  mv data/train_combined_no_sil/utt2spk.new data/train_combined_no_sil/utt2spk
+  utils/fix_data_dir.sh data/train_combined_no_sil
+
+  # We also want several utterances per speaker. Now we'll throw out speakers
+  # with fewer than 8 utterances.
+  min_num_utts=8
+  awk '{print $1, NF-1}' data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2num
+  awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/train_combined_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2utt.new
+  mv data/train_combined_no_sil/spk2utt.new data/train_combined_no_sil/spk2utt
+  utils/spk2utt_to_utt2spk.pl data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/utt2spk
+
+  utils/filter_scp.pl data/train_combined_no_sil/utt2spk data/train_combined_no_sil/utt2num_frames > data/train_combined_no_sil/utt2num_frames.new
+  mv data/train_combined_no_sil/utt2num_frames.new data/train_combined_no_sil/utt2num_frames
+
+  # Now we're ready to create training examples.
+  utils/fix_data_dir.sh data/train_combined_no_sil
+fi
+
+# Stages 6 through 8 are handled in run_xvector.sh, a TDNN embedding extractor is trained.
+local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \
+  --data data/train_combined_no_sil --nnet-dir $nnet_dir \
+  --egs-dir $nnet_dir/egs
+
+if [ $stage -le 9 ]; then
+  # Extract x-vectors for DIHARD 2018 development and evaluation set.
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \
+    --nj 40 --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $nnet_dir \
+    data/dihard_2018_dev_cmn $nnet_dir/xvectors_dihard_2018_dev
+
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \
+    --nj 40 --window 1.5 --period 0.75 --apply-cmn false \
+    --min-segment 0.5 $nnet_dir \
+    data/dihard_2018_eval_cmn $nnet_dir/xvectors_dihard_2018_eval
+
+  # Reduce the amount of training data for the PLDA training.
+  utils/subset_data_dir.sh data/train_cmn_segmented 128000 data/train_cmn_segmented_128k
+  # Extract x-vectors for the VoxCeleb, which is our PLDA training
+  # data.  A long period is used here so that we don't compute too
+  # many x-vectors for each recording.
+  diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \
+    --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \
+    --hard-min true $nnet_dir \
+    data/train_cmn_segmented_128k $nnet_dir/xvectors_train_segmented_128k
+fi
+
+# Train PLDA models
+if [ $stage -le 10 ]; then
+  # Train a PLDA model on VoxCeleb, using DIHARD 2018 development set to whiten.
+  "$train_cmd" $nnet_dir/xvectors_dihard_2018_dev/log/plda.log \
+    ivector-compute-plda ark:$nnet_dir/xvectors_train_segmented_128k/spk2utt \
+      "ark:ivector-subtract-global-mean \
+      scp:$nnet_dir/xvectors_train_segmented_128k/xvector.scp ark:- \
+      | transform-vec $nnet_dir/xvectors_dihard_2018_dev/transform.mat ark:- ark:- \
+      | ivector-normalize-length ark:- ark:- |" \
+    $nnet_dir/xvectors_dihard_2018_dev/plda || exit 1;
+fi
+
+# Perform PLDA scoring
+if [ $stage -le 11 ]; then
+  # Perform PLDA scoring on all pairs of segments for each recording.
+  diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \
+    --nj 20 $nnet_dir/xvectors_dihard_2018_dev $nnet_dir/xvectors_dihard_2018_dev \
+    $nnet_dir/xvectors_dihard_2018_dev/plda_scores
+
+  diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \
+    --nj 20 $nnet_dir/xvectors_dihard_2018_dev $nnet_dir/xvectors_dihard_2018_eval \
+    $nnet_dir/xvectors_dihard_2018_eval/plda_scores
+fi
+
+# Cluster the PLDA scores using a stopping threshold.
+if [ $stage -le 12 ]; then
+  # First, we find the threshold that minimizes the DER on DIHARD 2018 development set.
+  mkdir -p $nnet_dir/tuning
+  echo "Tuning clustering threshold for DIHARD 2018 development set"
+  best_der=100
+  best_threshold=0
+
+  # The threshold is in terms of the log likelihood ratio provided by the
+  # PLDA scores.  In a perfectly calibrated system, the threshold is 0.
+  # In the following loop, we evaluate DER performance on DIHARD 2018 development
+  # set using some reasonable thresholds for a well-calibrated system.
+  for threshold in -0.5 -0.4 -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3 0.4 0.5; do
+    diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+      --threshold $threshold --rttm-channel 1 $nnet_dir/xvectors_dihard_2018_dev/plda_scores \
+      $nnet_dir/xvectors_dihard_2018_dev/plda_scores_t$threshold
+
+    md-eval.pl -r data/dihard_2018_dev/rttm \
+     -s $nnet_dir/xvectors_dihard_2018_dev/plda_scores_t$threshold/rttm \
+     2> $nnet_dir/tuning/dihard_2018_dev_t${threshold}.log \
+     > $nnet_dir/tuning/dihard_2018_dev_t${threshold}
+
+    der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+      $nnet_dir/tuning/dihard_2018_dev_t${threshold})
+    if [ $(perl -e "print ($der < $best_der ? 1 : 0);") -eq 1 ]; then
+      best_der=$der
+      best_threshold=$threshold
+    fi
+  done
+  echo "$best_threshold" > $nnet_dir/tuning/dihard_2018_dev_best
+
+  diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
+    $nnet_dir/xvectors_dihard_2018_dev/plda_scores $nnet_dir/xvectors_dihard_2018_dev/plda_scores
+
+  # Cluster DIHARD 2018 evaluation set using the best threshold found for the DIHARD
+  # 2018 development set. The DIHARD 2018 development set is used as the validation
+  # set to tune the parameters.
+  diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    --threshold $(cat $nnet_dir/tuning/dihard_2018_dev_best) --rttm-channel 1 \
+    $nnet_dir/xvectors_dihard_2018_eval/plda_scores $nnet_dir/xvectors_dihard_2018_eval/plda_scores
+
+  mkdir -p $nnet_dir/results
+  # Compute the DER on the DIHARD 2018 evaluation set. We use the official metrics of
+  # the DIHARD challenge. The DER is calculated with no unscored collars and including
+  # overlapping speech.
+  md-eval.pl -r data/dihard_2018_eval/rttm \
+    -s $nnet_dir/xvectors_dihard_2018_eval/plda_scores/rttm 2> $nnet_dir/results/threshold.log \
+    > $nnet_dir/results/DER_threshold.txt
+  der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+    $nnet_dir/results/DER_threshold.txt)
+  # Using supervised calibration, DER: 26.30%
+  echo "Using supervised calibration, DER: $der%"
+fi
+
+# Cluster the PLDA scores using the oracle number of speakers
+if [ $stage -le 13 ]; then
+  # In this section, we show how to do the clustering if the number of speakers
+  # (and therefore, the number of clusters) per recording is known in advance.
+  diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \
+    --reco2num-spk data/dihard_2018_eval/reco2num_spk --rttm-channel 1 \
+    $nnet_dir/xvectors_dihard_2018_eval/plda_scores $nnet_dir/xvectors_dihard_2018_eval/plda_scores_num_spk
+
+  md-eval.pl -r data/dihard_2018_eval/rttm \
+    -s $nnet_dir/xvectors_dihard_2018_eval/plda_scores_num_spk/rttm 2> $nnet_dir/results/num_spk.log \
+    > $nnet_dir/results/DER_num_spk.txt
+  der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \
+    $nnet_dir/results/DER_num_spk.txt)
+  # Using the oracle number of speakers, DER: 23.42%
+  echo "Using the oracle number of speakers, DER: $der%"
+fi
diff --git a/egs/dihard_2018/v2/sid b/egs/dihard_2018/v2/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/dihard_2018/v2/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/steps b/egs/dihard_2018/v2/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/dihard_2018/v2/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/dihard_2018/v2/utils b/egs/dihard_2018/v2/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/dihard_2018/v2/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/fame/s5/run.sh b/egs/fame/s5/run.sh
index 26a8485ff7d..de6fe46b7c4 100755
--- a/egs/fame/s5/run.sh
+++ b/egs/fame/s5/run.sh
@@ -106,8 +106,8 @@ fi
 if [ $stage -le 7 ]; then
   echo "Starting SGMM training."
   steps/align_fmllr.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/tri3 exp/tri3_ali || exit 1;
-  steps/train_ubm.sh  --cmd "$train_cmd" $numGaussUBM data/train data/lang exp/tri3_ali exp/ubm || exit 1;
-  steps/train_sgmm2.sh  --cmd "$train_cmd" $numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri3_ali exp/ubm/final.ubm exp/sgmm2 || exit 1;
+  steps/train_ubm.sh --cmd "$train_cmd" $numGaussUBM data/train data/lang exp/tri3_ali exp/ubm || exit 1;
+  steps/train_sgmm2.sh --cmd "$train_cmd" $numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri3_ali exp/ubm/final.ubm exp/sgmm2 || exit 1;
   echo "SGMM training done."
 
   echo "Decoding the development and test sets using SGMM models"
diff --git a/egs/fame/v1/local/prepare_for_eer.py b/egs/fame/v1/local/prepare_for_eer.py
index 59d2985e7c2..f1dbcfa9ab6 100755
--- a/egs/fame/v1/local/prepare_for_eer.py
+++ b/egs/fame/v1/local/prepare_for_eer.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 # Copyright 2015   David Snyder
 # Apache 2.0.
 #
@@ -12,4 +13,4 @@
   spkrutt2target[spkr+utt]=target
 for line in scores:
   spkr, utt, score = line.strip().split()
-  print score, spkrutt2target[spkr+utt]
+  print(score, spkrutt2target[spkr+utt])
diff --git a/egs/farsdat/s5/local/nnet/run_dnn.sh b/egs/farsdat/s5/local/nnet/run_dnn.sh
index fbb3db72e3e..a02894a7322 100755
--- a/egs/farsdat/s5/local/nnet/run_dnn.sh
+++ b/egs/farsdat/s5/local/nnet/run_dnn.sh
@@ -53,7 +53,7 @@ if [ $stage -le 1 ]; then
   # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
   dir=exp/dnn4_pretrain-dbn
   (tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log
-  $cuda_cmd $dir/log/pretrain_dbn.log \
+  "$train_cmd" --gpu 1 $dir/log/pretrain_dbn.log \
     steps/nnet/pretrain_dbn.sh --hid-dim 1024 --rbm-iter 20 $data_fmllr/train $dir || exit 1;
 fi
 
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
   dbn=exp/dnn4_pretrain-dbn/6.dbn
   (tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log
   # Train
-  $cuda_cmd $dir/log/train_nnet.log \
+  "$train_cmd" --gpu 1 $dir/log/train_nnet.log \
     steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
     $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1;
   # Decode (reuse HCLG graph)
@@ -93,7 +93,7 @@ fi
 
 if [ $stage -le 4 ]; then
   # Re-train the DNN by 6 iterations of sMBR 
-  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
+  steps/nnet/train_mpe.sh --cmd ""$train_cmd" --gpu 1" --num-iters 6 --acwt $acwt --do-smbr true \
     $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
   # Decode
   for ITER in 1 6; do
diff --git a/egs/farsdat/s5/run.sh b/egs/farsdat/s5/run.sh
index 81f353c301c..4c3d3c5882b 100755
--- a/egs/farsdat/s5/run.sh
+++ b/egs/farsdat/s5/run.sh
@@ -8,7 +8,7 @@
 # farsdat, description of the database:
 # http://www.assta.org/sst/SST-94-Vol-ll/cache/SST-94-VOL2-Chapter15-p20.pdf
 
-. ./cmd.sh 
+. ./cmd.sh
 [ -f path.sh ] && . ./path.sh
 set -e
 
@@ -54,7 +54,7 @@ echo ===========================================================================
 # Now make MFCC features.
 mfccdir=mfcc
 
-for x in train dev test; do 
+for x in train dev test; do
   steps/make_mfcc.sh --cmd "$train_cmd" --nj $feats_nj data/$x exp/make_mfcc/$x $mfccdir
   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
 done
diff --git a/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..d870ab04c38
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py b/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py
index 9112d868c25..4c96e01ce7e 100755
--- a/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py
+++ b/egs/fisher_callhome_spanish/s5/local/callhome_get_lattices.py
@@ -5,6 +5,7 @@
 # The list of files in the conversations for which 1 best output has to be extracted
 # words.txt
 
+from __future__ import print_function
 import os
 import sys
 import subprocess
@@ -76,7 +77,7 @@ def findLattice(timeDetail):
                 # Concatenate lattices
                 mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
 
-        print mergedTranslation
+        print(mergedTranslation)
         if mergedTranslation != "":
 
             # Sanjeev's Recipe : Remove epsilons and topo sort
@@ -95,16 +96,16 @@ def findLattice(timeDetail):
             # file so it can be checked later
             proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
             line = proc.stdout.readline()
-            print line + " " + str(lineNo)
+            print("{} {}".format(line, lineNo))
             if line.strip() != "PLF format appears to be correct.":
                 os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
                 invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
-                rmLines.write(str(lineNo) + "\n")
+                rmLines.write("{}\n".format(lineNo))
             else:
                 provFile.write(PLFline)
         else:
             blankPLF.write(timeInfo[0] + "\n")
-            rmLines.write(str(lineNo) + "\n")
+            rmLines.write("{}\n".format(lineNo))
         # Now convert to PLF
         lineNo += 1
 
diff --git a/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
new file mode 100755
index 00000000000..7f407552c2e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/chain/run_tdnn_1g.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e.
+#   with bypass resnet connections, and re-tuned.
+# compute-wer --text --mode=present ark:exp/chain/multipsplice_tdnn/decode_fsp_train_test/scoring_kaldi/test_filt.txt ark,p:- 
+# %WER 22.21 [ 8847 / 39831, 1965 ins, 2127 del, 4755 sub ]
+# %SER 56.98 [ 3577 / 6278 ]
+# Scored 6278 sentences, 0 not present in hyp.
+
+# steps/info/chain_dir_info.pl  exp/chain/multipsplice_tdnn
+# exp/chain/multipsplice_tdnn: num-iters=296 nj=1..2 num-params=8.2M dim=40+100->2489 combine=-0.170->-0.165 (over 8) xent:train/valid[196,295,final]=(-2.30,-1.93,-1.83/-2.24,-1.96,-1.86) logprob:train/valid[196,295,final]=(-0.208,-0.169,-0.164/-0.189,-0.161,-0.158)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="test dev"
+gmm=tri5a        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1g   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 15 ]; then
+echo "local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix""
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+fi
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/tri5a_lats_nodup_sp
+dir=exp/chain/multipsplice_tdnn
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3/ivectors_train_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain/${gmm}_tree
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_${gmm}_chain
+
+#for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+#    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+#    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+#  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+#done
+
+
+if [ $stage -le 16 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 17 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 18 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 19 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 20 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand $srand \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.frames-per-iter 5000000 \
+    --trainer.optimization.num-jobs-initial 1 \
+    --trainer.optimization.num-jobs-final=2 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.optimization.momentum 0.0 \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context 0 \
+    --egs.chunk-right-context 0 \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs $remove_egs \
+    --use-gpu true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir exp/tri5a_lats_nodup_sp \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 21 ]; then
+  # The reason we are using data/lang_test here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  #LM was trained only on Fisher Spanish train subset.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph_fsp_train || exit 1;
+
+fi
+
+rnnlmdir=exp/rnnlm_lstm_tdnn_1b
+if [ $stage -le 22 ]; then
+  local/rnnlm/train_rnnlm.sh --dir $rnnlmdir || exit 1;
+fi
+
+if [ $stage -le 23 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype  in fsp_train; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data} || exit 1;
+      done
+      bash local/rnnlm/lmrescore_nbest.sh 1.0 data/lang_test $rnnlmdir data/${data}_hires/ \
+	      ${dir}/decode_${lmtype}_${data} $dir/decode_rnnLM_${lmtype}_${data} || exit 1;
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/ctm.sh b/egs/fisher_callhome_spanish/s5/local/ctm.sh
index 7d09f574580..62860a10b7b 100755
--- a/egs/fisher_callhome_spanish/s5/local/ctm.sh
+++ b/egs/fisher_callhome_spanish/s5/local/ctm.sh
@@ -19,9 +19,9 @@ fi
 steps/get_ctm.sh $data_dir $lang_dir $decode_dir
 
 # Make sure that channel markers match
-#sed -i "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s1\s:fsp A :g' {}
-#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} sed -i -r 's:fsp\s2\s:fsp B :g' {}
+#perl -i -pe "s:\s.*_fsp-([AB]): \1:g" data/dev/stm
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s1\s:fsp A :g' {}
+#ls exp/tri5a/decode_dev/score_*/dev.ctm | xargs -I {} perl -i -pe 's:fsp\s2\s:fsp B :g' {}
 
 # Get the environment variables
 . /export/babel/data/software/env.sh
diff --git a/egs/fisher_callhome_spanish/s5/local/decode_report.py b/egs/fisher_callhome_spanish/s5/local/decode_report.py
index 0331ab5d6f4..6f3d3f80c95 100755
--- a/egs/fisher_callhome_spanish/s5/local/decode_report.py
+++ b/egs/fisher_callhome_spanish/s5/local/decode_report.py
@@ -7,6 +7,7 @@
 # This script is specific to my partitions and needs to be made more general
 # or modified
 
+from __future__ import print_function
 import subprocess
 import os
 
@@ -46,8 +47,8 @@ def get_best_wer(decode_dir):
     best_iteration = 0
     best_wer = 100.0
     for i in range(16):
-        if os.path.isfile(decode_dir + "/wer_" + str(i)):
-            result = subprocess.check_output("tail -n 3 " + decode_dir + "/wer_" + str(i), shell=True)
+        if os.path.isfile("{}/wer_{}".format(decode_dir, i)):
+            result = subprocess.check_output("tail -n 3 {}/wer_{}".format(decode_dir, i), shell=True)
             wer_string = result.split("\n")[0]
             wer_details = wer_string.split(' ')
             # Get max WER
@@ -58,8 +59,8 @@ def get_best_wer(decode_dir):
     return best_iteration, best_wer
 
 for decode_dir in decode_directories[:6]:
-    print decode_dir
-    print get_best_wer(decode_dir)
+    print(decode_dir)
+    print(get_best_wer(decode_dir))
 
 # Separate processing for bMMI stuff
 best_wer = 100.0
@@ -73,8 +74,8 @@ def get_best_wer(decode_dir):
         best_dir = decode_dir
         best_iteration = iteration
 
-print best_dir
-print (best_iteration, best_wer)
+print(best_dir)
+print((best_iteration, best_wer))
 
 best_wer = 100.0
 best_dir = ""
@@ -87,8 +88,8 @@ def get_best_wer(decode_dir):
         best_dir = decode_dir
         best_iteration = iteration
 
-print best_dir
-print (best_iteration, best_wer)
+print(best_dir)
+print((best_iteration, best_wer))
 
 best_wer = 100.0
 best_dir = ""
@@ -101,8 +102,8 @@ def get_best_wer(decode_dir):
         best_dir = decode_dir
         best_iteration = iteration
 
-print best_dir
-print (best_iteration, best_wer)
+print(best_dir)
+print((best_iteration, best_wer))
 
 best_wer = 100.0
 best_dir = ""
@@ -115,8 +116,8 @@ def get_best_wer(decode_dir):
         best_dir = decode_dir
         best_iteration = iteration
 
-print best_dir
-print (best_iteration, best_wer)
+print(best_dir)
+print((best_iteration, best_wer))
 
 best_wer = 100.0
 best_dir = ""
@@ -129,8 +130,8 @@ def get_best_wer(decode_dir):
         best_dir = decode_dir
         best_iteration = iteration
 
-print best_dir
-print (best_iteration, best_wer)
+print(best_dir)
+print((best_iteration, best_wer))
 
 best_wer = 100.0
 best_dir = ""
@@ -143,5 +144,5 @@ def get_best_wer(decode_dir):
         best_dir = decode_dir
         best_iteration = iteration
 
-print best_dir
-print (best_iteration, best_wer)
+print(best_dir)
+print((best_iteration, best_wer))
diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 7b2de2db392..779298305c4 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -105,9 +105,8 @@ if [ $stage -le 4 ]; then
   cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
 
   # Add prons for laughter, noise, oov
-  for w in `grep -v sil $dir/silence_phones.txt`; do
-    sed -i "/\[$w\]/d" $tmpdir/lexicon.2
-  done
+  w=$(grep -v sil $dir/silence_phones.txt | tr '\n' '|')
+  perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.2
 
   for w in `grep -v sil $dir/silence_phones.txt`; do
     echo "[$w] $w"
diff --git a/egs/fisher_callhome_spanish/s5/local/get_lattices.py b/egs/fisher_callhome_spanish/s5/local/get_lattices.py
index a44facbce44..5430c18bb5b 100755
--- a/egs/fisher_callhome_spanish/s5/local/get_lattices.py
+++ b/egs/fisher_callhome_spanish/s5/local/get_lattices.py
@@ -5,6 +5,7 @@
 # The list of files in the conversations for which 1 best output has to be extracted
 # words.txt
 
+from __future__ import print_function
 import os
 import sys
 import subprocess
@@ -76,7 +77,7 @@ def findLattice(timeDetail):
                 # Concatenate lattices
                 mergedTranslation = latticeConcatenate(mergedTranslation, tmp)
 
-        print mergedTranslation
+        print(mergedTranslation)
         if mergedTranslation != "":
 
             # Sanjeev's Recipe : Remove epsilons and topo sort
@@ -95,16 +96,16 @@ def findLattice(timeDetail):
             # file so it can be checked later
             proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
             line = proc.stdout.readline()
-            print line + " " + str(lineNo)
+            print("{} {}".format(line, lineNo))
             if line.strip() != "PLF format appears to be correct.":
                 os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
                 invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
-                rmLines.write(str(lineNo) + "\n")
+                rmLines.write("{}\n".format(lineNo))
             else:
                 provFile.write(PLFline)
         else:
             blankPLF.write(timeInfo[0] + "\n")
-            rmLines.write(str(lineNo) + "\n")
+            rmLines.write("{}\n".format(lineNo))
         # Now convert to PLF
         lineNo += 1
 
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index 5c09f09bc35..c7aa6affb11 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python
-# Copyright 2014  Gaurav Kumar.   Apache 2.0
 # -*- coding: utf-8 -*-
 #
+# Copyright 2014  Gaurav Kumar.   Apache 2.0
+#    2018  Nagendra Kumar Goel, Saikiran Valluri, GoVivace inc., Avaaya
 # Merges unique words from Spanish Fisher, Gigaword and the LDC spanish lexicon
-
-import sys
+from __future__ import print_function
+import sys, re
 import json
 import codecs
 import operator
@@ -16,6 +17,7 @@
 uw_gigaword = tmpdir + "/es_wordlist.json"
 uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
 
+filtered_letters = re.compile(u'[¡¥ª°º¿àçèëìîôö0123456789]')
 merged_lexicon = []
 # All three lexicons are in different formats
 # First add the data from lexicon_fisher (A) into the dictionary
@@ -24,8 +26,7 @@
     merged_lexicon.append(line.strip())
 fisher.close()
 
-print "After adding the fisher data, the lexicon contains " \
-      + str(len(merged_lexicon)) + " entries."
+print("After adding the fisher data, the lexicon contains {} entries".format(len(merged_lexicon)))
 
 # Now add data from the LDC lexicon
 ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
@@ -34,12 +35,11 @@
     if entries[0].lower() not in merged_lexicon:
         merged_lexicon.append(entries[0].lower())
 
-print "After adding the LDC data, the lexicon contains " \
-      + str(len(merged_lexicon)) + " entries."
+print("After adding the LDC data, the lexicon contains {} entries".format(len(merged_lexicon)))
 
 # Finally add the gigaword data
 gigaword = json.load(open(uw_gigaword))
-gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1)))
+gigaword = reversed(sorted(gigaword.items(), key=operator.itemgetter(1)))
 
 for item in gigaword:
     # We need a maximum of wordlimit words in the lexicon
@@ -49,16 +49,16 @@
     if item[0].lower() not in merged_lexicon:
         merged_lexicon.append(item[0].lower())
 
-print "After adding the Gigaword data, the lexicon contains " \
-      + str(len(merged_lexicon)) + " entries."
+print("After adding the Gigaword data, the lexicon contains {} entries".format(len(merged_lexicon)))
 
 # Now write the uniquewords to a file
 lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
 ltuples = sorted(merged_lexicon)
 
 for item in ltuples:
-    lf.write(item + "\n")
+    if not item==u'ñ' and not re.search(filtered_letters, item):
+        lf.write(item + "\n")
 
 lf.close()
 
-print "Finshed writing unique words"
+print("Finshed writing unique words")
diff --git a/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..cc9de4d26c5
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,187 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+
+stage=7
+nj=30
+train_set=train   # you might set this to e.g. train.
+test_sets="test dev"
+gmm=tri5a                # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
+                         # in the tedlium recip it's _cleaned).
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 7 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 7."
+  exit 1
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+
+fi
+
+if [ $stage -le 11 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # We now extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 13 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 15 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
new file mode 100755
index 00000000000..3713fe228d6
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/local/rnnlm/train_rnnlm.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+
+# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization.
+
+# local/rnnlm/train_rnnlm.sh: best iteration (out of 18) was 17, linking it to final iteration.
+# local/rnnlm/train_rnnlm.sh: train/dev perplexity was 45.6 / 68.7.
+# Train objf: -651.50 -4.44 -4.26 -4.15 -4.08 -4.03 -4.00 -3.97 -3.94 -3.92 -3.90 -3.89 -3.88 -3.86 -3.85 -3.84 -3.83 -3.82
+# Dev objf:   -10.76 -4.68 -4.47 -4.38 -4.33 -4.29 -4.28 -4.27 -4.26 -4.26 -4.25 -4.24 -4.24 -4.24 -4.23 -4.23 -4.23 -4.23
+
+# Begin configuration section.
+dir=exp/rnnlm_lstm_tdnn_1b
+embedding_dim=200
+embedding_l2=0.005 # embedding layer l2 regularize
+comp_l2=0.005 # component-level l2 regularize
+output_l2=0.005 # output-layer l2 regularize
+epochs=90
+mic=
+stage=-10
+train_stage=0
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+[ -z "$cmd" ] && cmd=$train_cmd
+
+train=data/train/text
+dev=data/dev2/text   # We at no stage in run.sh should decode dev2 partition for results!
+wordlist=data/lang/words.txt
+text_dir=data/local/rnnlm/text
+mkdir -p $dir/config
+set -e
+
+for f in $train $dev $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train | cut -d ' ' -f2- > $text_dir/ami.txt
+  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+ami  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features 10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+lstm_opts="l2-regularize=$comp_l2"
+tdnn_opts="l2-regularize=$comp_l2"
+output_opts="l2-regularize=$output_l2"
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+lstm-layer name=lstm1 cell-dim=$embedding_dim $lstm_opts
+relu-renorm-layer name=tdnn dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
+lstm-layer name=lstm2 cell-dim=$embedding_dim $lstm_opts
+output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --embedding_l2 $embedding_l2 \
+                       --stage $train_stage \
+                       --num-epochs $epochs --cmd "$cmd" $dir
+fi
+
+exit 0
diff --git a/egs/fisher_callhome_spanish/s5/local/train_get_lattices.py b/egs/fisher_callhome_spanish/s5/local/train_get_lattices.py
index 3b6755d6540..b9f906b27da 100755
--- a/egs/fisher_callhome_spanish/s5/local/train_get_lattices.py
+++ b/egs/fisher_callhome_spanish/s5/local/train_get_lattices.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 
+from __future__ import print_function
 import os
 import sys
 import subprocess
@@ -18,7 +19,7 @@
 
 latticeDict = {}
 
-for key,location in latticeLocation.iteritems():
+for key,location in latticeLocation.items():
     for root, dirs, filenames in os.walk(location):
         for f in filenames:
             latticeDict[f] = str(key)
@@ -105,16 +106,16 @@ def findLattice(timeDetail):
             # file so it can be checked later
             proc = subprocess.Popen("/export/a04/gkumar/moses/mosesdecoder/checkplf < " + finalPLFFile + " 2>&1 | awk 'FNR == 2 {print}'", stdout=subprocess.PIPE, shell=True)
             line = proc.stdout.readline()
-            print line + " " + str(lineNo)
+            print("{} {}".format(line, lineNo))
             if line.strip() != "PLF format appears to be correct.":
                 os.system("cp " + finalFST + " " + invalidplfdir + "/" + timeInfo[0])
                 invalidPLF.write(invalidplfdir + "/" + timeInfo[0] + "\n")
-                rmLines.write(str(lineNo) + "\n")
+                rmLines.write("{}\n".format(lineNo))
             else:
                 provFile.write(PLFline)
         else:
             blankPLF.write(timeInfo[0] + "\n")
-            rmLines.write(str(lineNo) + "\n")
+            rmLines.write("{}\n".format(lineNo))
         # Now convert to PLF
         lineNo += 1
 
diff --git a/egs/fisher_callhome_spanish/s5/path.sh b/egs/fisher_callhome_spanish/s5/path.sh
index 1a6fb5f891b..17ffb0369f8 100755
--- a/egs/fisher_callhome_spanish/s5/path.sh
+++ b/egs/fisher_callhome_spanish/s5/path.sh
@@ -3,3 +3,4 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/dpovey/libs
diff --git a/egs/fisher_callhome_spanish/s5/rnnlm b/egs/fisher_callhome_spanish/s5/rnnlm
new file mode 120000
index 00000000000..fb754622d5e
--- /dev/null
+++ b/egs/fisher_callhome_spanish/s5/rnnlm
@@ -0,0 +1 @@
+../../wsj/s5/rnnlm
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 57902a98fed..6e2752a7b68 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -1,20 +1,22 @@
 #!/bin/bash
 #
+# Copyright 2018  Nagendra Goel, Saikiran Valluri  Apache 2.0
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
 # Recipe for Fisher/Callhome-Spanish
-# Made to integrate KALDI with JOSHUA for end-to-end ASR and SMT
 
 stage=0
+train_stage=-20
+train_sgmm2=false
 
 # call the next line with the directory where the Spanish Fisher data is
 # (the values below are just an example).
-sfisher_speech=/veu4/jadrian/data/LDC/LDC2010S01
-sfisher_transcripts=/veu4/jadrian/data/LDC/LDC2010T04
-spanish_lexicon=/veu4/jadrian/data/LDC/LDC96L16
+sfisher_speech=/export/corpora/LDC/LDC2010S01
+sfisher_transcripts=/export/corpora/LDC/LDC2010T04
+spanish_lexicon=/export/corpora/LDC/LDC96L16
 split=local/splits/split_fisher
 
-callhome_speech=/veu4/jadrian/data/LDC/LDC96S35
-callhome_transcripts=/veu4/jadrian/data/LDC/LDC96T17
+callhome_speech=/export/corpora/LDC/LDC96S35
+callhome_transcripts=/export/corpora/LDC/LDC96T17
 split_callhome=local/splits/split_callhome
 
 mfccdir=`pwd`/mfcc
@@ -25,7 +27,7 @@ if [ -f path.sh ]; then . ./path.sh; fi
 
 set -e
 
-if [ $stage -lt 1 ]; then
+if [ $stage -le 1 ]; then
   local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
 
   local/callhome_data_prep.sh $callhome_speech $callhome_transcripts
@@ -95,7 +97,7 @@ if [ $stage -lt 1 ]; then
   local/callhome_create_splits.sh $split_callhome
 fi
 
-if [ $stage -lt 2 ]; then
+if [ $stage -le 2 ]; then
   # Now compute CMVN stats for the train, dev and test subsets
   steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev $mfccdir
   steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $mfccdir
@@ -124,90 +126,95 @@ if [ $stage -lt 2 ]; then
   utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
 fi
 
+if [ $stage -le 3 ]; then
+  steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
+    data/train_10k_nodup data/lang exp/mono0a
 
-steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
-  data/train_10k_nodup data/lang exp/mono0a
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
 
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-   data/train_30k data/lang exp/mono0a exp/mono0a_ali || exit 1;
-
-steps/train_deltas.sh --cmd "$train_cmd" \
+  steps/train_deltas.sh --cmd "$train_cmd" \
     2500 20000 data/train_30k data/lang exp/mono0a_ali exp/tri1 || exit 1;
 
 
-(utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
- steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri1/graph data/dev exp/tri1/decode_dev)&
+  (utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+    exp/tri1/graph data/dev exp/tri1/decode_dev)&
 
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-   data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_30k data/lang exp/tri1 exp/tri1_ali || exit 1;
 
-steps/train_deltas.sh --cmd "$train_cmd" \
+  steps/train_deltas.sh --cmd "$train_cmd" \
     2500 20000 data/train_30k data/lang exp/tri1_ali exp/tri2 || exit 1;
 
-(
-  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
-  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
-)&
-
+  (
+    utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph || exit 1;
+    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri2/graph data/dev exp/tri2/decode_dev || exit 1;
+   )&
+fi
 
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj 30 --cmd "$train_cmd" \
+    data/train_100k data/lang exp/tri2 exp/tri2_ali || exit 1;
 
 # Train tri3a, which is LDA+MLLT, on 100k data.
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" \
    3000 40000 data/train_100k data/lang exp/tri2_ali exp/tri3a || exit 1;
-(
-  utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
-  steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
-)&
-
+  (
+    utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+    steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+     exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
+  )&
+fi
 
+if [ $stage -le 5 ]; then
 # Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
-steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train_100k data/lang exp/tri3a exp/tri3a_ali || exit 1;
 
-steps/train_sat.sh  --cmd "$train_cmd" \
-  4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    4000 60000 data/train_100k data/lang exp/tri3a_ali  exp/tri4a || exit 1;
 
-(
-  utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri4a/graph data/dev exp/tri4a/decode_dev
+  (
+    utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri4a/graph data/dev exp/tri4a/decode_dev
 )&
 
 
-steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/train data/lang exp/tri4a exp/tri4a_ali || exit 1;
 
 # Reduce the number of gaussians
-steps/train_sat.sh  --cmd "$train_cmd" \
-  5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    5000 120000 data/train data/lang exp/tri4a_ali  exp/tri5a || exit 1;
 
-(
-  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/dev exp/tri5a/decode_dev
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/test exp/tri5a/decode_test
+  (
+    utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/dev exp/tri5a/decode_dev
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/test exp/tri5a/decode_test
 
   # Decode CALLHOME
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
-  steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
-   exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
-) &
-
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_test exp/tri5a/decode_callhome_test
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_dev exp/tri5a/decode_callhome_dev
+    steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
+      exp/tri5a/graph data/callhome_train exp/tri5a/decode_callhome_train
+    ) &
+
+
+   steps/align_fmllr.sh \
+     --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
+     data/train data/lang exp/tri5a exp/tri5a_ali
+fi
 
-steps/align_fmllr.sh \
-  --boost-silence 0.5 --nj 32 --cmd "$train_cmd" \
-  data/train data/lang exp/tri5a exp/tri5a_ali
+if $train_sgmm2; then
 
 steps/train_ubm.sh \
   --cmd "$train_cmd" 750 \
@@ -258,22 +265,7 @@ for iter in 1 2 3 4; do
 done
 ) &
 
-dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
-                       --parallel-opts "--num-threads 16")
-dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
-                       --parallel-opts "--gpu 1")
-
-steps/nnet2/train_pnorm_ensemble.sh \
-  --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
-  --num-hidden-layers 4 --pnorm-input-dim 2000 --pnorm-output-dim 200\
-  --cmd "$train_cmd" \
-  "${dnn_gpu_parallel_opts[@]}" \
-  --ensemble-size 4 --initial-beta 0.1 --final-beta 5 \
-  data/train data/lang exp/tri5a_ali exp/tri6a_dnn
+fi
 
-(
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 \
-    --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
-) &
-wait
+local/chain/run_tdnn_1g.sh --stage $stage --train-stage $train_stage || exit 1;
 exit 0;
diff --git a/egs/fisher_english/s5/local/chain/run_tdnn.sh b/egs/fisher_english/s5/local/chain/run_tdnn.sh
index 14174e617c4..1fd0f1fdf3a 100755
--- a/egs/fisher_english/s5/local/chain/run_tdnn.sh
+++ b/egs/fisher_english/s5/local/chain/run_tdnn.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
index e95de232304..07636a8b3c8 100644
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh
@@ -209,11 +209,11 @@ diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree an
 #   steps/nnet3/chain/build_tree_multiple_sources.sh \
 #     --use-fmllr false --context-opts "--context-width=2 --central-position=1" \
 #     --frame-subsampling-factor $frame_subsampling_factor \
-#     7000 $lang \
+#     7000 $unsup_decode_lang \
 #     data/${supervised_set_perturbed} \
 #     ${sup_tree_dir} \
 #     data/${unsupervised_set_perturbed} \
-#     $chaindir/best_path_${unsupervised_set_perturbed} \
+#     ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \
 #     $treedir || exit 1
 # fi
 #
@@ -231,7 +231,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
index e76df666e8a..b1c133942ef 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -142,7 +142,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
index 2d5b2f8480e..04244014502 100755
--- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
+++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh
@@ -228,11 +228,11 @@ diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree an
 #   steps/nnet3/chain/build_tree_multiple_sources.sh \
 #     --use-fmllr false --context-opts "--context-width=2 --central-position=1" \
 #     --frame-subsampling-factor $frame_subsampling_factor \
-#     7000 $lang \
+#     7000 $unsup_decode_lang \
 #     data/${supervised_set_perturbed} \
 #     ${sup_tree_dir} \
 #     data/${unsupervised_set_perturbed} \
-#     $chaindir/best_path_${unsupervised_set_perturbed} \
+#     ${sup_chain_dir}/best_path_${unsupervised_set_perturbed}_big \
 #     $treedir || exit 1
 # fi
 #
@@ -250,7 +250,7 @@ if [ $stage -le 11 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index cbf0ef6cb6c..c12f604f26b 100755
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -133,7 +133,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
index 12b3187a5fa..efcd1eced4a 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7c.sh
@@ -129,7 +129,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
index 7d640c3262a..e4a555abfdd 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_7d.sh
@@ -134,7 +134,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index 07e88b59ddc..5650cedca28 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -142,7 +142,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20"
 
   mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
new file mode 100644
index 00000000000..5beb2e74a9a
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a_svd.sh
@@ -0,0 +1,448 @@
+#!/bin/bash
+#
+# Copyright 2018  Nagendra Kumar Goel,
+#            Saikiran Valluri, Govivace.Inc -  Apache 2.0
+
+# The script is organized as below.
+# First we train the baseline LSTMP-TDNN config chain model for few epochs on the (Fisher+swbd)-english data,
+# Then, we perform SVD based refactoring of all the Affine components in this baseline final.mdl,
+# in order to reduce the overall model parameters size,
+# as determined by the bottleneck dim value or Energy and Shrinkage threshold values.
+# Then, we finetune the weight parameters of the refactored model using entire Fisher + switchboard data for single epoch.
+
+# Command used for comparing  WERs of decoding on different testsets using pre-SVD and SVD models:
+#  ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_lstm_1a_svd_sp
+#
+# Please run this entire script till the end before running the above WER compare command...
+
+
+# System                tdnn_lstm_1a_sp
+# WER on eval2000(tg)        12.3
+#           [looped:]        12.2
+# WER on eval2000(fg)        12.1
+#           [looped:]        12.1
+# WER on eval2000(fg)
+#  [SVD retrained + looped]  12.1
+# WER on rt03(tg)            11.6
+#           [looped:]        11.6
+# WER on rt03(tg)
+#  [SVD retrained]           12
+# WER on rt03(fg)            11.3
+#           [looped:]        11.3
+# Final train prob         -0.074
+# Final valid prob         -0.084
+# Final train prob (xent)        -0.882
+# Final valid prob (xent)       -0.9393
+
+# WER stats for eval2000 using tdnn_lstm_1a_sp
+#           | #Snt #Wrd  | Corr Sub Del Ins Err  S.Err |
+# %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.4 | 1831 21395 | 92.7 5.1 2.2 1.1 8.4 42.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 15.9 | 2628 21594 | 86.4 8.9 4.7 2.3 15.9 54.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.1 | 4459 42989 | 89.6 6.9 3.5 1.7 12.1 49.2 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.2 | 1831 21395 | 93.1 5.1 1.8 1.3 8.2 41.7 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_sp
+# %WER 9.6 | 3970 36721 | 91.5 5.5 3.0 1.1 9.6 41.2 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.6 | 8420 76157 | 89.7 6.8 3.4 1.4 11.6 43.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.3 | 4450 39436 | 88.0 7.4 4.6 1.3 13.3 44.5 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 9.4 | 3970 36721 | 91.8 5.3 2.9 1.1 9.4 40.3 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.3 | 8420 76157 | 89.9 6.4 3.7 1.2 11.3 42.4 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.1 | 4450 39436 | 88.3 7.5 4.2 1.4 13.1 44.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+# WER stats for rt03 using tdnn_lstm_1a_svd_sp
+# %WER 9.7 | 3970 36721 | 91.3 5.9 2.8 1.0 9.7 40.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 12  | 8420 76157 | 89.3 7.3 3.4 1.3 12.0 42.0 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 14.1 | 4450 39436 | 87.4 8.2 4.3 1.5 14.1 44.6 | exp/chain/tdnn_lstm_1a_svd_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys      
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-20
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1a # Note: _sp will get added to this if $speed_perturb == true.
+svd_dir=${dir}_svd # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+
+# config for svd
+apply_svd=true
+energy_threshold=0.81
+shrinkage_threshold=0.64
+primary_lr_factor=0.25
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+svd_dir=${svd_dir}$suffix
+build_tree_train_set=train_nodup
+train_set=train_nodup_sp
+build_tree_ali_dir=exp/tri5a_ali
+treedir=exp/chain/tri6_tree
+lang=data/lang_chain
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $build_tree_ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_lats_nodup$suffix
+  rm exp/tri5a_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts 
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+src_mdl=${dir}/final.mdl
+if $apply_svd && [ $stage -le 14 ]; then
+  # model compression using SVD
+
+  # threshold configs for tdnn layers
+  mkdir -p $svd_dir/configs
+  edits_config=$svd_dir/configs/final.config
+  common_egs_dir=$dir/egs
+  cat <<EOF > ${edits_config}
+  set-learning-rate-factor learning-rate-factor=$primary_lr_factor
+  apply-svd name=* energy-threshold=$energy_threshold shrinkage-threshold=$shrinkage_threshold
+EOF
+
+  # Copy files / directories from source directory
+  cp ${dir}/{cmvn_opts,tree,frame_subsampling_factor,0.trans_mdl,normalization.fst,den.fst} $svd_dir/.
+
+  # Generate initial model from trained model
+  $train_cmd $svd_dir/log/generate_input_mdl.log \
+    nnet3-am-copy --edits-config=$edits_config $src_mdl $svd_dir/input.raw
+
+  # Retrain the model for 1 epoch
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --trainer.input-model $svd_dir/input.raw \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 1 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir ${svd_dir}  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+
+if [ $stage -le 16 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+              steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+                      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+                      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+if $apply_svd; then
+  # Decoding the svd retrained model.
+  dir=$svd_dir
+fi
+
+if [ $stage -le 18 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
index c9d50d1f7bd..f3cc869e6de 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
@@ -151,7 +151,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index 1cce08abeee..059a81e15fc 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -148,7 +148,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
index 2334c6a1bc1..d86b699d6f6 100755
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
@@ -149,7 +149,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
diff --git a/egs/fisher_swbd/s5/local/fisher_data_prep.sh b/egs/fisher_swbd/s5/local/fisher_data_prep.sh
index 909e53aaf30..186f7d7e122 100755
--- a/egs/fisher_swbd/s5/local/fisher_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/fisher_data_prep.sh
@@ -118,7 +118,7 @@ if [ $stage -le 1 ]; then
      $line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file";
      $call_id eq $1 || die "Mismatch call-id $call_id vs $1\n";
      while (<I>) {
-       if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) {
+       if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.*\S|\S)\s*$/) {
          $start = sprintf("%06d", $1 * 100.0);
          $end = sprintf("%06d", $2 * 100.0);
          length($end) > 6 && die "Time too long $end in file $file";
diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py
index 3c447c5976a..75cc4458d85 100755
--- a/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py
+++ b/egs/fisher_swbd/s5/local/format_acronyms_ctm_eval2000.py
@@ -10,6 +10,7 @@
 # en_4156 B 414.58 0.16 l
 # en_4156 B 414.74 0.17 a
 
+from __future__ import division
 import argparse,re
 __author__ = 'Minhua Wu'
  
@@ -27,7 +28,7 @@
     if items[4].find(".") != -1:
         letters = items[4].split("._")
         acronym_period = round(float(items[3]), 2)
-        letter_slot = round(acronym_period / len(letters), 2)
+        letter_slot = round(acronym_period/len(letters), 2)
         time_start = round(float(items[2]), 2)
         for l in letters[:-1]:
             time = " %.2f %.2f " % (time_start, letter_slot)
diff --git a/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py b/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py
index 59814beb4ea..c3f9af09c99 100755
--- a/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py
+++ b/egs/fisher_swbd/s5/local/format_acronyms_ctm_rt03.py
@@ -10,6 +10,7 @@
 # en_4156 B 414.58 0.16 l
 # en_4156 B 414.74 0.17 a
 
+from __future__ import division
 import argparse,re
 __author__ = 'Minhua Wu'
  
@@ -27,7 +28,7 @@
     if items[4].find(".") != -1:
         letters = items[4].split("._")
         acronym_period = round(float(items[3]), 2)
-        letter_slot = round(acronym_period / len(letters), 2)
+        letter_slot = round(acronym_period/ len(letters), 2)
         time_start = round(float(items[2]), 2)
         for l in letters[:-1]:
             time = " %.2f %.2f " % (time_start, letter_slot)
diff --git a/egs/formosa/README.txt b/egs/formosa/README.txt
new file mode 100644
index 00000000000..3b9d78dad92
--- /dev/null
+++ b/egs/formosa/README.txt
@@ -0,0 +1,22 @@
+### Welcome to the demo recipe of the Formosa Speech in the Wild (FSW) Project ###
+
+The language habits of Taiwanese people are different from other Mandarin speakers (both accents and cultures) [1]. Especially Tainwaese use tranditional Chinese characters, i.e., 繁體中文). To address this issue, a Taiwanese speech corpus collection project "Formosa Speech in the Wild (FSW)" was initiated in 2017 to improve the development of Taiwanese-specific speech recognition techniques.
+
+FSW corpus will be a large-scale database of real-Life/multi-gene Taiwanese Spontaneous speech collected and transcribed from various sources (radio, TV, open courses, etc.). To demostrate that this database is a reasonable data resource for Taiwanese spontaneous speech recognition research, a baseline recipe is provied here for everybody, especially students, to develop their own systems easily and quickly.
+
+This recipe is based on the "NER-Trs-Vol1" corpus (about 150 hours broadcast radio speech selected from FSW). For more details, please visit: 
+* Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw)
+
+If you want to apply the NER-Trs-Vol1 corpus, please contact Yuan-Fu Liao (廖元甫) via "yfliao@mail.ntut.edu.tw". This corpus is only for non-commercial research/education use and will be distributed via our GitLab server in https://speech.nchc.org.tw.
+
+Any bug, errors, comments or suggestions are very welcomed.
+
+Yuan-Fu Liao (廖元甫)
+Associate Professor
+Department of electronic Engineering,
+National Taipei University of Technology
+http://www.ntut.edu.tw/~yfliao
+yfliao@mail.ntut.edu.tw
+
+............
+[1] The languages of Taiwan consist of several varieties of languages under families of the Austronesian languages and the Sino-Tibetan languages. Taiwanese Mandarin, Hokkien, Hakka and Formosan languages are used by 83.5%, 81.9%, 6.6% and 1.4% of the population respectively (2010). Given the prevalent use of Taiwanese Hokkien, the Mandarin spoken in Taiwan has been to a great extent influenced by it.
diff --git a/egs/formosa/s5/RESULTS b/egs/formosa/s5/RESULTS
new file mode 100644
index 00000000000..b047e5cefe4
--- /dev/null
+++ b/egs/formosa/s5/RESULTS
@@ -0,0 +1,43 @@
+#
+# Reference results
+#
+# Experimental settings:
+#
+# training set:	show CS, BG, DA, QG, SR, SY and WK,	in total 18977 utt., 1,088,948 words
+# test set:	show JZ, GJ, KX and YX,			in total  2112 utt.,   135,972 words
+# eval set:     show JX, TD and WJ,                     in total  2222 utt.,   104,648 words
+#
+# lexicon: 274,036 words
+# phones (IPA):  196 (tonal)
+#
+
+# WER: test
+
+%WER 61.32 [ 83373 / 135972, 5458 ins, 19156 del, 58759 sub ] exp/mono/decode_test/wer_11_0.0
+%WER 41.00 [ 55742 / 135972, 6725 ins, 12763 del, 36254 sub ] exp/tri1/decode_test/wer_15_0.0
+%WER 40.41 [ 54948 / 135972, 7366 ins, 11505 del, 36077 sub ] exp/tri2/decode_test/wer_14_0.0
+%WER 38.67 [ 52574 / 135972, 6855 ins, 11250 del, 34469 sub ] exp/tri3a/decode_test/wer_15_0.0
+%WER 35.70 [ 48546 / 135972, 7197 ins,  9717 del, 31632 sub ] exp/tri4a/decode_test/wer_17_0.0
+%WER 32.11 [ 43661 / 135972, 6112 ins, 10185 del, 27364 sub ] exp/tri5a/decode_test/wer_17_0.5
+%WER 31.36 [ 42639 / 135972, 6846 ins, 8860 del, 26933 sub ] exp/tri5a_cleaned/decode_test/wer_17_0.5
+%WER 24.43 [ 33218 / 135972, 5524 ins,  7583 del, 20111 sub ] exp/nnet3/tdnn_sp/decode_test/wer_12_0.0
+%WER 23.95 [ 32568 / 135972, 4457 ins, 10271 del, 17840 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.0
+%WER 23.54 [ 32006 / 135972, 4717 ins, 8644 del, 18645 sub ] exp/chain/tdnn_1b_sp/decode_test/wer_10_0.0
+%WER 20.64 [ 28067 / 135972, 4434 ins, 7946 del, 15687 sub ] exp/chain/tdnn_1c_sp/decode_test/wer_11_0.0
+%WER 20.98 [ 28527 / 135972, 4706 ins, 7816 del, 16005 sub ] exp/chain/tdnn_1d_sp/decode_test/wer_10_0.0
+
+# CER: test
+
+%WER 54.09 [ 116688 / 215718, 4747 ins, 24510 del, 87431 sub ] exp/mono/decode_test/cer_10_0.0
+%WER 32.61 [  70336 / 215718, 5866 ins, 16282 del, 48188 sub ] exp/tri1/decode_test/cer_13_0.0
+%WER 32.10 [  69238 / 215718, 6186 ins, 15772 del, 47280 sub ] exp/tri2/decode_test/cer_13_0.0
+%WER 30.40 [  65583 / 215718, 6729 ins, 13115 del, 45739 sub ] exp/tri3a/decode_test/cer_12_0.0
+%WER 27.53 [  59389 / 215718, 6311 ins, 13008 del, 40070 sub ] exp/tri4a/decode_test/cer_15_0.0
+%WER 24.21 [  52232 / 215718, 6425 ins, 11543 del, 34264 sub ] exp/tri5a/decode_test/cer_15_0.0
+%WER 23.41 [ 50492 / 215718, 6645 ins, 10997 del, 32850 sub ] exp/tri5a_cleaned/decode_test/cer_17_0.0
+%WER 17.07 [  36829 / 215718, 4734 ins,  9938 del, 22157 sub ] exp/nnet3/tdnn_sp/decode_test/cer_12_0.0
+%WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+%WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+%WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+%WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
diff --git a/egs/formosa/s5/cmd.sh b/egs/formosa/s5/cmd.sh
new file mode 100755
index 00000000000..66ae9090820
--- /dev/null
+++ b/egs/formosa/s5/cmd.sh
@@ -0,0 +1,27 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+# Run locally:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+
+# JHU cluster (or most clusters using GridEngine, with a suitable
+# conf/queue.conf).
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+
+host=$(hostname -f)
+if [ ${host#*.} == "fit.vutbr.cz" ]; then
+  queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
+  export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
+  export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
+  export cuda_cmd="queue.pl --config $queue_conf --gpu 1 --mem 10G --tmp 40G"
+elif [ ${host#*.} == "cm.cluster" ]; then
+  # MARCC bluecrab cluster:
+  export train_cmd="slurm.pl --time 4:00:00 "
+  export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
+fi
diff --git a/egs/formosa/s5/conf/decode.config b/egs/formosa/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/formosa/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/formosa/s5/conf/mfcc.conf b/egs/formosa/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/formosa/s5/conf/mfcc_hires.conf b/egs/formosa/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..ca067e77b37
--- /dev/null
+++ b/egs/formosa/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 #  Switchboard is sampled at 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 8000 (=3800)
diff --git a/egs/formosa/s5/conf/online_cmvn.conf b/egs/formosa/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..591367e7ae9
--- /dev/null
+++ b/egs/formosa/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used when invoking online2-wav-nnet3-latgen-faster.
diff --git a/egs/formosa/s5/conf/pitch.conf b/egs/formosa/s5/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs/formosa/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/formosa/s5/local/chain/run_tdnn.sh b/egs/formosa/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..e1adaa9346d
--- /dev/null
+++ b/egs/formosa/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..66c5ad3335f
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# This script is based on run_tdnn_7h.sh in swbd chain recipe.
+
+set -e
+
+# configs for 'chain'
+affix=1a
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6_7d_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..1981bb0530d
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,188 @@
+#!/bin/bash
+
+# This script shows improvement arising from data cleaning.
+
+# CER:
+# %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_sp
+# exp/chain/tdnn_1b_sp: num-iters=133 nj=2..12 num-params=12.5M dim=43+100->4528 combine=-0.073->-0.073 (over 2) xent:train/valid[87,132,final]=(-1.05,-0.964,-0.963/-1.10,-1.06,-1.05) logprob:train/valid[87,132,final]=(-0.079,-0.065,-0.065/-0.094,-0.092,-0.092)
+
+set -e
+
+# configs for 'chain'
+affix=1b
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=12
+minibatch_size=128
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_cleaned_sp
+ali_dir=exp/tri5a_cleaned_sp_ali
+treedir=exp/chain/tri6a_cleaned_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train_cleaned --gmm tri5a_cleaned --nnet3-affix $nnet3_affix
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=625
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..6fa10344cfc
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# CER:
+# %WER 16.44 [ 35459 / 215718, 4216 ins, 11278 del, 19965 sub ] exp/chain/tdnn_1b_sp/decode_test/cer_10_0.0
+# %WER 13.72 [ 29605 / 215718, 4678 ins, 8066 del, 16861 sub ] exp/chain/tdnn_1c_sp/decode_test/cer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1c_sp
+# exp/chain/tdnn_1c_sp: num-iters=147 nj=3..16 num-params=17.9M dim=43+100->4528 combine=-0.041->-0.041 (over 2) xent:train/valid[97,146,final]=(-0.845,-0.625,-0.618/-0.901,-0.710,-0.703) logprob:train/valid[97,146,final]=(-0.064,-0.040,-0.039/-0.072,-0.058,-0.057)
+
+set -e
+
+# configs for 'chain'
+affix=1c
+nnet3_affix=_1b
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_cleaned_sp
+ali_dir=exp/tri5a_cleaned_sp_ali
+treedir=exp/chain/tri6a_cleaned_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train_cleaned --gmm tri5a_cleaned --nnet3-affix $nnet3_affix
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..1f4b7e12850
--- /dev/null
+++ b/egs/formosa/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# CER:
+# 1a: %WER 16.83 [ 36305 / 215718, 4772 ins, 10810 del, 20723 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_9_0.0
+# 1d: %WER 14.08 [ 30364 / 215718, 5182 ins, 7588 del, 17594 sub ] exp/chain/tdnn_1d_sp/decode_test/cer_9_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1d_sp
+# exp/chain/tdnn_1d_sp: num-iters=157 nj=3..16 num-params=18.6M dim=43+100->5792 combine=-0.050->-0.050 (over 1) xent:train/valid[103,156,final]=(-0.977,-0.735,-0.725/-0.953,-0.772,-0.768) logprob:train/valid[103,156,final]=(-0.077,-0.052,-0.052/-0.079,-0.065,-0.066)
+
+set -e
+
+# configs for 'chain'
+affix=1d
+stage=0
+train_stage=-10
+get_egs_stage=-10
+dir=exp/chain/tdnn  # Note: _sp will get added to this
+decode_iter=
+
+# training options
+num_epochs=6
+initial_effective_lrate=0.00025
+final_effective_lrate=0.000025
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=64
+frames_per_eg=150,110,90
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+dir=${dir}${affix:+_$affix}_sp
+train_set=train_sp
+ali_dir=exp/tri5a_sp_ali
+treedir=exp/chain/tri6a_tree_sp
+lang=data/lang_chain
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage --train-set train --gmm tri5a ${nnet3_affix:+ --nnet3-affix $nnet3_affix}
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_sp_lats
+  rm exp/tri5a_sp_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3$nnet3_affix/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_sp_lats \
+    --use-gpu wait \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+graph_dir=$dir/graph
+if [ $stage -le 13 ]; then
+  for test_set in test eval; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj 10 --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix:+_$nnet3_affix}/ivectors_$test_set \
+      $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/nnet3/run_ivector_common.sh b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..723589ddd2e
--- /dev/null
+++ b/egs/formosa/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is modified based on mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="test eval"
+gmm=tri5a
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_sp_ali
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj 70 data/${train_set}_sp \
+    exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp \
+    exp/make_mfcc/${train_set}_sp mfcc_perturbed || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=mfcc_perturbed_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc_pitch.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+    # create MFCC data dir without pitch to extract iVector
+    utils/data/limit_feature_dim.sh 0:39 data/${datadir}_hires data/${datadir}_hires_nopitch || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires_nopitch exp/make_hires/$datadir $mfccdir || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires_nopitch/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires_nopitch \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_nopitch_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_nopitch_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+     data/${train_set}_sp_hires_nopitch exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires_nopitch ${temp_data_root}/${train_set}_sp_hires_nopitch_max2
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+    ${temp_data_root}/${train_set}_sp_hires_nopitch_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
+      data/${data}_hires_nopitch exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}
+  done
+fi
+
+exit 0
diff --git a/egs/formosa/s5/local/nnet3/run_tdnn.sh b/egs/formosa/s5/local/nnet3/run_tdnn.sh
new file mode 100755
index 00000000000..a41d990a9b2
--- /dev/null
+++ b/egs/formosa/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+set -e
+
+stage=0
+train_stage=-10
+affix=
+common_egs_dir=
+
+# training options
+initial_effective_lrate=0.0015
+final_effective_lrate=0.00015
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=8
+remove_egs=false
+
+# feature options
+use_ivectors=true
+
+# End configuration section.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+dir=exp/nnet3/tdnn_sp${affix:+_$affix}
+gmm_dir=exp/tri5a
+train_set=train_sp
+ali_dir=${gmm_dir}_sp_ali
+graph_dir=$gmm_dir/graph
+
+local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=43 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=850
+  relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2)
+  relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2)
+  relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=850
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 500 \
+    --use-gpu wait \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # this version of the decoding treats each utterance separately
+  # without carrying forward speaker information.
+
+  for decode_set in test eval; do
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    decode_dir=${dir}/decode_$decode_set
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+       --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $decode_dir || exit 1;
+  done
+  wait;
+fi
+
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_data.sh b/egs/formosa/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..68f342e1549
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_data.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018  Yuan-Fu Liao, National Taipei University of Technology
+#                 AsusTek Computer Inc. (Author: Alex Hung)
+
+# Apache 2.0
+
+set -e -o pipefail
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+. ./path.sh
+. parse_options.sh
+
+for x in $train_dir $eval_dir; do
+  if [ ! -d "$x" ] ; then
+    echo >&2 "The directory $x does not exist"
+  fi
+done
+
+if [ -z "$(command -v dos2unix 2>/dev/null)" ]; then
+    echo "dos2unix not found on PATH. Please install it manually."
+    exit 1;
+fi
+
+# have to remvoe previous files to avoid filtering speakers according to cmvn.scp and feats.scp
+rm -rf   data/all data/train data/test data/eval data/local/train
+mkdir -p data/all data/train data/test data/eval data/local/train
+
+
+# make utt2spk, wav.scp and text
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $y' \; | dos2unix > data/all/utt2spk
+find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $x' \; | dos2unix > data/all/wav.scp
+find $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x'    \; | dos2unix > data/all/text
+
+# fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+# duplicate entries and so on). Also, it regenerates the spk2utt from
+# utt2spk
+utils/fix_data_dir.sh data/all
+
+echo "Preparing train and test data"
+# test set: JZ, GJ, KX, YX
+grep -E "(JZ|GJ|KX|YX)_" data/all/utt2spk | awk '{print $1}' > data/all/cv.spk
+utils/subset_data_dir_tr_cv.sh --cv-spk-list data/all/cv.spk data/all data/train data/test
+
+# for LM training
+echo "cp data/train/text data/local/train/text for language model training"
+cat data/train/text | awk '{$1=""}1;' | awk '{$1=$1}1;' > data/local/train/text
+
+# preparing EVAL set.
+find $eval_dir     -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $y' \; | dos2unix > data/eval/utt2spk
+find $eval_dir     -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s\n"     $y $x' \; | dos2unix > data/eval/wav.scp
+find $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x'    \; | dos2unix > data/eval/text
+utils/fix_data_dir.sh data/eval
+
+echo "Data preparation completed."
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_dict.sh b/egs/formosa/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..4e580f5f6e8
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_dict.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2018  Yuan-Fu Liao, National Taipei University of Technology
+# Apache 2.0
+
+source_dir=NER-Trs-Vol1/Language
+dict_dir=data/local/dict
+rm -rf $dict_dir
+mkdir -p $dict_dir
+
+#
+#
+#
+rm -f $dict_dir/lexicon.txt
+touch $dict_dir/lexicon.txt
+cat $source_dir/lexicon.txt > $dict_dir/lexicon.txt
+echo "<SIL> SIL"	>> $dict_dir/lexicon.txt
+
+#
+# define silence phone
+#
+rm -f $dict_dir/silence_phones.txt
+touch $dict_dir/silence_phones.txt
+
+echo "SIL"	> $dict_dir/silence_phones.txt
+
+#
+# find nonsilence phones
+#
+rm -f $dict_dir/nonsilence_phones.txt
+touch $dict_dir/nonsilence_phones.txt
+
+cat $source_dir/lexicon.txt | grep -v -F -f $dict_dir/silence_phones.txt | \
+    perl -ane 'print join("\n", @F[1..$#F]) . "\n"; '  | \
+    sort -u > $dict_dir/nonsilence_phones.txt
+
+#
+# add optional silence phones
+#
+
+rm -f $dict_dir/optional_silence.txt
+touch $dict_dir/optional_silence.txt
+echo "SIL"	> $dict_dir/optional_silence.txt
+
+#
+# extra questions
+#
+rm -f $dict_dir/extra_questions.txt
+touch $dict_dir/extra_questions.txt
+cat $dict_dir/silence_phones.txt    | awk '{printf("%s ", $1);} END{printf "\n";}'  > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' >> $dict_dir/extra_questions.txt || exit 1;
+
+echo "Dictionary preparation succeeded"
+exit 0;
diff --git a/egs/formosa/s5/local/prepare_lm.sh b/egs/formosa/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..59fe1529658
--- /dev/null
+++ b/egs/formosa/s5/local/prepare_lm.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+#nl -nrz -w10  corpus/LM/iban-bp-2012.txt | utils/shuffle_list.pl > data/local/external_text
+local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external
+
+# let's do ngram interpolation of the previous two LMs
+# the lm.gz is always symlink to the model with the best perplexity, so we use that
+
+mkdir -p data/srilm_interp
+for w in 0.9 0.8 0.7 0.6 0.5; do
+    ngram -lm data/srilm/lm.gz  -mix-lm data/srilm_external/lm.gz \
+          -lambda $w -write-lm data/srilm_interp/lm.${w}.gz
+    echo -n "data/srilm_interp/lm.${w}.gz "
+    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s -
+done | sort  -k15,15g  > data/srilm_interp/perplexities.txt
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=$(cat data/srilm/perplexities.txt | grep 3gram | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_test data/lang_test
+
+# for decoding using bigger LM let's find which interpolated gave the most improvement
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=$(cat data/srilm_interp/perplexities.txt | head -n1 | awk '{print $1}')
+local/arpa2G.sh $lm data/lang_big data/lang_big
+
+# for really big lm, we should only decode using small LM
+# and resocre using the big lm
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/formosa/s5/local/run_cleanup_segmentation.sh b/egs/formosa/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..b72cd89b4d1
--- /dev/null
+++ b/egs/formosa/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright   2016  Vimal Manohar
+#             2016  Johns Hopkins University (author: Daniel Povey)
+#             2017  Nagendra Kumar Goel
+#             2019  AsusTek Computer Inc. (author: Alex Hung)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri5a
+langdir=data/lang_test
+nj=20
+decode_nj=20
+decode_num_threads=1
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
+    --nj $nj --cmd "$train_cmd" \
+    $data $langdir $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    3500 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+utils/data/get_utt2dur.sh data/train_cleaned
+ori_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${data}/utt2dur)
+new_avg_dur=$(awk 'BEGIN{total=0}{total += $2}END{printf("%.2f", total/NR)}' ${cleaned_data}/utt2dur)
+echo "average duration was reduced from ${ori_avg_dur}s to ${new_avg_dur}s."
+# average duration was reduced from 21.68s to 10.97s.
+exit 0;
diff --git a/egs/formosa/s5/local/score.sh b/egs/formosa/s5/local/score.sh
new file mode 100755
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/formosa/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/formosa/s5/local/train_lms.sh b/egs/formosa/s5/local/train_lms.sh
new file mode 100755
index 00000000000..efc5b92c573
--- /dev/null
+++ b/egs/formosa/s5/local/train_lms.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/train/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/train/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SIL> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SIL> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<SIL>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0;
diff --git a/egs/formosa/s5/local/wer_hyp_filter b/egs/formosa/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SIL>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/formosa/s5/local/wer_output_filter b/egs/formosa/s5/local/wer_output_filter
new file mode 100755
index 00000000000..06a99a43e34
--- /dev/null
+++ b/egs/formosa/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "<SIL>")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/formosa/s5/local/wer_ref_filter b/egs/formosa/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..519d92ee80d
--- /dev/null
+++ b/egs/formosa/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SIL>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/formosa/s5/path.sh b/egs/formosa/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/formosa/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/formosa/s5/run.sh b/egs/formosa/s5/run.sh
new file mode 100755
index 00000000000..a4d0f2dcd1d
--- /dev/null
+++ b/egs/formosa/s5/run.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+#
+# Copyright 2018, Yuan-Fu Liao, National Taipei University of Technology, yfliao@mail.ntut.edu.tw
+#
+# Before you run this recipe, please apply, download and put or make a link of the corpus under this folder (folder name: "NER-Trs-Vol1").
+# For more detail, please check:
+# 1. Formosa Speech in the Wild (FSW) project (https://sites.google.com/speech.ntut.edu.tw/fsw/home/corpus)
+# 2. Formosa Speech Recognition Challenge (FSW) 2018 (https://sites.google.com/speech.ntut.edu.tw/fsw/home/challenge)
+stage=-2
+num_jobs=20
+
+train_dir=NER-Trs-Vol1/Train
+eval_dir=NER-Trs-Vol1-Eval
+eval_key_dir=NER-Trs-Vol1-Eval-Key
+
+# shell options
+set -eo pipefail
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+# configure number of jobs running in parallel, you should adjust these numbers according to your machines
+# data preparation
+if [ $stage -le -2 ]; then
+  # Lexicon Preparation,
+  echo "$0: Lexicon Preparation"
+  local/prepare_dict.sh || exit 1;
+
+  # Data Preparation
+  echo "$0: Data Preparation"
+  local/prepare_data.sh --train-dir $train_dir --eval-dir $eval_dir --eval-key-dir $eval_key_dir || exit 1;
+
+  # Phone Sets, questions, L compilation
+  echo "$0: Phone Sets, questions, L compilation Preparation"
+  rm -rf data/lang
+  utils/prepare_lang.sh --position-dependent-phones false data/local/dict \
+      "<SIL>" data/local/lang data/lang || exit 1;
+
+  # LM training
+  echo "$0: LM training"
+  rm -rf data/local/lm/3gram-mincount
+  local/train_lms.sh || exit 1;
+
+  # G compilation, check LG composition
+  echo "$0: G compilation, check LG composition"
+  utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
+      data/local/dict/lexicon.txt data/lang_test || exit 1;
+
+fi
+
+# Now make MFCC plus pitch features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+
+# mfcc
+if [ $stage -le -1 ]; then
+  echo "$0: making mfccs"
+  for x in train test eval; do
+    steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
+    utils/fix_data_dir.sh data/$x || exit 1;
+  done
+fi
+
+# mono
+if [ $stage -le 0 ]; then
+  echo "$0: train mono model"
+  # Make some small data subsets for early system-build stages.
+  echo "$0: make training subsets"
+  utils/subset_data_dir.sh --shortest data/train 3000 data/train_mono
+
+  # train mono
+  steps/train_mono.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+    data/train_mono data/lang exp/mono || exit 1;
+
+  # Get alignments from monophone system.
+  steps/align_si.sh --boost-silence 1.25 --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+  # Monophone decoding
+  (
+  utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/mono/graph data/test exp/mono/decode_test
+  )&
+fi
+
+# tri1
+if [ $stage -le 1 ]; then
+  echo "$0: train tri1 model"
+  # train tri1 [first triphone pass]
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+
+  # align tri1
+  steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  # decode tri1
+  (
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri1/graph data/test exp/tri1/decode_test
+  )&
+fi
+
+# tri2
+if [ $stage -le 2 ]; then
+  echo "$0: train tri2 model"
+  # train tri2 [delta+delta-deltas]
+  steps/train_deltas.sh --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
+
+  # align tri2b
+  steps/align_si.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+
+  # decode tri2
+  (
+  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+  steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj $num_jobs \
+    exp/tri2/graph data/test exp/tri2/decode_test
+  )&
+fi
+
+# tri3a
+if [ $stage -le 3 ]; then
+  echo "$-: train tri3 model"
+  # Train tri3a, which is LDA+MLLT,
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+   2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
+
+  # decode tri3a
+  (
+  utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
+  steps/decode.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri3a/graph data/test exp/tri3a/decode_test
+  )&
+fi
+
+# tri4
+if [ $stage -le 4 ]; then
+  echo "$0: train tri4 model"
+  # From now, we start building a more serious system (with SAT), and we'll
+  # do the alignment with fMLLR.
+  steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
+
+  # align tri4a
+  steps/align_fmllr.sh  --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri4a exp/tri4a_ali
+
+  # decode tri4a
+  (
+  utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+    exp/tri4a/graph data/test exp/tri4a/decode_test
+  )&
+fi
+
+# tri5
+if [ $stage -le 5 ]; then
+  echo "$0: train tri5 model"
+  # Building a larger SAT system.
+  steps/train_sat.sh --cmd "$train_cmd" \
+    3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
+
+  # align tri5a
+  steps/align_fmllr.sh --cmd "$train_cmd" --nj $num_jobs \
+    data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
+
+  # decode tri5
+  (
+  utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj $num_jobs --config conf/decode.config \
+     exp/tri5a/graph data/test exp/tri5a/decode_test || exit 1;
+  )&
+fi
+
+# nnet3 tdnn models
+# commented out by default, since the chain model is usually faster and better
+#if [ $stage -le 6 ]; then
+  # echo "$0: train nnet3 model"
+  # local/nnet3/run_tdnn.sh
+#fi
+
+# chain model
+if [ $stage -le 7 ]; then
+  # The iVector-extraction and feature-dumping parts coulb be skipped by setting "--train_stage 7"
+  echo "$0: train chain model"
+  local/chain/run_tdnn.sh
+fi
+
+# getting results (see RESULTS file)
+if [ $stage -le 8 ]; then
+  echo "$0: extract the results"
+  for test_set in test eval; do
+  echo "WER: $test_set"
+  for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+  for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
+  echo
+
+  echo "CER: $test_set"
+  for x in exp/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+  for x in exp/*/*/decode_${test_set}*; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
+  echo
+  done
+fi
+
+# finish
+echo "$0: all done"
+
+exit 0;
diff --git a/egs/formosa/s5/steps b/egs/formosa/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/formosa/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/formosa/s5/utils b/egs/formosa/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/formosa/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/gale_arabic/s5/local/gale_format_data.sh b/egs/gale_arabic/s5/local/gale_format_data.sh
index 85a946a58d9..053323dc194 100755
--- a/egs/gale_arabic/s5/local/gale_format_data.sh
+++ b/egs/gale_arabic/s5/local/gale_format_data.sh
@@ -57,4 +57,4 @@ fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
 
 echo gale_format_data  succeeded.
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/gale_arabic/s5/local/gale_prep_dict.sh b/egs/gale_arabic/s5/local/gale_prep_dict.sh
index 74ef789eda7..f6fd83378d0 100755
--- a/egs/gale_arabic/s5/local/gale_prep_dict.sh
+++ b/egs/gale_arabic/s5/local/gale_prep_dict.sh
@@ -25,9 +25,8 @@ echo SIL > $dir/optional_silence.txt
 cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' |\
 sort -u >  $dir/nonsilence_phones.txt || exit 1;
 
+perl -i -pe 'print "<UNK> SIL\n" if $.==1'  $dir/lexicon.txt
 
- sed -i '1i<UNK> SIL' $dir/lexicon.txt
- 
 echo Dictionary preparation succeeded
 
 exit 0
diff --git a/egs/gale_arabic/s5/local/gale_train_lms.sh b/egs/gale_arabic/s5/local/gale_train_lms.sh
index 1b5d4665a19..8f8e715390f 100755
--- a/egs/gale_arabic/s5/local/gale_train_lms.sh
+++ b/egs/gale_arabic/s5/local/gale_train_lms.sh
@@ -113,4 +113,4 @@ fi
 
 echo train lm succeeded
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/gale_arabic/s5/local/run_sgmm.sh b/egs/gale_arabic/s5/local/run_sgmm.sh
index f9ba9b193a8..a5d32d18038 100755
--- a/egs/gale_arabic/s5/local/run_sgmm.sh
+++ b/egs/gale_arabic/s5/local/run_sgmm.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 
 . ./path.sh
@@ -10,17 +10,17 @@ nDecodeJobs=40
 
 galeData=GALE
 mfccdir=mfcc
- 
-if [[ ! -d  exp/tri3b_ali ]]; then 
+
+if [[ ! -d  exp/tri3b_ali ]]; then
   echo "exp/tri3b_ali lattices are required for alignmnet"
-  exit 1 
+  exit 1
 fi
 
 
 ## SGMM (subspace gaussian mixture model), excluding the "speaker-dependent weights"
 steps/train_ubm.sh --cmd "$train_cmd" 700 \
  data/train data/lang exp/tri3b_ali exp/ubm5a || exit 1;
- 
+
 steps/train_sgmm2.sh --cmd "$train_cmd" 5000 20000 data/train data/lang exp/tri3b_ali \
   exp/ubm5a/final.ubm exp/sgmm_5a || exit 1;
 
@@ -38,16 +38,16 @@ steps/align_sgmm2.sh --nj $nJobs --cmd "$train_cmd" --transform-dir exp/tri3b_al
 steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split 30 --beam 9.0 --lattice-beam 6 \
   --cmd "$decode_cmd" --transform-dir \
   exp/tri3b_ali data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1;
-  
+
 steps/train_mmi_sgmm2.sh --cmd "$train_cmd" --num-iters 8 --transform-dir exp/tri3b_ali --boost 0.1 \
   data/train data/lang exp/sgmm_5a exp/sgmm_5a_denlats exp/sgmm_5a_mmi_b0.1
- 
+
 #decode SGMM MMI
 utils/mkgraph.sh data/lang_test exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph
 steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
   --config conf/decode.config --transform-dir exp/tri3b/decode \
   exp/sgmm_5a_mmi_b0.1/graph data/test exp/sgmm_5a_mmi_b0.1/decode
-  
+
 for n in 1 2 3 4; do
   steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n \
   --transform-dir exp/tri3b/decode data/lang_test data/test \
diff --git a/egs/gale_arabic/s5b/RESULTS b/egs/gale_arabic/s5b/RESULTS
index 2260a106654..a485240ff6b 100644
--- a/egs/gale_arabic/s5b/RESULTS
+++ b/egs/gale_arabic/s5b/RESULTS
@@ -2,13 +2,7 @@
 # This file is generated using local/split_wer.sh $galeData  //galeData is a local folder to keep intermediate gale data
 # look at the end of run.sh in the same folder 
 ##
-##### RESULTS generated by amali at 2017-01-01-08-05-59
-
 Report Results WER:
-%WER 9.50 [ 2124 / 22363, 160 ins, 275 del, 1689 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_report_9
-%WER 10.72 [ 2398 / 22363, 163 ins, 313 del, 1922 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_report_9
-%WER 12.04 [ 2693 / 22363, 226 ins, 271 del, 2196 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_report_9
-%WER 12.29 [ 2749 / 22363, 273 ins, 266 del, 2210 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_report_10
 %WER 17.82 [ 3986 / 22363, 315 ins, 618 del, 3053 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_report_12
 %WER 18.15 [ 4059 / 22363, 335 ins, 589 del, 3135 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_report_11
 %WER 18.42 [ 4119 / 22363, 346 ins, 590 del, 3183 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_report_11
@@ -27,10 +21,6 @@ Report Results WER:
 %WER 25.66 [ 5738 / 22363, 478 ins, 838 del, 4422 sub ] exp/tri2a/decode/wer_report_14
 %WER 26.38 [ 5900 / 22363, 435 ins, 929 del, 4536 sub ] exp/tri1/decode/wer_report_15
 Conversational Results WER:
-%WER 21.59 [ 10213 / 47305, 944 ins, 3092 del, 6177 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_conversational_9
-%WER 24.77 [ 11716 / 47305, 1098 ins, 3579 del, 7039 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_conversational_9
-%WER 26.78 [ 12670 / 47305, 1741 ins, 2434 del, 8495 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_conversational_9
-%WER 27.55 [ 13032 / 47305, 1800 ins, 2666 del, 8566 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_conversational_11
 %WER 34.10 [ 16133 / 47305, 1903 ins, 3245 del, 10985 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_conversational_11
 %WER 34.81 [ 16466 / 47305, 2077 ins, 3037 del, 11352 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_conversational_10
 %WER 35.19 [ 16648 / 47305, 1933 ins, 3264 del, 11451 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_conversational_11
@@ -49,10 +39,6 @@ Conversational Results WER:
 %WER 45.92 [ 21724 / 47305, 1995 ins, 5213 del, 14516 sub ] exp/tri2a/decode/wer_conversational_14
 %WER 46.86 [ 22166 / 47305, 2212 ins, 4819 del, 15135 sub ] exp/tri1/decode/wer_conversational_13
 Combined Results for Reports and Conversational WER:
-%WER 17.64 [ 12286 / 69668, 1310 ins, 2807 del, 8169 sub ] exp/chain_cleaned/tdnn_lstm1a_sp_bi/decode/wer_8
-%WER 20.26 [ 14114 / 69668, 1261 ins, 3892 del, 8961 sub ] exp/chain_cleaned/tdnn1b_sp_bi/decode/wer_9
-%WER 22.05 [ 15363 / 69668, 1967 ins, 2705 del, 10691 sub ] exp/nnet3_cleaned/lstm_ld5_sp/decode/wer_9
-%WER 22.66 [ 15786 / 69668, 2047 ins, 2955 del, 10784 sub ] exp/nnet3_cleaned/tdnn_sp/decode/wer_11
 %WER 28.89 [ 20127 / 69668, 2244 ins, 3829 del, 14054 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_11
 %WER 29.48 [ 20541 / 69668, 2243 ins, 3860 del, 14438 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_11
 %WER 29.81 [ 20767 / 69668, 2279 ins, 3854 del, 14634 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_11
@@ -65,8 +51,30 @@ Combined Results for Reports and Conversational WER:
 %WER 32.36 [ 22542 / 69668, 2156 ins, 4184 del, 16202 sub ] exp/tri2b_mmi/decode_it4/wer_11
 %WER 32.50 [ 22640 / 69668, 2393 ins, 3956 del, 16291 sub ] exp/tri2b_mmi/decode_it3/wer_11
 %WER 32.79 [ 22847 / 69668, 2407 ins, 4760 del, 15680 sub ] exp/tri2b_mpe/decode_it3/wer_13
+# WER with train_sat_basis
+%WER 33.35 [ 23233 / 69668, 2385 ins, 5274 del, 15574 sub ] exp/tri3b/decode/wer_16_0.5
+# WER with train_sat
 %WER 33.61 [ 23413 / 69668, 2817 ins, 4577 del, 16019 sub ] exp/tri3b/decode/wer_17
 %WER 35.73 [ 24894 / 69668, 2630 ins, 4944 del, 17320 sub ] exp/tri3b/decode.si/wer_15
 %WER 36.17 [ 25196 / 69668, 2429 ins, 5393 del, 17374 sub ] exp/tri2b/decode/wer_16
 %WER 39.42 [ 27462 / 69668, 2473 ins, 6051 del, 18938 sub ] exp/tri2a/decode/wer_14
 %WER 40.35 [ 28113 / 69668, 2713 ins, 5635 del, 19765 sub ] exp/tri1/decode/wer_13
+
+
+# Effect of GMM seed model (tri2b instead of tri3b).  Using tri3b give a slightly better result
+# as compared to using tri2b as seed.
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3b_sp/decode_test/wer_10_0.0
+%WER 16.71 [ 11642 / 69668, 1145 ins, 2908 del, 7589 sub ] exp/chain/tdnn_1a_2b_sp/decode_test/wer_9_0.0
+
+# Effect of Tree-size (3500, 4500, 7000, 11000)
+%WER 16.66 [ 11610 / 69668, 1233 ins, 2747 del, 7630 sub ] exp/chain/tdnn_1a_3500_sp/decode_test/wer_10_0.0
+%WER 16.59 [ 11557 / 69668, 1234 ins, 2646 del, 7677 sub ] exp/chain/tdnn_1a_4500_sp/decode_test/wer_10_0.0
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_sp/decode_test/wer_9_0.0
+%WER 16.62 [ 11580 / 69668, 1164 ins, 2789 del, 7627 sub ] exp/chain/tdnn_1a_11000_sp/decode_test/wer_10_0.0
+
+# Effect of l2-regularization on the output with tree-size=7000. l2 on the output (0.005,0.002)
+%WER 16.54 [ 11522 / 69668, 1123 ins, 2739 del, 7660 sub ] exp/chain/tdnn_1a_7000_005_sp/decode_test/wer_9_0.5
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_7000_002_sp/decode_test/wer_9_0.0
+
+#current best 'chain' models  (see local/chain/tuning/run_tdnn_1a.sh)
+%WER 16.47 [ 11474 / 69668, 1421 ins, 2207 del, 7846 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_9_0.0
diff --git a/egs/gale_arabic/s5b/cmd.sh b/egs/gale_arabic/s5b/cmd.sh
index 71dd849a93b..ea341c98d4a 100755
--- a/egs/gale_arabic/s5b/cmd.sh
+++ b/egs/gale_arabic/s5b/cmd.sh
@@ -10,6 +10,6 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
-export mkgraph_cmd="queue.pl --mem 8G"
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5b/local/chain/compare_wer.sh b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5b/local/chain/run_chain_common.sh b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
index 7afafb31ff6..bf2e45c9914 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -1,31 +1,51 @@
 #!/bin/bash
 
-#started from tedlium recipe with few edits
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
 
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
 
-set -e -o pipefail
 
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
+set -e -o pipefail
 stage=0
 nj=30
-decode_nj=30
-min_seg_len=1.55
-xent_regularize=0.1
 train_set=train
-gmm=tri2b # the gmm for the target data
+test_set=test
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
 num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
-
-# The rest are configs specific to this script.  Most of the parameters
-# are just hardcoded at this level, in the commands below.
-train_stage=-10 #default -10
-tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnn_affix=1b  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
-common_egs_dir=  # you can set this to use previously dumped egs.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
+
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
@@ -39,169 +59,162 @@ where "nvcc" is installed.
 EOF
 fi
 
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
-dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
-train_data_dir=data/${train_set}_sp_hires_comb
-lores_train_data_dir=data/${train_set}_sp_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
 
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-if [ $stage -le 14 ]; then
-  echo "$0: creating lang directory with one state per phone."
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  if [ -d data/lang_chain ]; then
-    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
-      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
-    else
-      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
-      echo " ... not sure what to do.  Exiting."
-      exit 1;
-    fi
-  else
-    cp -r data/lang data/lang_chain
-    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
-    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
-    # Use our special topology... note that later on may have to tune this
-    # topology.
-    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
-  fi
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
 fi
 
 if [ $stage -le 15 ]; then
-  # Get the alignments as lattices (gives the chain training more freedom).
-  # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang $gmm_dir $lat_dir
-  rm $lat_dir/fsts.*.gz # save space
-fi
-
-if [ $stage -le 16 ]; then
-  # Build a tree using our new topology.  We know we have alignments for the
-  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
-  # those.
-  if [ -f $tree_dir/final.mdl ]; then
-    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
-    exit 1;
-  fi
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
-fi
-
-if [ $stage -le 17 ]; then
   mkdir -p $dir
-
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
 
   mkdir -p $dir/configs
+
   cat <<EOF > $dir/configs/network.xconfig
   input dim=100 name=ivector
   input dim=40 name=input
-
   # please note that it is important to have input layer with the name=input
   # as the layer immediately preceding the fixed-affine-layer to enable
   # the use of short notation for the descriptor
   fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
   # the first splicing is moved before the lda layer, so no splicing here
-  relu-renorm-layer name=tdnn1 dim=450
-  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
-  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
-  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
-  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
-
-  ## adding the layers for chain branch
-  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
-  # adding the layers for xent branch
-  # This block prints the configs for a separate output that will be
-  # trained with a cross-entropy objective in the 'chain' models... this
-  # has the effect of regularizing the hidden parts of the model.  we use
-  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
-  # 0.5 / args.xent_regularize is suitable as it means the xent
-  # final-layer learns at a rate independent of the regularization
-  # constant; and the 0.5 was tuned so as to make the relative progress
-  # similar in the xent and regular final layers.
-  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
-
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-
 fi
 
-if [ $stage -le 18 ]; then
+
+if [ $stage -le 16 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
- steps/nnet3/chain/train.py --stage $train_stage \
+  steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.00005 \
+    --chain.l2-regularize 0.0 \
     --chain.apply-deriv-weights false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width 150 \
-    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
     --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 2 \
-    --trainer.optimization.num-jobs-final 2 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs true \
-    --feat-dir $train_data_dir \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
     --tree-dir $tree_dir \
-    --lat-dir $lat_dir \
-    --dir $dir
-fi
-
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
 
+fi
 
-if [ $stage -le 19 ]; then
-  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
 fi
 
-if [ $stage -le 20 ]; then
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
-  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
-    --acwt 1.0 --post-decode-acwt 10.0 \
-    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
-    --scoring-opts "--min-lmwt 5 " \
-    $dir/graph data/test_hires $dir/decode || exit 1;
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
 fi
-exit 0
diff --git a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
index 604f32a1de4..deebafc95e4 100755
--- a/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/gale_arabic/s5b/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -120,7 +120,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh b/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
deleted file mode 100755
index 0125272d06c..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_audio.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-audio_dvds=${@:1:${#}-1} # all the audio dvds for GALE corpus; ; check audio=( in ../run.sh
-
-mkdir -p $galeData 
-
-# check that sox is installed 
-which sox  &>/dev/null
-if [[ $? != 0 ]]; then 
- echo "sox is not installed"; exit 1 
-fi
-
-for dvd in $audio_dvds; do
-  dvd_full_path=$(utils/make_absolute.sh $dvd)
-  if [[ ! -e $dvd_full_path ]]; then 
-    echo missing $dvd_full_path; exit 1;
-  fi
-  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
-    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
-    echo "$id sox $file -r 16000 -t wav - |"
-  done 
-done | sort -u > $galeData/wav.scp
-
-echo data prep audio succeded
-
-exit 0
-
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh b/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
deleted file mode 100755
index b18a4e5b105..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_split.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash 
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-if [ $# -ne 1 ]; then
-   echo "Arguments should be the <gale folder>"; exit 1
-fi
-
-
-#data will data/local
-
-galeData=$(utils/make_absolute.sh $1)
-mkdir -p data/local
-dir=$(utils/make_absolute.sh data/local)
-
-
-grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test
-grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train 
-
-for x in test train; do
- outdir=$dir/$x
- file=$galeData/all.$x 
- mkdir -p $outdir
- awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
- cp -pr $outdir/utt2spk $outdir/spk2utt
- awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
- awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
-done 
-
-
-grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp
-
-cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
- {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
- 
-echo data prep split succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh b/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
deleted file mode 100755
index 04529d88ac0..00000000000
--- a/egs/gale_arabic/s5b/local/gale_data_prep_txt.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-galeData=$(utils/make_absolute.sh "${@: -1}" );  # last argumnet; the local folder
-txt_dvds=${@:1:${#}-1} # all the txt cds correspoding to the audio corpus; check text=( in ../run.sh
-
-
-top_pwd=`pwd`
-txtdir=$galeData/txt
-mkdir -p $txtdir; cd $txtdir
-
-for cdx in $txt_dvds; do
-  echo "Preparing $cdx"
-  if [[ $cdx  == *.tgz ]] ; then
-     tar -xvf $cdx
-  elif [  -d "$cdx" ]; then
-    ln -s $cdx `basename $cdx`
-  else
-    echo "I don't really know what I shall do with $cdx " >&2
-  fi
-done
-
-find -L . -type f -name "*.tdf" | while read file; do
-sed '1,3d' $file  # delete the first 3 lines
-done >  all.tmp$$
-
-perl -e '
-    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
-    open(IN, "$inFile");
-    open(ID, ">$idFile");
-    open(TXT, ">$txtFile");
-    while (<IN>) {
-      @arr= split /\t/,$_;
-      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
-      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
-      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
-      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
-      next if ($rStart == $rEnd);
-      $id =~ s/.sph//g;
-      print ID $id;
-      print TXT "$arr[7]\n";
- }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
-
-
-perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
-
-paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
-
-awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all
-awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $galeData/report
-awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational
-
-cd ..;
-rm -fr $txtdir
-cd $top_pwd
-echo data prep text succeeded
-
-exit 0
diff --git a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh b/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
deleted file mode 100755
index 5f101f8245b..00000000000
--- a/egs/gale_arabic/s5b/local/gale_prep_grapheme_dict.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-# Copyright 2017 QCRI (author: Ahmed Ali)
-# Apache 2.0
-
-
-# run this from ../
-dir=$(utils/make_absolute.sh data/local/dict)
-mkdir -p $dir
-
-
-# (1) Get all avaialble  dictionaries, since this is a grapheme model, so we mainly need the most frequent word lists
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2  || exit 1;
-wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2  || exit 1;
-bzcat ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  tmp$$
-bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  tmp$$
-# (2) Now we add all the words appeared in the training data
-cat data/local/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> tmp$$
-grep -v [0-9] tmp$$ |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > tmp1.$$ # remove vowels and  rare alef wasla
-cat tmp1.$$ | sed 's:\(\):\1 :g' | sed -e 's:  : :g' -e 's:  : :g' -e 's:\s*: :g' -e  's:\*:V:g' > tmp2.$$
-paste -d ' ' tmp1.$$ tmp2.$$ > $dir/lexicon.txt 
-
-#(2) Dictionary preparation:
-
-# silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
-
-# nonsilence phones; on each line is a list of phones that correspond
-# really to the same base phone.
-cat tmp2.$$ | tr -s ' ' '\n' | grep -v ^$  | sort -u >  $dir/nonsilence_phones.txt || exit 1;
-
-sed -i '1i<UNK> SIL' $dir/lexicon.txt # insert word <UNK> with phone sil at the begining of the dictionary
-
-rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 ar-ar_grapheme_lexicon_2016-02-09.bz2 tmp$$ tmp1.$$ tmp2.$$ 
-echo Dictionary preparation succeeded
-
-# The script is still missing dates and numbers 
-
-exit 0 
-
diff --git a/egs/gale_arabic/s5b/local/gale_train_lms.sh b/egs/gale_arabic/s5b/local/gale_train_lms.sh
deleted file mode 100755
index 3988ec3818f..00000000000
--- a/egs/gale_arabic/s5b/local/gale_train_lms.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-
-# To be run from one directory above this script.
-
-
-lexicon=data/local/dict/lexicon.txt 
-[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
-
-
-# This script takes no arguments.  It assumes you have already run
-# previus steps successfully
-# It takes as input the files
-#data/local/train.*/text
-#data/local/dict/lexicon.txt
-
-
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:./../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd $KALDI_ROOT/tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-
-dir=data/local/lm
- mkdir -p $dir
- text=data/local/train/text
- [ ! -f $text ] && echo "$0: No such file $text" && exit 1;
- 
- cleantext=$dir/text.no_oov
-
- cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
-   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ",$n);} } printf("\n");}' \
-   > $cleantext || exit 1;
-
-
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
-    sort -nr > $dir/word.counts || exit 1;
-
-
-# Get counts from acoustic training transcripts, and add  one-count
-# for each word in the lexicon (but not silence, we don't want it
-# in the LM-- we'll add it optionally later).
- cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
-   cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
-    sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
-
-# note: we probably won't really make use of <UNK> as there aren't any OOVs
- cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
-    || exit 1;
-
-# note: ignore 1st field of train.txt, it's the utterance-id.
- cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
-   { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
-    || exit 1;
- 
- train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
-
-# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
-# Perplexity over 128254.000000 words is 90.446690
-
-# note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
-
-
-echo train lm succeeded
-
-exit 0 
diff --git a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
index f14c8441869..a03cc5b2fa3 100755
--- a/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/gale_arabic/s5b/local/nnet3/run_ivector_common.sh
@@ -2,31 +2,29 @@
 
 set -e -o pipefail
 
-# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
-# be called by more scripts).  It contains the common feature preparation and iVector-related parts
-# of the script.  See those scripts for examples of usage.
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
 
 
 stage=0
 nj=100
-min_seg_len=1.55  # min length in seconds... we do this because chain training
-                  # will discard segments shorter than 1.5 seconds.   Must remain in sync
-                  # with the same option given to prepare_lores_feats_and_alignments.sh
 train_set=train   # you might set this to e.g. train.
-gmm=tri2b         # This specifies a GMM-dir from the features of the type you're training the system on;
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
                          # it should contain alignments for 'train_set'.
 
 num_threads_ubm=32
-nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
-                         # becomes exp/nnet3_cleaned or whatever.
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
 
 . ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
 
 
 gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 
 for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
@@ -61,7 +59,7 @@ if [ $stage -le 2 ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
   done
 
@@ -69,7 +67,7 @@ if [ $stage -le 2 ]; then
   # features; this helps make trained nnets more invariant to test data volume.
   utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 
-  for datadir in ${train_set}_sp test; do
+  for datadir in ${train_set}_sp ${test_sets}; do
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/${datadir}_hires
     steps/compute_cmvn_stats.sh data/${datadir}_hires
@@ -78,75 +76,33 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
-  # we have to combine short segments or we won't be able to train chain models
-  # on those segments.
-  utils/data/combine_short_segments.sh \
-     data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
-
-  # just copy over the CMVN to avoid having to recompute it.
-  cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
-  utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
-fi
-
-if [ $stage -le 4 ]; then
-  echo "$0: selecting segments of hires training data that were also present in the"
-  echo " ... original training data."
-
-  # note, these data-dirs are temporary; we put them in a sub-directory
-  # of the place where we'll make the alignments.
-  temp_data_root=exp/nnet3${nnet3_affix}/tri5
-  mkdir -p $temp_data_root
-
-  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
-          data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
-
-  # note: essentially all the original segments should be in the hires data.
-  n1=$(wc -l <data/${train_set}/feats.scp)
-  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
-  if [ $n1 != $n2 ]; then
-    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
-  fi
-
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmm_dir exp/nnet3${nnet3_affix}/tri5
-fi
-
-
-if [ $stage -le 5 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
-
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
   temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
   # train a diagonal UBM using a subset of about a quarter of the data
-  # we don't use the _comb data for this as there is no need for compatibility with
-  # the alignments, and using the non-combined data is more efficient for I/O
-  # (no messing about with piped commands).
   num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
   utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
       $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
     --num-frames 700000 \
     --num-threads $num_threads_ubm \
     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 4 ]; then
   # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
@@ -155,54 +111,54 @@ if [ $stage -le 6 ]; then
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 5 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
-  # valid for the non-'max2' data, the utterance list is the same.
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
-  # We extract iVectors on the speed-perturbed training data after combining
-  # short segments, which will be what we train the system on.  With
+  # We extract iVectors on the speed-perturbed training data .  With
   # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
   # each of these pairs as one speaker; this gives more diversity in iVectors..
-  # Note that these are extracted 'online'.
+  # Note that these are extracted 'online' (they vary within the utterance).
 
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
     exp/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
-  # perturbation (sp) or small-segment concatenation (comb).
-  for data in test; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
       data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
       exp/nnet3${nnet3_affix}/ivectors_${data}_hires
   done
 fi
 
-if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
-  echo "$0: $feats already exists.  Refusing to overwrite the features "
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
   echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
   exit 1;
 fi
 
 
-if [ $stage -le 8 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
   utils/data/perturb_data_dir_speed_3way.sh \
     data/${train_set} data/${train_set}_sp
 fi
 
-if [ $stage -le 9 ]; then
-  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
   steps/make_mfcc.sh --nj $nj \
     --cmd "$train_cmd" data/${train_set}_sp
   steps/compute_cmvn_stats.sh data/${train_set}_sp
@@ -211,26 +167,15 @@ if [ $stage -le 9 ]; then
   utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
-if [ $stage -le 10 ]; then
-  echo "$0: combining short segments of low-resolution speed-perturbed  MFCC data"
-  src=data/${train_set}_sp
-  dest=data/${train_set}_sp_comb
-  utils/data/combine_short_segments.sh $src $min_seg_len $dest
-  # re-use the CMVN stats from the source directory, since it seems to be slow to
-  # re-compute them after concatenating short segments.
-  cp $src/cmvn.scp $dest/
-  utils/fix_data_dir.sh $dest
-fi
-
-if [ $stage -le 11 ]; then
+if [ $stage -le 8 ]; then
   if [ -f $ali_dir/ali.1.gz ]; then
     echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
     echo " ... or use a later --stage option."
     exit 1
   fi
-  echo "$0: aligning with the perturbed, short-segment-combined low-resolution data"
+  echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-         data/${train_set}_sp_comb data/lang $gmm_dir $ali_dir
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
 fi
 
 
diff --git a/egs/gale_arabic/s5b/local/prepare_data.sh b/egs/gale_arabic/s5b/local/prepare_data.sh
new file mode 100755
index 00000000000..aea9ba2dc8e
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo "$0: missing $dvd_full_path"; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "$0:Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "$0:I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+ 
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5b/local/prepare_dict.sh b/egs/gale_arabic/s5b/local/prepare_dict.sh
new file mode 100755
index 00000000000..47b5869fdf1
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5b/local/prepare_lexicon.py b/egs/gale_arabic/s5b/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5b/local/prepare_lm.sh b/egs/gale_arabic/s5b/local/prepare_lm.sh
new file mode 100755
index 00000000000..6fdf35f471a
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5b/local/score.sh b/egs/gale_arabic/s5b/local/score.sh
index 83366f7c7fc..1d84815fc69 100755
--- a/egs/gale_arabic/s5b/local/score.sh
+++ b/egs/gale_arabic/s5b/local/score.sh
@@ -1,60 +1,6 @@
-#!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-[ -f ./path.sh ] && . ./path.sh
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-decode_mbr=true
-word_ins_penalty=0.0
-min_lmwt=7
-max_lmwt=17
-iter=  #some of the scripts from steps/ seem to use it
-#end configuration section.
-
-echo "$0 $#"
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
-  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
-  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
-  exit 1;
-fi
 
-data=$1
-lang_or_graph=$2
-dir=$3
-
-symtab=$lang_or_graph/words.txt
-
-for f in $symtab $dir/lat.1.gz $data/text; do
-  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
-done
-
-mkdir -p $dir/scoring/log
-
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
-  lattice-best-path --word-symbol-table=$symtab \
-    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
+#!/bin/bash
 
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-   cat $dir/scoring/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
 
-exit 0;
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5b/local/wer_output_filter b/egs/gale_arabic/s5b/local/wer_output_filter
new file mode 100755
index 00000000000..cf48b434144
--- /dev/null
+++ b/egs/gale_arabic/s5b/local/wer_output_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in infile:
+  words = line.strip().split()
+  words = [word for word in words if '<UNK>' not in word]
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/gale_arabic/s5b/run.sh b/egs/gale_arabic/s5b/run.sh
index c45f5119949..3f12d22495e 100755
--- a/egs/gale_arabic/s5b/run.sh
+++ b/egs/gale_arabic/s5b/run.sh
@@ -3,177 +3,121 @@
 # Copyright 2014 QCRI (author: Ahmed Ali)
 # Apache 2.0
 
-. ./path.sh
-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-           ## This relates to the queue.
 num_jobs=120
 num_decode_jobs=40
+decode_gmm=true
+stage=0
+overwrite=false
 
-#NB: You can add whatever number of copora you like. The supported extensions 
-#NB: (formats) are wav and flac. Flac will be converted using sox and in contrast
-#NB: with the old approach, the conversion will be on-the-fly and one-time-only
-#NB: during the parametrization.
-
-#NB: Text corpora scpecification. We support either tgz files, which are unpacked
-#NB: or just plain (already unpacked) directories. The list of transcript is then
-#NB: obtained using find command
-
-#Make sure you edit this section to reflect whers you keep the LDC data on your cluster
-
-#This is CLSP configuration. We add the 2014 GALE data. We got around 2 % 
-#improvement just by including it. The gain might be large if someone would tweak
-# the number of leaves and states and so on.
-
-#audio=(
-#  /export/corpora/LDC/LDC2013S02/
-#  /export/corpora/LDC/LDC2013S07/
-#  /export/corpora/LDC/LDC2014S07/
-#)
-#text=(
-#  /export/corpora/LDC/LDC2013T17
-#  /export/corpora/LDC/LDC2013T04
-#  /export/corpora/LDC/LDC2014T17
-#)
-
-audio=(
-  /data/sls/scratch/amali/data/GALE/LDC2013S02
-  /data/sls/scratch/amali/data/GALE/LDC2013S07
-  /data/sls/scratch/amali/data/GALE/LDC2014S07
-)
-text=(
-  /data/sls/scratch/amali/data/GALE/LDC2013T17.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2013T04.tgz
-  /data/sls/scratch/amali/data/GALE/LDC2014T17.tgz
-)
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
 
 galeData=GALE
-#prepare the data
-#split train dev test 
-#prepare lexicon and LM 
-
-# You can run the script from here automatically, but it is recommended to run the data preparation,
-# and features extraction manually and and only once.
-# By copying and pasting into your shell.
-
-#copy the audio files to local folder wav and convet flac files to wav
-local/gale_data_prep_audio.sh  "${audio[@]}" $galeData || exit 1;
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
 
-#get the transcription and remove empty prompts and all noise markers  
-local/gale_data_prep_txt.sh  "${text[@]}" $galeData || exit 1;
+if [ $stage -le 0 ]; then
 
-# split the data to reports and conversational and for each class will have rain/dev and test
-local/gale_data_prep_split.sh $galeData  || exit 1;
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
 
-# get all Arabic grapheme dictionaries and add silence and UNK
-local/gale_prep_grapheme_dict.sh  || exit 1;
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
 
+  echo "$0: Preparing lexicon and LM..." 
+  local/prepare_dict.sh
 
-#prepare the langauge resources
-utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang   || exit 1;
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
 
-# LM training
-local/gale_train_lms.sh || exit 1;
+  local/prepare_lm.sh
 
-local/gale_format_data.sh  || exit 1;
-# G compilation, check LG composition
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
 mfccdir=mfcc
-
-for x in train test ; do
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
-    data/$x exp/make_mfcc/$x $mfccdir
-  utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
-  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
-done
-
-
-# Here we start the AM
-
-# Let's create a subset with 10k segments to make quick flat-start training:
-utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
-
-# Train monophone models on a subset of the data, 10K segment
-# Note: the --boost-silence option should probably be omitted by default
-steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
-  data/train.10K data/lang exp/mono || exit 1;
-
-
-# Get alignments from monophone system.
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/mono exp/mono_ali || exit 1;
-
-# train tri1 [first triphone pass]
-steps/train_deltas.sh --cmd "$train_cmd" \
-  2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
-
-# First triphone decoding
-utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
-steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri1/graph data/test exp/tri1/decode
-  
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
-
-# Train tri2a, which is deltas+delta+deltas
-steps/train_deltas.sh --cmd "$train_cmd" \
-  3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
-
-# tri2a decoding
-utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2a/graph data/test exp/tri2a/decode
-
-# train and decode tri2b [LDA+MLLT]
-steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
-  data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
-steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
-  exp/tri2b/graph data/test exp/tri2b/decode
-
-# Align all data with LDA+MLLT system (tri2b)
-steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
-  --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
-
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" \
-  5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
-
-utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
-steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
-  "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
-
-# From 3b system, align all data.
-steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
-  data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
-  
-
-# nnet3 cross-entropy 
-local/nnet3/run_tdnn.sh #tdnn recipe:
-local/nnet3/run_lstm.sh --stage 12  #lstm recipe (we skip ivector training)
-
-# chain lattice-free 
-local/chain/run_tdnn.sh      #tdnn recipe:
-local/chain/run_tdnn_lstm.sh #tdnn-lstm recipe:
-
-time=$(date +"%Y-%m-%d-%H-%M-%S")
-
-#get detailed WER; reports, conversational and combined
-local/split_wer.sh $galeData > RESULTS.details.$USER.$time # to make sure you keep the results timed and owned
-
-echo training succedded
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating sub-set and training monophone system"
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Aligning data using monophone system"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono exp/mono_ali || exit 1;
+
+  echo "$0: training triphone system with delta features"
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1/graph data/test exp/tri1/decode
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b/graph data/test exp/tri2b/decode
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b exp/tri2b_ali || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b/graph data/test exp/tri3b/decode
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: Training a regular chain model using the e2e alignments..."
+  local/chain/run_tdnn.sh
+fi
+
+echo "$0: training succedded"
 exit 0
-
-#TODO:
-#LM (4-gram and RNN) rescoring
-#combine lattices
-#dialect detection
-
-
-
-
-
diff --git a/egs/gale_arabic/s5c/RESULT b/egs/gale_arabic/s5c/RESULT
new file mode 100644
index 00000000000..d56c9e2dbc6
--- /dev/null
+++ b/egs/gale_arabic/s5c/RESULT
@@ -0,0 +1,4 @@
+%WER 41.98 [ 29249 / 69668, 2672 ins, 5990 del, 20587 sub ] exp/tri1_subword/decode/wer_15_0.0
+%WER 37.66 [ 26239 / 69668, 2660 ins, 5255 del, 18324 sub ] exp/tri2b_subword/decode/wer_17_0.0
+%WER 35.26 [ 24565 / 69668, 2879 ins, 4892 del, 16794 sub ] exp/tri3b_subword/decode/wer_17_0.5
+%WER 17.29 [ 12049 / 69668, 1244 ins, 2758 del, 8047 sub ] exp/chain/tdnn_1a_sp/decode_test/wer_10_0.5
diff --git a/egs/gale_arabic/s5c/cmd.sh b/egs/gale_arabic/s5c/cmd.sh
new file mode 100755
index 00000000000..ea341c98d4a
--- /dev/null
+++ b/egs/gale_arabic/s5c/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="retry.pl queue.pl --mem 2G"
+export decode_cmd="retry.pl queue.pl --mem 4G"
+export mkgraph_cmd="retry.pl queue.pl --mem 8G"
diff --git a/egs/gale_arabic/s5c/conf/decode.config b/egs/gale_arabic/s5c/conf/decode.config
new file mode 100644
index 00000000000..6f503eab35e
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/decode.config
@@ -0,0 +1 @@
+link decode_dnn.config
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/conf/mfcc.conf b/egs/gale_arabic/s5c/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/gale_arabic/s5c/conf/mfcc_hires.conf b/egs/gale_arabic/s5c/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..c45f2b691a9
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40    
+--num-ceps=40   
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
diff --git a/egs/gale_arabic/s5c/conf/online_cmvn.conf b/egs/gale_arabic/s5c/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/gale_arabic/s5c/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/gale_arabic/s5c/local/bad_segments b/egs/gale_arabic/s5c/local/bad_segments
new file mode 100644
index 00000000000..c3413f0714c
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/bad_segments
@@ -0,0 +1,10 @@
+ARABIYA_FROMIRAQ_ARB_20070302_175801_2326286_2327450
+ARABIYA_BILARABI_ARB_20061005_201400_221375_223694
+LBC_NAHAR_ARB_20060911_142800_3683267_3685290
+LBC_NAHAR_ARB_20070303_145800_3249800_3251128
+LBC_NAHAR_ARB_20070303_145800_3623646_3624152
+LBC_NAHAR_ARB_20070305_035800_481003_484069
+ALAM_WITHEVENT_ARB_20070227_205800_3141876_3144152
+ALAM_NEWSRPT_ARB_20070130_015801_2875054_2876396
+ALJZ_TODHARV_ARB_20060914_155800_2947717_2949041
+ALJZ_TODHARV_ARB_20070107_145800_2417848_2419238
diff --git a/egs/gale_arabic/s5c/local/chain/compare_wer.sh b/egs/gale_arabic/s5c/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/gale_arabic/s5c/local/chain/run_chain_common.sh b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..da37e148441
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=6000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..bf2e45c9914
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn_1a_sp
+# System                      tdnn_1a_sp
+# WER                             16.47
+# CER                              6.68
+# Final train prob              -0.0652
+# Final valid prob              -0.0831
+# Final train prob (xent)       -0.8965
+# Final valid prob (xent)       -0.9964
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a_sp/
+# exp/chain/tdnn_1a_sp/: num-iters=441 nj=3..16 num-params=18.6M dim=40+100->5816 combine=-0.063->-0.062 (over 6) xent:train/valid[293,440,final]=(-1.22,-0.912,-0.896/-1.29,-1.01,-0.996) logprob:train/valid[293,440,final]=(-0.097,-0.066,-0.065/-0.108,-0.084,-0.083)
+
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+test_set=test
+gmm=tri3b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=_1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=150,110,100
+get_egs_stage=-10
+
+# training options
+srand=0
+remove_egs=true
+run_ivector_common=true
+run_chain_common=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if $run_ivector_common; then
+  local/nnet3/run_ivector_common.sh \
+    --stage $stage --nj $nj \
+    --train-set $train_set --gmm $gmm \
+    --num-threads-ubm $num_threads_ubm \
+    --nnet3-affix "$nnet3_affix"
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+if $run_chain_common; then
+  local/chain/run_chain_common.sh --stage $stage \
+                                  --gmm-dir $gmm_dir \
+                                  --ali-dir $ali_dir \
+                                  --lores-train-data-dir ${lores_train_data_dir} \
+                                  --lang $lang \
+                                  --lat-dir $lat_dir \
+                                  --num-leaves 7000 \
+                                  --tree-dir $tree_dir || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.stage $get_egs_stage \
+    --reporting.email="$reporting_email" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir=$lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+    steps/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context 0 --extra-right-context 0 \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test_set}_hires \
+      $tree_dir/graph data/${test_set}_hires ${dir}/decode_${test_set} || exit 1
+fi
diff --git a/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..deebafc95e4
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+decode_nj=30
+min_seg_len=1.55
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.1
+train_set=train
+gmm=tri2b # the gmm for the target data gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 3 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $extra_left_context  \
+    --extra-right-context $extra_right_context  \
+    --frames-per-chunk "$frames_per_chunk" \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    --scoring-opts "--min-lmwt 5 " \
+    $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+exit 0
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a03cc5b2fa3
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
+
+
+stage=0
+nj=100
+train_set=train   # you might set this to e.g. train.
+test_sets="test"
+gmm=tri3b # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix= # affix for exp/nnet3 directory to put iVector stuff
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 4 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/gale_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 7 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 8 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh b/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
new file mode 120000
index 00000000000..c53740399ce
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_lstm.sh
@@ -0,0 +1 @@
+tuning/run_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh b/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
new file mode 100755
index 00000000000..7f7b8b3ba56
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/tuning/run_lstm_1a.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+#started from tedlium recipe with few edits
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 0 0"
+lstm_delay=" -1 -2 -3 "
+label_delay=5
+num_lstm_layers=3
+cell_dim=1024
+hidden_dim=1024
+recurrent_projection_dim=256
+non_recurrent_projection_dim=256
+chunk_width=20
+chunk_left_context=40
+chunk_right_context=0
+max_param_change=2.0
+
+# training options
+srand=0
+num_epochs=6
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=2
+num_jobs_final=3
+momentum=0.5
+num_chunk_per_minibatch=100
+samples_per_iter=20000
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/lstm${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs"
+  config_extra_opts=()
+  [ ! -z "$lstm_delay" ] && config_extra_opts+=(--lstm-delay "$lstm_delay")
+  steps/nnet3/lstm/make_configs.py  "${config_extra_opts[@]}" \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --num-lstm-layers $num_lstm_layers \
+    --splice-indexes "$splice_indexes " \
+    --cell-dim $cell_dim \
+    --hidden-dim $hidden_dim \
+    --recurrent-projection-dim $recurrent_projection_dim \
+    --non-recurrent-projection-dim $non_recurrent_projection_dim \
+    --label-delay $label_delay \
+    --self-repair-scale-nonlinearity 0.00001 \
+  $dir/configs || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+  
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=$samples_per_iter \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
+    --trainer.optimization.momentum=$momentum \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=1 \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+    data/test_hires ${dir}/decode_test ${dir}/decode_test_rescore || exit 1
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..6619df668ef
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# started from tedlium recipe with few edits
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train
+gmm=tri2b  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
+remove_egs=true
+relu_dim=850
+num_epochs=3
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+   
+  steps/nnet3/tdnn/train.sh --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 2 \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir ${train_ivector_dir} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmd "$decode_cmd" \
+    --relu-dim "$relu_dim" \
+    --remove-egs "$remove_egs" \
+    $train_data_dir data/lang $ali_dir $dir
+fi
+
+if [ $stage -le 13 ]; then
+  steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_test_hires \
+    ${graph_dir} data/test_hires ${dir}/decode || exit 1 
+fi
+
+exit 0;
diff --git a/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl b/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
new file mode 100755
index 00000000000..df01c5d7b85
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/normalize_transcript_BW.pl
@@ -0,0 +1,111 @@
+#!/usr/bin/env perl
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+use warnings;
+use strict;
+use Encode;
+use utf8;
+
+
+
+if (@ARGV !=2 )
+    {#
+	print "usage: $0 <inFile> <onlyArabicFile>\n"; 
+	exit (1);   
+    }
+    
+# <\check usage>
+my $inFile = shift (@ARGV);
+my $ouFile = shift(@ARGV);
+
+
+open INFILE, "<$inFile" || die "unable to open the input file $inFile\n";
+binmode INFILE, ":encoding(utf8)";
+
+
+open OUTPUTFILE, ">$ouFile" or die "unable to open the output mlf file $ouFile\n";
+binmode OUTPUTFILE, ":encoding(utf8)";
+
+
+while (<INFILE>) {
+  s/[^اأإآبتثجحخدذرزسشصضطظعغفقكلمنهويىئءؤة0-9]+/ /g;  ## Removes non Arabic or numbers
+  my $BW = convertUTF8ToBuckwalter ($_);
+  print OUTPUTFILE "$BW"."\n";
+}
+close INFILE;
+close OUTPUTFILE;
+
+
+
+# this function is copied from MADATools.pm: MADA Tools
+ sub convertUTF8ToBuckwalter {
+
+    my ($line)= (@_);
+    #$line = $UTF8_ENCODING_OBJ->decode($line);  ## Same as Encode::decode("utf8",$line), but faster since object already created
+    $line =~ s/\x{0621}/\'/g;   ## HAMZA
+    $line =~ s/\x{0622}/\|/g;   ## ALEF WITH MADDA ABOVE
+    $line =~ s/\x{0623}/\>/g;   ## ALEF WITH HAMZA ABOVE
+    $line =~ s/\x{0624}/\&/g;   ## WAW WITH HAMZA ABOVE
+    $line =~ s/\x{0625}/\</g;   ## ALEF WITH HAMZA BELOW
+    $line =~ s/\x{0626}/\}/g;   ## YEH WITH HAMZA ABOVE
+    $line =~ s/\x{0627}/A/g;    ## ALEF
+    $line =~ s/\x{0628}/b/g;    ## BEH
+    $line =~ s/\x{0629}/p/g;    ## TEH MARBUTA
+    $line =~ s/\x{062A}/t/g;    ## TEH
+    $line =~ s/\x{062B}/v/g;    ## THEH
+    $line =~ s/\x{062C}/j/g;    ## JEEM
+    $line =~ s/\x{062D}/H/g;    ## HAH
+    $line =~ s/\x{062E}/x/g;    ## KHAH
+    $line =~ s/\x{062F}/d/g;    ## DAL
+    $line =~ s/\x{0630}/\*/g;   ## THAL
+    $line =~ s/\x{0631}/r/g;    ## REH
+    $line =~ s/\x{0632}/z/g;    ## ZAIN
+    $line =~ s/\x{0633}/s/g;    ## SEEN
+    $line =~ s/\x{0634}/\$/g;   ## SHEEN
+    $line =~ s/\x{0635}/S/g;    ## SAD
+    $line =~ s/\x{0636}/D/g;    ## DAD
+    $line =~ s/\x{0637}/T/g;    ## TAH
+    $line =~ s/\x{0638}/Z/g;    ## ZAH
+    $line =~ s/\x{0639}/E/g;    ## AIN
+    $line =~ s/\x{063A}/g/g;    ## GHAIN
+    $line =~ s/\x{0640}/_/g;    ## TATWEEL
+    $line =~ s/\x{0641}/f/g;    ## FEH
+    $line =~ s/\x{0642}/q/g;    ## QAF
+    $line =~ s/\x{0643}/k/g;    ## KAF
+    $line =~ s/\x{0644}/l/g;    ## LAM
+    $line =~ s/\x{0645}/m/g;    ## MEEM
+    $line =~ s/\x{0646}/n/g;    ## NOON
+    $line =~ s/\x{0647}/h/g;    ## HEH
+    $line =~ s/\x{0648}/w/g;    ## WAW
+    $line =~ s/\x{0649}/Y/g;    ## ALEF MAKSURA
+    $line =~ s/\x{064A}/y/g;    ## YEH
+
+    ## Diacritics
+    $line =~ s/\x{064B}/F/g;    ## FATHATAN
+    $line =~ s/\x{064C}/N/g;    ## DAMMATAN
+    $line =~ s/\x{064D}/K/g;    ## KASRATAN
+    $line =~ s/\x{064E}/a/g;    ## FATHA
+    $line =~ s/\x{064F}/u/g;    ## DAMMA
+    $line =~ s/\x{0650}/i/g;    ## KASRA
+    $line =~ s/\x{0651}/\~/g;   ## SHADDA
+    $line =~ s/\x{0652}/o/g;    ## SUKUN
+    $line =~ s/\x{0670}/\`/g;   ## SUPERSCRIPT ALEF
+
+    $line =~ s/\x{0671}/\{/g;   ## ALEF WASLA
+    $line =~ s/\x{067E}/P/g;    ## PEH
+    $line =~ s/\x{0686}/J/g;    ## TCHEH
+    $line =~ s/\x{06A4}/V/g;    ## VEH
+    $line =~ s/\x{06AF}/G/g;    ## GAF
+
+
+    ## Punctuation should really be handled by the utf8 cleaner or other method
+#   $line =~ s/\xa2/\,/g; # comma
+#    $line =~ s//\,/g; # comma
+#    $line =~ s//\,/g;
+#    $line =~ s//\;/g; # semicolon
+#    $line =~ s//\?/g; # questionmark
+
+    return $line;
+}
diff --git a/egs/gale_arabic/s5c/local/prepare_data.sh b/egs/gale_arabic/s5c/local/prepare_data.sh
new file mode 100755
index 00000000000..aea9ba2dc8e
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_data.sh
@@ -0,0 +1,104 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+gale_data=GALE
+
+mkdir -p $gale_data 
+# check that sox is installed 
+which sox  &>/dev/null
+if [[ $? != 0 ]]; then 
+ echo "$0: sox is not installed"; exit 1
+fi
+
+for dvd in $dir1 $dir2 $dir3; do
+  dvd_full_path=$(utils/make_absolute.sh $dvd)
+  if [[ ! -e $dvd_full_path ]]; then 
+    echo "$0: missing $dvd_full_path"; exit 1;
+  fi
+  find $dvd_full_path \( -name "*.wav" -o -name "*.flac" \)  | while read file; do
+    id=$(basename $file | awk '{gsub(".wav","");gsub(".flac","");print}')
+    echo "$id sox $file -r 16000 -t wav - |"
+  done 
+done | sort -u > $gale_data/wav.scp
+echo "$0:data prep audio succeded"
+
+gale_data=$(utils/make_absolute.sh "GALE" );
+top_pwd=`pwd`
+txtdir=$gale_data/txt
+mkdir -p $txtdir; cd $txtdir
+
+for cdx in $text1 $text2 $text3; do
+  echo "$0:Preparing $cdx"
+  if [[ $cdx  == *.tgz ]] ; then
+     tar -xvf $cdx
+  elif [  -d "$cdx" ]; then
+    ln -s $cdx `basename $cdx`
+  else
+    echo "$0:I don't really know what I shall do with $cdx " >&2
+  fi
+done
+
+find -L . -type f -name "*.tdf" | while read file; do
+sed '1,3d' $file  # delete the first 3 lines
+done >  all.tmp$$
+
+perl -e '
+    ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
+    open(IN, "$inFile");
+    open(ID, ">$idFile");
+    open(TXT, ">$txtFile");
+    while (<IN>) {
+      @arr= split /\t/,$_;
+      $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
+      $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
+      if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
+      $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd\n";
+      next if ($rStart == $rEnd);
+      $id =~ s/.sph//g;
+      print ID $id;
+      print TXT "$arr[7]\n";
+ }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
+
+perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
+paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
+
+
+awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/all
+awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $gale_data/report
+awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $gale_data/conversational
+
+cd ..;
+rm -fr $txtdir
+cd $top_pwd
+echo "$0:dat a prep text succeeded"
+
+mkdir -p data
+dir=$(utils/make_absolute.sh data/)
+grep -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.test
+grep -v -f local/test_list $gale_data/all | grep -v -f local/bad_segments > $gale_data/all.train 
+
+for x in test train; do
+ outdir=data/$x
+ file=$gale_data/all.$x 
+ mkdir -p $outdir
+ awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
+ cp -pr $outdir/utt2spk $outdir/spk2utt
+ awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
+ awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
+done 
+
+grep -f local/test_list $gale_data/wav.scp > $dir/test/wav.scp
+
+cat $gale_data/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
+ {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
+ 
+echo "$0:data prep split succeeded"
+exit 0
diff --git a/egs/gale_arabic/s5c/local/prepare_dict.sh b/egs/gale_arabic/s5c/local/prepare_dict.sh
new file mode 100755
index 00000000000..47b5869fdf1
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh
new file mode 100755
index 00000000000..14416e8587e
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# Copyright 2017 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+# This script prepares the subword dictionary.
+
+set -e
+dir=data/local/dict
+lexicon_url1="http://alt.qcri.org//resources/speech/dictionary/ar-ar_grapheme_lexicon_2016-02-09.bz2";
+lexicon_url2="http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2";
+num_merges=1000
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  wget -P data/local/lexicon_data $lexicon_url1
+  wget -P data/local/lexicon_data $lexicon_url2
+  bzcat data/local/lexicon_data/ar-ar_grapheme_lexicon_2016-02-09.bz2  | sed '1,3d' | awk '{print $1}'  >  data/local/lexicon_data/grapheme_lexicon
+  bzcat data/local/lexicon_data/ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' | awk '{print $1}' >>  data/local/lexicon_data/grapheme_lexicon
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v "[0-9]" data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+# Make a subword lexicon based on current word lexicon
+glossaries="<UNK> <sil>"
+if [ $stage -le 0 ]; then
+  echo "$0: making subword lexicon... $(date)."
+  # get pair_code file
+  cut -d ' ' -f2- data/train/text | sed 's/<sil>//g;s/<UNK>//g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt
+  mv $dir/lexicon.txt $dir/lexicon_word.txt
+  # get words
+  cut -d ' ' -f1 $dir/lexicon_word.txt > $dir/words.txt
+  utils/lang/bpe/apply_bpe.py -c data/local/pair_code.txt --glossaries $glossaries < $dir/words.txt | \
+  sed 's/ /\n/g' | sort -u > $dir/subwords.txt
+  sed 's/./& /g' $dir/subwords.txt | sed 's/@ @ //g' | sed 's/*/V/g' | paste -d ' ' $dir/subwords.txt - > $dir/lexicon.txt
+fi
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/gale_arabic/s5c/local/prepare_lexicon.py b/egs/gale_arabic/s5c/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/gale_arabic/s5c/local/prepare_lm.sh b/egs/gale_arabic/s5c/local/prepare_lm.sh
new file mode 100755
index 00000000000..6fdf35f471a
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lm.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=3
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.gz
+
+#ngram -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/prepare_lm_subword.sh b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh
new file mode 100755
index 00000000000..a5d5c1d1c94
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/prepare_lm_subword.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2012  Vassil Panayotov
+#           2017  Ewald Enzinger
+#           2019  Dongji Gao
+# Apache 2.0
+
+. ./path.sh || exit 1
+
+echo "=== Building a language model ..."
+
+dir=data/local/lm/
+text=data/train/text
+lexicon=data/local/dict/lexicon.txt
+# Language model order
+order=6
+
+. utils/parse_options.sh
+
+# Prepare a LM training corpus from the transcripts
+mkdir -p $dir
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
+done
+
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+cat data/train/text | cut -d " " -f 2- >  $dir/text.txt
+cat data/test/text | cut -d ' ' -f2- > $dir/dev.txt
+cut -d' ' -f1 $lexicon > $dir/wordlist
+
+ngram-count -text $dir/text.txt -order $order -vocab $dir/wordlist \
+  -unk -map-unk "<UNK>" -wbdiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6 -interpolate -lm $dir/lm.gz
+
+ngram -order $order -lm $dir/lm.gz -ppl $dir/dev.txt
+echo "*** Finished building the LM model!"
diff --git a/egs/gale_arabic/s5c/local/score.sh b/egs/gale_arabic/s5c/local/score.sh
new file mode 100755
index 00000000000..1d84815fc69
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/score.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/gale_arabic/s5c/local/split_wer.sh b/egs/gale_arabic/s5c/local/split_wer.sh
new file mode 100755
index 00000000000..d83a0f79e8c
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/split_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Report WER for reports and conversational
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+if [ $# -ne 1 ]; then
+   echo "Arguments should be the gale folder, see ../run.sh for example."
+   exit 1;
+fi
+
+[ -f ./path.sh ] && . ./path.sh
+
+
+galeFolder=$(utils/make_absolute.sh $1)
+symtab=./data/lang/words.txt
+find exp/ -maxdepth 3 -type d -name decode\* > list_decode$$
+
+#split the test set per type:
+awk '{print $2}' $galeFolder/all.test | sort -u > $galeFolder/test_id$$
+
+# generate the report test set
+awk '{print $2}' $galeFolder/report | sort -u  > $galeFolder/report_id$$
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/report_id$$ > $galeFolder/report.test
+
+# generate the conversational test set
+awk '{print $2}' $galeFolder/conversational | sort -u  > $galeFolder/conversational_id$$
+
+comm -1 -2 $galeFolder/test_id$$ $galeFolder/conversational_id$$ > $galeFolder/conversational.test
+
+rm -fr $galeFolder/test_id$$ $galeFolder/report_id$$ $galeFolder/conversational_id$$
+
+min_lmwt=7
+max_lmwt=20
+cat list_decode$$ | while read dir; do
+ for type in report conversational; do
+ #echo "Processing: $dir $type"
+  rm -fr $dir/scoring_$type
+  cp -pr $dir/scoring  $dir/scoring_$type
+  ( cd $dir/scoring_$type;
+    for x in *.tra test_filt.txt; do
+      sort -u $x > tmp$$
+      join tmp$$ $galeFolder/${type}.test > $x
+      rm -fr tmp$$
+    done
+   )
+
+utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
+   cat $dir/scoring_${type}/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring_${type}/test_filt.txt  ark,p:- ">&" $dir/wer_${type}_LMWT
+done
+done
+
+
+time=$(date +"%Y-%m-%d-%H-%M-%S")
+echo "RESULTS generated by $USER at $time"
+
+echo "Report Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_report_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Conversational Results WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_conversational_* | utils/best_wer.sh; done | sort -n -k2
+
+echo "Combined Results for Reports and Conversational WER:"
+cat list_decode$$ | while read x; do [ -d $x ] && grep WER $x/wer_?? $x/wer_?| utils/best_wer.sh; done | sort -n -k2
+
+rm list_decode$$
+
+
+
diff --git a/egs/gale_arabic/s5c/local/test_list b/egs/gale_arabic/s5c/local/test_list
new file mode 100644
index 00000000000..d82cf498804
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/test_list
@@ -0,0 +1,11 @@
+ALAM_WITHEVENT_ARB_20070116_205800
+ALAM_WITHEVENT_ARB_20070130_205800
+ALAM_WITHEVENT_ARB_20070206_205801
+ALAM_WITHEVENT_ARB_20070213_205800
+ALAM_WITHEVENT_ARB_20070227_205800
+ALAM_WITHEVENT_ARB_20070306_205800
+ALAM_WITHEVENT_ARB_20070313_205800
+ARABIYA_FROMIRAQ_ARB_20070216_175800
+ARABIYA_FROMIRAQ_ARB_20070223_175801
+ARABIYA_FROMIRAQ_ARB_20070302_175801
+ARABIYA_FROMIRAQ_ARB_20070309_175800
diff --git a/egs/gale_arabic/s5c/local/wer_output_filter b/egs/gale_arabic/s5c/local/wer_output_filter
new file mode 100755
index 00000000000..fcd40539e7f
--- /dev/null
+++ b/egs/gale_arabic/s5c/local/wer_output_filter
@@ -0,0 +1,4 @@
+#!/bin/sed -f
+s/@@ //g
+s/<sil>//g
+s/<UNK>//g
diff --git a/egs/gale_arabic/s5c/path.sh b/egs/gale_arabic/s5c/path.sh
new file mode 100755
index 00000000000..be11b34cbc6
--- /dev/null
+++ b/egs/gale_arabic/s5c/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=$(pwd)/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/gale_arabic/s5c/run.sh b/egs/gale_arabic/s5c/run.sh
new file mode 100755
index 00000000000..3e363816812
--- /dev/null
+++ b/egs/gale_arabic/s5c/run.sh
@@ -0,0 +1,131 @@
+#!/bin/bash -e
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+#           2019 Dongji Gao
+# Apache 2.0
+
+# This is an example script for subword implementation
+
+num_jobs=120
+num_decode_jobs=40
+decode_gmm=true
+stage=0
+overwrite=false
+num_merges=1000
+
+dir1=/export/corpora/LDC/LDC2013S02/
+dir2=/export/corpora/LDC/LDC2013S07/
+dir3=/export/corpora/LDC/LDC2014S07/
+text1=/export/corpora/LDC/LDC2013T17/
+text2=/export/corpora/LDC/LDC2013T04/
+text3=/export/corpora/LDC/LDC2014T17/
+
+galeData=GALE
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+if [ $stage -le 0 ]; then
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: preparing data..."
+  local/prepare_data.sh --dir1 $dir1 --dir2 $dir2 --dir3 $dir3 \
+                        --text1 $text1 --text2 $text2 --text3 $text3
+
+  echo "$0: Preparing lexicon and LM..." 
+  local/prepare_dict_subword.sh --num_merges $num_merges
+
+  utils/subword/prepare_lang_subword.sh data/local/dict "<UNK>" data/local/lang data/lang
+
+  for set in train test; do
+    utils/subword/prepare_subword_text.sh data/$set/text data/local/pair_code.txt data/$set/text
+  done
+
+  local/prepare_lm_subword.sh
+
+  utils/format_lm.sh data/lang data/local/lm/lm.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+mfccdir=mfcc
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for x in train test ; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+      data/$x exp/make_mfcc/$x $mfccdir
+    utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating sub-set and training monophone system"
+  utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1;
+
+  steps/train_mono.sh --nj 40 --cmd "$train_cmd" \
+    data/train.10K data/lang exp/mono_subword || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Aligning data using monophone system"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/mono_subword exp/mono_ali_subword || exit 1;
+
+  echo "$0: training triphone system with delta features"
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train data/lang exp/mono_ali_subword exp/tri1_subword || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri1_subword exp/tri1_subword/graph
+  steps/decode.sh  --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri1_subword/graph data/test exp/tri1_subword/decode
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning data and retraining and realigning with lda_mllt"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri1_subword exp/tri1_ali_subword || exit 1;
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \
+    data/train data/lang exp/tri1_ali_subword exp/tri2b_subword || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2b_subword exp/tri2b_subword/graph
+  steps/decode.sh --nj $num_decode_jobs --cmd "$decode_cmd" \
+    exp/tri2b_subword/graph data/test exp/tri2b_subword/decode
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning data and retraining and realigning with sat_basis"
+  steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri2b_subword exp/tri2b_ali_subword || exit 1;
+
+  steps/train_sat_basis.sh --cmd "$train_cmd" \
+    5000 100000 data/train data/lang exp/tri2b_ali_subword exp/tri3b_subword || exit 1;
+
+  steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \
+    data/train data/lang exp/tri3b_subword exp/tri3b_ali_subword || exit 1;
+fi
+
+if [ $stage -le 8 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3b_subword exp/tri3b_subword/graph
+  steps/decode_fmllr.sh --nj $num_decode_jobs --cmd \
+    "$decode_cmd" exp/tri3b_subword/graph data/test exp/tri3b_subword/decode
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: Training a regular chain model using the e2e alignments..."
+  local/chain/run_tdnn.sh --gmm tri3b_subword
+fi
+
+echo "$0: training succeed"
+exit 0
diff --git a/egs/gale_arabic/s5c/steps b/egs/gale_arabic/s5c/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/gale_arabic/s5c/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/gale_arabic/s5c/utils b/egs/gale_arabic/s5c/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/gale_arabic/s5c/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
index 2e2810bb713..c6a80240754 100755
--- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh
+++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh
@@ -130,7 +130,9 @@ unset LC_ALL
 # are equal
 cat $dict_dir/ch-dict.txt |\
   perl -e '
-  use encoding utf8;
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
   while (<STDIN>) {
     @A = split(" ", $_);
     $word_len = length($A[0]);
@@ -299,4 +301,3 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ",
 
 export LC_ALL=C
 echo "$0: Done"
-
diff --git a/egs/gale_mandarin/s5/local/gale_segment.py b/egs/gale_mandarin/s5/local/gale_segment.py
index 975ddb9c143..d652eb837f3 100755
--- a/egs/gale_mandarin/s5/local/gale_segment.py
+++ b/egs/gale_mandarin/s5/local/gale_segment.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 #coding:utf-8
 #!/usr/bin/env python
+from __future__ import print_function
 import sys
 from mmseg import seg_txt
 for line in sys.stdin:
@@ -12,4 +13,4 @@
       continue
     for j in seg_txt(blks[i]):
       out_line += " " + j
-  print out_line     
+  print(out_line)     
diff --git a/egs/gop/README.md b/egs/gop/README.md
new file mode 100644
index 00000000000..d95f4e966fd
--- /dev/null
+++ b/egs/gop/README.md
@@ -0,0 +1,98 @@
+There is a copy of this document on Google Docs, which renders the equations better:
+[link](https://docs.google.com/document/d/1pie-PU6u2NZZC_FzocBGGm6mpfBJMiCft9UoG0uA1kA/edit?usp=sharing)
+
+* * *
+
+# GOP on Kaldi
+
+The Goodness of Pronunciation (GOP) is a variation of the posterior probability, for phone level pronunciation scoring.
+GOP is widely used in pronunciation evaluation and mispronunciation detection tasks.
+
+This implementation is mainly based on the following paper:
+
+Hu, W., Qian, Y., Soong, F. K., & Wang, Y. (2015). Improved mispronunciation detection with deep neural network trained acoustic models and transfer learning based logistic regression classifiers. Speech Communication, 67(January), 154-166.
+
+## GOP-GMM
+
+In the conventional GMM-HMM based system, GOP was first proposed in (Witt et al., 2000). It was defined as the duration normalised log of the posterior:
+
+$$
+GOP(p)=\frac{1}{t_e-t_s+1} \log p(p|\mathbf o)
+$$
+
+where $\mathbf o$ is the input observations, $p$ is the canonical phone, $t_s, t_e$ are the start and end frame indexes.
+
+Assuming $p(q_i)\approx p(q_j)$ for any $q_i, q_j$, we have:
+
+$$
+\log p(p|\mathbf o)=\frac{p(\mathbf o|p)p(p)}{\sum_{q\in Q} p(\mathbf o|q)p(q)}
+                   \approx\frac{p(\mathbf o|p)}{\sum_{q\in Q} p(\mathbf o|q)}
+$$
+
+where $Q$ is the whole phone set.
+
+The numerator of the equation is calculated from forced alignment result and the denominator is calculated from an Viterbi decoding with a unconstrained phone loop.
+
+We do not implement GOP-GMM for Kaldi, as GOP-NN performs much better than GOP-GMM.
+
+## GOP-NN
+
+The definition of GOP-NN is a bit different from the GOP-GMM. GOP-NN was defined as the log phone posterior ratio between the canonical phone and the one with the highest score (Hu et al., 2015).
+
+Firstly we define Log Phone Posterior (LPP):
+
+$$
+LPP(p)=\log p(p|\mathbf o; t_s,t_e)
+$$
+
+Then we define the GOP-NN using LPP:
+
+$$
+GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
+$$
+
+LPP could be calculated as:
+
+$$
+LPP(p) \approx \frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
+$$
+
+$$
+p(p|o_t) = \sum_{s \in p} p(s|o_t)
+$$
+
+where $s$ is the senone label, $\{s|s \in p\}$ is the states belonging to those triphones whose current phone is $p$.
+
+## Phone-level Feature
+
+Normally the classifier-based approach archives better performance than GOP-based approach.
+
+Different from GOP based method, an extra supervised training process is needed. The input features for supervised training are phone-level, segmental features. The phone-level feature is defined as:
+
+$$
+{[LPP(p_1),\cdots,LPP(p_M), LPR(p_1|p_i), \cdots, LPR(p_j|p_i),\cdots]}^T
+$$
+
+where the Log Posterior Ratio (LPR) between phone $p_j$ and $p_i$ is defined as:
+
+$$
+LPR(p_j|p_i) = \log p(p_j|\mathbf o; t_s, t_e) - \log p(p_i|\mathbf o; t_s, t_e)
+$$
+
+## Implementation
+
+This implementation consists of a executable binary `bin/compute-gop` and some scripts.
+
+`compute-gop` computes GOP and extracts phone-level features using nnet output probabilities.
+The output probabilities are assumed to be from a log-softmax layer.
+
+The script `run.sh` shows a typical pipeline based on librispeech's model and data.
+
+In Hu's paper, GOP was computed using a feed-forward DNN.
+We have tried to use the output-xent of a chain model to compute GOP, but the result was not good.
+We guess the HMM topo of chain model may not fit for GOP.
+
+The nnet3's TDNN (no chain) model performs well in GOP computing, so this recipe uses it.
+
+## Acknowledgement
+The author of this recipe would like to thank Xingyu Na for his works of model tuning and his helpful suggestions.
diff --git a/egs/gop/s5/cmd.sh b/egs/gop/s5/cmd.sh
new file mode 100644
index 00000000000..9139633e57a
--- /dev/null
+++ b/egs/gop/s5/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="run.pl"
diff --git a/egs/gop/s5/local/make_testcase.sh b/egs/gop/s5/local/make_testcase.sh
new file mode 100755
index 00000000000..884563066b1
--- /dev/null
+++ b/egs/gop/s5/local/make_testcase.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+src=$1
+dst=$2
+
+# Select a very small set for testing
+utils/subset_data_dir.sh --shortest $src 10 $dst
+
+# make fake transcripts as negative examples
+cp $dst/text $dst/text.ori
+sed -i "s/ THERE / THOSE /" $dst/text
+sed -i "s/ IN / ON /" $dst/text
diff --git a/egs/gop/s5/local/remove_phone_markers.pl b/egs/gop/s5/local/remove_phone_markers.pl
new file mode 100755
index 00000000000..16236a749cf
--- /dev/null
+++ b/egs/gop/s5/local/remove_phone_markers.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/env perl
+# Copyright 2019 Junbo Zhang
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+my $Usage = <<EOU;
+remove_phone_markers.pl:
+This script processes a phone set (i.e. the phones.txt file), remove the stress
+markers and the pos-in-word markers, and creates a new phone.txt file and an
+old->new phone mapping file, in which each line is: "old-integer-id new-integer-id.
+
+Usage: utils/remove_phone_markers.pl <old-phone-symbols> <new-phone-symbols> <mapping>
+ e.g.: utils/remove_phone_markers.pl phones.txt phones-pure.txt phone-to-pure-phone.int
+EOU
+
+if (@ARGV < 3) {
+  die $Usage;
+}
+
+my $old_phone_symbols_filename = shift @ARGV;
+my $new_phone_symbols_filename = shift @ARGV;
+my $mapping_filename = shift @ARGV;
+
+my %id_of_old_phone;
+open(IN, $old_phone_symbols_filename) or die "Can't open $old_phone_symbols_filename";
+while (<IN>) {
+  chomp;
+  my ($phone, $id) = split;
+  next if $phone =~ /\#/;
+  $id_of_old_phone{$phone} = $id;
+}
+close IN;
+
+my $new_id = 0;
+my %id_of_new_phone;
+my %id_old_to_new;
+foreach (sort { $id_of_old_phone{$a} <=> $id_of_old_phone{$b} } keys %id_of_old_phone) {
+  my $old_phone = $_;
+  s/_[BIES]//;
+  s/\d//;
+  my $new_phone = $_;
+  $id_of_new_phone{$new_phone} = $new_id++ if not exists $id_of_new_phone{$new_phone};
+  $id_old_to_new{$id_of_old_phone{$old_phone}} = $id_of_new_phone{$new_phone};
+}
+
+# Write to file
+open(OUT, ">$new_phone_symbols_filename") or die "Can\'t write to $new_phone_symbols_filename";
+foreach (sort { $id_of_new_phone{$a} <=> $id_of_new_phone{$b} } keys %id_of_new_phone) {
+  print OUT "$_\t$id_of_new_phone{$_}\n";
+}
+close OUT;
+
+open(OUT, ">$mapping_filename") or die "Can\'t write to $mapping_filename";
+foreach (sort { $a <=> $b } keys %id_old_to_new) {
+  next if $_ == 0;
+  print OUT "$_ $id_old_to_new{$_}\n";
+}
+close OUT;
diff --git a/egs/gop/s5/path.sh b/egs/gop/s5/path.sh
new file mode 100755
index 00000000000..03df6dd9f2b
--- /dev/null
+++ b/egs/gop/s5/path.sh
@@ -0,0 +1,27 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+# we use this both in the (optional) LM training and the G2P-related scripts
+PYTHON='python2.7'
+
+### Below are the paths used by the optional parts of the recipe
+
+# We only need the Festival stuff below for the optional text normalization(for LM-training) step
+FEST_ROOT=tools/festival
+NSW_PATH=${FEST_ROOT}/festival/bin:${FEST_ROOT}/nsw/bin
+export PATH=$PATH:$NSW_PATH
+
+# SRILM is needed for LM model building
+SRILM_ROOT=$KALDI_ROOT/tools/srilm
+SRILM_PATH=$SRILM_ROOT/bin:$SRILM_ROOT/bin/i686-m64
+export PATH=$PATH:$SRILM_PATH
+
+# Sequitur G2P executable
+sequitur=$KALDI_ROOT/tools/sequitur/g2p.py
+sequitur_path="$(dirname $sequitur)/lib/$PYTHON/site-packages"
+
+# Directory under which the LM training corpus should be extracted
+LM_CORPUS_ROOT=./lm-corpus
diff --git a/egs/gop/s5/run.sh b/egs/gop/s5/run.sh
new file mode 100755
index 00000000000..a731b913552
--- /dev/null
+++ b/egs/gop/s5/run.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Copyright 2019 Junbo Zhang
+# Apache 2.0
+
+# This script shows how to calculate Goodness of Pronunciation (GOP) and
+# extract phone-level pronunciation feature for mispronunciations detection
+# tasks. Read ../README.md or the following paper for details:
+#
+# "Hu et al., Improved mispronunciation detection with deep neural network
+# trained acoustic models and transfer learning based logistic regression
+# classifiers, 2015."
+
+# You might not want to do this for interactive shells.
+set -e
+
+# Before running this recipe, you have to run the librispeech recipe firstly.
+# This script assumes the following paths exist.
+librispeech_eg=../../librispeech/s5
+model=$librispeech_eg/exp/nnet3_cleaned/tdnn_sp
+ivector=$librispeech_eg/exp/nnet3_cleaned/ivectors_test_clean_hires
+lang=$librispeech_eg/data/lang
+test_data=$librispeech_eg/data/test_clean_hires
+
+for d in $model $ivector $lang $test_data; do
+  [ ! -d $d ] && echo "$0: no such path $d" && exit 1;
+done
+
+# Global configurations
+stage=0
+nj=4
+
+data=test_10short
+dir=exp/gop_$data
+
+. ./cmd.sh
+. ./path.sh
+. parse_options.sh
+
+if [ $stage -le 0 ]; then
+  # Prepare test data
+  [ -d data ] || mkdir -p data/$data
+  local/make_testcase.sh $test_data data/$data
+fi
+
+if [ $stage -le 1 ]; then
+  # Compute Log-likelihoods
+  steps/nnet3/compute_output.sh --cmd "$cmd" --nj $nj \
+    --online-ivector-dir $ivector data/$data $model exp/probs_$data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/nnet3/align.sh --cmd "$cmd" --nj $nj --use_gpu false \
+    --online_ivector_dir $ivector data/$data $lang $model $dir
+fi
+
+if [ $stage -le 3 ]; then
+  # make a map which converts phones to "pure-phones"
+  # "pure-phone" means the phone whose stress and pos-in-word markers are ignored
+  # eg. AE1_B --> AE, EH2_S --> EH, SIL --> SIL
+  local/remove_phone_markers.pl $lang/phones.txt $dir/phones-pure.txt \
+    $dir/phone-to-pure-phone.int
+
+  # Convert transition-id to pure-phone id
+  $cmd JOB=1:$nj $dir/log/ali_to_phones.JOB.log \
+    ali-to-phones --per-frame=true $model/final.mdl "ark,t:gunzip -c $dir/ali.JOB.gz|" \
+      "ark,t:-" \| utils/apply_map.pl -f 2- $dir/phone-to-pure-phone.int \| \
+      gzip -c \>$dir/ali-pure-phone.JOB.gz   || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The outputs of the binary compute-gop are the GOPs and the phone-level features.
+  #
+  # An example of the GOP result (extracted from "ark,t:$dir/gop.3.txt"):
+  # 4446-2273-0031 [ 1 0 ] [ 12 0 ] [ 27 -5.382001 ] [ 40 -13.91807 ] [ 1 -0.2555897 ] \
+  #                [ 21 -0.2897284 ] [ 5 0 ] [ 31 0 ] [ 33 0 ] [ 3 -11.43557 ] [ 25 0 ] \
+  #                [ 16 0 ] [ 30 -0.03224623 ] [ 5 0 ] [ 25 0 ] [ 33 0 ] [ 1 0 ]
+  # It is in the posterior format, where each pair stands for [pure-phone-index gop-value].
+  # For example, [ 27 -5.382001 ] means the GOP of the pure-phone 27 (it corresponds to the
+  # phone "OW", according to "$dir/phones-pure.txt") is -5.382001, indicating the audio
+  # segment of this phone should be a mispronunciation.
+  #
+  # The phone-level features are in matrix format:
+  # 4446-2273-0031  [ -0.2462088 -10.20292 -11.35369 ...
+  #                   -8.584108 -7.629755 -13.04877 ...
+  #                   ...
+  #                   ... ]
+  # The row number is the phone number of the utterance. In this case, it is 17.
+  # The column number is 2 * (pure-phone set size), as the feature is consist of LLR + LPR.
+  # The phone-level features can be used to train a classifier with human labels. See Hu's
+  # paper for detail.
+  $cmd JOB=1:$nj $dir/log/compute_gop.JOB.log \
+    compute-gop --phone-map=$dir/phone-to-pure-phone.int $model/final.mdl \
+      "ark,t:gunzip -c $dir/ali-pure-phone.JOB.gz|" \
+      "ark:exp/probs_$data/output.JOB.ark" \
+      "ark,t:$dir/gop.JOB.txt" "ark,t:$dir/phonefeat.JOB.txt"   || exit 1;
+  echo "Done compute-gop, the results: \"$dir/gop.<JOB>.txt\" in posterior format."
+
+  # We set -5 as a universal empirical threshold here. You can also determine multiple phone
+  # dependent thresholds based on the human-labeled mispronunciation data.
+  echo "The phones whose gop values less than -5 could be treated as mispronunciations."
+fi
diff --git a/egs/gop/s5/steps b/egs/gop/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/gop/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/gop/s5/utils b/egs/gop/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/gop/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/gp/s1/local/gp_convert_audio.sh b/egs/gp/s1/local/gp_convert_audio.sh
index a7c2d7285c4..b3db909c9b6 100755
--- a/egs/gp/s1/local/gp_convert_audio.sh
+++ b/egs/gp/s1/local/gp_convert_audio.sh
@@ -108,4 +108,4 @@ done < "$INLIST"
   echo "sox: error converting following $nsoxerr file(s):" >&2
 [ -f "$soxerr" ] && cat "$soxerr" >&2
 
-exit 0;
\ No newline at end of file
+exit 0;
diff --git a/egs/gp/s1/utils/mkgraph.sh b/egs/gp/s1/utils/mkgraph.sh
index 2e45296593b..3aba742832d 100755
--- a/egs/gp/s1/utils/mkgraph.sh
+++ b/egs/gp/s1/utils/mkgraph.sh
@@ -131,4 +131,4 @@ cp $lang/silphones.csl $dir/
 # to make const fst:
 # fstconvert --fst_type=const $dir/HCLG.fst $dir/HCLG_c.fst
 
-echo "Finished making decoding graphs in $dir"
\ No newline at end of file
+echo "Finished making decoding graphs in $dir"
diff --git a/egs/heroico/s5/RESULTS b/egs/heroico/s5/RESULTS
index 9717e95e6e2..7942c03b1d9 100644
--- a/egs/heroico/s5/RESULTS
+++ b/egs/heroico/s5/RESULTS
@@ -1,22 +1,48 @@
 # for dir in $(echo exp/tri*/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done
 
-%WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0
-%WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0
-%WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0
-%WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0
-%WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0
-%WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0
-%WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0
-%WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0
-%WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0
-%WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0
-%WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0
-%WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0
-%WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0
-%WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0
-%WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0
-%WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0
-%WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0
-%WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0
-%WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0
-%WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0
+# old results before adding Movie subtitles text corpus in LM training:
+# %WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0
+# %WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0
+# %WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0
+# %WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0
+# %WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0
+# %WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0
+# %WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0
+# %WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0
+# %WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0
+# %WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0
+# %WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0
+# %WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0
+# %WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0
+# %WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0
+# %WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0
+# %WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0
+# %WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0
+# %WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0
+# %WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0
+# %WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0
+
+# new results:
+%WER 18.27 [ 1398 / 7650, 213 ins, 253 del, 932 sub ] exp/tri1/decode_devtest/wer_15_0.5
+%WER 9.95 [ 746 / 7498, 74 ins, 108 del, 564 sub ] exp/tri1/decode_native/wer_13_0.5
+%WER 16.63 [ 1532 / 9215, 197 ins, 183 del, 1152 sub ] exp/tri1/decode_nonnative/wer_17_0.0
+%WER 13.68 [ 2287 / 16713, 207 ins, 360 del, 1720 sub ] exp/tri1/decode_test/wer_17_0.5
+%WER 17.19 [ 1315 / 7650, 227 ins, 231 del, 857 sub ] exp/tri2b/decode_devtest/wer_17_0.5
+%WER 9.23 [ 692 / 7498, 60 ins, 103 del, 529 sub ] exp/tri2b/decode_native/wer_16_0.5
+%WER 17.16 [ 1581 / 9215, 184 ins, 216 del, 1181 sub ] exp/tri2b/decode_nonnative/wer_17_0.5
+%WER 13.64 [ 2279 / 16713, 241 ins, 326 del, 1712 sub ] exp/tri2b/decode_test/wer_17_0.5
+%WER 15.36 [ 1175 / 7650, 212 ins, 210 del, 753 sub ] exp/tri3b/decode_devtest/wer_17_0.5
+%WER 20.27 [ 1551 / 7650, 269 ins, 257 del, 1025 sub ] exp/tri3b/decode_devtest.si/wer_14_1.0
+%WER 6.40 [ 480 / 7498, 50 ins, 58 del, 372 sub ] exp/tri3b/decode_native/wer_16_0.0
+%WER 10.91 [ 818 / 7498, 100 ins, 112 del, 606 sub ] exp/tri3b/decode_native.si/wer_16_1.0
+%WER 14.30 [ 1318 / 9215, 206 ins, 134 del, 978 sub ] exp/tri3b/decode_nonnative/wer_17_0.0
+%WER 21.62 [ 1992 / 9215, 286 ins, 224 del, 1482 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0
+%WER 10.78 [ 1802 / 16713, 247 ins, 195 del, 1360 sub ] exp/tri3b/decode_test/wer_17_0.0
+%WER 16.81 [ 2809 / 16713, 374 ins, 338 del, 2097 sub ] exp/tri3b/decode_test.si/wer_16_1.0
+
+# chain model results:
+# for dir in $(echo exp/chain/tdnn1b_sp/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done
+%WER 12.99 [ 994 / 7650, 192 ins, 163 del, 639 sub ] exp/chain/tdnn1b_sp/decode_devtest/wer_10_1.0
+%WER 12.47 [ 1149 / 9215, 119 ins, 174 del, 856 sub ] exp/chain/tdnn1b_sp/decode_nonnative/wer_12_0.0
+%WER 9.64 [ 1611 / 16713, 169 ins, 240 del, 1202 sub ] exp/chain/tdnn1b_sp/decode_test/wer_12_0.0
+%WER 6.13 [ 460 / 7498, 52 ins, 55 del, 353 sub ] exp/chain/tdnn1b_sp/decode_native/wer_10_0.0
diff --git a/egs/heroico/s5/cmd.sh b/egs/heroico/s5/cmd.sh
index a427f3c16a5..533aad25db1 100755
--- a/egs/heroico/s5/cmd.sh
+++ b/egs/heroico/s5/cmd.sh
@@ -10,6 +10,7 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export cmd="retry.pl queue.pl"
 export train_cmd="retry.pl queue.pl"
 export decode_cmd="retry.pl queue.pl --mem 2G"
 
diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
new file mode 100755
index 00000000000..361879b4142
--- /dev/null
+++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -0,0 +1,318 @@
+#!/bin/bash
+
+# run_cnn_tdnn_1a.sh is modified from run_tdnn_1b.sh but taking
+#   the xconfig from mini-librispeech's run_cnn_tdnn_1a54.sh; only
+#   reducing the bottleneck-dim from 96 to 64, which is the value
+#   the run_tdnn1b.sh script here has. Results are better.
+# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a_sp
+# System                  tdnn1a_sp tdnn1b_sp cnn_tdnn1a_sp
+# %WER        devtest       53.07     52.54     51.10
+# %WER           test       59.25     53.70     52.07
+# %WER         native       54.47     48.76     47.88
+# %WER      nonnative       63.01     57.66     55.51
+# Final train prob          -0.0253   -0.0547   -0.0502
+# Final valid prob          -0.0687   -0.0694   -0.0661
+# Final train prob (xent)   -0.7715   -0.9502   -0.8513
+# Final valid prob (xent)   -1.0719   -1.0849   -0.9915
+# Num-params                 6567648   3321312   3345088
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train
+test_sets="native nonnative devtest test"
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+num_leaves=3500
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --cmd "$train_cmd" \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    $num_leaves \
+    ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  cnn_opts="l2-regularize=0.03"
+  ivector_layer_opts="l2-regularize=0.03"
+  ivector_affine_opts="l2-regularize=0.03"
+  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+
+  batchnorm-component name=idct-batchnorm input=idct
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
+  # limit the num-parameters).
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/train.py \
+    --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=8 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 \
+    data/lang_test \
+    $tree_dir \
+    $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/nnet3/decode.sh \
+      --acwt 1.0 \
+      --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nspk \
+      --cmd "$decode_cmd" \
+      --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+      $tree_dir/graph \
+      data/${data}_hires \
+      ${dir}/decode_${data} || exit 1;
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang \
+    exp/nnet3${nnet3_affix}/extractor \
+    ${dir} \
+    ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    # note: we just give it "data/${data}" as it only uses the wav.scp, the
+    # feature type does not matter.
+    steps/online/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nspk --cmd "$decode_cmd" \
+      $tree_dir/graph data/${data} ${dir}_online/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
+
+# Local Variables:
+# tab-width: 2
+# indent-tabs-mode: nil
+# End:
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
index 4658f4d3d6d..290bd4c7970 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -1,19 +1,20 @@
 #!/bin/bash
 
 # local/chain/compare_wer.sh exp/chain/tdnn1a_sp
+# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp
 # System                  tdnn1a_sp
-# %WER        devtest       53.07
-# %WER           test       59.25
-# %WER         native       54.47
-# %WER      nonnative       63.01
-# Final train prob          -0.0253
-# Final valid prob          -0.0687
-# Final train prob (xent)   -0.7715
-# Final valid prob (xent)   -1.0719
-# Num-params                 6567648
+# %WER        devtest       13.10
+# %WER           test       15.53
+# %WER         native       10.14
+# %WER      nonnative       19.78
+# Final train prob          -0.0233
+# Final valid prob          -0.0720
+# Final train prob (xent)   -0.8107
+# Final valid prob (xent)   -0.9898
+# Num-params                 6559440
 
 # steps/info/chain_dir_info.pl  exp/chain/tdnn1a_sp/
-#exp/chain/tdnn1a_sp/: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1392 combine=-0.040->-0.033 (over 7) xent:train/valid[69,104,final]=(-1.12,-0.880,-0.771/-1.33,-1.21,-1.07) logprob:train/valid[69,104,final]=(-0.050,-0.031,-0.025/-0.079,-0.080,-0.069)
+# exp/chain/tdnn1a_sp: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1384 combine=-0.032->-0.026 (over 7) xent:train/valid[69,104,final]=(-1.14,-0.892,-0.811/-1.19,-1.07,-0.990) logprob:train/valid[69,104,final]=(-0.045,-0.029,-0.023/-0.083,-0.080,-0.072)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
@@ -149,7 +150,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.0025"
 
diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
index 33ce1556d29..cfb4dc1f697 100755
--- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -3,21 +3,20 @@
 # 1b is as 1a but a re-tuned model with quite a few changes, including moving to
 #   a resnet-style factored TDNN-F model.
 #
-# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp
+# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp
 # System                  tdnn1a_sp tdnn1b_sp
-# %WER        devtest       53.07     52.54
-# %WER           test       59.25     53.70
-# %WER         native       54.47     48.76
-# %WER      nonnative       63.01     57.66
-# Final train prob          -0.0253   -0.0547
-# Final valid prob          -0.0687   -0.0694
-# Final train prob (xent)   -0.7715   -0.9502
-# Final valid prob (xent)   -1.0719   -1.0849
-# Num-params                 6567648   3321312
-
+# %WER        devtest       13.10     12.99
+# %WER           test       15.53      9.64
+# %WER         native       10.14      6.13
+# %WER      nonnative       19.78     12.47
+# Final train prob          -0.0233   -0.0442
+# Final valid prob          -0.0720   -0.0726
+# Final train prob (xent)   -0.8107   -0.9759
+# Final valid prob (xent)   -0.9898   -0.9964
+# Num-params                 6559440   3318224
 
 # steps/info/chain_dir_info.pl  exp/chain/tdnn1b_sp
-# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069)
+# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1384 combine=-0.044->-0.044 (over 1) xent:train/valid[21,33,final]=(-1.30,-0.993,-0.976/-1.28,-1.01,-0.996) logprob:train/valid[21,33,final]=(-0.071,-0.050,-0.044/-0.093,-0.076,-0.073)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
@@ -152,7 +151,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
diff --git a/egs/heroico/s5/local/heroico_answers_make_lists.pl b/egs/heroico/s5/local/heroico_answers_make_lists.pl
index fb3c0ecb8d1..c1a3735b4f1 100755
--- a/egs/heroico/s5/local/heroico_answers_make_lists.pl
+++ b/egs/heroico/s5/local/heroico_answers_make_lists.pl
@@ -30,7 +30,7 @@
 my $t = "$tmpdir/answers/text";
 
 # initialize hash for prompts
-my %p = ();
+my %prompts = ();
 
 # store prompts in hash
 LINEA: while ( my $line = <> ) {
@@ -40,9 +40,27 @@
   my @dirs = split /\//, $directories;
   # get the speaker number
   my $s = $dirs[-1];
+  # pad the speaker number with zeroes
+  my $spk = "";
+  if ( $s < 10 ) {
+      $spk = '000' . $s;
+  } elsif ( $s < 100 ) {
+      $spk = '00' . $s;
+  } elsif ( $s < 1000 ) {
+      $spk = '0' . $s;
+  }
+  # pad the filename with zeroes
+  my $fn = "";
+  if ( $file < 10 ) {
+      $fn = '000' . $file;
+  } elsif ( $file < 100 ) {
+      $fn = '00' . $file;
+  } elsif ( $file < 1000 ) {
+      $fn = '0' . $file;
+  }
   # the utterance name
-  my $i = $s . '_' . 'a' . '_' . $file;
-  $p{$i} = $sent;
+  my $utt = $spk . '_' . $fn;
+  $prompts{$utt} = $sent;
 }
 
 open my $W, '<', $w or croak "problem with $w $!";
@@ -58,18 +76,36 @@
   my @dirs = split /\//, $directories;
   my $r = basename $line, ".wav";
   my $s = $dirs[-1];
-  my $rid = $s . '_' . 'a' . '_' . $r;
-  if ( exists $p{$rid} ) {
-    print $T "$rid $p{$rid}\n";
-  } elsif ( defined $rid ) {
-    warn  "warning: problem\t$rid";
+  my $spk = "";
+  # pad with zeroes
+  if ( $s < 10 ) {
+      $spk = '000' . $s;
+  } elsif ( $s < 100 ) {
+      $spk = '00' . $s;
+  } elsif ( $s < 1000 ) {
+      $spk = '0' . $s;
+  }
+  # pad the file name with zeroes
+  my $rec = "";
+  if ( $r < 10 ) {
+      $rec = '000' . $r;
+  } elsif ( $r < 100 ) {
+      $rec = '00' . $r;
+  } elsif ( $r < 1000 ) {
+      $rec = '0' . $r;
+  }
+  my $rec_id = $spk . '_' . $rec;
+  if ( exists $prompts{$rec_id} ) {
+    print $T "$rec_id $prompts{$rec_id}\n";
+  } elsif ( defined $rec_id ) {
+    warn  "warning: problem\t$rec_id";
     next LINE;
   } else {
     croak "$line";
   }
 
-  print $O "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
-  print $U "$rid ${s}_a\n";
+  print $O "$rec_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
+  print $U "$rec_id $spk\n";
 }
 close $T;
 close $O;
diff --git a/egs/heroico/s5/local/heroico_recordings_make_lists.pl b/egs/heroico/s5/local/heroico_recordings_make_lists.pl
index 1d157665799..b9a3ab5a565 100755
--- a/egs/heroico/s5/local/heroico_recordings_make_lists.pl
+++ b/egs/heroico/s5/local/heroico_recordings_make_lists.pl
@@ -19,75 +19,102 @@
 system "mkdir -p $tmpdir/recordings/devtest";
 
 # input wav file list
-my $w = "$tmpdir/wav_list.txt";
+my $input_wav_list = "$tmpdir/wav_list.txt";
 
 # output temporary wav.scp files
-my $o_train = "$tmpdir/recordings/train/wav.scp";
-my $o_test = "$tmpdir/recordings/devtest/wav.scp";
+my $train_wav_scp = "$tmpdir/recordings/train/wav.scp";
+my $test_wav_scp = "$tmpdir/recordings/devtest/wav.scp";
 
 # output temporary utt2spk files
-my $u_train = "$tmpdir/recordings/train/utt2spk";
-my $u_test = "$tmpdir/recordings/devtest/utt2spk";
+my $train_uttspk = "$tmpdir/recordings/train/utt2spk";
+my $test_uttspk = "$tmpdir/recordings/devtest/utt2spk";
 
 # output temporary text files
-my $t_train = "$tmpdir/recordings/train/text";
-my $t_test = "$tmpdir/recordings/devtest/text";
+my $train_text = "$tmpdir/recordings/train/text";
+my $test_text = "$tmpdir/recordings/devtest/text";
 
 # initialize hash for prompts
-my %p = ();
+my %prompts = ();
 
 # store prompts in hash
 LINEA: while ( my $line = <> ) {
     chomp $line;
-    my ($s,$sent) = split /\t/, $line, 2;
-    $p{$s} = $sent;
+    my ($prompt_id,$prompt) = split /\t/, $line, 2;
+    # pad the prompt id with zeroes
+    my $pid = "";
+    if ( $prompt_id < 10 ) {
+	$pid = '0000' . $prompt_id;
+    } elsif ( $prompt_id < 100 ) {
+	$pid = '000' . $prompt_id;
+    } elsif ( $prompt_id < 1000 ) {
+	$pid = '00' . $prompt_id;
+    }
+    $prompts{$pid} = $prompt;
 }
 
-open my $W, '<', $w or croak "problem with $w $!";
-open my $OT, '+>', $o_train or croak "problem with $o_train $!";
-open my $OE, '+>', $o_test or croak "problem with $o_test $!";
-open my $UT, '+>', $u_train or croak "problem with $u_train $!";
-open my $UE, '+>', $u_test or croak "problem with $u_test $!";
-open my $TT, '+>', $t_train or croak "problem with $t_train $!";
-open my $TE, '+>', $t_test or croak "problem with $t_test $!";
+open my $WVL, '<', $input_wav_list or croak "problem with $input_wav_list $!";
+open my $TRNWSCP, '+>', $train_wav_scp or croak "problem with $train_wav_scp $!";
+open my $TSTWSCP, '+>', $test_wav_scp or croak "problem with $test_wav_scp $!";
+open my $TRNUTTSPK, '+>', $train_uttspk or croak "problem with $train_uttspk $!";
+open my $TSTUTTSPK, '+>', $test_uttspk or croak "problem with $test_uttspk $!";
+open my $TRNTXT, '+>', $train_text or croak "problem with $train_text $!";
+open my $TSTTXT, '+>', $test_text or croak "problem with $test_text $!";
 
- LINE: while ( my $line = <$W> ) {
+ LINE: while ( my $line = <$WVL> ) {
      chomp $line;
      next LINE if ($line =~ /Answers/ );
      next LINE unless ( $line =~ /Recordings/ );
      my ($volume,$directories,$file) = File::Spec->splitpath( $line );
      my @dirs = split /\//, $directories;
-     my $r = basename $line, ".wav";
-     my $s = $dirs[-1];
-     my $rid = $s . '_r' . '_' . $r;
-     if ( ( $r >= 355 ) and ( $r < 561 ) ) {
-	 if ( exists $p{$r} ) {
-	     print $TE "$rid $p{$r}\n";
-	 } elsif ( defined $rid ) {
-	     warn  "problem\t$rid";
+     my $utt_id = basename $line, ".wav";
+     # pad the utterance id with zeroes
+     my $utt = "";
+     if ( $utt_id < 10 ) {
+     $utt = '0000' . $utt_id;
+} elsif ( $utt_id < 100 ) {
+    $utt = '000' . $utt_id;
+} elsif ( $utt_id < 1000 ) {
+    $utt = '00' . $utt_id;
+}
+     my $spk_id = $dirs[-1];
+     # pad the speaker id with zeroes
+     my $spk = "";
+     if ( $spk_id < 10 ) {
+	 $spk = '000' . $spk_id;
+     } elsif ( $spk_id < 100 ) {
+	 $spk = '00' . $spk_id;
+     } elsif ( $spk_id < 1000 ) {
+	 $spk = '0' . $spk_id;
+     }
+     my $spk_utt_id = $spk . '_' . $utt;
+     if ( ( $utt_id >= 355 ) and ( $utt_id < 561 ) ) {
+if ( exists $prompts{$utt} ) {
+	     print $TSTTXT "$spk_utt_id $prompts{$utt}\n";
+	 } elsif ( defined $spk_utt_id ) {
+	     warn  "problem\t$spk_utt_id";
 	     next LINE;
 	 } else {
 	     croak "$line";
 	 }
-	 print $OE "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
-	 print $UE "$rid ${s}_r\n";
-     } elsif ( ( $r < 355 ) or ( $r > 560 ) ) {
-	 if ( exists $p{$r} ) {
-	     print $TT "$rid $p{$r}\n";
-	 } elsif ( defined $rid ) {
-	     warn  "problem\t$rid";
+	 print $TSTWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
+	 print $TSTUTTSPK "$spk_utt_id $spk\n";
+     } elsif ( ( $utt_id < 355 ) or ( $utt_id > 560 ) ) {
+	 if ( exists $prompts{$utt} ) {
+	     print $TRNTXT "$spk_utt_id $prompts{$utt}\n";
+	 } elsif ( defined $spk_utt_id ) {
+	     warn  "problem\t$spk_utt_id";
 	     next LINE;
 	 } else {
 	     croak "$line";
 	 }
-	 print $OT "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
-	 print $UT "$rid ${s}_r\n";
-     }
+	 print $TRNWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n";
+	 print $TRNUTTSPK "$spk_utt_id $spk\n";
+     } 
 }
-close $TT;
-close $OT;
-close $UT;
-close $TE;
-close $OE;
-close $UE;
-close $W;
+close $TRNTXT;
+close $TRNWSCP;
+close $TRNUTTSPK;
+close $TSTTXT;
+close $TSTWSCP;
+close $TSTUTTSPK;
+close $WVL;
diff --git a/egs/heroico/s5/local/nnet3/run_ivector_common.sh b/egs/heroico/s5/local/nnet3/run_ivector_common.sh
index 153f0073667..e882ce0c918 100755
--- a/egs/heroico/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/heroico/s5/local/nnet3/run_ivector_common.sh
@@ -9,6 +9,9 @@ set -euo pipefail
 # of usage.
 
 stage=0
+nj=56
+num_threads_ubm=2
+
 train_set=train
 test_sets="native nonnative devtest test"
 gmm=tri3b
@@ -37,25 +40,17 @@ if [ $stage -le 1 ]; then
     utils/data/perturb_data_dir_speed_3way.sh \
 	data/${train_set} \
 	data/${train_set}_sp
-    echo "$0: making MFCC features for low-resolution speed-perturbed data"
-    steps/make_mfcc.sh \
-	--cmd "$train_cmd" \
-	--nj 10 \
-	data/${train_set}_sp || exit 1;
-    steps/compute_cmvn_stats.sh \
-	data/${train_set}_sp || exit 1;
-    utils/fix_data_dir.sh \
-	data/${train_set}_sp
+
+    echo "$0: making mfcc features for low-resolution speed-perturbed data"
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1;
+    steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+    utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
 if [ $stage -le 2 ]; then
     echo "$0: aligning with the perturbed low-resolution data"
     steps/align_fmllr.sh \
-	--nj 20 \
-	--cmd "$train_cmd" \
-	data/${train_set}_sp \
-	data/lang \
-	$gmm_dir \
+	--nj 20 --cmd "$train_cmd" data/${train_set}_sp data/lang $gmm_dir \
 	$ali_dir || exit 1
 fi
 
diff --git a/egs/heroico/s5/local/prepare_data.sh b/egs/heroico/s5/local/prepare_data.sh
index db2b990c07b..b78d9f1d1cb 100755
--- a/egs/heroico/s5/local/prepare_data.sh
+++ b/egs/heroico/s5/local/prepare_data.sh
@@ -4,17 +4,17 @@
 # Apache 2.0.
 
 . ./cmd.sh
-
 . ./path.sh
 stage=0
+datadir=$1
 
 . ./utils/parse_options.sh
 
 set -e
 set -o pipefail
 
-# the location of the LDC corpus
-datadir=$1
+tmpdir=data/local/tmp
+
 # acoustic models are trained on the heroico corpus
 # testing is done on the usma corpus
 # heroico consists of 2 parts: answers and recordings (recited)
@@ -25,8 +25,6 @@ recordings_transcripts=$datadir/data/transcripts/heroico-recordings.txt
 # usma is all recited
 usma_transcripts=$datadir/data/transcripts/usma-prompts.txt
 
-tmpdir=data/local/tmp
-
 # make acoustic model training  lists
 if [ $stage -le 0 ]; then
   mkdir -p $tmpdir/heroico $tmpdir/usma
@@ -37,12 +35,12 @@ if [ $stage -le 0 ]; then
   # the transcripts are converted to UTF8
   export LC_ALL=en_US.UTF-8
   cat $answers_transcripts  | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' |  local/heroico_answers_make_lists.pl
+    tr -d '\r' |  local/heroico_answers_make_lists.pl
 
   utils/fix_data_dir.sh $tmpdir/heroico/answers
 
   cat $recordings_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' | local/heroico_recordings_make_lists.pl
+    tr -d '\r' | local/heroico_recordings_make_lists.pl
 
   utils/fix_data_dir.sh $tmpdir/heroico/recordings/train
   utils/fix_data_dir.sh $tmpdir/heroico/recordings/devtest
@@ -52,11 +50,11 @@ if [ $stage -le 0 ]; then
 
   for x in wav.scp utt2spk text; do
     cat $tmpdir/heroico/answers/$x $tmpdir/heroico/recordings/train/$x | \
-      sed -e 's/\r//' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x
+      tr -d '\r' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x
   done
 
   for x in wav.scp utt2spk text; do
-    cat $tmpdir/heroico/recordings/devtest/$x | sed -e 's/\r//' | \
+    cat $tmpdir/heroico/recordings/devtest/$x | tr -d '\r' | \
       sort -k1,1 -u >$tmpdir/heroico/lists/devtest/$x
   done
 
@@ -67,10 +65,10 @@ fi
 if [ $stage -le 1 ]; then
   #  make separate lists for usma (US military academy) native and nonnative
   cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' | local/usma_native_make_lists.pl
+    tr -d '\r' | dos2unix | local/usma_native_make_lists.pl
 
   cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
-    sed -e 's/\r//' | local/usma_nonnative_make_lists.pl
+    tr -d '\r' | local/usma_nonnative_make_lists.pl
 
   for n in native nonnative; do
     mkdir -p $tmpdir/usma/$n/lists
@@ -86,14 +84,14 @@ if [ $stage -le 1 ]; then
   # get training lists
   for x in wav.scp utt2spk text; do
     cat $tmpdir/heroico/answers/${x} $tmpdir/heroico/recordings/train/${x} | \
-      sed -e 's/\r//' >$tmpdir/lists/train/$x
+      tr -d '\r' >$tmpdir/lists/train/$x
     sort $tmpdir/lists/train/$x >data/train/$x
   done
 
   # get devtest lists
   for x in wav.scp utt2spk text; do
     cat $tmpdir/heroico/lists/devtest/$x | \
-      sed -e 's/\r//' >$tmpdir/lists/devtest/$x
+       tr -d '\r' >$tmpdir/lists/devtest/$x
     sort $tmpdir/lists/devtest/$x >data/devtest/$x
   done
 
diff --git a/egs/heroico/s5/local/prepare_dict.sh b/egs/heroico/s5/local/prepare_dict.sh
index a6d182a6852..9f498bc963a 100755
--- a/egs/heroico/s5/local/prepare_dict.sh
+++ b/egs/heroico/s5/local/prepare_dict.sh
@@ -13,12 +13,12 @@ fi
 
 export LC_ALL=C
 
-cut -f2- data/local/tmp/dict/santiago.txt | \
+cut -f2- ./santiago.txt | \
   tr -s '[:space:]' '[\n*]' | \
     grep -v SPN | sort -u  >data/local/dict/nonsilence_phones.txt
 
 # sed "1d" deletes the last line.
-expand -t 1 data/local/tmp/dict/santiago.txt | sort -u |
+expand -t 1 ./santiago.txt | sort -u |
    sed "1d" >data/local/dict/lexicon.txt
 
 echo "<UNK> SPN" >> data/local/dict/lexicon.txt
diff --git a/egs/heroico/s5/local/subs_download.sh b/egs/heroico/s5/local/subs_download.sh
new file mode 100755
index 00000000000..98dcb42d4e0
--- /dev/null
+++ b/egs/heroico/s5/local/subs_download.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright 2017 John Morgan
+# Apache 2.0.
+
+tmpdir=data/local/tmp
+download_dir=$(pwd)
+mkdir -p $download_dir
+subs_src=$1
+
+# download the subs corpus
+if [ ! -f $download_dir/subs.zip ]; then
+  wget -O $download_dir/subs.zip $subs_src
+  (
+    cd $download_dir
+    unzip subs.zip
+  )
+  else
+    echo "$0: subs file already downloaded."
+fi
diff --git a/egs/heroico/s5/local/subs_prepare_data.pl b/egs/heroico/s5/local/subs_prepare_data.pl
index 3cd906d4699..e39db79f610 100755
--- a/egs/heroico/s5/local/subs_prepare_data.pl
+++ b/egs/heroico/s5/local/subs_prepare_data.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
 
 # Copyright 2017 John Morgan
 # Apache 2.0.
@@ -12,69 +12,64 @@
 use Encode;
 
 # set lower and upper bounds
-my $lb = 8;
-# only segments with at least  $lb words will be written
-my $ub = 16;
-# only segments with fewer than $ub words will be written
+my $low_bound = 8;
+# only segments with at least  $low_bound words will be written
+my $up_bound = 16;
+# only segments with fewer than $up_bound words will be written
 
 # input and output files
-my $c = "data/local/tmp/subs/OpenSubtitles2016.en-es.es";
-my $symtab = "data/lang/words.txt";
-my $rl = "data/local/tmp/subs/lm/es.txt";
-my $oo = "data/local/tmp/subs/lm/oovs.txt";
+
+my $corpus = "OpenSubtitles.en-es.es";
+my $symbol_table = "data/lang/words.txt";
+my $filtered = "data/local/tmp/subs/lm/es.txt";
+my $oovs = "data/local/tmp/subs/lm/oovs.txt";
 my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt";
 
-open my $C, '<', $c or croak "problems with $c $!";
+open my $C, '<', $corpus or croak "problems with $corpus $!";
 
 system "mkdir -p data/local/tmp/subs/lm";
 
-open my $RL, '+>:utf8', $rl or croak "problems with $rl $!";
-
-LINE: while ( my $line = <$C> ) {
-    $line = decode_utf8 $line;
-    chomp $line;
-
-    my @tokens = split /\s+/, $line;
-
-    next LINE if ( ($#tokens < $lb) or ($#tokens > $ub ));
-
-    #remove control characters
-    #$line =~ s/(\p{Other})/ /g;
-    #$line =~ s/(\p{Control})/ /g;
-    #$line =~ s/(\p{Format})/ /g;
-    #$line =~ s/(\p{Private_Use})/ /g;
-    #$line =~ s/(\p{Surrogate})/ /g;
-
-    # punctuation
-    $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[	 ]+)/ /msxg;
-#convert tabs to white space
-    $line =~ s/\t/ /g;
-    #hard to soft space
-    $line =~ s/ / /g;
-#squeeze white space
-    $line =~ s/\s+/ /g;
-#initial and final white space
-    $line =~ s/^\p{Separator}+//;
-    $line =~ s/\p{Separator}+$//;
-#down case
-    $line = lc $line;
-
-
-    print $RL "$line\n";
-
+if ( -e $filtered ) {
+    warn "$filtered already exists.";
+} else {
+  open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!";
+  LINE: while ( my $line = <$C> ) {
+      $line = decode_utf8 $line;
+      chomp $line;
+
+      my @tokens = split /\s+/, $line;
+
+      next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound ));
+
+      # remove punctuation
+      $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[	 ]+)/ /msxg;
+      #convert tabs to white space
+      $line =~ s/\t/ /g;
+      #hard to soft space
+      $line =~ s/ / /g;
+      #squeeze white space
+      $line =~ s/\s+/ /g;
+      #initial and final white space
+      $line =~ s/^\p{Separator}+//;
+      $line =~ s/\p{Separator}+$//;
+      #down case
+      $line = lc $line;
+
+      print $FLT "$line\n";
+  }
+  close $FLT;
 }
-
 close $C;
-close $RL;
+
 
 # find out of vocabulary words
 
-# $symtab points to a file containing a map of symbols to integers
+# $symbol_table points to a file containing a map of symbols to integers
 
 # hash for word to integer map
 my %sym2int = ();
 
-open my $F, '<', $symtab or croak "problem with $symtab $!";
+open my $F, '<', $symbol_table or croak "problem with $symbol_table $!";
 
 # store words to int map in hash
 while( my $line = <$F>) {
@@ -84,33 +79,33 @@
 }
 close $F;
 
-open my $I, '<', $rl or croak "problem with $rl $!";
-open my $OO, '+>', $oo or croak "problems with $oo $!";
+open my $I, '<', $filtered or croak "problem with $filtered $!";
+open my $OOVS, '+>', $oovs or croak "problems with $oovs $!";
 
 while ( my $line = <$I>) {
     chomp $line;
     my @A = split /\s/, $line;
     foreach my $a (@A) {
 	if (!defined ($sym2int{$a})) {
-            print $OO "$a\n";
+            print $OOVS "$a\n";
 	}
     }
 }
-close $OO;
+close $OOVS;
 close $I;
 
 # remove segments with OOVs
 
 # store OOVS in hash
 my %oov = ();
-open my $V, '<', $oo or croak "problems with $oo $!";
+open my $V, '<', $oovs or croak "problems with $oovs $!";
 while ( my $line = <$V> ) {
     chomp $line;
     $oov{$line} = 1;
 }
 close $V;
 
-open my $L, '<', $rl or croak "problems with $rl $!";
+open my $L, '<', $filtered or croak "problems with $filtered $!";
 open my $IV, '+>', $iv or croak "problems with $iv $!";
 
 SEGMENT: while ( my $segment = <$L> ) {
diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh
index 711bece3c66..4cc5617e985 100755
--- a/egs/heroico/s5/run.sh
+++ b/egs/heroico/s5/run.sh
@@ -1,83 +1,80 @@
 #!/bin/bash
 
 . ./cmd.sh
-
 . ./path.sh
+
 stage=0
 
+# the location of the LDC corpus; this location works for the CLSP grid.
+datadir=/export/corpora5/LDC/LDC2006S37
+
+# The corpus and lexicon are on openslr.org
+#speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
+lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz"
+
+# Location of the Movie subtitles text corpus
+subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"
+
 . utils/parse_options.sh
 
 set -e
 set -o pipefail
 set -u
 
-# the location of the LDC corpus; this location works for the CLSP grid.
-datadir=/export/corpora5/LDC/LDC2006S37
 
-#datadir=/mnt/corpora/LDC2006S37
-
-# location of subtitles text data
-# note: this is not used so I'm commenting it out; dan.
-#subsdata="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2016/en-es.txt.zip"
-lexicon="http://www.openslr.org/resources/34/santiago.tar.gz"
 # don't change tmpdir, the location is used explicitly in scripts in local/.
 tmpdir=data/local/tmp
 
 if [ $stage -le 0 ]; then
-  # prepare the lists for acoustic model training and testing
-  mkdir -p $tmpdir/heroico
-  mkdir -p $tmpdir/usma
-
-  [ ! -d "$datadir" ] && \
-    echo "$0 Data directory (LDC corpus release) does not exist" && \
+  if [ ! -d $datadir ]; then
+    echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
+    echo "  and set $datadir to the directory where it is located."
     exit 1
-  local/prepare_data.sh $datadir
+  fi
+  if [ ! -s santiago.txt ]; then
+    echo "$0: downloading the lexicon"
+    wget -c http://www.openslr.org/resources/34/santiago.tar.gz
+    tar -xvzf santiago.tar.gz
+  fi
+  # Get data for lm training
+  local/subs_download.sh $subtitles_url
 fi
 
 if [ $stage -le 1 ]; then
-  # prepare a dictionary
-  mkdir -p data/local/dict
-  mkdir -p data/local/tmp/dict
-
-  # download the dictionary from openslr
-  if [ ! -f data/local/tmp/dict/santiago.tar.gz ]; then
-    wget -O data/local/tmp/dict/santiago.tar.gz $lexicon
-  fi
-
-  (
-    cd $tmpdir/dict
-    tar -xzf santiago.tar.gz
-  )
+  echo "Making lists for building models."
+  local/prepare_data.sh $datadir
+fi
 
+if [ $stage -le 2 ]; then
+  mkdir -p data/local/dict $tmpdir/dict
   local/prepare_dict.sh
+fi
 
-  # prepare the lang directory
+if [ $stage -le 3 ]; then
   utils/prepare_lang.sh \
     data/local/dict "<UNK>" \
     data/local/lang data/lang
 fi
 
-if [ $stage -le 2 ]; then
-  # use am training text to train lm
-  mkdir -p $tmpdir/heroico/lm
+if [ $stage -le 4 ]; then
+  mkdir -p $tmpdir/subs/lm
+  local/subs_prepare_data.pl
+fi
+
+if [ $stage -le 5 ]; then
   echo "point 1"
-  # get the text from data/train/text
-  cut -d " " -f 2- data/train/text > $tmpdir/heroico/lm/train.txt
-  echo "point 2"
-  # build lm
-  local/prepare_lm.sh $tmpdir/heroico/lm/train.txt
+  local/prepare_lm.sh  $tmpdir/subs/lm/in_vocabulary.txt
+fi
 
-  echo "point 3"
+if [ $stage -le 6 ]; then
+  echo "point 2"
   utils/format_lm.sh \
     data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \
     data/lang_test
-
-  # delete temporary work
-  rm -rf data/local/tmp
 fi
 
-if [ $stage -le 3 ]; then
-  # extract acoustic features
+if [ $stage -le 7 ]; then
+  echo "$0: extracting acoustic features."
   mkdir -p exp
 
   for fld in native nonnative test devtest train; do
@@ -92,7 +89,7 @@ if [ $stage -le 3 ]; then
   done
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 8 ]; then
   echo "$0 monophone training"
   steps/train_mono.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1;
 
@@ -108,8 +105,7 @@ if [ $stage -le 4 ]; then
   ) &
 fi
 
-if [ $stage -le 5 ]; then
-
+if [ $stage -le 9 ]; then
   # align with monophones
   steps/align_si.sh --nj 8 --cmd "$train_cmd" \
     data/train data/lang exp/mono exp/mono_ali
@@ -131,10 +127,8 @@ if [ $stage -le 5 ]; then
 
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 10 ]; then
   echo "$0: Starting delta system alignment"
-
-  # align with triphones
   steps/align_si.sh \
     --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_ali
 
@@ -156,10 +150,9 @@ if [ $stage -le 6 ]; then
   ) &
 fi
 
-if  [ $stage -le 7 ]; then
+if  [ $stage -le 11 ]; then
   echo "$0: Starting LDA+MLLT system alignment"
 
-  # align with lda and mllt adapted triphones
   steps/align_si.sh \
     --use-graphs true --nj 8 --cmd "$train_cmd" \
     data/train data/lang exp/tri2b exp/tri2b_ali
@@ -169,7 +162,6 @@ if  [ $stage -le 7 ]; then
     --cmd "$train_cmd" \
     3100 50000 data/train data/lang exp/tri2b_ali exp/tri3b
 
-  # align with tri3b models
   echo "$0 Starting exp/tri3b_ali"
   steps/align_fmllr.sh \
     --nj 8 --cmd "$train_cmd" \
@@ -182,16 +174,16 @@ if  [ $stage -le 7 ]; then
     utils/mkgraph.sh \
       data/lang_test exp/tri3b exp/tri3b/graph ||  exit 1;
 
-    # decode test sets with tri3b models
     for x in native nonnative devtest test; do
+      echo "$0: decoding $x with tri3b models."
       steps/decode_fmllr.sh \
         --nj 8 --cmd "$decode_cmd"  exp/tri3b/graph data/$x exp/tri3b/decode_${x}
     done
   ) &
 fi
 
-if [ $stage -le 9 ]; then
-  # train and test chain models
+if [ $stage -le 12 ]; then
+  echo "$0: train and test chain models."
   local/chain/run_tdnn.sh
 fi
 
diff --git a/egs/hkust/s5/RESULTS b/egs/hkust/s5/RESULTS
index c419c9f6ddd..aac01fcb5af 100644
--- a/egs/hkust/s5/RESULTS
+++ b/egs/hkust/s5/RESULTS
@@ -1,3 +1,5 @@
+## Caution: these WERs are actually CERs.
+
 # for x in exp/*/decode; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done
 %WER 80.67 [ 45198 / 56027, 1607 ins, 10733 del, 32858 sub ] exp/mono0a/decode/cer_9_0.0
 %WER 58.79 [ 32939 / 56027, 2662 ins, 6124 del, 24153 sub ] exp/tri1/decode/cer_13_0.0
@@ -41,3 +43,6 @@ exp/nnet2_convnet/decode/cer_10:%WER 41.19 [ 23129 / 56154, 2599 ins, 3782 del,
 # nnet3 mfcc results (using speed perturbed data)
 exp/nnet3/tdnn_sp/decode_dev/cer_10:%WER 33.79 [ 18977 / 56154, 2027 ins, 3485 del, 13465 sub ]
 exp/nnet3/lstm_sp_ld5/decode_dev/cer_9:%WER 33.51 [ 18815 / 56154, 1813 ins, 3249 del, 13753 sub ]
+
+
+#  For nnet3+chain results, which are significantly better, see scripts in local/chain/tuning/.
diff --git a/egs/hkust/s5/local/chain/compare_wer.sh b/egs/hkust/s5/local/chain/compare_wer.sh
index b3376871a69..27a6b783433 100755
--- a/egs/hkust/s5/local/chain/compare_wer.sh
+++ b/egs/hkust/s5/local/chain/compare_wer.sh
@@ -39,25 +39,25 @@ for x in $*; do
 done
 echo
 
-# print decode WER results
-echo -n "# WER(%)               "
+# print decode CER results
+echo -n "# CER(%)               "
 for x in $*; do
   set_names $x
-  wer=$([ -d $x ] && grep WER $x/decode/cer_* | utils/best_wer.sh | awk '{print $2}')
+  wer=$([ -d $x ] && grep CER $x/decode/cer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
-# so how about online WER?
+# so how about online CER?
 if $include_online; then
-  echo -n "# WER(%)[online]       "
+  echo -n "# CER(%)[online]       "
   for x in $*; do
     set_names $x
     wer=$(cat ${x}_online/decode/cer_* | utils/best_wer.sh | awk '{print $2}')
     printf "% 10s" $wer
   done
   echo
-  echo -n "# WER(%)[per-utt]      "
+  echo -n "# CER(%)[per-utt]      "
   for x in $*; do
     set_names $x
     wer_per_utt=$(cat ${x}_online/decode_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}')
diff --git a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
old mode 100644
new mode 100755
index 0fc0de36a45..c62b776de2b
--- a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
+++ b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh
@@ -5,9 +5,9 @@
 # Results
 # local/chain/compare_wer.sh --online exp/chain/tdnn_7h_chain_2b_sp
 # Model                tdnn_7h_chain_2b_sp
-# WER(%)                    23.67
-# WER(%)[online]            23.69
-# WER(%)[per-utt]           24.67
+# CER(%)                    23.67
+# CER(%)[online]            23.69
+# CER(%)[per-utt]           24.67
 # Final train prob        -0.0895
 # Final valid prob        -0.1251
 # Final train prob (xent)   -1.3628
@@ -109,7 +109,7 @@ if [ $stage -le 12 ]; then
   ivector_dim=$(feat-to-dim scp:exp/nnet3/ivectors_${train_set}/ivector_online.scp -)
   feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -)
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
   output_opts="l2-regularize=0.002"
diff --git a/egs/hkust/s5/local/create_oov_char_lexicon.pl b/egs/hkust/s5/local/create_oov_char_lexicon.pl
index 0c146c9a123..33e2e8061c3 100755
--- a/egs/hkust/s5/local/create_oov_char_lexicon.pl
+++ b/egs/hkust/s5/local/create_oov_char_lexicon.pl
@@ -25,15 +25,17 @@
   exit;
 }
 
-use encoding utf8;
+use utf8;
 my %prons;
 open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
+binmode(DICT,":encoding(utf8)");
 foreach (<DICT>) {
   chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
 }
 close DICT;
 
 open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
+binmode(WORDS,":encoding(utf8)");
 while (<WORDS>) {
   chomp;
   print $_;
diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh
index 207f03af36b..6342ccfe861 100755
--- a/egs/hkust/s5/local/hkust_data_prep.sh
+++ b/egs/hkust/s5/local/hkust_data_prep.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-
+ 
 . ./path.sh || exit 1;
 
 if [ $# != 2 ]; then
@@ -14,6 +14,11 @@ hkust_text_dir=$2
 train_dir=data/local/train
 dev_dir=data/local/dev
 
+# transcripts normalization and segmentation
+# needs external tools
+python2 -c "import mmseg" 2>/dev/null || {
+    echo "Python module mmseg is not found. To install it, run tools/extra/install_mmseg.sh"; exit 1; }
+    
 mkdir -p $train_dir
 mkdir -p $dev_dir
 
@@ -35,7 +40,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
 
 #collect all trans, convert encodings to utf-8,
 find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }
@@ -50,7 +55,7 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\
   ' | sort -k1 > $train_dir/transcripts.txt || exit 1;
 
 find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
-  iconv -f GBK -t utf-8 - | perl -e '
+  iconv -f GBK -t UTF-8 | perl -e '
     while (<STDIN>) {
       @A = split(" ", $_);
       if (@A <= 1) { next; }
@@ -65,17 +70,13 @@ find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
   ' | sort -k1  > $dev_dir/transcripts.txt || exit 1;
 
 #transcripts normalization and segmentation
-#(this needs external tools),
-python -c "import mmseg" 2>/dev/null || \
-  (echo "mmseg is not found. Checkout tools/extra/install_mmseg.sh" && exit 1;)
-
 cat $train_dir/transcripts.txt |\
   sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
   sed -e 's/<\/foreign>/ /g' |\
   sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
   sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
   local/hkust_normalize.pl |\
-  python local/hkust_segment.py |\
+  local/hkust_segment.py |\
   awk '{if (NF > 1) print $0;}' > $train_dir/text || exit 1;
 
 cat $dev_dir/transcripts.txt |\
@@ -84,7 +85,7 @@ cat $dev_dir/transcripts.txt |\
   sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
   sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
   local/hkust_normalize.pl |\
-  python local/hkust_segment.py |\
+  local/hkust_segment.py |\
   awk '{if (NF > 1) print $0;}' > $dev_dir/text || exit 1;
 
 # some data is corrupted. Delete them
diff --git a/egs/hkust/s5/local/hkust_prepare_dict.sh b/egs/hkust/s5/local/hkust_prepare_dict.sh
index 27d1060e945..49f27f2f868 100755
--- a/egs/hkust/s5/local/hkust_prepare_dict.sh
+++ b/egs/hkust/s5/local/hkust_prepare_dict.sh
@@ -176,7 +176,9 @@ wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
 # dictionary in order to get OOV pronunciations
 cat $dict_dir/cedict/ch-dict.txt |\
   perl -e '
-  use encoding utf8;
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
   while (<STDIN>) {
     @A = split(" ", $_);
     $word_len = length($A[0]);
@@ -188,7 +190,9 @@ cat $dict_dir/cedict/ch-dict.txt |\
 # extract chars
 cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
   perl -e '
-  use encoding utf8;
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
   while (<STDIN>) {
     @A = split(" ", $_);
     @chars = split("", $A[0]);
diff --git a/egs/hkust/s5/local/hkust_segment.py b/egs/hkust/s5/local/hkust_segment.py
index 92d3add0e3e..d4c2b35a668 100755
--- a/egs/hkust/s5/local/hkust_segment.py
+++ b/egs/hkust/s5/local/hkust_segment.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python2.7
 #coding:utf-8
 
 from __future__ import print_function
diff --git a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
index be0c7ad8e0d..5675dc3fbd9 100755
--- a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
+++ b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
@@ -31,9 +31,9 @@ def get_args():
 
     parser = argparse.ArgumentParser("Process 1995 CSR-IV HUB4 transcripts")
 
-    parser.add_argument("--noise-word", type=str, default="<NOISE>",
+    parser.add_argument("--noise-word", default="<NOISE>",
                         help="Word to add in-place of noise words")
-    parser.add_argument("--spoken-noise-word", type=str,
+    parser.add_argument("--spoken-noise-word",
                         default="<SPOKEN_NOISE>",
                         help="Word to add in-place of speaker noise words")
     parser.add_argument("in_file", type=argparse.FileType('r'),
@@ -230,7 +230,7 @@ def run(args):
                         start_time = story_end_time
                     segments = process_story_content(
                         args, reco_id,
-                        ' '.join([unicode(x) for x in s.children]),
+                        ' '.join([str(x) for x in s.children]),
                         start_time=story_begin_time, end_time=story_end_time)
                     write_segments(segments, args)
                 elif (s.name is not None and s.name != "language"
@@ -240,9 +240,9 @@ def run(args):
                         "or <language> or <sung>; got {0}".format(s))
                 elif s.name == "language" or s.name == "sung":
                     non_story_contents.append(
-                        ' '.join([unicode(x) for x in s.children]))
+                        ' '.join([str(x) for x in s.children]))
                 else:
-                    non_story_contents.append(unicode(s))
+                    non_story_contents.append(str(s))
             except RuntimeError:
                 raise
             except Exception:
diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
index 95aa7ddb831..fb5ba7a64ee 100755
--- a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
+++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
@@ -36,9 +36,9 @@ def get_args():
     corpus (LDC98T31).""")
     parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0,
                         help="Set higher for more verbose logging.")
-    parser.add_argument("file_list", type=str,
+    parser.add_argument("file_list",
                         help="""List of compressed source files""")
-    parser.add_argument("dir", type=str,
+    parser.add_argument("dir",
                         help="Output directory to dump processed files to")
 
     args = parser.parse_args()
@@ -83,7 +83,7 @@ def process_file_lines(lines, out_file_handle):
                 for x in para.contents:
                     try:
                         if x.name is None:
-                            normalized_text = normalize_text(unicode(x))
+                            normalized_text = normalize_text(str(x))
                             if len(normalized_text) == 0:
                                 continue
                             out_file_handle.write("{0}\n".format(
diff --git a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
index 94b02a766a9..08203f7ada1 100755
--- a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
+++ b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
@@ -38,10 +38,10 @@ def get_args():
     parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).")
     parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3], default=0,
                         help="Use larger verbosity for more verbose logging.")
-    parser.add_argument("file_list", type=str,
+    parser.add_argument("file_list",
                         help="List of compressed source files for NA News Text. "
                         "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994")
-    parser.add_argument("out_file", type=str,
+    parser.add_argument("out_file",
                         help="Output file to write to.")
 
     args = parser.parse_args()
@@ -85,7 +85,7 @@ def process_file_lines(lines, out_file_handle):
                 continue
             for para in art.find_all('p'):
                 assert para.name == 'p'
-                text = ' '.join([unicode(x).strip() for x in para.contents])
+                text = ' '.join([str(x).strip() for x in para.contents])
                 normalized_text = normalize_text(text)
                 out_file_handle.write("{0}\n".format(
                     normalized_text.encode('ascii')))
diff --git a/egs/hub4_english/s5/local/lm/merge_word_counts.py b/egs/hub4_english/s5/local/lm/merge_word_counts.py
index 6338cbbf875..85e15d8dc07 100755
--- a/egs/hub4_english/s5/local/lm/merge_word_counts.py
+++ b/egs/hub4_english/s5/local/lm/merge_word_counts.py
@@ -7,6 +7,7 @@
 A min-count argument is required to only write counts that are above the
 specified minimum count.
 """
+from __future__ import print_function
 
 import sys
 
@@ -21,7 +22,7 @@ def main():
         parts = line.strip().split()
         words[parts[1]] = words.get(parts[1], 0) + int(parts[0])
 
-    for word, count in words.iteritems():
+    for word, count in words.items():
         if count >= int(sys.argv[1]):
             print ("{0} {1}".format(count, word))
 
diff --git a/egs/hub4_spanish/s5/local/chain/compare_wer.sh b/egs/hub4_spanish/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..0194b86ac69
--- /dev/null
+++ b/egs/hub4_spanish/s5/local/chain/compare_wer.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=("#WER test ")
+
+for n in 0; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(test)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/hub4_spanish/s5/local/chain/run_cnn_tdnn.sh b/egs/hub4_spanish/s5/local/chain/run_cnn_tdnn.sh
new file mode 120000
index 00000000000..ab83f3c43e8
--- /dev/null
+++ b/egs/hub4_spanish/s5/local/chain/run_cnn_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/hub4_spanish/s5/local/chain/run_tdnn.sh b/egs/hub4_spanish/s5/local/chain/run_tdnn.sh
index 211957092f9..61f8f499182 120000
--- a/egs/hub4_spanish/s5/local/chain/run_tdnn.sh
+++ b/egs/hub4_spanish/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-./tuning/run_tdnn_1a.sh
\ No newline at end of file
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
new file mode 100755
index 00000000000..d1b657a2d74
--- /dev/null
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -0,0 +1,287 @@
+#!/bin/bash
+
+## This is taken from mini_librispeech.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1a_sp exp/chain/cnn_tdnn1a_sp
+# System                 tdnn1a_sp cnn_tdnn1a_sp
+#WER test                    14.19         13.47
+#             [online:]      14.26         13.57
+# Final train prob         -0.0707       -0.0911
+# Final valid prob         -0.1225       -0.1145
+# Final train prob (xent)  -1.1117       -1.3038
+# Final valid prob (xent)  -1.3199       -1.3374
+# Num-params               6945216       4471200
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_tdnn1a_sp
+# exp/chain/cnn_tdnn1a_sp: num-iters=102 nj=2..5 num-params=4.5M dim=40+100->2272 combine=-0.101->-0.097 (over 5) xent:train/valid[67,101,final]=(-1.46,-1.31,-1.30/-1.47,-1.34,-1.34) logprob:train/valid[67,101,final]=(-0.112,-0.097,-0.091/-0.129,-0.121,-0.114)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train
+test_sets=eval
+gmm=tri5
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  cnn_opts="l2-regularize=0.03"
+  ivector_affine_opts="l2-regularize=0.03"
+  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+
+  batchnorm-component name=idct-batchnorm input=idct
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
+  # limit the num-parameters).
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hub4_spanish-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/langp_test \
+    $tree_dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  nspk=$(wc -l <data/eval/spk2utt)
+  steps/nnet3/decode.sh \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_eval_hires \
+    $dir/graph data/eval_hires $dir/decode_test || exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+    nspk=$(wc -l <data/eval/spk2utt)
+    steps/online/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nspk --cmd "$decode_cmd" \
+      $dir/graph  data/eval ${dir}_online/decode_test  || exit 1
+fi
+
+
+exit 0;
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
index 470833cec70..40bbbe1ae79 100755
--- a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -136,7 +136,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -179,7 +179,7 @@ fi
 if [ $stage -le 14 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hub4_spanish-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/chain/train.py --stage=$train_stage \
@@ -227,6 +227,16 @@ if [ $stage -le 15 ]; then
     $tree_dir $dir/graph || exit 1;
 fi
 
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  nspk=$(wc -l <data/eval/spk2utt)
+  steps/nnet3/decode.sh \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_eval_hires \
+    $dir/graph data/eval_hires $dir/decode_test || exit 1
+fi
 
 if [ $stage -le 17 ]; then
   # note: if the features change (e.g. you add pitch features), you will have to
diff --git a/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..a498d8157f3
--- /dev/null
+++ b/egs/hub4_spanish/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,273 @@
+#!/bin/bash
+
+## This is taken from mini_librispeech
+
+# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp
+# System                 tdnn1a_sp tdnn1b_sp
+#WER test                    14.19     13.89
+#             [online:]      14.26     14.02
+# Final train prob         -0.0707   -0.0941
+# Final valid prob         -0.1225   -0.1165
+# Final train prob (xent)  -1.1117   -1.3456
+# Final valid prob (xent)  -1.3199   -1.3938
+# Num-params               6945216   5186240
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp
+# exp/chain/tdnn1b_sp: num-iters=102 nj=2..5 num-params=5.2M dim=40+100->2272 combine=-0.105->-0.100 (over 6) xent:train/valid[67,101,final]=(-1.54,-1.34,-1.35/-1.56,-1.39,-1.39) logprob:train/valid[67,101,final]=(-0.116,-0.099,-0.094/-0.135,-0.123,-0.116)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train
+test_sets=eval
+gmm=tri5
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/hub4_spanish-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --egs.cmd="run.pl --max-jobs-run 12" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/langp_test \
+    $tree_dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  nspk=$(wc -l <data/eval/spk2utt)
+  steps/nnet3/decode.sh \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+    --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_eval_hires \
+    $dir/graph data/eval_hires $dir/decode_test || exit 1
+fi
+
+if [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+    nspk=$(wc -l <data/eval/spk2utt)
+    steps/online/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nspk --cmd "$decode_cmd" \
+      $dir/graph  data/eval ${dir}_online/decode_test  || exit 1
+fi
+
+
+exit 0;
diff --git a/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py b/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py
index 25f26f38a4f..69b4e374b6e 100755
--- a/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py
+++ b/egs/hub4_spanish/s5/local/lexicon/make_unicode_lexicon.py
@@ -106,6 +106,7 @@
 # Import Statements
 
 from __future__ import print_function
+from __future__ import division
 import codecs
 import argparse
 import unicodedata
@@ -338,8 +339,8 @@ def encode(unicode_transcription, tag_percentage, log=False):
     graph2int = {v: k for k, v in enumerate(set(graph_list))}
     int2graph = {v: k for k, v in graph2int.items()}
     graph_list_int = [graph2int[g] for g in graph_list]
-    bin_edges = range(0, len(int2graph.keys()) + 1)
-    graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0] / float(len(graph_list_int))
+    bin_edges = list(range(0, len(int2graph.keys()) + 1))
+    graph_counts = np.histogram(graph_list_int, bins=bin_edges)[0]/ float(len(graph_list_int))
     # Set count threshold to frequency that tags the bottom 10% of graphemes
     bottom_idx = int(np.floor(tag_percentage * len(graph_counts)))
     count_thresh = sorted(graph_counts)[bottom_idx]
@@ -464,7 +465,7 @@ def encode(unicode_transcription, tag_percentage, log=False):
     for g_dict in table:
         g_map = ""
         map_number = 0
-        for g_field, g_val in sorted(g_dict.iteritems()):
+        for g_field, g_val in sorted(g_dict.items()):
             if(g_field == ("MAP" + str(map_number))):
                 g_map = g_map + g_val + " "
                 map_number = map_number + 1
@@ -594,7 +595,7 @@ def write_map(grapheme_map, mapfile):
 
     '''
     with codecs.open(mapfile, 'w', encoding='utf-8') as f:
-        for g, g_map in grapheme_map.iteritems():
+        for g, g_map in grapheme_map.items():
             print(g, g_map, file=f)
 
 
@@ -612,14 +613,14 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None,
     with codecs.open(outfile, "w", "utf-8") as f:
         # First write the non-speech words
         try:
-            for w in sil_lex.iterkeys():
+            for w in sil_lex.keys():
                 f.write("%s\t%s\n" % (w, sil_lex[w]))
         except AttributeError:
             pass
         
         # Then write extra-speech words 
         try:
-            for w in extra_lex.iterkeys():
+            for w in extra_lex.keys():
                 f.write("%s\t%s\n" % (w, extra_lex[w]))
         except AttributeError:
             pass
@@ -628,9 +629,9 @@ def write_lexicon(baseforms, encoded_transcription, outfile, sil_lex=None,
         for idx, w in enumerate(baseforms):
             # This is really just for BABEL in case <hes> is written as a word
             if(w[0].lower() == "<hes>"):
-                f.write("%s\t<hes>\n" % (unicode(w[0])))
+                f.write("%s\t<hes>\n" % (str(w[0])))
             else:
-                f.write("%s\t%s\n" % (unicode(w[0]),
+                f.write("%s\t%s\n" % (str(w[0]),
                                       encoded_transcription[idx]))
 
 if __name__ == "__main__":
diff --git a/egs/hub4_spanish/s5/local/prepare_unicode_dict.py b/egs/hub4_spanish/s5/local/prepare_unicode_dict.py
index 86fa4d60ba1..3b9dc1abd86 100755
--- a/egs/hub4_spanish/s5/local/prepare_unicode_dict.py
+++ b/egs/hub4_spanish/s5/local/prepare_unicode_dict.py
@@ -89,7 +89,7 @@ def extract_phonemes(lexicon):
     # Read all baseform units into dictionary with {a: [a, a_1, a_2],
     #                                               b: [b_1, b_3], ...}
     phonemes_dict = {}
-    for word, pron in lexicon.iteritems():
+    for word, pron in lexicon.items():
         for p in pron.split():
             try:
                 base = p.split("_",1)[0]
@@ -98,11 +98,11 @@ def extract_phonemes(lexicon):
                 phonemes_dict[base] = [p]
 
     # Makes sure there are no repeats in the list
-    phonemes_dict = {k: set(v) for k, v in phonemes_dict.iteritems()}
+    phonemes_dict = {k: set(v) for k, v in phonemes_dict.items()}
 
     # Get all unique phonemes
     phonemes = []
-    for v in phonemes_dict.itervalues():
+    for v in phonemes_dict.values():
         for p in v:
             phonemes.append(p)
 
@@ -137,11 +137,11 @@ def write_extra_questions(nonsil_phonemes, nonsil_phonemes_dict,
 
         # Write all possible phone_tag combinations that occur in the lexicon
         for tag in tags:
-            for p in nonsil_phonemes_dict.iterkeys():
+            for p in nonsil_phonemes_dict.keys():
                 tagged_phoneme = "_".join([p, tag])
                 if(tagged_phoneme in nonsil_phonemes_dict[p]):
                     fp.write("%s " % tagged_phoneme)
-            for p in sil_phonemes_dict.iterkeys():
+            for p in sil_phonemes_dict.keys():
                 tagged_phoneme = "_".join([p, tag])
                 if(tagged_phoneme in sil_phonemes_dict[p]):
                     fp.write("%s " % tagged_phoneme)
diff --git a/egs/iam/v1/RESULTS b/egs/iam/v1/RESULTS
new file mode 100644
index 00000000000..b25cb3cd772
--- /dev/null
+++ b/egs/iam/v1/RESULTS
@@ -0,0 +1,42 @@
+Run_end2end.sh (WER using lang_test, lang_unk)
+flat_start:
+  • %WER 14.41 [ 2671 / 18542, 262 ins, 561 del, 1848 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+  • %WER 15.21 [ 2821 / 18542, 375 ins, 500 del, 1946 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+
+cnn_e2eali_1a: 
+  • %WER 11.94 [ 2214 / 18542, 267 ins, 380 del, 1567 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_1.0
+  • %WER 13.30 [ 2467 / 18542, 441 ins, 330 del, 1696 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5
+
+cnn_e2eali_1b: 
+  • %WER 11.20 [ 2076 / 18542, 260 ins, 335 del, 1481 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+  • %WER 12.46 [ 2311 / 18542, 371 ins, 326 del, 1614 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+
+cnn_e2eali_1c: 
+  • %WER 9.90 [ 1836 / 18542, 257 ins, 227 del, 1352 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_10_1.0
+  • %WER 12.10 [ 2243 / 18542, 411 ins, 269 del, 1563 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_12_0.5
+
+
+Run.sh (WER using lang_test, lang_unk)
+cnn_1a:
+  • %WER 15.18 [ 2815 / 18542, 285 ins, 509 del, 2021 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+  • %WER 16.88 [ 3130 / 18542, 444 ins, 611 del, 2075 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+
+cnn_chainali_1a:
+  • %WER 14.09 [ 2612 / 18542, 245 ins, 505 del, 1862 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_13_0.0
+  • %WER 15.93 [ 2954 / 18542, 454 ins, 470 del, 2030 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_0.0
+
+cnn_chainali_1b:
+  • %WER 13.29 [ 2465 / 18542, 221 ins, 499 del, 1745 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.5
+  • %WER 15.09 [ 2798 / 18542, 418 ins, 468 del, 1912 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_10_0.5
+
+cnn_chainali_1c:
+  • %WER 11.59 [ 2149 / 18542, 276 ins, 362 del, 1511 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+  • %WER 13.75 [ 2550 / 18542, 465 ins, 368 del, 1717 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+  
+cnn_chainali_1d:
+  • %WER 11.07 [ 2053 / 18542, 261 ins, 311 del, 1481 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+  • %WER 12.95 [ 2402 / 18542, 436 ins, 313 del, 1653 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+
+cnn_chainali_1e:
+  • %WER 10.03 [ 1859 / 18542, 226 ins, 291 del, 1342 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_11_0.5
+    %WER 12.15 [ 2253 / 18542, 406 ins, 282 del, 1565 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_10_0.5
diff --git a/egs/iam/v1/local/augment_data.sh b/egs/iam/v1/local/augment_data.sh
new file mode 100755
index 00000000000..31e4a8217ca
--- /dev/null
+++ b/egs/iam/v1/local/augment_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+aug_set=aug1
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp"
+
+for set in $aug_set; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --fliplr false --augment true $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh
index ad90710b13f..4a2cc29481c 100755
--- a/egs/iam/v1/local/chain/compare_wer.sh
+++ b/egs/iam/v1/local/chain/compare_wer.sh
@@ -34,6 +34,20 @@ for x in $*; do
 done
 echo
 
+echo -n "# WER val                    "
+for x in $*; do
+  wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val                    "
+for x in $*; do
+  cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
diff --git a/egs/iam/v1/local/chain/run_cnn.sh b/egs/iam/v1/local/chain/run_cnn.sh
new file mode 120000
index 00000000000..df6f0a468c1
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali.sh b/egs/iam/v1/local/chain/run_cnn_chainali.sh
new file mode 120000
index 00000000000..41b712609c2
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_chainali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_chainali_1d.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali.sh b/egs/iam/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..ad51803ab0e
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1c.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_e2e_cnn.sh b/egs/iam/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
similarity index 80%
rename from egs/iam/v1/local/chain/run_cnn_1a.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
index 41a76920e37..ef1273f3961 100755
--- a/egs/iam/v1/local/chain/run_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_1a.sh
@@ -4,23 +4,23 @@
 #              2017 Chun Chieh Chang
 #              2017 Ashish Arora
 
-# steps/info/chain_dir_info.pl exp/chain/cnn_1a/
-# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098)
-
 # local/chain/compare_wer.sh exp/chain/cnn_1a/
-# System                         cnn_1a
-# WER                             18.52
-# CER                             10.07
-# Final train prob              -0.0077
-# Final valid prob              -0.0970
-# Final train prob (xent)       -0.5484
-# Final valid prob (xent)       -0.9643
-# Parameters                      4.36M
+# System                         cnn_1a(dict_50k)      cnn_1a(dict_50k + unk model)
+# WER                              16.88                    15.18
+# CER                               8.52                    7.58
+# WER val                          16.17                    13.53
+# CER val                           7.15                    5.89
+# Final train prob               -0.0299
+# Final valid prob               -0.0574
+# Final train prob (xent)        -0.3912
+# Final valid prob (xent)        -0.6439
+# Parameters                       4.36M
 
-set -e -o pipefail
+# steps/info/chain_dir_info.pl exp/chain/cnn_1a/
+# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->368 combine=-0.029->-0.029 (over 2) xent:train/valid[27,41,final]=(-0.522,-0.394,-0.391/-0.695,-0.644,-0.644) logprob:train/valid[27,41,final]=(-0.035,-0.030,-0.030/-0.056,-0.057,-0.057)
 
+set -e -o pipefail
 stage=0
-
 nj=30
 train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
@@ -34,28 +34,21 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
 # training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+decode_val=true
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -109,7 +102,7 @@ fi
 if [ $stage -le 2 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" $train_data_dir \
     data/lang $gmm_dir $lat_dir
   rm $lat_dir/fsts.*.gz # save space
 fi
@@ -124,9 +117,9 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
@@ -134,9 +127,8 @@ fi
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
@@ -155,7 +147,6 @@ if [ $stage -le 4 ]; then
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
   # adding the layers for xent branch
   # This block prints the configs for a separate output that will be
   # trained with a cross-entropy objective in the 'chain' mod?els... this
@@ -186,9 +177,9 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$frame_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=4 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -198,15 +189,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -222,20 +208,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
new file mode 100755
index 00000000000..bbcc55aa2b0
--- /dev/null
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+
+# chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1a
+# System                      cnn_chainali_1a (dict_50k)        cnn_chainali_1a(dict_50k + unk_model)
+# WER                             15.93                             14.09
+# CER                              7.79                              6.70
+# WER val                         15.10                             12.63
+# CER val                          6.72                              5.36
+# Final train prob              -0.0220
+# Final valid prob              -0.0157
+# Final train prob (xent)       -0.4238
+# Final valid prob (xent)       -0.6119
+# Parameters                      4.36M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a
+# exp/chain/cnn_chainali_1a: num-iters=42 nj=2..4 num-params=4.4M dim=40->368 combine=-0.020->-0.020 (over 2) xent:train/valid[27,41,final]=(-0.534,-0.425,-0.424/-0.659,-0.612,-0.612) logprob:train/valid[27,41,final]=(-0.026,-0.022,-0.022/-0.017,-0.016,-0.016)
+set -e -o pipefail
+
+stage=0
+nj=30
+train_set=train
+decode_val=true
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=450
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats_chain
+gmm_lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_chainali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_chain
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
+  cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves $train_data_dir \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=false \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
similarity index 79%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
index c6876fbafcb..401ffa14e19 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -1,27 +1,26 @@
 #!/bin/bash
 
 # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer.
-
-# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/
-# System                         cnn_1a cnn_chainali_1b
-# WER                             18.52     14.38
-# CER                             10.07      7.14
-# Final train prob              -0.0077   -0.0113
-# Final valid prob              -0.0970   -0.0400
-# Final train prob (xent)       -0.5484   -0.6043
-# Final valid prob (xent)       -0.9643   -0.9030
-# Parameters                      4.36M     3.96M
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b
+# System                      cnn_chainali_1b(dict_50k)   cnn_chainali_1b(dict_50k + unk_model)
+# WER                             15.09                       13.29
+# CER                              7.13                        6.08
+# WER val                         14.80                       11.98
+# CER val                          6.16                        4.87
+# Final train prob              -0.0225
+# Final valid prob              -0.0132
+# Final train prob (xent)       -0.4466
+# Final valid prob (xent)       -0.6048
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/
-# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038)
-
+# exp/chain/cnn_chainali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.019->-0.019 (over 2) xent:train/valid[27,41,final]=(-0.545,-0.448,-0.447/-0.645,-0.605,-0.605) logprob:train/valid[27,41,final]=(-0.026,-0.023,-0.023/-0.014,-0.013,-0.013)
 
 set -e -o pipefail
-
 stage=0
-
 nj=30
 train_set=train
+decode_val=true
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -31,31 +30,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
 common_egs_dir=
 reporting_email=
 
-# chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -82,7 +70,6 @@ for f in $train_data_dir/feats.scp \
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-
 if [ $stage -le 1 ]; then
   echo "$0: creating lang directory $lang with chain-type topology"
   # Create a version of the lang/ directory that has one state per phone in the
@@ -112,7 +99,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
 
@@ -126,9 +113,9 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
@@ -136,16 +123,14 @@ fi
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -160,7 +145,6 @@ if [ $stage -le 4 ]; then
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
-
   # adding the layers for xent branch
   # This block prints the configs for a separate output that will be
   # trained with a cross-entropy objective in the 'chain' mod?els... this
@@ -191,9 +175,9 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -203,15 +187,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -227,20 +206,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
similarity index 80%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
index 54c52d913de..17209b9204f 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1c.sh
@@ -1,25 +1,25 @@
 #!/bin/bash
 
 # chainali_1c is as chainali_1b except it uses l2-regularize
-# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c
-# System                      cnn_chainali_1b cnn_chainali_1c
-# WER                             14.38     12.72
-# CER                              7.14      5.99
-# Final train prob              -0.0113   -0.0291
-# Final valid prob              -0.0400   -0.0359
-# Final train prob (xent)       -0.6043   -0.9781
-# Final valid prob (xent)       -0.9030   -1.1544
-# Parameters                      3.96M     3.96M
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1c
+# System                      cnn_chainali_1c (dict_50k)        cnn_chainali_1c(dict_50k + unk_model)
+# WER                             12.95                             11.07
+# CER                              6.04                              4.91
+# WER val                         12.75                              9.78
+# CER val                          5.15                              3.74
+# Final train prob              -0.0217
+# Final valid prob              -0.0060
+# Final train prob (xent)       -0.8303
+# Final valid prob (xent)       -0.8665
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c
-# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020)
-
+# exp/chain/cnn_chainali_1c/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006)
 set -e -o pipefail
-
 stage=0
-
 nj=30
 train_set=train
+decode_val=true
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
@@ -29,30 +29,20 @@ chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
 common_egs_dir=
 reporting_email=
 
-# chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -108,7 +98,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
 
@@ -122,19 +112,17 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
-
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
@@ -190,11 +178,11 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
     --chain.right-tolerance 3 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -206,13 +194,9 @@ if [ $stage -le 5 ]; then
     --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -228,20 +212,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
similarity index 79%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1d.sh
rename to egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
index 19de3af7f1d..89a40ed2a13 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_chainali_1d.sh
@@ -1,60 +1,53 @@
 #!/bin/bash
 
 # chainali_1d is as chainali_1c except it uses unconstrained egs
-
-# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_chainali_1c exp/chain/cnn_chainali_1d
-# System                      cnn_chainali_1c cnn_chainali_1d
-# WER                             13.14     12.33
-# CER                              6.40      5.72
-# Final train prob              -0.0260   -0.0037
-# Final valid prob              -0.0451   -0.0132
-# Final train prob (xent)       -0.9993   -0.8647
-# Final valid prob (xent)       -1.1549   -1.0101
-# Parameters                      3.97M     3.97M
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1d
+# System                      cnn_chainali_1d (dict_50k)        cnn_chainali_1d(dict_50k + unk_model)
+# WER                             12.95                             11.07
+# CER                              6.04                              4.91
+# WER val                         12.75                              9.78
+# CER val                          5.15                              3.74
+# Final train prob              -0.0217
+# Final valid prob              -0.0060
+# Final train prob (xent)       -0.8303
+# Final valid prob (xent)       -0.8665
+# Parameters                      3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1d
-# exp/chain/cnn_chainali_1d: num-iters=21 nj=2..4 num-params=4.0M dim=40->376 combine=-0.002->-0.002 (over 1) xent:train/valid[13,20,final]=(-1.66,-1.01,-0.865/-1.72,-1.12,-1.01) logprob:train/valid[13,20,final]=(-0.058,-0.019,-0.004/-0.055,-0.027,-0.013)
-
+# exp/chain/cnn_chainali_1d/: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.018->-0.018 (over 1) xent:train/valid[27,41,final]=(-1.22,-0.847,-0.830/-1.19,-0.880,-0.867) logprob:train/valid[27,41,final]=(-0.045,-0.025,-0.022/-0.026,-0.010,-0.006)
 
 set -e -o pipefail
 
 stage=0
-
 nj=30
 train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
                 # should have alignments for the specified training data.
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
-affix=_1c_uc  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+affix=_1d  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 ali=tri3_ali
-chain_model_dir=exp/chain${nnet3_affix}/cnn_1a_uc
+chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
 common_egs_dir=
 reporting_email=
 
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_unk
+lang_decode=lang_unk
+decode_val=true
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -80,7 +73,6 @@ for f in $train_data_dir/feats.scp \
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-
 if [ $stage -le 1 ]; then
   echo "$0: creating lang directory $lang with chain-type topology"
   # Create a version of the lang/ directory that has one state per phone in the
@@ -110,7 +102,7 @@ if [ $stage -le 2 ]; then
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
                             --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
-                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+                            $train_data_dir data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
 fi
 
@@ -124,19 +116,18 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
@@ -146,7 +137,6 @@ if [ $stage -le 4 ]; then
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -157,7 +147,6 @@ if [ $stage -le 4 ]; then
   relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
-
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
@@ -192,11 +181,11 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
     --chain.right-tolerance 3 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
     --trainer.frames-per-iter=1000000 \
@@ -206,15 +195,10 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -230,20 +214,20 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/$lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..703d404159a
--- /dev/null
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+# System                      cnn_e2eali_1a_(dict_50k) cnn_e2eali_1a_(dict_50k + unk model)
+# WER                             13.30                    11.94
+# CER                              5.95                     5.15
+# WER val                         12.85                    10.71
+# CER val                          5.09                     4.03
+# Final train prob              -0.0562
+# Final valid prob              -0.0634
+# Final train prob (xent)       -0.8196
+# Final valid prob (xent)       -0.8816
+# Parameters                      3.96M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
+# exp/chain/cnn_e2eali_1a: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.058->-0.058 (over 1) xent:train/valid[27,41,final]=(-2.67,-0.841,-0.820/-2.71,-0.892,-0.882) logprob:train/valid[27,41,final]=(-0.240,-0.060,-0.056/-0.245,-0.068,-0.063)
+
+set -e -o pipefail
+
+stage=0
+nj=30
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=450
+remove_egs=true
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves $train_data_dir \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=0 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..905c4661477
--- /dev/null
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+
+# e2eali_1b is the same as e2eali_1a but uses unconstrained egs
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b
+# System                      cnn_e2eali_1b (dict_50k) cnn_e2eali_1b (dict_50k + unk model)
+# WER                             12.46                    11.20
+# CER                              5.53                     4.76
+# WER val                         12.71                    10.49
+# CER val                          4.97                     3.92
+# Final train prob              -0.0381
+# Final valid prob              -0.0443
+# Final train prob (xent)       -0.7860
+# Final valid prob (xent)       -0.8290
+# Parameters                      3.96M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b
+# exp/chain/cnn_e2eali_1b: num-iters=42 nj=2..4 num-params=4.0M dim=40->368 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-1.19,-0.805,-0.786/-1.19,-0.846,-0.829) logprob:train/valid[27,41,final]=(-0.060,-0.041,-0.038/-0.062,-0.048,-0.044)
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=450
+lang_decode=lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            $train_data_dir data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves $train_data_dir \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=0 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=true \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
new file mode 100755
index 00000000000..26b1aca0929
--- /dev/null
+++ b/egs/iam/v1/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+
+# e2eali_1c is the same as e2eali_1b but has more CNN layers, different filter size
+# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs.
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1c
+# System                      cnn_e2eali_1c (dict_50k)        cnn_e2eali_1c(dict_50k + unk_model)
+# WER                             12.10                           9.90
+# CER                              5.23                           4.16
+# WER val                         12.15                           9.60
+# CER val                          4.78                           3.56
+# Final train prob              -0.0470
+# Final valid prob              -0.0657
+# Final train prob (xent)       -0.4713
+# Final valid prob (xent)       -0.5437
+# Parameters                      4.32M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c
+# exp/chain/cnn_e2eali_1c: num-iters=30 nj=3..5 num-params=4.3M dim=40->368 combine=-0.051->-0.051 (over 1) xent:train/valid[19,29,final]=(-0.722,-0.500,-0.471/-0.748,-0.568,-0.544) logprob:train/valid[19,29,final]=(-0.090,-0.053,-0.047/-0.106,-0.071,-0.066)
+set -e -o pipefail
+
+stage=0
+nj=30
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=550
+lang_decode=data/lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            $train_data_dir data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves $train_data_dir \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=0 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=true \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
new file mode 100755
index 00000000000..462ad0522de
--- /dev/null
+++ b/egs/iam/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a (dict_50k)  e2e_cnn_1a (dict_50k + unk_model)
+# WER                             15.21                  14.41
+# CER                              7.43                   6.82
+# WER val                         14.84                  13.51
+# CER val                          6.41                   5.60
+# Final train prob              -0.0206
+# Final valid prob              -0.0393
+# Final train prob (xent)
+# Final valid prob (xent)
+# Parameters                      9.52M
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=-0.020->-0.020 (over 1) logprob:train/valid[27,41,final]=(-0.025,-0.021,-0.021/-0.044,-0.040,-0.039)
+
+set -e
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+nj=30
+
+# training options
+tdnn_dim=450
+minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
+common_egs_dir=
+train_set=train
+decode_val=true
+lang_decode=data/lang_unk
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_bitree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type biphone \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 4 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/$train_set \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/extract_features.sh b/egs/iam/v1/local/extract_features.sh
new file mode 100755
index 00000000000..1741ad3f9b2
--- /dev/null
+++ b/egs/iam/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment=false
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  local/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/iam/v1/local/gen_topo.py b/egs/iam/v1/local/gen_topo.py
new file mode 100755
index 00000000000..6fae276d542
--- /dev/null
+++ b/egs/iam/v1/local/gen_topo.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+# Copyright 2017 (author: Chun-Chieh Chang)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs. This is a modified version of
+# 'utils/gen_topo.pl'. The difference is that this creates two topologies for
+# the non-silence HMMs. The number of states for punctuations is different than
+# the number of states for other characters.
+
+from __future__ import print_function
+import argparse
+import string
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones");
+parser.add_argument("num_sil_states", type=int, help="number of states for silence phones");
+parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number.");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+punctuation_phones = []
+exclude = set("!(),.?;:'-\"")
+with open(args.phone_list) as f:
+    for line in f:
+        line = line.strip()
+        phone = line.split('_')[0]
+        if len(phone) == 1 and phone in exclude:
+            punctuation_phones.append(int(line.split(' ')[1]))
+# For nonsilence phones that are not punctuations
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_nonsil_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_nonsil_states) + " </State>")
+print("</TopologyEntry>")
+
+# For nonsilence phones that ar punctuations
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_punctuation_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_punctuation_states) + " </State>")
+print("</TopologyEntry>")
+
+# For silence phones
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in silence_phones]))
+print("</ForPhones>")
+if(args.num_sil_states > 1):
+    transp = 1.0 / (args.num_sil_states - 1)
+    
+    state_str = "<State> 0 <PdfClass> 0 "
+    for x in range(0, (args.num_sil_states - 1)):
+        state_str = state_str + "<Transition> " + str(x) + " " + str(transp) + " "
+    state_str = state_str + "</State>"
+    print(state_str)
+
+    for x in range(1, (args.num_sil_states - 1)):
+        state_str = "<State> " + str(x) + " <PdfClass> " + str(x) + " "
+        for y in range(1, args.num_sil_states):
+            state_str = state_str + "<Transition> " + str(y) + " " + str(transp) + " "
+        state_str = state_str + "</State>"
+        print(state_str)
+    second_last = args.num_sil_states - 1
+    print("<State> " + str(second_last) + " <PdfClass> " + str(second_last) + " <Transition> " + str(second_last) + " 0.75 <Transition> " + str(args.num_sil_states) + " 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+else:
+    print("<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py
index 84e012daedb..3ce501732cf 100755
--- a/egs/iam/v1/local/make_features.py
+++ b/egs/iam/v1/local/make_features.py
@@ -2,6 +2,7 @@
 
 # Copyright      2017  Chun Chieh Chang
 #                2017  Ashish Arora
+#                2017  Yiwen Shao
 #                2018  Hossein Hadian
 
 """ This script converts images to Kaldi-format feature matrices. The input to
@@ -14,20 +15,27 @@
     to enforce the images to have the specified length in that file by padding
     white pixels (the --padding option will be ignored in this case). This relates
     to end2end chain training.
-
     eg. local/make_features.py data/train --feat-dim 40
 """
-
+import random
 import argparse
 import os
 import sys
+import scipy.io as sio
 import numpy as np
 from scipy import misc
+from scipy.ndimage.interpolation import affine_transform
+import math
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
 
 parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
                                                 writes them to standard output in text format.""")
-parser.add_argument('dir', type=str,
-                    help='Source data directory (containing images.scp)')
+parser.add_argument('images_scp_path', type=str,
+                    help='Path of images.scp file')
+parser.add_argument('--allowed_len_file_path', type=str, default=None,
+                    help='If supplied, each images will be padded to reach the '
+                    'target length (this overrides --padding).')
 parser.add_argument('--out-ark', type=str, default='-',
                     help='Where to write the output feature file')
 parser.add_argument('--feat-dim', type=int, default=40,
@@ -35,8 +43,10 @@
 parser.add_argument('--padding', type=int, default=5,
                     help='Number of white pixels to pad on the left'
                     'and right side of the image.')
-
-
+parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="Flip the image left-right for right to left languages")
+parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="performs image augmentation")
 args = parser.parse_args()
 
 
@@ -56,18 +66,12 @@ def write_kaldi_matrix(file_handle, matrix, key):
             file_handle.write("\n")
     file_handle.write(" ]\n")
 
-def get_scaled_image(im, allowed_lengths = None):
-    scale_size = args.feat_dim
-    sx = im.shape[1]
-    sy = im.shape[0]
-    scale = (1.0 * scale_size) / sy
-    nx = int(scale_size)
-    ny = int(scale * sx)
-    im = misc.imresize(im, (nx, ny))
+
+def horizontal_pad(im, allowed_lengths = None):
     if allowed_lengths is None:
         left_padding = right_padding = args.padding
     else:  # Find an allowed length for the image
-        imlen = im.shape[1]
+        imlen = im.shape[1] # width
         allowed_len = 0
         for l in allowed_lengths:
             if l > imlen:
@@ -77,28 +81,153 @@ def get_scaled_image(im, allowed_lengths = None):
             #  No allowed length was found for the image (the image is too long)
             return None
         padding = allowed_len - imlen
-        left_padding = padding // 2
+        left_padding = int(padding // 2)
         right_padding = padding - left_padding
-    dim_y = im.shape[0]
+    dim_y = im.shape[0] # height
     im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
                                            dtype=int), im), axis=1)
     im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
                                                     dtype=int)), axis=1)
     return im_pad1
 
-### main ###
-data_list_path = os.path.join(args.dir, 'images.scp')
+def get_scaled_image_aug(im, mode='normal'):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx) 
+    scale_size = random.randint(10, 30)
+    scale = (1.0 * scale_size) / sy
+    down_nx = int(scale_size)
+    down_ny = int(scale * sx)
+    if mode == 'normal':
+        im = misc.imresize(im, (nx, ny))
+        return im
+    else:
+        im_scaled_down = misc.imresize(im, (down_nx, down_ny))
+        im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
+        return im_scaled_up
+    return im
+
+def contrast_normalization(im, low_pct, high_pct):
+    element_number = im.size
+    rows = im.shape[0]
+    cols = im.shape[1]
+    im_contrast = np.zeros(shape=im.shape)
+    low_index = int(low_pct * element_number)
+    high_index = int(high_pct * element_number)
+    sorted_im = np.sort(im, axis=None)
+    low_thred = sorted_im[low_index]
+    high_thred = sorted_im[high_index]
+    for i in range(rows):
+        for j in range(cols):
+            if im[i, j] > high_thred:
+                im_contrast[i, j] = 255  # lightest to white
+            elif im[i, j] < low_thred:
+                im_contrast[i, j] = 0  # darkest to black
+            else:
+                # linear normalization
+                im_contrast[i, j] = (im[i, j] - low_thred) * \
+                    255 / (high_thred - low_thred)
+    return im_contrast
+
+
+def geometric_moment(frame, p, q):
+    m = 0
+    for i in range(frame.shape[1]):
+        for j in range(frame.shape[0]):
+            m += (i ** p) * (j ** q) * frame[i][i]
+    return m
+
+
+def central_moment(frame, p, q):
+    u = 0
+    x_bar = geometric_moment(frame, 1, 0) / \
+        geometric_moment(frame, 0, 0)  # m10/m00
+    y_bar = geometric_moment(frame, 0, 1) / \
+        geometric_moment(frame, 0, 0)  # m01/m00
+    for i in range(frame.shape[1]):
+        for j in range(frame.shape[0]):
+            u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j]
+    return u
+
+
+def height_normalization(frame, w, h):
+    frame_normalized = np.zeros(shape=(h, w))
+    alpha = 4
+    x_bar = geometric_moment(frame, 1, 0) / \
+        geometric_moment(frame, 0, 0)  # m10/m00
+    y_bar = geometric_moment(frame, 0, 1) / \
+        geometric_moment(frame, 0, 0)  # m01/m00
+    sigma_x = (alpha * ((central_moment(frame, 2, 0) /
+                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u20/m00)
+    sigma_y = (alpha * ((central_moment(frame, 0, 2) /
+                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u02/m00)
+    for x in range(w):
+        for y in range(h):
+            i = int((x / w - 0.5) * sigma_x + x_bar)
+            j = int((y / h - 0.5) * sigma_y + y_bar)
+            frame_normalized[x][y] = frame[i][j]
+    return frame_normalized
 
+
+def find_slant_project(im):
+    rows = im.shape[0]
+    cols = im.shape[1]
+    std_max = 0
+    alpha_max = 0
+    col_disp = np.zeros(90, int)
+    proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int)
+    for r in range(rows):
+        for alpha in range(-45, 45, 1):
+            col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi))
+        for c in range(cols):
+            if im[r, c] < 100:
+                for alpha in range(-45, 45, 1):
+                    proj[alpha + 45, c + col_disp[alpha] + rows] += 1
+    for alpha in range(-45, 45, 1):
+        proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10)
+        proj_std = np.std(proj_histogram)
+        if proj_std > std_max:
+            std_max = proj_std
+            alpha_max = alpha
+    proj_std = np.std(proj, axis=1)
+    return -alpha_max
+
+
+def horizontal_shear(im, degree):
+    rad = degree / 180.0 * math.pi
+    padding_x = int(abs(np.tan(rad)) * im.shape[0])
+    padding_y = im.shape[0]
+    if rad > 0:
+        im_pad = np.concatenate(
+            (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1)
+    elif rad < 0:
+        im_pad = np.concatenate(
+            (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1)
+    else:
+        im_pad = im
+    shear_matrix = np.array([[1, 0],
+                             [np.tan(rad), 1]])
+    sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0)
+    return sheared_im
+
+
+### main ###
+random.seed(1)
+data_list_path = args.images_scp_path
 if args.out_ark == '-':
     out_fh = sys.stdout
 else:
-    out_fh = open(args.out_ark,'wb')
+    out_fh = open(args.out_ark,'w')
 
 allowed_lengths = None
-if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')):
+allowed_len_handle = args.allowed_len_file_path
+if os.path.isfile(allowed_len_handle):
     print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
     allowed_lengths = []
-    with open(os.path.join(args.dir,'allowed_lengths.txt')) as f:
+    with open(allowed_len_handle) as f:
         for line in f:
             allowed_lengths.append(int(line.strip()))
     print("Read {} allowed lengths and will apply them to the "
@@ -106,6 +235,7 @@ def get_scaled_image(im, allowed_lengths = None):
 
 num_fail = 0
 num_ok = 0
+aug_setting = ['normal', 'scaled']
 with open(data_list_path) as f:
     for line in f:
         line = line.strip()
@@ -113,15 +243,24 @@ def get_scaled_image(im, allowed_lengths = None):
         image_id = line_vect[0]
         image_path = line_vect[1]
         im = misc.imread(image_path)
-        im_scaled = get_scaled_image(im, allowed_lengths)
-
-        if im_scaled is None:
+        if args.fliplr:
+            im = np.fliplr(im)
+        if args.augment:
+            im_aug = get_scaled_image_aug(im, aug_setting[0])
+            im_contrast = contrast_normalization(im_aug, 0.05, 0.2)
+            slant_degree = find_slant_project(im_contrast)
+            im_sheared = horizontal_shear(im_contrast, slant_degree)
+            im_aug = im_sheared
+        else:
+            im_aug = get_scaled_image_aug(im, aug_setting[0])
+        im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
+        if im_horizontal_padded is None:
             num_fail += 1
             continue
-        data = np.transpose(im_scaled, (1, 0))
+        data = np.transpose(im_horizontal_padded, (1, 0))
         data = np.divide(data, 255.0)
         num_ok += 1
         write_kaldi_matrix(out_fh, data, image_id)
 
-print('Generated features for {} images. Failed for {} (iamge too '
+print('Generated features for {} images. Failed for {} (image too '
       'long).'.format(num_ok, num_fail), file=sys.stderr)
diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh
index 73d711c73f0..dc07f07e318 100755
--- a/egs/iam/v1/local/prepare_data.sh
+++ b/egs/iam/v1/local/prepare_data.sh
@@ -18,6 +18,7 @@
 
 stage=0
 download_dir=data/download
+process_aachen_split=false
 wellington_dir=
 username=
 password=       # username and password for downloading the IAM database
@@ -53,6 +54,8 @@ ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
 brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
 lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
 wellington_corpus_loc=/export/corpora5/Wellington/WWC/
+aachen_split_url=http://www.openslr.org/resources/56/splits.zip
+aachen_splits=data/local/aachensplits
 mkdir -p $download_dir data/local
 
 # download and extact images and transcription
@@ -144,6 +147,18 @@ else
   echo "$0: Wellington Corpus not included because wellington_dir not provided"
 fi
 
+if [ -d $aachen_splits ]; then
+  echo "$0: Not downloading the Aachen splits as it is already there."
+else
+  if [ ! -f $aachen_splits/splits.zip ]; then
+    echo "$0: Downloading Aachen splits ..."
+    mkdir -p $aachen_splits
+    wget -P $aachen_splits/ $aachen_split_url || exit 1;
+  fi
+  unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1;
+  echo "$0: Done downloading and extracting Aachen splits"
+fi
+
 mkdir -p data/{train,test,val}
 file_name=largeWriterIndependentTextLineRecognitionTask
 
@@ -160,11 +175,17 @@ cat $train_old > $train_new
 cat $test_old > $test_new
 cat $val1_old $val2_old > $val_new
 
-if [ $stage -le 0 ]; then
-  local/process_data.py data/local data/train --dataset train || exit 1
-  local/process_data.py data/local data/test --dataset test || exit 1
-  local/process_data.py data/local data/val --dataset validation || exit 1
-
-  utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
-  utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+if $process_aachen_split; then
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1
+else
+    local/process_data.py data/local data/train --dataset train || exit 1
+    local/process_data.py data/local data/test --dataset test || exit 1
+    local/process_data.py data/local data/val --dataset validation || exit 1
 fi
+
+image/fix_data_dir.sh data/train
+image/fix_data_dir.sh data/test
+image/fix_data_dir.sh data/val
+
diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh
index f691d577fba..7451f6b85f7 100755
--- a/egs/iam/v1/local/prepare_dict.sh
+++ b/egs/iam/v1/local/prepare_dict.sh
@@ -38,7 +38,7 @@ while(<>){
 }' | sort -u > $dir/lexicon.txt
 
 
-sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 echo '<unk> SIL' >> $dir/lexicon.txt
diff --git a/egs/iam/v1/local/process_aachen_splits.py b/egs/iam/v1/local/process_aachen_splits.py
new file mode 100755
index 00000000000..cb6a6d4f0d8
--- /dev/null
+++ b/egs/iam/v1/local/process_aachen_splits.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+    the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+
+  Eg. local/process_aachen_splits.py data/local data/train data --dataset train
+  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+      utt2spk file: 000_a01-000u-00 000
+      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+
+parser = argparse.ArgumentParser(description="""Creates text, utt2spk
+                                                and images.scp files.""")
+parser.add_argument('database_path', type=str,
+                    help='Path to the downloaded (and extracted) IAM data')
+parser.add_argument('split_path', type=str,
+                    help='location of the train/test/val set')
+parser.add_argument('out_dir', type=str,
+                    help='location to write output files.')
+parser.add_argument('--dataset', type=str, default='train',
+                    choices=['train', 'test','validation'],
+                    help='Subset of data to process.')
+args = parser.parse_args()
+
+text_file = os.path.join(args.out_dir + '/', 'text')
+text_fh = open(text_file, 'w')
+
+utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+image_file = os.path.join(args.out_dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+
+dataset_path = os.path.join(args.split_path,
+                            args.dataset + '.uttlist')
+
+text_file_path = os.path.join(args.database_path,
+                              'ascii','lines.txt')
+text_dict = {}
+def process_text_file_for_word_model():
+  with open (text_file_path, 'rt') as in_file:
+    for line in in_file:
+      if line[0]=='#':
+        continue
+      line = line.strip()
+      utt_id = line.split(' ')[0]
+      text_vect = line.split(' ')[8:]
+      text = "".join(text_vect)
+      text = text.replace("|", " ")
+      text_dict[utt_id] = text
+
+
+### main ###
+
+print("Processing '{}' data...".format(args.dataset))
+process_text_file_for_word_model()
+
+with open(dataset_path) as f:
+  for line in f:
+    line = line.strip()
+    line_vect = line.split('-')
+    xml_file = line_vect[0] + '-' + line_vect[1]
+    xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
+    doc = minidom.parse(xml_path)
+    form_elements = doc.getElementsByTagName('form')[0]
+    writer_id = form_elements.getAttribute('writer-id')
+    outerfolder = form_elements.getAttribute('id')[0:3]
+    innerfolder = form_elements.getAttribute('id')
+    lines_path = os.path.join(args.database_path, 'lines',
+                              outerfolder, innerfolder)
+    for file in os.listdir(lines_path):
+      if file.endswith(".png"):
+        image_file_path = os.path.join(lines_path, file)
+        base_name = os.path.splitext(os.path.basename(image_file_path))[0]
+        text =  text_dict[base_name]
+        utt_id = writer_id + '_' + base_name
+        text_fh.write(utt_id + ' ' + text + '\n')
+        utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+        image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
index a15fbea2af3..3e8c838efdb 100755
--- a/egs/iam/v1/local/train_lm.sh
+++ b/egs/iam/v1/local/train_lm.sh
@@ -58,9 +58,12 @@ if [ $stage -le 0 ]; then
   rm ${dir}/data/text/* 2>/dev/null || true
 
   # Using LOB and brown corpus.
-  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
-    local/remove_test_utterances_from_lob.py data/test/text data/val/text \
-                                             > ${dir}/data/text/lob.txt
+  if [ ! -f data/local/lob-train-only.txt ]; then
+    cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
+      local/remove_test_utterances_from_lob.py data/test/text data/val/text \
+                                               > data/local/lob-train-only.txt
+  fi
+  cat data/local/lob-train-only.txt > ${dir}/data/text/lob.txt
   cat data/local/browncorpus/brown.txt > ${dir}/data/text/brown.txt
   if [ -d "data/local/wellingtoncorpus" ]; then
     cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt > ${dir}/data/text/wellington.txt
diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py
index c5ad1235427..1f1404b5165 100755
--- a/egs/iam/v1/local/unk_arc_post_to_transcription.py
+++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py
@@ -1,88 +1,108 @@
 #!/usr/bin/env python3
 
-# Copyright     2017  Ashish Arora
+#Copyright      2017  Ashish Arora
 
+""" This module will be used by scripts for open vocabulary setup.
+ If the hypothesis transcription contains <unk>, then it will replace the 
+ <unk> with the word predicted by <unk> model by concatenating phones decoded 
+ from the unk-model. It is currently supported only for triphone setup.
+ Args:
+  phones: File name of a file that contains the phones.txt, (symbol-table for phones).
+          phone and phoneID, Eg. a 217, phoneID of 'a' is 217. 
+  words: File name of a file that contains the words.txt, (symbol-table for words). 
+         word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234.
+  unk: ID of <unk>. Eg. 231.
+  one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior
+               of arcs along the one-best path from the lattice.
+               E.g. 506_m01-049-00 8 12  1 7722  282 272 288 231
+                    <utterance-id> <start-frame> <num-frames> <posterior> <word> [<ali>] 
+                    [<phone1> <phone2>...]
+  output-text: File containing hypothesis transcription with <unk> recognized by the
+               unk-model.
+               E.g. A move to stop mr. gaitskell.
+  
+  Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt 
+      data/lang/oov.int
+"""
 import argparse
+import io
+import os
 import sys
-
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
-parser.add_argument('phones', type=str, help='phones and phonesID')
-parser.add_argument('words', type=str, help='word and wordID')
-parser.add_argument('unk', type=str, default='-', help='location of unk file')
-parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
-parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
+parser.add_argument('phones', type=str, help='File name of a file that contains the'
+                    'symbol-table for phones. Each line must be: <phone> <phoneID>')
+parser.add_argument('words', type=str, help='File name of a file that contains the'
+                    'symbol-table for words. Each line must be: <word> <word-id>')
+parser.add_argument('unk', type=str, default='-', help='File name of a file that'
+                    'contains the ID of <unk>. The content must be: <oov-id>, e.g. 231')
+parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post'
+                    'format, which is a list of timing info and posterior of arcs'
+                    'along the one-best path from the lattice')
+parser.add_argument('--output-text', type=str, default='-', help='File containing'
+                    'hypothesis transcription with <unk> recognized by the unk-model')
 args = parser.parse_args()
 
-
 ### main ###
-phone_fh = open(args.phones, 'r', encoding='latin-1')
-word_fh = open(args.words, 'r', encoding='latin-1')
-unk_fh = open(args.unk, 'r', encoding='latin-1')
-if args.input_ark == '-':
-    input_fh = sys.stdin
+phone_handle = open(args.phones, 'r', encoding='utf8') # Create file handles 
+word_handle = open(args.words, 'r', encoding='utf8')
+unk_handle = open(args.unk,'r', encoding='utf8')
+if args.one_best_arc_post == '-':
+    arc_post_handle = io.TextIOWrapper(sys.stdin.buffer, encoding='utf8')
 else:
-    input_fh = open(args.input_ark, 'r', encoding='latin-1')
-if args.out_ark == '-':
-    out_fh = sys.stdout
+    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='utf8')
+if args.output_text == '-':
+    output_text_handle = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
 else:
-    out_fh = open(args.out_ark, 'w', encoding='latin-1')
+    output_text_handle = open(args.output_text, 'w', encoding='utf8')
 
-phone_dict = dict()  # Stores phoneID and phone mapping
-phone_data_vect = phone_fh.read().strip().split("\n")
-for key_val in phone_data_vect:
+id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
+phones_data = phone_handle.read().strip().split("\n")
+
+for key_val in phones_data:
   key_val = key_val.split(" ")
-  phone_dict[key_val[1]] = key_val[0]
+  id2phone[key_val[1]] = key_val[0]
+
 word_dict = dict()
-word_data_vect = word_fh.read().strip().split("\n")
+word_data_vect = word_handle.read().strip().split("\n")
+
 for key_val in word_data_vect:
   key_val = key_val.split(" ")
   word_dict[key_val[1]] = key_val[0]
-unk_val = unk_fh.read().strip().split(" ")[0]
+unk_val = unk_handle.read().strip().split(" ")[0]
 
-utt_word_dict = dict()
-utt_phone_dict = dict()  # Stores utteranceID and phoneID
-unk_word_dict = dict()
-count=0
-for line in input_fh:
+utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str)
+for line in arc_post_handle:
   line_vect = line.strip().split("\t")
-  if len(line_vect) < 6:
-    print("Bad line: '{}'   Expecting 6 fields. Skipping...".format(line),
+  if len(line_vect) < 6: # Check for 1best-arc-post output
+    print("Error: Bad line: '{}'   Expecting 6 fields. Skipping...".format(line),
           file=sys.stderr)
     continue
-  uttID = line_vect[0]
+  utt_id = line_vect[0]
   word = line_vect[4]
   phones = line_vect[5]
-  if uttID in utt_word_dict.keys():
-    utt_word_dict[uttID][count] = word
-    utt_phone_dict[uttID][count] = phones
-  else:
-    count = 0
-    utt_word_dict[uttID] = dict()
-    utt_phone_dict[uttID] = dict()
-    utt_word_dict[uttID][count] = word
-    utt_phone_dict[uttID][count] = phones
-  if word == unk_val:   # Get character sequence for unk
-    phone_key_vect = phones.split(" ")
-    phone_val_vect = list()
-    for pkey in phone_key_vect:
-      phone_val_vect.append(phone_dict[pkey])
+  if utt_id not in list(utt_word_dict.keys()):
+    utt_word_dict[utt_id] = list()
+
+  if word == unk_val: # Get the 1best phone sequence given by the unk-model
+    phone_id_seq = phones.split(" ")
+    phone_seq = list()
+    for pkey in phone_id_seq:
+      phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence.
     phone_2_word = list()
-    for phone_val in phone_val_vect:
-      phone_2_word.append(phone_val.split('_')[0])
-    phone_2_word = ''.join(phone_2_word)
-    utt_word_dict[uttID][count] = phone_2_word
+    for phone_val in phone_seq:
+      phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B)
+    phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence
+    utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model
   else:
-    if word == '0':
+    if word == '0': # Store space/silence
       word_val = ' '
     else:
       word_val = word_dict[word]
-    utt_word_dict[uttID][count] = word_val
-  count += 1
+    utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post
 
-transcription = ""
-for key in sorted(utt_word_dict.keys()):
-  transcription = key
-  for index in sorted(utt_word_dict[key].keys()):
-    value = utt_word_dict[key][index]
-    transcription = transcription + " " + value
-  out_fh.write(transcription + '\n')
+transcription = "" # Output transcription
+for utt_key in sorted(utt_word_dict.keys()):
+  transcription = utt_key
+  for word in utt_word_dict[utt_key]:
+    transcription = transcription + " " + word
+  output_text_handle.write(transcription + '\n')
diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh
index b943870f530..85811b6cb3d 100755
--- a/egs/iam/v1/run.sh
+++ b/egs/iam/v1/run.sh
@@ -20,6 +20,9 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 # This corpus is of written NZ English that can be purchased here:
 # "https://www.victoria.ac.nz/lals/resources/corpora-default"
 wellington_database=/export/corpora5/Wellington/WWC/
+train_set=train_aug
+process_aachen_split=false
+overwrite=false
 
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
@@ -30,39 +33,63 @@ wellington_database=/export/corpora5/Wellington/WWC/
 ./local/check_tools.sh
 
 if [ $stage -le 0 ]; then
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
   echo "$0: Preparing data..."
   local/prepare_data.sh --download-dir "$iam_database" \
     --wellington-dir "$wellington_database" \
-    --username "$username" --password "$password"
+    --username "$username" --password "$password" \
+    --process_aachen_split $process_aachen_split
 fi
-mkdir -p data/{train,test}/data
+mkdir -p data/{train,test,val}/data
 
 if [ $stage -le 1 ]; then
-  echo "$0: Preparing the test and train feature files..."
-  for dataset in train test; do
-    local/make_features.py data/$dataset --feat-dim 40 | \
-      copy-feats --compress=true --compression-method=7 \
-                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
-    steps/compute_cmvn_stats.sh data/$dataset
+  echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$0: $(date) Extracting features, creating feats.scp file"
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train
+  steps/compute_cmvn_stats.sh data/train || exit 1;
+  for set in val test; do
+    local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \
+    --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
   done
+  utils/fix_data_dir.sh data/train
 fi
 
 if [ $stage -le 2 ]; then
+  for set in train; do
+    echo "$0: $(date) stage 2: Performing augmentation, it will double training data"
+    local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
+    steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
   echo "$0: Estimating a language model for decoding..."
   # We do this stage before dict preparation because prepare_dict.sh
   # generates the lexicon from pocolm's wordlist
   local/train_lm.sh --vocab-size 50k
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 4 ]; then
   echo "$0: Preparing dictionary and lang..."
-
   # This is for training. Use a large vocab size, e.g. 500k to include all the
   # training words:
   local/prepare_dict.sh --vocab-size 500k --dir data/local/dict  # this is for training
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
                         data/local/dict "<unk>" data/lang/temp data/lang
-
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
   # This is for decoding. We use a 50k lexicon to be consistent with the papers
   # reporting WERs on IAM:
   local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k  # this is for decoding
@@ -77,11 +104,14 @@ if [ $stage -le 3 ]; then
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \
                         --unk-fst exp/unk_lang_model/unk_fst.txt \
                         data/local/dict_50k "<unk>" data/lang_unk/temp data/lang_unk
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo
   cp data/lang_test/G.fst data/lang_unk/G.fst
 fi
 
 if [ $stage -le 4 ]; then
-  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
+  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/$train_set \
     data/lang exp/mono
 fi
 
@@ -93,10 +123,10 @@ if [ $stage -le 5 ] && $decode_gmm; then
 fi
 
 if [ $stage -le 6 ]; then
-  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+  steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \
     exp/mono exp/mono_ali
 
-  steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \
+  steps/train_deltas.sh --cmd $cmd 500 20000 data/$train_set data/lang \
     exp/mono_ali exp/tri
 fi
 
@@ -108,12 +138,12 @@ if [ $stage -le 7 ] && $decode_gmm; then
 fi
 
 if [ $stage -le 8 ]; then
-  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+  steps/align_si.sh --nj $nj --cmd $cmd data/$train_set data/lang \
     exp/tri exp/tri_ali
 
   steps/train_lda_mllt.sh --cmd $cmd \
     --splice-opts "--left-context=3 --right-context=3" 500 20000 \
-    data/train data/lang exp/tri_ali exp/tri2
+    data/$train_set data/lang exp/tri_ali exp/tri2
 fi
 
 if [ $stage -le 9 ] && $decode_gmm; then
@@ -125,10 +155,10 @@ fi
 
 if [ $stage -le 10 ]; then
   steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
-    data/train data/lang exp/tri2 exp/tri2_ali
+    data/$train_set data/lang exp/tri2 exp/tri2_ali
 
   steps/train_sat.sh --cmd $cmd 500 20000 \
-    data/train data/lang exp/tri2_ali exp/tri3
+    data/$train_set data/lang exp/tri2_ali exp/tri3
 fi
 
 if [ $stage -le 11 ] && $decode_gmm; then
@@ -140,13 +170,13 @@ fi
 
 if [ $stage -le 12 ]; then
   steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
-    data/train data/lang exp/tri3 exp/tri3_ali
+    data/$train_set data/lang exp/tri3 exp/tri3_ali
 fi
 
 if [ $stage -le 13 ]; then
-  local/chain/run_cnn_1a.sh --lang-test lang_unk
+  local/chain/run_cnn.sh --lang-test lang_unk --train_set $train_set
 fi
 
 if [ $stage -le 14 ]; then
-  local/chain/run_cnn_chainali_1c.sh --chain-model-dir exp/chain/cnn_1a --stage 2
+  local/chain/run_cnn_chainali.sh --chain-model-dir exp/chain/cnn_1a --stage 2 --train_set $train_set
 fi
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
index 6df93e739f4..0a8b014715f 100755
--- a/egs/iam/v1/run_end2end.sh
+++ b/egs/iam/v1/run_end2end.sh
@@ -6,6 +6,8 @@ stage=0
 nj=20
 username=
 password=
+process_aachen_split=false
+overwrite=false
 # iam_database points to the database path on the JHU grid. If you have not
 # already downloaded the database you can set it to a local directory
 # like "data/download" and follow the instructions
@@ -16,61 +18,78 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 # This corpus is of written NZ English that can be purchased here:
 # "https://www.victoria.ac.nz/lals/resources/corpora-default"
 wellington_database=/export/corpora5/Wellington/WWC/
-
+train_set=train_aug
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
-
-
 ./local/check_tools.sh
-
 if [ $stage -le 0 ]; then
+
+ if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
   echo "$0: Preparing data..."
   local/prepare_data.sh --download-dir "$iam_database" \
     --wellington-dir "$wellington_database" \
-    --username "$username" --password "$password"
+    --username "$username" --password "$password" \
+    --process_aachen_split $process_aachen_split
 fi
-mkdir -p data/{train,test}/data
+mkdir -p data/{train,test,val}/data
 
 if [ $stage -le 1 ]; then
-  image/get_image2num_frames.py data/train  # This will be needed for the next command
+  echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
   # The next command creates a "allowed_lengths.txt" file in data/train
   # which will be used by local/make_features.py to enforce the images to
   # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
   image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
-  echo "$0: Preparing the test and train feature files..."
-  for dataset in train test; do
-    local/make_features.py data/$dataset --feat-dim 40 | \
-      copy-feats --compress=true --compression-method=7 \
-                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
-    steps/compute_cmvn_stats.sh data/$dataset
+  echo "$0: $(date) Extracting features, creating feats.scp file"
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train
+  steps/compute_cmvn_stats.sh data/train || exit 1;
+  for set in val test; do
+    local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \
+    --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
   done
   utils/fix_data_dir.sh data/train
 fi
 
 if [ $stage -le 2 ]; then
+  for set in train; do
+    echo "$0: $(date) stage 2: Performing augmentation, it will double training data"
+    local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
+    steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
   echo "$0: Estimating a language model for decoding..."
   # We do this stage before dict preparation because prepare_dict.sh
   # generates the lexicon from pocolm's wordlist
   local/train_lm.sh --vocab-size 50k
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 4 ]; then
   echo "$0: Preparing dictionary and lang..."
-
   # This is for training. Use a large vocab size, e.g. 500k to include all the
   # training words:
   local/prepare_dict.sh --vocab-size 500k --dir data/local/dict
-  utils/prepare_lang.sh --sil-prob 0.95 \
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
                         data/local/dict "<unk>" data/lang/temp data/lang
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
 
   # This is for decoding. We use a 50k lexicon to be consistent with the papers
   # reporting WERs on IAM.
   local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k
-  utils/prepare_lang.sh --sil-prob 0.95 data/local/dict_50k \
-                        "<unk>" data/lang_test/temp data/lang_test
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
+                        data/local/dict_50k "<unk>" data/lang_test/temp data/lang_test
   utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \
                      data/local/dict_50k/lexicon.txt data/lang_test
 
@@ -79,23 +98,27 @@ if [ $stage -le 3 ]; then
                             data/local/dict_50k exp/unk_lang_model
   utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \
                         data/local/dict_50k "<unk>" data/lang_unk/temp data/lang_unk
+
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang_unk/phones.txt >data/lang_unk/topo
   cp data/lang_test/G.fst data/lang_unk/G.fst
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 5 ]; then
   echo "$0: Calling the flat-start chain recipe..."
-  local/chain/run_flatstart_cnn1a.sh
+  local/chain/run_e2e_cnn.sh --train_set $train_set
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: Aligning the training data using the e2e chain model..."
   steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
                        --use-gpu false \
                        --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
-                       data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+                       data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 7 ]; then
   echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
-  local/chain/run_cnn_e2eali_1a.sh
+  local/chain/run_cnn_e2eali.sh --train_set $train_set
 fi
diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh
new file mode 100755
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/iam/v2/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/iam/v2/image b/egs/iam/v2/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/iam/v2/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh
new file mode 100755
index 00000000000..31e4a8217ca
--- /dev/null
+++ b/egs/iam/v2/local/augment_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+aug_set=aug1
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp"
+
+for set in $aug_set; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --fliplr false --augment true $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..2ce14e13694
--- /dev/null
+++ b/egs/iam/v2/local/chain/compare_wer.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+. ./path.sh
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer="--"
+  [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer="--"
+  [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# WER val                    "
+for x in $*; do
+  wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored) val         "
+for x in $*; do
+  wer="--"
+  [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val                    "
+for x in $*; do
+  cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored) val         "
+for x in $*; do
+  cer="--"
+  [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Parameters                 "
+for x in $*; do
+  params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+  printf "% 10s" $params
+done
+echo
diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..da731bcb0b1
--- /dev/null
+++ b/egs/iam/v2/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1d.sh
\ No newline at end of file
diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..7dca9c30e23
--- /dev/null
+++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1b.sh
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
similarity index 91%
rename from egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh
rename to egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
index ba28f681708..9a01688ba35 100755
--- a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -22,6 +22,7 @@ stage=0
 
 nj=30
 train_set=train
+decode_val=true
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 e2echain_model_dir=exp/chain/e2e_cnn_1a
@@ -42,7 +43,9 @@ tdnn_dim=450
 # training options
 srand=0
 remove_egs=true
-lang_test=lang_unk
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -132,7 +135,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
@@ -228,18 +231,26 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context $chunk_left_context \
+      --extra-right-context $chunk_right_context \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
 fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
similarity index 86%
rename from egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh
rename to egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
index 6d8cca876bf..28aa246f334 100755
--- a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -2,15 +2,17 @@
 
 # e2eali_1b is the same as e2eali_1a but uses unconstrained egs
 
-# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b
 # System                      cnn_e2eali_1a cnn_e2eali_1b
-# WER                             12.79     12.23
-# CER                              5.73      5.48
-# Final train prob              -0.0556   -0.0367
-# Final valid prob              -0.0795   -0.0592
-# Final train prob (xent)       -0.9178   -0.8382
-# Final valid prob (xent)       -1.0604   -0.9853
-# Parameters                      3.95M     3.95M
+# WER                             10.40     10.33
+# WER (rescored)                  10.02     10.10
+# CER                              4.97      5.00
+# CER (rescored)                   4.83      4.88
+# Final train prob              -0.0612   -0.0428
+# Final valid prob              -0.0857   -0.0666
+# Final train prob (xent)       -0.8990   -0.9210
+# Final valid prob (xent)       -1.0024   -1.0264
+# Parameters                      3.98M     3.98M
 
 # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b
 # exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059)
@@ -21,6 +23,7 @@ stage=0
 
 nj=30
 train_set=train
+decode_val=true
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 e2echain_model_dir=exp/chain/e2e_cnn_1a
@@ -41,7 +44,10 @@ tdnn_dim=450
 # training options
 srand=0
 remove_egs=true
-lang_test=lang_unk
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -131,7 +137,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   cnn_opts="l2-regularize=0.075"
   tdnn_opts="l2-regularize=0.075"
   output_opts="l2-regularize=0.1"
@@ -227,18 +233,26 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context $chunk_left_context \
+      --extra-right-context $chunk_right_context \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
 fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
new file mode 100755
index 00000000000..f158317950a
--- /dev/null
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+
+# e2eali_1c is the same as e2eali_1b but has fewer CNN layers, smaller
+# l2-regularize, more epochs and uses dropout.
+
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b exp/chain/cnn_e2eali_1c
+# System                      cnn_e2eali_1b cnn_e2eali_1c
+# WER                             10.33     10.05
+# WER (rescored)                  10.10      9.75
+# CER                              5.00      4.76
+# CER (rescored)                   4.88      4.68
+# Final train prob              -0.0428   -0.0317
+# Final valid prob              -0.0666   -0.0630
+# Final train prob (xent)       -0.9210   -0.5413
+# Final valid prob (xent)       -1.0264   -0.7096
+# Parameters                      3.98M     5.12M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c
+# exp/chain/cnn_e2eali_1c: num-iters=21 nj=2..4 num-params=5.1M dim=40->392 combine=-0.034->-0.034 (over 1) xent:train/valid[13,20,final]=(-0.953,-0.800,-0.541/-1.03,-0.933,-0.710) logprob:train/valid[13,20,final]=(-0.069,-0.048,-0.032/-0.091,-0.078,-0.063)
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=8 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context $chunk_left_context \
+      --extra-right-context $chunk_right_context \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
new file mode 100755
index 00000000000..1c44057454a
--- /dev/null
+++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+
+# e2eali_1d is the same as e2eali_1c but has more CNN layers, different filter size
+# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs.
+
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d
+# System                      e2e_cnn_1b cnn_e2eali_1d
+# WER                             13.91      8.80
+# WER (rescored)                  13.64      8.52
+# CER                              7.08      4.06
+# CER (rescored)                   6.82      3.98
+# Final train prob               0.0148   -0.0524
+# Final valid prob               0.0105   -0.0713
+# Final train prob (xent)                 -0.4695
+# Final valid prob (xent)                 -0.5310
+# Parameters                      9.52M     4.36M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d
+# exp/chain/cnn_e2eali_1d: num-iters=30 nj=3..5 num-params=4.4M dim=40->400 combine=-0.055->-0.055 (over 1) xent:train/valid[19,29,final]=(-0.683,-0.489,-0.469/-0.703,-0.544,-0.531) logprob:train/valid[19,29,final]=(-0.090,-0.057,-0.052/-0.107,-0.076,-0.071)
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1d  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1b
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context $chunk_left_context \
+      --extra-right-context $chunk_right_context \
+      --extra-left-context-initial 0 \
+      --extra-right-context-final 0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
+fi
+
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
similarity index 84%
rename from egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
rename to egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
index 56c897137f4..cb2bfa0a82d 100755
--- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -2,19 +2,21 @@
 # Copyright    2017  Hossein Hadian
 
 # This script does end2end chain training (i.e. from scratch)
-
-# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a
-# System                         cnn_1a cnn_chainali_1c e2e_cnn_1a
-# WER                             18.52     12.72     13.87
-# CER                             10.07      5.99      6.54
-# Final train prob              -0.0077   -0.0291   -0.0371
-# Final valid prob              -0.0970   -0.0359   -0.0636
-# Final train prob (xent)       -0.5484   -0.9781
-# Final valid prob (xent)       -0.9643   -1.1544
-# Parameters                      4.36M     3.96M     9.13M
+# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a
+# WER                             11.24
+# WER (rescored)                  10.80
+# CER                              5.32
+# CER (rescored)                   5.24
+# Final train prob               0.0568
+# Final valid prob               0.0381
+# Final train prob (xent)
+# Final valid prob (xent)
+# Parameters                      9.13M
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
-# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059)
+# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038)
+
 
 set -e
 
@@ -23,6 +25,7 @@ stage=0
 train_stage=-10
 get_egs_stage=-10
 affix=1a
+nj=30
 
 # training options
 tdnn_dim=450
@@ -35,7 +38,9 @@ l2_regularize=0.00005
 frames_per_iter=1000000
 cmvn_opts="--norm-means=true --norm-vars=true"
 train_set=train
-lang_test=lang_unk
+decode_val=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -95,7 +100,6 @@ if [ $stage -le 2 ]; then
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -106,7 +110,6 @@ if [ $stage -le 2 ]; then
   relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
   relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
-
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
   output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
@@ -155,15 +158,19 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 5 ]; then
-  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
-  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --nj 30 --cmd "$cmd" \
-    $dir/graph data/test $dir/decode_test || exit 1;
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
 fi
 
 echo "Done. Date: $(date). Results:"
diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
new file mode 100755
index 00000000000..d5f79602695
--- /dev/null
+++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/
+# System                      e2e_cnn_1b
+# WER                             13.59
+# WER (rescored)                  13.27
+# CER                              6.92
+# CER (rescored)                   6.71
+# Final train prob               0.0345
+# Final valid prob               0.0269
+# Final train prob (xent)
+# Final valid prob (xent)
+# Parameters                      9.52M
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1b
+# exp/chain/e2e_cnn_1b: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=0.041->0.041 (over 2) logprob:train/valid[27,41,final]=(0.032,0.035,0.035/0.025,0.026,0.027)
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1b
+nj=30
+
+# training options
+tdnn_dim=450
+minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
+common_egs_dir=
+train_set=train
+decode_val=true
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_bitree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type biphone \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 4 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1
+  done
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v2/local/check_tools.sh b/egs/iam/v2/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/iam/v2/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/iam/v2/local/extract_features.sh b/egs/iam/v2/local/extract_features.sh
new file mode 100755
index 00000000000..1741ad3f9b2
--- /dev/null
+++ b/egs/iam/v2/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment=false
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  local/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py
new file mode 100755
index 00000000000..8ffc59c5788
--- /dev/null
+++ b/egs/iam/v2/local/gen_topo.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+# Copyright 2017 (author: Chun-Chieh Chang)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs. This is a modified version of
+# 'utils/gen_topo.pl'. The difference is that this creates two topologies for
+# the non-silence HMMs. The number of states for punctuations is different than
+# the number of states for other characters.
+
+from __future__ import print_function
+from __future__ import division
+import argparse
+import string
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones");
+parser.add_argument("num_sil_states", type=int, help="number of states for silence phones");
+parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation");
+parser.add_argument("nonsilence_phones",
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones",
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+parser.add_argument("phone_list", help="file containing all phones and their corresponding number.");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+punctuation_phones = []
+exclude = set("!(),.?;:'-\"")
+with open(args.phone_list) as f:
+    for line in f:
+        line = line.strip()
+        phone = line.split(' ')[0]
+        if len(phone) == 1 and phone in exclude:
+            punctuation_phones.append(int(line.split(' ')[1]))
+# For nonsilence phones that are not punctuations
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_nonsil_states):
+    xp1 = x + 1
+    print("<State> {0} <PdfClass> {0} <Transition> {0} 0.75 <Transition> {1} 0.25 </State>".format(x, xp1))
+print("<State> {} </State>".format(args.num_nonsil_states))
+print("</TopologyEntry>")
+
+# For nonsilence phones that ar punctuations
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_punctuation_states):
+    xp1 = x + 1
+    print("<State> {0} <PdfClass> {0} <Transition> {0} 0.75 <Transition> {1} 0.25 </State>".format(x, xp1))
+print("<State> {} </State>".format(args.num_punctuation_states))
+print("</TopologyEntry>")
+
+# For silence phones
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in silence_phones]))
+print("</ForPhones>")
+if(args.num_sil_states > 1):
+    transp = 1.0 / (args.num_sil_states - 1)
+    
+    state_str = "<State> 0 <PdfClass> 0 "
+    for x in range(0, (args.num_sil_states - 1)):
+        state_str = "{}<Transition> {} {} ".format(state_str, x, transp))
+    state_str = state_str + "</State>"
+    print(state_str)
+
+    for x in range(1, (args.num_sil_states - 1)):
+        state_str = "<State> {0} <PdfClass {0} ".format(x))
+        for y in range(1, args.num_sil_states):
+            state_str = "{}<Transition> {} {} ".format(state_str, y, transp))
+        state_str = state_str + "</State>"
+        print(state_str)
+    second_last = args.num_sil_states - 1
+    print("<State> {0} <PdfClass> {0} <Transition> {0} 0.75 <Transition> {1} 0.25 </State>".format(second_last, args.num_sil_states))
+    print("<State> {} </State>".format(args.num_sil_states))
+else:
+    print("<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>")
+    print("<State> {} </State>".format(args.num_sil_states))
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/iam/v2/local/make_features.py b/egs/iam/v2/local/make_features.py
new file mode 100755
index 00000000000..3ce501732cf
--- /dev/null
+++ b/egs/iam/v2/local/make_features.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Yiwen Shao
+#                2018  Hossein Hadian
+
+""" This script converts images to Kaldi-format feature matrices. The input to
+    this script is the path to a data directory, e.g. "data/train". This script
+    reads the images listed in images.scp and writes them to standard output
+    (by default) as Kaldi-formatted matrices (in text form). It also scales the
+    images so they have the same height (via --feat-dim). It can optionally pad
+    the images (on left/right sides) with white pixels.
+    If an 'image2num_frames' file is found in the data dir, it will be used
+    to enforce the images to have the specified length in that file by padding
+    white pixels (the --padding option will be ignored in this case). This relates
+    to end2end chain training.
+    eg. local/make_features.py data/train --feat-dim 40
+"""
+import random
+import argparse
+import os
+import sys
+import scipy.io as sio
+import numpy as np
+from scipy import misc
+from scipy.ndimage.interpolation import affine_transform
+import math
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
+                                                writes them to standard output in text format.""")
+parser.add_argument('images_scp_path', type=str,
+                    help='Path of images.scp file')
+parser.add_argument('--allowed_len_file_path', type=str, default=None,
+                    help='If supplied, each images will be padded to reach the '
+                    'target length (this overrides --padding).')
+parser.add_argument('--out-ark', type=str, default='-',
+                    help='Where to write the output feature file')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='Size to scale the height of all images')
+parser.add_argument('--padding', type=int, default=5,
+                    help='Number of white pixels to pad on the left'
+                    'and right side of the image.')
+parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="Flip the image left-right for right to left languages")
+parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="performs image augmentation")
+args = parser.parse_args()
+
+
+def write_kaldi_matrix(file_handle, matrix, key):
+    file_handle.write(key + " [ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to "
+                            "have the same length")
+        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file_handle.write("\n")
+    file_handle.write(" ]\n")
+
+
+def horizontal_pad(im, allowed_lengths = None):
+    if allowed_lengths is None:
+        left_padding = right_padding = args.padding
+    else:  # Find an allowed length for the image
+        imlen = im.shape[1] # width
+        allowed_len = 0
+        for l in allowed_lengths:
+            if l > imlen:
+                allowed_len = l
+                break
+        if allowed_len == 0:
+            #  No allowed length was found for the image (the image is too long)
+            return None
+        padding = allowed_len - imlen
+        left_padding = int(padding // 2)
+        right_padding = padding - left_padding
+    dim_y = im.shape[0] # height
+    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
+                                           dtype=int), im), axis=1)
+    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
+                                                    dtype=int)), axis=1)
+    return im_pad1
+
+def get_scaled_image_aug(im, mode='normal'):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx) 
+    scale_size = random.randint(10, 30)
+    scale = (1.0 * scale_size) / sy
+    down_nx = int(scale_size)
+    down_ny = int(scale * sx)
+    if mode == 'normal':
+        im = misc.imresize(im, (nx, ny))
+        return im
+    else:
+        im_scaled_down = misc.imresize(im, (down_nx, down_ny))
+        im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
+        return im_scaled_up
+    return im
+
+def contrast_normalization(im, low_pct, high_pct):
+    element_number = im.size
+    rows = im.shape[0]
+    cols = im.shape[1]
+    im_contrast = np.zeros(shape=im.shape)
+    low_index = int(low_pct * element_number)
+    high_index = int(high_pct * element_number)
+    sorted_im = np.sort(im, axis=None)
+    low_thred = sorted_im[low_index]
+    high_thred = sorted_im[high_index]
+    for i in range(rows):
+        for j in range(cols):
+            if im[i, j] > high_thred:
+                im_contrast[i, j] = 255  # lightest to white
+            elif im[i, j] < low_thred:
+                im_contrast[i, j] = 0  # darkest to black
+            else:
+                # linear normalization
+                im_contrast[i, j] = (im[i, j] - low_thred) * \
+                    255 / (high_thred - low_thred)
+    return im_contrast
+
+
+def geometric_moment(frame, p, q):
+    m = 0
+    for i in range(frame.shape[1]):
+        for j in range(frame.shape[0]):
+            m += (i ** p) * (j ** q) * frame[i][i]
+    return m
+
+
+def central_moment(frame, p, q):
+    u = 0
+    x_bar = geometric_moment(frame, 1, 0) / \
+        geometric_moment(frame, 0, 0)  # m10/m00
+    y_bar = geometric_moment(frame, 0, 1) / \
+        geometric_moment(frame, 0, 0)  # m01/m00
+    for i in range(frame.shape[1]):
+        for j in range(frame.shape[0]):
+            u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j]
+    return u
+
+
+def height_normalization(frame, w, h):
+    frame_normalized = np.zeros(shape=(h, w))
+    alpha = 4
+    x_bar = geometric_moment(frame, 1, 0) / \
+        geometric_moment(frame, 0, 0)  # m10/m00
+    y_bar = geometric_moment(frame, 0, 1) / \
+        geometric_moment(frame, 0, 0)  # m01/m00
+    sigma_x = (alpha * ((central_moment(frame, 2, 0) /
+                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u20/m00)
+    sigma_y = (alpha * ((central_moment(frame, 0, 2) /
+                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u02/m00)
+    for x in range(w):
+        for y in range(h):
+            i = int((x / w - 0.5) * sigma_x + x_bar)
+            j = int((y / h - 0.5) * sigma_y + y_bar)
+            frame_normalized[x][y] = frame[i][j]
+    return frame_normalized
+
+
+def find_slant_project(im):
+    rows = im.shape[0]
+    cols = im.shape[1]
+    std_max = 0
+    alpha_max = 0
+    col_disp = np.zeros(90, int)
+    proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int)
+    for r in range(rows):
+        for alpha in range(-45, 45, 1):
+            col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi))
+        for c in range(cols):
+            if im[r, c] < 100:
+                for alpha in range(-45, 45, 1):
+                    proj[alpha + 45, c + col_disp[alpha] + rows] += 1
+    for alpha in range(-45, 45, 1):
+        proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10)
+        proj_std = np.std(proj_histogram)
+        if proj_std > std_max:
+            std_max = proj_std
+            alpha_max = alpha
+    proj_std = np.std(proj, axis=1)
+    return -alpha_max
+
+
+def horizontal_shear(im, degree):
+    rad = degree / 180.0 * math.pi
+    padding_x = int(abs(np.tan(rad)) * im.shape[0])
+    padding_y = im.shape[0]
+    if rad > 0:
+        im_pad = np.concatenate(
+            (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1)
+    elif rad < 0:
+        im_pad = np.concatenate(
+            (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1)
+    else:
+        im_pad = im
+    shear_matrix = np.array([[1, 0],
+                             [np.tan(rad), 1]])
+    sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0)
+    return sheared_im
+
+
+### main ###
+random.seed(1)
+data_list_path = args.images_scp_path
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'w')
+
+allowed_lengths = None
+allowed_len_handle = args.allowed_len_file_path
+if os.path.isfile(allowed_len_handle):
+    print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
+    allowed_lengths = []
+    with open(allowed_len_handle) as f:
+        for line in f:
+            allowed_lengths.append(int(line.strip()))
+    print("Read {} allowed lengths and will apply them to the "
+          "features.".format(len(allowed_lengths)), file=sys.stderr)
+
+num_fail = 0
+num_ok = 0
+aug_setting = ['normal', 'scaled']
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+        im = misc.imread(image_path)
+        if args.fliplr:
+            im = np.fliplr(im)
+        if args.augment:
+            im_aug = get_scaled_image_aug(im, aug_setting[0])
+            im_contrast = contrast_normalization(im_aug, 0.05, 0.2)
+            slant_degree = find_slant_project(im_contrast)
+            im_sheared = horizontal_shear(im_contrast, slant_degree)
+            im_aug = im_sheared
+        else:
+            im_aug = get_scaled_image_aug(im, aug_setting[0])
+        im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
+        if im_horizontal_padded is None:
+            num_fail += 1
+            continue
+        data = np.transpose(im_horizontal_padded, (1, 0))
+        data = np.divide(data, 255.0)
+        num_ok += 1
+        write_kaldi_matrix(out_fh, data, image_id)
+
+print('Generated features for {} images. Failed for {} (image too '
+      'long).'.format(num_ok, num_fail), file=sys.stderr)
diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh
new file mode 100755
index 00000000000..cf729d9a939
--- /dev/null
+++ b/egs/iam/v2/local/prepare_data.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+# Apache 2.0
+
+# This script downloads the IAM handwriting database and prepares the training
+# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
+# It also downloads the LOB and Brown text corpora. It downloads the database files
+# only if they do not already exist in download directory.
+
+#  Eg. local/prepare_data.sh
+#  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+#      utt2spk file: 000_a01-000u-00 000
+#      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+#      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
+
+stage=0
+download_dir=data/download
+process_aachen_split=false
+wellington_dir=
+username=
+password=       # username and password for downloading the IAM database
+                # if you have not already downloaded the database, please
+                # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
+                # and provide this script with your username and password.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
+  echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files"
+  echo "exist in your data/local directory this script will fail because the required files"
+  echo "can't be downloaded automatically (it needs registration)."
+  echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
+  echo "... and then call this script again with --username <username> --password <password>"
+  echo ""
+  exit 1
+fi
+
+lines=data/local/lines
+xml=data/local/xml
+ascii=data/local/ascii
+bcorpus=data/local/browncorpus
+lobcorpus=data/local/lobcorpus
+wcorpus=data/local/wellingtoncorpus
+data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
+lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
+xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
+data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
+ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
+brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
+lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
+wellington_corpus_loc=/export/corpora5/Wellington/WWC/
+aachen_split_url=http://www.openslr.org/resources/56/splits.zip
+aachen_splits=data/local/aachensplits
+mkdir -p $download_dir data/local
+
+# download and extact images and transcription
+if [ -d $lines ]; then
+  echo "$0: Not downloading lines images as it is already there."
+else
+  if [ ! -f $download_dir/lines.tgz ]; then
+    echo "$0: Trying to download lines images..."
+    wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1;
+  fi
+  mkdir -p $lines
+  tar -xzf $download_dir/lines.tgz -C $lines || exit 1;
+  echo "$0: Done downloading and extracting lines images"
+fi
+
+if [ -d $xml ]; then
+  echo "$0: Not downloading transcriptions as it is already there."
+else
+  if [ ! -f $download_dir/xml.tgz ]; then
+    echo "$0: Trying to download transcriptions..."
+    wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1;
+  fi
+  mkdir -p $xml
+  tar -xzf $download_dir/xml.tgz -C $xml || exit 1;
+  echo "$0: Done downloading and extracting transcriptions."
+fi
+
+if [ -d $data_split_info ]; then
+  echo "$0: Not downloading data split information as it is already there."
+else
+  if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then
+    echo "$0: Trying to download training and testing data split information..."
+    wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1;
+  fi
+  mkdir -p $data_split_info
+  unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1;
+  echo "$0: Done downloading and extracting training and testing data split information"
+fi
+
+if [ -d $ascii ]; then
+  echo "$0: Not downloading ascii.tgz as it is already there."
+else
+  if [ ! -f $download_dir/ascii.tgz ]; then
+    echo "$0: trying to download ascii.tgz..."
+    wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1;
+  fi
+  mkdir -p $ascii
+  tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1;
+  echo "$0: Done downloading and extracting ascii.tgz"
+fi
+
+if [ -d $lobcorpus ]; then
+  echo "$0: Not downloading the LOB text corpus as it is already there."
+else
+  if [ ! -f $lobcorpus/0167.zip ]; then
+    echo "$0: Downloading the LOB text corpus ..."
+    mkdir -p $lobcorpus
+    wget -P $lobcorpus/ $lob_corpus_url || exit 1;
+  fi
+  unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1;
+  echo "$0: Done downloading and extracting LOB corpus"
+fi
+
+if [ -d $bcorpus ]; then
+  echo "$0: Not downloading the Brown corpus as it is already there."
+else
+  if [ ! -f $bcorpus/brown.txt ]; then
+    mkdir -p $bcorpus
+    echo "$0: Downloading the Brown text corpus..."
+    wget -P $bcorpus $brown_corpus_url || exit 1;
+  fi
+  echo "$0: Done downloading the Brown text corpus"
+fi
+
+if [ -d $wcorpus ]; then
+  echo "$0: Not copying Wellington corpus as it is already there."
+elif [ ! -z $wellington_dir ]; then
+  mkdir -p $wcorpus
+  cp -r $wellington_dir/. $wcorpus
+
+  # Combine Wellington corpora and replace some of their annotations
+  cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \
+    cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt
+
+  cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt
+
+  echo "$0: Done copying Wellington corpus"
+else
+  echo "$0: Wellington Corpus not included because wellington_dir not provided"
+fi
+
+if [ -d $aachen_splits ]; then
+  echo "$0: Not downloading the Aachen splits as it is already there."
+else
+  if [ ! -f $aachen_splits/splits.zip ]; then
+    echo "$0: Downloading Aachen splits ..."
+    mkdir -p $aachen_splits
+    wget -P $aachen_splits/ $aachen_split_url || exit 1;
+  fi
+  unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1;
+  echo "$0: Done downloading and extracting Aachen splits"
+fi
+
+
+mkdir -p data/{train,test,val}
+file_name=largeWriterIndependentTextLineRecognitionTask
+
+train_old="data/local/$file_name/trainset.txt"
+test_old="data/local/$file_name/testset.txt"
+val1_old="data/local/$file_name/validationset1.txt"
+val2_old="data/local/$file_name/validationset2.txt"
+
+train_new="data/local/train.uttlist"
+test_new="data/local/test.uttlist"
+val_new="data/local/validation.uttlist"
+
+cat $train_old > $train_new
+cat $test_old > $test_new
+cat $val1_old $val2_old > $val_new
+
+if $process_aachen_split; then
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1
+    local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1
+else
+    local/process_data.py data/local data/train --dataset train || exit 1
+    local/process_data.py data/local data/test --dataset test || exit 1
+    local/process_data.py data/local data/val --dataset validation || exit 1
+fi
+
+image/fix_data_dir.sh data/train
+image/fix_data_dir.sh data/test
+image/fix_data_dir.sh data/val
diff --git a/egs/iam/v2/local/prepare_dict.sh b/egs/iam/v2/local/prepare_dict.sh
new file mode 100755
index 00000000000..714b5b51788
--- /dev/null
+++ b/egs/iam/v2/local/prepare_dict.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+vocab_size=50000
+. ./utils/parse_options.sh
+
+mkdir -p $dir
+
+# First get the set of all letters that occur in data/train/text
+cat data/train/text | \
+  perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \
+  sort -u | grep -v "|" > $dir/nonsilence_phones.txt
+
+# Now use the pocolm's wordlist which is the most N frequent words in
+# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising
+# letters as their transcription. Only include words that use the above letters.
+# (Letter # is replaced with <HASH>)
+
+export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n")
+
+head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \
+  perl -e '$letters=$ENV{letters}; $letters=$letters . "|";
+while(<>){
+    chop;
+    $w = $_;
+    if($w =~ m/^[$letters]+$/){
+      $trans = join(" ", split(//, $w));
+      $trans =~ s/#/<HASH>/g;
+      $trans =~ s/\|/SIL/g;
+      print "$w $trans\n";
+    }
+}' | sort -u > $dir/lexicon.txt
+
+
+perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/iam/v2/local/process_aachen_splits.py b/egs/iam/v2/local/process_aachen_splits.py
new file mode 100755
index 00000000000..cb6a6d4f0d8
--- /dev/null
+++ b/egs/iam/v2/local/process_aachen_splits.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+    the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+
+  Eg. local/process_aachen_splits.py data/local data/train data --dataset train
+  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+      utt2spk file: 000_a01-000u-00 000
+      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+
+parser = argparse.ArgumentParser(description="""Creates text, utt2spk
+                                                and images.scp files.""")
+parser.add_argument('database_path', type=str,
+                    help='Path to the downloaded (and extracted) IAM data')
+parser.add_argument('split_path', type=str,
+                    help='location of the train/test/val set')
+parser.add_argument('out_dir', type=str,
+                    help='location to write output files.')
+parser.add_argument('--dataset', type=str, default='train',
+                    choices=['train', 'test','validation'],
+                    help='Subset of data to process.')
+args = parser.parse_args()
+
+text_file = os.path.join(args.out_dir + '/', 'text')
+text_fh = open(text_file, 'w')
+
+utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+image_file = os.path.join(args.out_dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+
+dataset_path = os.path.join(args.split_path,
+                            args.dataset + '.uttlist')
+
+text_file_path = os.path.join(args.database_path,
+                              'ascii','lines.txt')
+text_dict = {}
+def process_text_file_for_word_model():
+  with open (text_file_path, 'rt') as in_file:
+    for line in in_file:
+      if line[0]=='#':
+        continue
+      line = line.strip()
+      utt_id = line.split(' ')[0]
+      text_vect = line.split(' ')[8:]
+      text = "".join(text_vect)
+      text = text.replace("|", " ")
+      text_dict[utt_id] = text
+
+
+### main ###
+
+print("Processing '{}' data...".format(args.dataset))
+process_text_file_for_word_model()
+
+with open(dataset_path) as f:
+  for line in f:
+    line = line.strip()
+    line_vect = line.split('-')
+    xml_file = line_vect[0] + '-' + line_vect[1]
+    xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
+    doc = minidom.parse(xml_path)
+    form_elements = doc.getElementsByTagName('form')[0]
+    writer_id = form_elements.getAttribute('writer-id')
+    outerfolder = form_elements.getAttribute('id')[0:3]
+    innerfolder = form_elements.getAttribute('id')
+    lines_path = os.path.join(args.database_path, 'lines',
+                              outerfolder, innerfolder)
+    for file in os.listdir(lines_path):
+      if file.endswith(".png"):
+        image_file_path = os.path.join(lines_path, file)
+        base_name = os.path.splitext(os.path.basename(image_file_path))[0]
+        text =  text_dict[base_name]
+        utt_id = writer_id + '_' + base_name
+        text_fh.write(utt_id + ' ' + text + '\n')
+        utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+        image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py
new file mode 100755
index 00000000000..2adae7bf7be
--- /dev/null
+++ b/egs/iam/v2/local/process_data.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+    the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+
+  Eg. local/process_data.py data/local data/train data --dataset train
+  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+      utt2spk file: 000_a01-000u-00 000
+      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+
+parser = argparse.ArgumentParser(description="""Creates text, utt2spk
+                                                and images.scp files.""")
+parser.add_argument('database_path', type=str,
+                    help='Path to the downloaded (and extracted) IAM data')
+parser.add_argument('out_dir', type=str,
+                    help='Where to write output files.')
+parser.add_argument('--dataset', type=str, default='train',
+                    choices=['train', 'test','validation'],
+                    help='Subset of data to process.')
+args = parser.parse_args()
+
+text_file = os.path.join(args.out_dir + '/', 'text')
+text_fh = open(text_file, 'w')
+
+utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+image_file = os.path.join(args.out_dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+
+dataset_path = os.path.join(args.database_path,
+                            args.dataset + '.uttlist')
+
+text_file_path = os.path.join(args.database_path,
+                              'ascii','lines.txt')
+text_dict = {}
+def process_text_file_for_word_model():
+  with open (text_file_path, 'rt') as in_file:
+    for line in in_file:
+      if line[0]=='#':
+        continue
+      line = line.strip()
+      utt_id = line.split(' ')[0]
+      text_vect = line.split(' ')[8:]
+      text = "".join(text_vect)
+      text = text.replace("|", " ")
+      text_dict[utt_id] = text
+
+print("Processing '{}' data...".format(args.dataset))
+process_text_file_for_word_model()
+
+with open(dataset_path) as f:
+  for line in f:
+    line = line.strip()
+    line_vect = line.split('-')
+    xml_file = line_vect[0] + '-' + line_vect[1]
+    xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
+    img_num = line[-3:]
+    doc = minidom.parse(xml_path)
+    form_elements = doc.getElementsByTagName('form')[0]
+    writer_id = form_elements.getAttribute('writer-id')
+    outerfolder = form_elements.getAttribute('id')[0:3]
+    innerfolder = form_elements.getAttribute('id')
+    lines_path = os.path.join(args.database_path, 'lines',
+                              outerfolder, innerfolder, innerfolder)
+    image_file_path = lines_path + img_num + '.png'
+    text =  text_dict[line]
+    utt_id = writer_id + '_' + line
+    text_fh.write(utt_id + ' ' + text + '\n')
+    utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+    image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py
new file mode 100755
index 00000000000..5e5dac52818
--- /dev/null
+++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+# Copyright   2018 Ashish Arora
+
+import argparse
+import os
+import numpy as np
+import sys
+import re
+
+parser = argparse.ArgumentParser(description="""Removes dev/test set lines
+                                                from the LOB corpus. Reads the
+                                                corpus from stdin, and writes it to stdout.""")
+parser.add_argument('dev_text', type=str,
+                    help='dev transcription location.')
+parser.add_argument('test_text', type=str,
+                    help='test transcription location.')
+args = parser.parse_args()
+
+def remove_punctuations(transcript):
+    char_list = []
+    for char in transcript:
+        if char.isdigit() or char == '+' or char == '~' or char == '?':
+            continue
+        if char == '#' or char == '=' or char == '-' or char == '!':
+            continue
+        if char == ',' or char == '.' or char == ')' or char == '\'':
+            continue
+        if char == '(' or char == ':' or char == ';' or char == '"':
+            continue
+        if char == '*':
+            continue
+        char_list.append(char)
+    return char_list
+
+
+def remove_special_words(words):
+    word_list = []
+    for word in words:
+        if word == '<SIC>' or word == '#':
+            continue
+        word_list.append(word)
+    return word_list
+
+
+# process and add dev/eval transcript in a list
+# remove special words, punctuations, spaces between words
+# lowercase the characters
+def read_utterances(text_file_path):
+    with open(text_file_path, 'rt') as in_file:
+        for line in in_file:
+            words = line.strip().split()
+            words_wo_sw = remove_special_words(words)
+            transcript = ''.join(words_wo_sw[1:])
+            transcript = transcript.lower()
+            trans_wo_punct = remove_punctuations(transcript)
+            transcript = ''.join(trans_wo_punct)
+            utterance_dict[words_wo_sw[0]] = transcript
+
+
+### main ###
+
+# read utterances and add it to utterance_dict
+utterance_dict = dict()
+read_utterances(args.dev_text)
+read_utterances(args.test_text)
+
+# read corpus and add it to below lists
+corpus_text_lowercase_wo_sc = list()
+corpus_text_wo_sc = list()
+original_corpus_text = list()
+for line in sys.stdin:
+    original_corpus_text.append(line)
+    words = line.strip().split()
+    words_wo_sw = remove_special_words(words)
+
+    transcript = ''.join(words_wo_sw)
+    transcript = transcript.lower()
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_lowercase_wo_sc.append(transcript)
+
+    transcript = ''.join(words_wo_sw)
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_wo_sc.append(transcript)
+
+# find majority of utterances below
+# for utterances which were not found
+# add them to remaining_utterances
+row_to_keep = [True for i in range(len(original_corpus_text))]
+remaining_utterances = dict()
+for line_id, line_to_find in utterance_dict.items():
+    found_line = False
+    # avoiding very small utterance, it causes removing
+    # complete lob text
+    if len(line_to_find) < 10:
+        remaining_utterances[line_id] = line_to_find
+    else:
+        for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)):
+            # Combine 3 consecutive lines of the corpus into a single line
+            prev_words = corpus_text_lowercase_wo_sc[i - 1].strip()
+            curr_words = corpus_text_lowercase_wo_sc[i].strip()
+            next_words = corpus_text_lowercase_wo_sc[i + 1].strip()
+            new_line = prev_words + curr_words + next_words
+            transcript = ''.join(new_line)
+            if line_to_find in transcript:
+                found_line = True
+                row_to_keep[i-1] = False
+                row_to_keep[i] = False
+                row_to_keep[i+1] = False
+    if not found_line:
+        remaining_utterances[line_id] = line_to_find
+
+# removing long utterances not found above
+row_to_keep[87530] = False; row_to_keep[87531] = False; row_to_keep[87532] = False;
+row_to_keep[31724] = False; row_to_keep[31725] = False; row_to_keep[31726] = False;
+row_to_keep[16704] = False; row_to_keep[16705] = False; row_to_keep[16706] = False;
+row_to_keep[94181] = False; row_to_keep[94182] = False; row_to_keep[94183] = False;
+row_to_keep[20171] = False; row_to_keep[20172] = False; row_to_keep[20173] = False;
+row_to_keep[16734] = False; row_to_keep[16733] = False; row_to_keep[16732] = False;
+row_to_keep[20576] = False; row_to_keep[20577] = False; row_to_keep[20578] = False;
+row_to_keep[31715] = False; row_to_keep[31716] = False; row_to_keep[31717] = False;
+row_to_keep[31808] = False; row_to_keep[31809] = False; row_to_keep[31810] = False;
+row_to_keep[31822] = False; row_to_keep[31823] = False; row_to_keep[31824] = False;
+row_to_keep[88791] = False; row_to_keep[88792] = False; row_to_keep[88793] = False;
+row_to_keep[31745] = False; row_to_keep[31746] = False; row_to_keep[31825] = False;
+row_to_keep[94256] = False; row_to_keep[94257] = False; row_to_keep[88794] = False;
+row_to_keep[88665] = False; row_to_keep[17093] = False; row_to_keep[17094] = False;
+row_to_keep[20586] = False; row_to_keep[87228] = False; row_to_keep[87229] = False;
+row_to_keep[16744] = False; row_to_keep[87905] = False; row_to_keep[87906] = False;
+row_to_keep[16669] = False; row_to_keep[16670] = False; row_to_keep[16719] = False;
+row_to_keep[87515] = False; row_to_keep[20090] = False; row_to_keep[31748] = False;
+for i in range(len(original_corpus_text)):
+    transcript = original_corpus_text[i].strip()
+    if row_to_keep[i]:
+        print(transcript)
+
+print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr)
+print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr)
+print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr)
+print('LOB lines: Before: {}   After: {}'.format(len(original_corpus_text),
+                                                 row_to_keep.count(True)), file=sys.stderr)
diff --git a/egs/iam/v2/local/remove_wellington_annotations.py b/egs/iam/v2/local/remove_wellington_annotations.py
new file mode 100755
index 00000000000..260a3542985
--- /dev/null
+++ b/egs/iam/v2/local/remove_wellington_annotations.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# Copyright 2018 Chun-Chieh Chang
+
+import sys
+import io
+import re
+from collections import OrderedDict
+
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8");
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8");
+
+prev2_line = " ";
+prev_line = " ";
+for line in sys.stdin:
+    line = line.strip()
+    pattern = re.compile("\\*\\*\\[.*?\\*\\*\\]|\\*[0-9]|\\\\[0-9]{0,2}|\\*\\*?[\|,\?,\#,\=,\;,\:,\<,\>]|\||\^")
+    line_fixed = pattern.sub("", line)
+    dict=OrderedDict([("*+$","$"), ("*+","£"), ("*-","-"), ("*/","*"), ("*{","{"), ("*}","}"),
+        ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"), ("*@","°")])
+    pattern = re.compile("|".join(re.escape(key) for key in dict.keys()));
+    line_fixed = pattern.sub(lambda x: dict[x.group()], line_fixed)
+    
+    line_fixed = prev2_line + "\n" + prev_line + "\n" + line_fixed
+
+    pattern = re.compile("\{[0-9]{0,2}(.*?)\}", re.DOTALL)
+    line_fixed = pattern.sub(lambda x: x.group(1), line_fixed)
+
+    output, prev2_line, prev_line = line_fixed.split("\n")
+
+    sys.stdout.write(output + "\n")
+sys.stdout.write(prev2_line + "\n")
+sys.stdout.write(prev_line + "\n")
diff --git a/egs/iam/v2/local/score.sh b/egs/iam/v2/local/score.sh
new file mode 100755
index 00000000000..1d84815fc69
--- /dev/null
+++ b/egs/iam/v2/local/score.sh
@@ -0,0 +1,6 @@
+
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh
new file mode 100755
index 00000000000..cc0119eb748
--- /dev/null
+++ b/egs/iam/v2/local/train_lm.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains an LM on the LOB+Brown text data and IAM training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+vocab_size=50000
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Using LOB and brown corpus.
+  if [ ! -f data/local/lob-train-only.txt ]; then
+    cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
+      local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \
+                                               > data/local/lob-train-only.txt
+  fi
+  cat data/local/lob-train-only.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+    | sed 's/@@//g' > ${dir}/data/text/lob.txt
+  cat data/local/browncorpus/brown.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+    | sed 's/@@//g' > ${dir}/brown.txt
+  tail -n +5000 ${dir}/brown.txt > ${dir}/data/text/brown.txt
+  if [ -d "data/local/wellingtoncorpus" ]; then
+    cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \
+      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > ${dir}/data/text/wellington.txt
+  fi
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  head -5000 ${dir}/brown.txt > ${dir}/data/text/dev.txt
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/iam.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from IAM text
+  if [ -d "data/local/wellingtoncorpus" ]; then
+    cat ${dir}/data/text/{iam,lob,brown,wellington}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  else
+    echo "$0: Wellington Corpus not found. Proceeding without using that corpus."
+    cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  fi
+  head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=6
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='brown=2 lob=2 iam=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+  train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 1 million n-grams for a big LM for rescoring purposes.
+  size=1000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 500,000 n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=500000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/iam/v2/local/wer_output_filter b/egs/iam/v2/local/wer_output_filter
new file mode 100755
index 00000000000..24691a160a9
--- /dev/null
+++ b/egs/iam/v2/local/wer_output_filter
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# Copyright      2017  Hossein Hadian
+
+# This is a filter used in scoring. It separates all
+# punctuations from words. For e.g. this sentence:
+
+# "They have come!" he said reverently, gripping his
+# hands. "Isn't it a glorious thing! Long awaited."
+
+# is converted to this:
+
+# " They have come ! " he said reverently , gripping his
+# hands . " Isn ' t it a glorious thing ! Long awaited . "
+
+# Sample BPE-based output:
+# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch
+
+import sys
+import re
+
+punctuations = "!(),.?;:'-\""
+escaped_punctuations = re.escape(punctuations)
+
+for line in sys.stdin:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations),
+                                       transcript)).strip()
+  print("{} {}".format(uttid, split_transcript))
diff --git a/egs/iam/v2/path.sh b/egs/iam/v2/path.sh
new file mode 100755
index 00000000000..7e458144624
--- /dev/null
+++ b/egs/iam/v2/path.sh
@@ -0,0 +1,9 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+
+export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/home/dpovey/libs:$LD_LIBRARY_PATH
+export LC_ALL=C
diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh
new file mode 100755
index 00000000000..c515c85fc72
--- /dev/null
+++ b/egs/iam/v2/run_end2end.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright 2017    Hossein Hadian
+
+set -e
+stage=0
+nj=20
+username=
+password=
+process_aachen_split=false
+overwrite=false
+# iam_database points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# like "data/download" and follow the instructions
+# in "local/prepare_data.sh" to download the database:
+iam_database=/export/corpora5/handwriting_ocr/IAM
+# wellington_database points to the database path on the JHU grid. The Wellington
+# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus).
+# This corpus is of written NZ English that can be purchased here:
+# "https://www.victoria.ac.nz/lals/resources/corpora-default"
+wellington_database=/export/corpora5/Wellington/WWC/
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+
+./local/check_tools.sh
+
+if [ $stage -le 0 ]; then
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir "$iam_database" \
+    --wellington-dir "$wellington_database" \
+    --username "$username" --password "$password" \
+    --process_aachen_split $process_aachen_split
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 1 ]; then
+  echo "$(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$(date) Extracting features, creating feats.scp file"
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train
+  steps/compute_cmvn_stats.sh data/train || exit 1;
+  for set in val test; do
+    local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \
+    --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  utils/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  for set in train; do
+    echo "$(date) stage 2: Performing augmentation, it will double training data"
+    local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
+    steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Preparing BPE..."
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/train_data.txt
+  cat data/local/phones.txt data/local/train_data.txt | \
+    utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+  for set in test train val train_aug; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | \
+      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+  # So we set --sil-prob to 0.0
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+                        data/local/dict "<sil>" data/lang/temp data/lang
+  silphonelist=`cat data/lang/phones/silence.csl`
+  nonsilphonelist=`cat data/lang/phones/nonsilence.csl`
+  local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh --train_set train_aug
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
+                       --use-gpu false \
+                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --train_set train_aug
+fi
diff --git a/egs/iam/v2/steps b/egs/iam/v2/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/iam/v2/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/iam/v2/utils b/egs/iam/v2/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/iam/v2/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
index d320f49d3aa..10650a18269 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -136,7 +136,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.08 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=256"
 
diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
index 56f5255288c..db62e6f8a55 100755
--- a/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -136,7 +136,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) 
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python) 
   opts="l2-regularize=0.08 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0"
   output_opts="l2-regularize=0.04"
diff --git a/egs/iban/s5/run.sh b/egs/iban/s5/run.sh
index 991d32505bf..278a8177c0e 100755
--- a/egs/iban/s5/run.sh
+++ b/egs/iban/s5/run.sh
@@ -68,7 +68,7 @@ if [ $stage -le 4 ]; then
   echo "Starting triphone training."
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
       data/train data/lang exp/mono exp/mono_ali
-  steps/train_deltas.sh --boost-silence 1.25  --cmd "$train_cmd"  \
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd"  \
       3200 30000 data/train data/lang exp/mono_ali exp/tri1
   echo "Triphone training done."
 
@@ -78,7 +78,7 @@ if [ $stage -le 4 ]; then
   steps/decode.sh --nj $dev_nj --cmd "$decode_cmd"  \
       exp/tri1/graph  data/dev exp/tri1/decode_dev
 
-  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_test/ data/lang_big/ data/dev \
       exp/tri1/decode_dev exp/tri1/decode_dev.rescored
   echo "Triphone decoding done."
@@ -89,7 +89,7 @@ if [ $stage -le 5 ]; then
   ## Triphones + delta delta
   # Training
   echo "Starting (larger) triphone training."
-  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \
        data/train data/lang exp/tri1 exp/tri1_ali
   steps/train_deltas.sh --cmd "$train_cmd"  \
       4200 40000 data/train data/lang exp/tri1_ali exp/tri2a
@@ -97,11 +97,11 @@ if [ $stage -le 5 ]; then
 
   (
   echo "Decoding the dev set using triphone(large) models."
-  utils/mkgraph.sh data/lang_test  exp/tri2a exp/tri2a/graph
+  utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
   steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
-      exp/tri2a/graph  data/dev exp/tri2a/decode_dev
+      exp/tri2a/graph data/dev exp/tri2a/decode_dev
 
-  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_test/ data/lang_big/ data/dev \
       exp/tri2a/decode_dev exp/tri2a/decode_dev.rescored
   echo "Triphone(large) decoding done."
@@ -112,21 +112,21 @@ if [ $stage -le 6 ]; then
   ### Triphone + LDA and MLLT
   # Training
   echo "Starting LDA+MLLT training."
-  steps/align_si.sh  --nj $nj --cmd "$train_cmd"  \
+  steps/align_si.sh --nj $nj --cmd "$train_cmd"  \
       data/train data/lang exp/tri2a exp/tri2a_ali
 
-  steps/train_lda_mllt.sh  --cmd "$train_cmd"  \
+  steps/train_lda_mllt.sh --cmd "$train_cmd"  \
     --splice-opts "--left-context=3 --right-context=3" \
-    4200 40000 data/train data/lang  exp/tri2a_ali exp/tri2b
+    4200 40000 data/train data/lang exp/tri2a_ali exp/tri2b
   echo "LDA+MLLT training done."
 
   (
   echo "Decoding the dev set using LDA+MLLT models."
-  utils/mkgraph.sh data/lang_test  exp/tri2b exp/tri2b/graph
-  steps/decode.sh --nj $dev_nj    --cmd "$decode_cmd" \
-      exp/tri2b/graph  data/dev exp/tri2b/decode_dev
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2b/graph data/dev exp/tri2b/decode_dev
 
-  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_test/ data/lang_big/ data/dev \
       exp/tri2b/decode_dev exp/tri2b/decode_dev.rescored
   echo "LDA+MLLT decoding done."
@@ -138,7 +138,7 @@ if [ $stage -le 7 ]; then
   ### Triphone + LDA and MLLT + SAT and FMLLR
   # Training
   echo "Starting SAT+FMLLR training."
-  steps/align_si.sh  --nj $nj --cmd "$train_cmd" \
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
       --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali
   steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
       data/train data/lang exp/tri2b_ali exp/tri3b
@@ -150,7 +150,7 @@ if [ $stage -le 7 ]; then
   steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \
       exp/tri3b/graph  data/dev exp/tri3b/decode_dev
 
-  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_test/ data/lang_big/ data/dev \
       exp/tri3b/decode_dev exp/tri3b/decode_dev.rescored
   echo "SAT+FMLLR decoding done."
@@ -163,10 +163,10 @@ if [ $stage -le 8 ]; then
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
       data/train data/lang exp/tri3b exp/tri3b_ali
 
-  steps/train_ubm.sh  --cmd "$train_cmd"  \
+  steps/train_ubm.sh --cmd "$train_cmd"  \
       600 data/train data/lang exp/tri3b_ali exp/ubm5b2
 
-  steps/train_sgmm2.sh  --cmd "$train_cmd"  \
+  steps/train_sgmm2.sh --cmd "$train_cmd"  \
        5200 12000 data/train data/lang exp/tri3b_ali exp/ubm5b2/final.ubm exp/sgmm2_5b2
   echo "SGMM training done."
 
@@ -180,7 +180,7 @@ if [ $stage -le 8 ]; then
       --transform-dir exp/tri3b/decode_dev \
       exp/sgmm2_5b2/graph data/dev exp/sgmm2_5b2/decode_dev
 
-  steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
       data/lang_test/ data/lang_big/ data/dev \
       exp/sgmm2_5b2/decode_dev exp/sgmm2_5b2/decode_dev.rescored
 
diff --git a/egs/ifnenit/v1/README.txt b/egs/ifnenit/README.txt
similarity index 100%
rename from egs/ifnenit/v1/README.txt
rename to egs/ifnenit/README.txt
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
index b0e147d157b..b0ecd547741 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh
@@ -123,7 +123,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
diff --git a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
index b1f33b41a0c..7f3132d657e 100755
--- a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh
@@ -128,7 +128,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
diff --git a/egs/ifnenit/v1/local/make_features.py b/egs/ifnenit/v1/local/make_features.py
index 3a485e32eb1..87afa37c00a 100755
--- a/egs/ifnenit/v1/local/make_features.py
+++ b/egs/ifnenit/v1/local/make_features.py
@@ -10,7 +10,7 @@
     
     eg. local/make_features.py data/train --feat-dim 40
 """
-
+from __future__ import division
 
 import argparse
 import os
@@ -24,8 +24,8 @@
 signal(SIGPIPE,SIG_DFL)
 
 parser = argparse.ArgumentParser(description="""Generates and saves the feature vectors""")
-parser.add_argument('dir', type=str, help='directory of images.scp and is also output directory')
-parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file')
+parser.add_argument('dir', help='directory of images.scp and is also output directory')
+parser.add_argument('--out-ark', default='-', help='where to write the output feature file')
 parser.add_argument('--feat-dim', type=int, default=40, help='size to scale the height of all images')
 parser.add_argument('--padding', type=int, default=5, help='size to scale the height of all images')
 args = parser.parse_args()
@@ -42,7 +42,7 @@ def write_kaldi_matrix(file_handle, matrix, key):
         if num_cols != len(matrix[row_index]):
             raise Exception("All the rows of a matrix are expected to "
                             "have the same length")
-        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        file_handle.write(" ".join([str(x) for x in matrix[row_index]]))
         if row_index != num_rows - 1:
             file_handle.write("\n")
     file_handle.write(" ]\n")
@@ -51,7 +51,7 @@ def get_scaled_image(im):
     scale_size = args.feat_dim
     sx = im.shape[1]
     sy = im.shape[0]
-    scale = (1.0 * scale_size) / sy
+    scale = (1.0 * scale_size)/ sy
     nx = int(scale_size)
     ny = int(scale * sx)
     im = misc.imresize(im, (nx, ny))
diff --git a/egs/librispeech/s5/RESULTS b/egs/librispeech/s5/RESULTS
index 32b39b2c634..dbf54b9384d 100644
--- a/egs/librispeech/s5/RESULTS
+++ b/egs/librispeech/s5/RESULTS
@@ -1,6 +1,6 @@
 # In the results below, "tgsmall" is the pruned 3-gram LM, which is used for lattice generation.
 # The following language models are then used for rescoring:
-# a) tgmed- slightly less pruned 3-gram LM  
+# a) tgmed- slightly less pruned 3-gram LM
 # b) tglarge- the full, non-pruned 3-gram LM
 # c) fglarge- non-pruned 4-gram LM
 #
@@ -8,7 +8,7 @@
 # whereas "dev-other" and "test-other" sets contain more challenging speech
 
 ### SAT GMM model trained on the "train-clean-100" set (100 hours "clean" speech)
-### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri4b/decode_${lm}_${test}/wer* | best_wer.sh; done; echo; done
+### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri4b/decode_${lm}_${test}/wer* | utils/best_wer.sh; done; echo; done
 %WER 8.20 [ 4459 / 54402, 695 ins, 427 del, 3337 sub ] exp/tri4b/decode_fglarge_dev_clean/wer_14_0.5
 %WER 8.60 [ 4677 / 54402, 763 ins, 399 del, 3515 sub ] exp/tri4b/decode_tglarge_dev_clean/wer_16_0.0
 %WER 10.39 [ 5655 / 54402, 711 ins, 648 del, 4296 sub ] exp/tri4b/decode_tgmed_dev_clean/wer_16_0.0
@@ -31,7 +31,7 @@
 
 
 ### SAT GMM model trained on the combined "train-clean-100" + "train-clean-360" set (460 hours "clean" speech)
-### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri5b/decode_${lm}_${test}/wer* | best_wer.sh; done; echo; done
+### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri5b/decode_${lm}_${test}/wer* | utils/best_wer.sh; done; echo; done
 %WER 7.05 [ 3835 / 54402, 588 ins, 370 del, 2877 sub ] exp/tri5b/decode_fglarge_dev_clean/wer_15_0.5
 %WER 7.49 [ 4077 / 54402, 623 ins, 376 del, 3078 sub ] exp/tri5b/decode_tglarge_dev_clean/wer_14_0.5
 %WER 9.38 [ 5104 / 54402, 701 ins, 533 del, 3870 sub ] exp/tri5b/decode_tgmed_dev_clean/wer_15_0.0
@@ -54,7 +54,7 @@
 
 
 ### SAT GMM model trained on the combined "train-clean-100" + "train-clean-360" + "train-other-500" set (960 hours)
-### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri6b/decode_${lm}_${test}/wer* | best_wer.sh; done; echo; done
+### for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge tgmed tgsmall; do grep WER exp/tri6b/decode_${lm}_${test}/wer* | utils/best_wer.sh; done; echo; done
 %WER 7.02 [ 3819 / 54402, 516 ins, 424 del, 2879 sub ] exp/tri6b/decode_fglarge_dev_clean/wer_14_1.0
 %WER 7.33 [ 3988 / 54402, 506 ins, 468 del, 3014 sub ] exp/tri6b/decode_tglarge_dev_clean/wer_15_1.0
 %WER 9.23 [ 5024 / 54402, 744 ins, 481 del, 3799 sub ] exp/tri6b/decode_tgmed_dev_clean/wer_13_0.0
@@ -337,7 +337,7 @@
 %WER 4.39 [ 2387 / 54402, 377 ins, 199 del, 1811 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tglarge/wer_14
 %WER 5.36 [ 2918 / 54402, 328 ins, 338 del, 2252 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgmed/wer_17
 %WER 6.08 [ 3305 / 54402, 369 ins, 396 del, 2540 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgsmall/wer_15
-%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14 
+%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14
 %WER 5.35 [ 2909 / 54402, 328 ins, 339 del, 2242 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgmed/wer_17
 %WER 6.05 [ 3291 / 54402, 384 ins, 381 del, 2526 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgsmall/wer_14
 %WER 13.45 [ 6850 / 50948, 808 ins, 876 del, 5166 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tglarge/wer_15
@@ -423,7 +423,7 @@
 %WER 17.64 [ 9231 / 52343, 764 ins, 1662 del, 6805 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgsmall_utt_offline/wer_14
 
 # Results with nnet3 tdnn
-# local/nnet3/run_tdnn.sh
+# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh)
 # (4 epoch training on speed-perturbed data)
 # num_params=19.3M
 %WER 4.43 [ 2410 / 54402, 306 ins, 278 del, 1826 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0
@@ -444,7 +444,7 @@
 %WER 16.29 [ 8528 / 52343, 828 ins, 1320 del, 6380 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0
 
 # Results with nnet3 tdnn
-# local/nnet3/run_tdnn.sh
+# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh)
 # (4 epoch training on speed-perturbed and volumn-perturbed "cleaned" data)
 # num_params=19.3M, average training time=68.8s per job(on Tesla K80), real-time factor=1.23161
 # for x in exp/nnet3_cleaned/tdnn_sp/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done
@@ -465,6 +465,24 @@
 %WER 14.78 [ 7737 / 52343, 807 ins, 1115 del, 5815 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_15_0.0
 %WER 16.28 [ 8521 / 52343, 843 ins, 1258 del, 6420 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0
 
+# Results with nnet3 tdnn with new configs, a.k.a. xconfig
+# local/nnet3/run_tdnn.sh (linked to local/nnet3/tuning/run_tdnn_1b.sh)
+%WER 4.60 [ 2502 / 54402, 324 ins, 286 del, 1892 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0
+%WER 4.80 [ 2612 / 54402, 350 ins, 285 del, 1977 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tglarge/wer_11_1.0
+%WER 5.97 [ 3248 / 54402, 460 ins, 310 del, 2478 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgmed/wer_11_0.0
+%WER 6.66 [ 3625 / 54402, 479 ins, 392 del, 2754 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgsmall/wer_11_0.0
+%WER 12.29 [ 6262 / 50948, 863 ins, 665 del, 4734 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_fglarge/wer_15_0.0
+%WER 12.89 [ 6565 / 50948, 773 ins, 853 del, 4939 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tglarge/wer_14_0.5
+%WER 15.41 [ 7849 / 50948, 894 ins, 1083 del, 5872 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgmed/wer_15_0.0
+%WER 16.81 [ 8562 / 50948, 896 ins, 1215 del, 6451 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgsmall/wer_14_0.0
+%WER 4.99 [ 2624 / 52576, 393 ins, 253 del, 1978 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_fglarge/wer_13_0.5
+%WER 5.16 [ 2715 / 52576, 359 ins, 319 del, 2037 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tglarge/wer_12_1.0
+%WER 6.29 [ 3307 / 52576, 471 ins, 341 del, 2495 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgmed/wer_12_0.0
+%WER 7.13 [ 3750 / 52576, 473 ins, 452 del, 2825 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgsmall/wer_13_0.0
+%WER 12.73 [ 6665 / 52343, 894 ins, 711 del, 5060 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_fglarge/wer_14_0.0
+%WER 13.33 [ 6979 / 52343, 920 ins, 796 del, 5263 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tglarge/wer_14_0.0
+%WER 15.90 [ 8323 / 52343, 921 ins, 1126 del, 6276 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_13_0.0
+%WER 17.28 [ 9044 / 52343, 894 ins, 1372 del, 6778 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0
 
 # Results with nnet3 tdnn+sMBR
 # local/nnet3/run_tdnn_discriminative.sh
diff --git a/egs/librispeech/s5/local/chain/run_cnn_tdnn.sh b/egs/librispeech/s5/local/chain/run_cnn_tdnn.sh
new file mode 100755
index 00000000000..cd8f38d8309
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/run_cnn_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_tdnn_1a.sh
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
index 6bf3a139ad1..ac3b74ed0c5 100755
--- a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
@@ -95,8 +95,8 @@ if [ $frame_subsampling_factor -ne 1 ]; then
 
     data_dirs=
     for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
-      steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
-        $x $train_data_dir exp/shift_hires mfcc_hires
+      utils/data/shift_feats.sh \
+        $x $train_data_dir ${train_data_dir}_fs$x
       utils/fix_data_dir.sh ${train_data_dir}_fs$x
       data_dirs="$data_dirs ${train_data_dir}_fs$x"
       awk -v nfs=$x '{print "fs"nfs"-"$0}' $train_ivector_dir/ivector_online.scp >> ${train_ivector_dir}_fs/ivector_online.scp
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh b/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..a4fa11e0908
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1b.sh
\ No newline at end of file
diff --git a/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
new file mode 100755
index 00000000000..db17a35be64
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# This is based on tdnn_1d_sp, but adding cnn as the front-end.
+# The cnn-tdnn-f (tdnn_cnn_1a_sp) outperforms the tdnn-f (tdnn_1d_sp).
+
+# bash local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1d_sp exp/chain_cleaned/tdnn_cnn_1a_sp/
+# System                         tdnn_1d_sp  tdnn_cnn_1a_sp
+# WER on dev(fglarge)               3.29          3.34
+# WER on dev(tglarge)               3.44          3.39
+# WER on dev(tgmed)                 4.22          4.29
+# WER on dev(tgsmall)               4.72          4.77
+# WER on dev_other(fglarge)         8.71          8.62
+# WER on dev_other(tglarge)         9.05          9.00
+# WER on dev_other(tgmed)          11.09         10.93
+# WER on dev_other(tgsmall)        12.13         12.02
+# WER on test(fglarge)              3.80          3.69
+# WER on test(tglarge)              3.89          3.80
+# WER on test(tgmed)                4.72          4.64
+# WER on test(tgsmall)              5.19          5.16      
+# WER on test_other(fglarge)        8.76          8.71
+# WER on test_other(tglarge)        9.19          9.11
+# WER on test_other(tgmed)         11.22         11.00
+# WER on test_other(tgsmall)       12.24         12.16
+# Final train prob               -0.0378       -0.0420
+# Final valid prob               -0.0374       -0.0400
+# Final train prob (xent)        -0.6099       -0.6881
+# Final valid prob (xent)        -0.6353       -0.7180
+# Num-parameters                22623456      18100736
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+decode_nj=50
+train_set=train_960_cleaned
+gmm=tri6b_cleaned
+nnet3_affix=_cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=cnn_1a
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# TDNN options
+frames_per_eg=150,110,100
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm 6 --num-processes 3 \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+local/chain/run_chain_common.sh --stage $stage \
+                                --gmm-dir $gmm_dir \
+                                --ali-dir $ali_dir \
+                                --lores-train-data-dir ${lores_train_data_dir} \
+                                --lang $lang \
+                                --lat-dir $lat_dir \
+                                --num-leaves 7000 \
+                                --tree-dir $tree_dir || exit 1;
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.01"
+  ivector_affine_opts="l2-regularize=0.0"
+  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_first_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # MFCC to filterbank
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+  batchnorm-component name=idct-batchnorm input=idct
+
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=10 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+
+  # the first TDNN-F layer has no bypass
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1536 bottleneck-dim=256 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --use-gpu "wait" \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 2500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00015 \
+    --trainer.optimization.final-effective-lrate 0.000015 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+graph_dir=$dir/graph_tgsmall
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+fi
+
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in test_clean test_other dev_clean dev_other; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
+      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 18 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for data in test_clean test_other dev_clean dev_other; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $graph_dir data/${data} ${dir}_online/decode_${data}_tgsmall || exit 1
+
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
+                
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
index d4c789f7794..fb652a719a2 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -179,8 +179,6 @@ if [ $stage -le 16 ]; then
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
   utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
-  # romove <UNK> from the graph
-  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst $graph_dir/HCLG.fst
 fi
 
 
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 7129827fe19..48d6ddb804f 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -122,7 +122,7 @@ if [ $stage -le 14 ]; then
   # create the config files for nnet initialization
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -206,10 +206,6 @@ if [ $stage -le 16 ]; then
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
   utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
-  # remove <UNK> from the graph, and convert back to const-FST.
-  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
-    fstconvert --fst_type=const > $graph_dir/temp.fst
-  mv $graph_dir/temp.fst $graph_dir/HCLG.fst
 fi
 
 
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index 29ebe62ddde..101fd6a4c15 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -112,7 +112,7 @@ if [ $stage -le 14 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
@@ -197,10 +197,6 @@ if [ $stage -le 16 ]; then
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
   utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
-  # remove <UNK> from the graph, and convert back to const-FST.
-  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
-    fstconvert --fst_type=const > $graph_dir/temp.fst
-  mv $graph_dir/temp.fst $graph_dir/HCLG.fst
 fi
 
 iter_opts=
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 81b621ef86f..865b10dea0c 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -207,7 +207,7 @@ if [ $stage -le 14 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75"
   linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
@@ -297,10 +297,6 @@ if [ $stage -le 16 ]; then
   # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
   # the lang directory.
   utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
-  # remove <UNK> from the graph, and convert back to const-FST.
-  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
-    fstconvert --fst_type=const > $graph_dir/temp.fst
-  mv $graph_dir/temp.fst $graph_dir/HCLG.fst
 fi
 
 iter_opts=
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..0e97e46194d
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+# this is the tdnn-lstmp based on the run_tdnn_lstm_1n.sh under Switchboard.
+
+# training acoustic model and decoding:
+#     local/chain/tuning/run_tdnn_lstm_1a.sh
+# System                      tdnn_lstm1a_sp
+# WER on dev(fglarge)              3.44
+# WER on dev(tglarge)              3.55
+# WER on dev_other(fglarge)        8.63
+# WER on dev_other(tglarge)        9.09
+# WER on test(fglarge)             3.78
+# WER on test(tglarge)             3.94
+# WER on test_other(fglarge)       8.83
+# WER on test_other(tglarge)       9.09
+# Final train prob              -0.0452
+# Final valid prob              -0.0477
+# Final train prob (xent)       -0.7874
+# Final valid prob (xent)       -0.8150
+# Num-parameters               27790288
+# exp/chain_cleaned/tdnn_lstm1a_sp/: num-iters=1303 nj=3..16 num-params=27.8M dim=40+100->6056 combine=-0.041->-0.040 (over 9) xent:train/valid[867,1302,final]=(-1.15,-0.782,-0.787/-1.18,-0.810,-0.815) logprob:train/valid[867,1302,final]=(-0.063,-0.047,-0.045/-0.062,-0.049,-0.048)
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=1a
+decode_iter=
+decode_nj=50
+
+# LSTM training options
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+remove_egs=false
+common_egs_dir=
+nnet3_affix=_cleaned
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+gmm=tri6b_cleaned
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}${suffix}
+train_set=train_960_cleaned
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  lstm_opts="l2-regularize=0.0005 decay-time=40"
+  output_opts="l2-regularize=0.0005 output-delay=$label_delay max-change=1.5 dim=$num_targets"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  output-layer name=output input=lstm3  include-log-softmax=false $output_opts
+
+  output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+      /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+fi
+
+
+graph_dir=$dir/graph_tgsmall
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+fi
+
+
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in test_clean test_other dev_clean dev_other; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+		  --extra-left-context $extra_left_context \
+          --extra-right-context $extra_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
+      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..0da813267fc
--- /dev/null
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,253 @@
+#!/bin/bash
+# this is the tdnn-lstmp based on the run_tdnn_lstm_1a.sh under Librispeech but with larger model size.
+
+# training acoustic model and decoding:
+#     local/chain/tuning/run_tdnn_lstm_1b.sh
+# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_lstm1a_sp exp/chain_cleaned/tdnn_lstm1b_sp
+# System                      tdnn_lstm1a_sp tdnn_lstm1b_sp
+# WER on dev(fglarge)              3.44      3.36
+# WER on dev(tglarge)              3.55      3.48
+# WER on dev(tgmed)                4.41      4.26
+# WER on dev(tgsmall)              4.82      4.71
+# WER on dev_other(fglarge)        8.63      8.43
+# WER on dev_other(tglarge)        9.09      8.94
+# WER on dev_other(tgmed)         10.99     10.65
+# WER on dev_other(tgsmall)       11.95     11.51
+# WER on test(fglarge)             3.78      3.83
+# WER on test(tglarge)             3.94      3.93
+# WER on test(tgmed)               4.68      4.72
+# WER on test(tgsmall)             5.11      5.10
+# WER on test_other(fglarge)       8.83      8.69
+# WER on test_other(tglarge)       9.09      9.10
+# WER on test_other(tgmed)        11.05     10.86
+# WER on test_other(tgsmall)      12.18     11.83
+# Final train prob              -0.0452   -0.0417
+# Final valid prob              -0.0477   -0.0459
+# Final train prob (xent)       -0.7874   -0.7488
+# Final valid prob (xent)       -0.8150   -0.7757
+# Num-parameters               27790288  45245520
+
+# rnn-lm rescoring:
+#     local/rnnlm/tuning/run_tdnn_lstm_1a.sh --ac-model-dir exp/chain_cleaned/tdnn_lstm1b_sp/
+# System                      tdnn_lstm1b_sp
+# WER on dev(fglarge_nbe_rnnlm)      2.73
+# WER on dev(fglarge_lat_rnnlm)        2.83
+# WER on dev(fglarge)              3.36
+# WER on dev(tglarge)              3.48
+# WER on dev_other(fglarge_nbe_rnnlm)      7.20
+# WER on dev_other(fglarge_lat_rnnlm)      7.23
+# WER on dev_other(fglarge)        8.43
+# WER on dev_other(tglarge)        8.94
+# WER on test(fglarge_nbe_rnnlm)      3.10
+# WER on test(fglarge_lat_rnnlm)       3.22
+# WER on test(fglarge)             3.83
+# WER on test(tglarge)             3.93
+# WER on test_other(fglarge_nbe_rnnlm)      7.54
+# WER on test_other(fglarge_lat_rnnlm)      7.65
+# WER on test_other(fglarge)       8.69
+# WER on test_other(tglarge)       9.10
+# Final train prob              -0.0417
+# Final valid prob              -0.0459
+# Final train prob (xent)       -0.7488
+# Final valid prob (xent)       -0.7757
+# Num-parameters               45245520
+
+
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=1b
+decode_iter=
+decode_nj=50
+
+# LSTM training options
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+remove_egs=false
+common_egs_dir=
+nnet3_affix=_cleaned
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+gmm=tri6b_cleaned
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}${suffix}
+train_set=train_960_cleaned
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  lstm_opts="l2-regularize=0.0005 decay-time=40"
+  output_opts="l2-regularize=0.0005 output-delay=$label_delay max-change=1.5 dim=$num_targets"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=320 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=320 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=320 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=320 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=320 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm1l dim=320 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm1 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=320 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm2l dim=320 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm2 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=320 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm3l dim=320 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm3 cell-dim=1536 recurrent-projection-dim=384 non-recurrent-projection-dim=384: delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  output-layer name=output input=lstm3  include-log-softmax=false $output_opts
+
+  output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+      /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+fi
+
+
+graph_dir=$dir/graph_tgsmall
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir
+fi
+
+
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in test_clean test_other dev_clean dev_other; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+		  --extra-left-context $extra_left_context \
+          --extra-right-context $extra_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1
+      steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1
+      steps/lmrescore_const_arpa.sh \
+          --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+          data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
diff --git a/egs/librispeech/s5/local/data_prep.sh b/egs/librispeech/s5/local/data_prep.sh
index dea93525e28..20c5697d61f 100755
--- a/egs/librispeech/s5/local/data_prep.sh
+++ b/egs/librispeech/s5/local/data_prep.sh
@@ -31,7 +31,6 @@ wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
 trans=$dst/text; [[ -f "$trans" ]] && rm $trans
 utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
 spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
-utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
 
 for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
   reader=$(basename $reader_dir)
@@ -79,8 +78,6 @@ nutt2spk=$(wc -l <$utt2spk)
 ! [ "$ntrans" -eq "$nutt2spk" ] && \
   echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1;
 
-utils/data/get_utt2dur.sh $dst 1>&2 || exit 1
-
 utils/validate_data_dir.sh --no-feats $dst || exit 1;
 
 echo "$0: successfully prepared data in $dst"
diff --git a/egs/librispeech/s5/local/download_and_untar.sh b/egs/librispeech/s5/local/download_and_untar.sh
index d01e681fed7..1bb6d909edc 100755
--- a/egs/librispeech/s5/local/download_and_untar.sh
+++ b/egs/librispeech/s5/local/download_and_untar.sh
@@ -67,7 +67,9 @@ if [ -f $data/$part.tar.gz ]; then
   fi
 fi
 
-if [ ! -f $data/$part.tar.gz ]; then
+pushd $data
+
+if [ ! -f $part.tar.gz ]; then
   if ! which wget >/dev/null; then
     echo "$0: wget is not installed."
     exit 1;
@@ -75,20 +77,19 @@ if [ ! -f $data/$part.tar.gz ]; then
   full_url=$url/$part.tar.gz
   echo "$0: downloading data from $full_url.  This may take some time, please be patient."
 
-  cd $data
   if ! wget --no-check-certificate $full_url; then
     echo "$0: error executing wget $full_url"
     exit 1;
   fi
 fi
 
-cd $data
-
 if ! tar -xvzf $part.tar.gz; then
   echo "$0: error un-tarring archive $data/$part.tar.gz"
   exit 1;
 fi
 
+popd >&/dev/null
+
 touch $data/LibriSpeech/$part/.complete
 
 echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
diff --git a/egs/librispeech/s5/local/lm/python/text_post_process.py b/egs/librispeech/s5/local/lm/python/text_post_process.py
index 4ffbbe04b1f..344c1b291bd 100755
--- a/egs/librispeech/s5/local/lm/python/text_post_process.py
+++ b/egs/librispeech/s5/local/lm/python/text_post_process.py
@@ -21,10 +21,10 @@ def parse_args():
     parser.add_argument('--abort-long-sent', type=bool, default=False,
                         help='If True and a sentence longer than "max-sent-len" detected' +\
                              'exit with error code 1. If False, just split the long sentences.')
-    parser.add_argument('--sent-end-marker', type=str, default="DOTDOTDOT")
-    parser.add_argument("in_text", type=str, help="Input text")
-    parser.add_argument("out_text", type=str, help="Output text")
-    parser.add_argument("sent_bounds", type=str,
+    parser.add_argument('--sent-end-marker', default="DOTDOTDOT")
+    parser.add_argument("in_text", help="Input text")
+    parser.add_argument("out_text", help="Output text")
+    parser.add_argument("sent_bounds",
                         help="A file that will contain a comma separated list of numbers, s.t. if" +
                              "i is in this list, then there is a sententence break after token i")
     return parser.parse_args()
@@ -66,7 +66,7 @@ def parse_args():
                 n_tokens += 1
                 start_scan = 4
                 current_line.append('SUN')
-            for i in xrange(start_scan, len(opl_tokens)):
+            for i in range(start_scan, len(opl_tokens)):
                 m = re.match("^[A-Z]+\'?[A-Z\']*$", opl_tokens[i])
                 if m is not None:
                     n_tokens += 1
diff --git a/egs/librispeech/s5/local/lm/python/text_pre_process.py b/egs/librispeech/s5/local/lm/python/text_pre_process.py
index 6228079b3a3..b75d0711d13 100755
--- a/egs/librispeech/s5/local/lm/python/text_pre_process.py
+++ b/egs/librispeech/s5/local/lm/python/text_pre_process.py
@@ -20,13 +20,13 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description="Pre-process a book's text")
-    parser.add_argument("--in-encoding", type=str, default="utf-8",
+    parser.add_argument("--in-encoding", default="utf-8",
                         help="Encoding to use when reading the input text")
-    parser.add_argument("--out-encoding", type=str, default="ascii",
+    parser.add_argument("--out-encoding", default="ascii",
                         help="Encoding to use when writing the output text")
-    parser.add_argument('--sent-end-marker', type=str, default="DOTDOTDOT")
-    parser.add_argument("in_text", type=str, help="Input text")
-    parser.add_argument("out_text", type=str, help="Output text")
+    parser.add_argument('--sent-end-marker', default="DOTDOTDOT")
+    parser.add_argument("in_text", help="Input text")
+    parser.add_argument("out_text", help="Output text")
     return parser.parse_args()
 
 # http://rosettacode.org/wiki/Roman_numerals/Decode#Python
diff --git a/egs/librispeech/s5/local/lm/train_lm.sh b/egs/librispeech/s5/local/lm/train_lm.sh
index 04badd95b26..6e6ae5970fb 100755
--- a/egs/librispeech/s5/local/lm/train_lm.sh
+++ b/egs/librispeech/s5/local/lm/train_lm.sh
@@ -50,7 +50,7 @@ if [ "$stage" -le 1 ]; then
   split_files=$(eval "echo $split_prefix-{$(seq -s',' $normjobs | sed 's/,$//')}")
   find $corpus_dir -mindepth 1 -maxdepth 1 -type d |\
     tee $tmp_dir/all_texts.txt |\
-    utils/split_scp.pl - $split_files
+    utils/split_scp.pl /dev/stdin $split_files
   echo "Checking the splits ..."
   total_count=$(wc -l <$tmp_dir/all_texts.txt)
   split_count=$(cat $split_files | wc -l | awk 'BEGIN{c=0} {c+=$1;} END{print c}')
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 28ee2b92004..00000000000
--- a/egs/librispeech/s5/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash
-
-# this is the standard "tdnn" system, built in nnet3; it's what we use to
-# call multi-splice.
-
-# without cleanup:
-# local/nnet3/run_tdnn.sh  --train-set train960 --gmm tri6b --nnet3-affix "" &
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-decode_nj=30
-train_set=train_960_cleaned
-gmm=tri6b_cleaned  # this is the source gmm-dir for the data-type of interest; it
-                   # should have alignments for the specified training data.
-nnet3_affix=_cleaned
-
-# Options which are not passed through to run_ivector_common.sh
-affix=
-train_stage=-10
-common_egs_dir=
-reporting_email=
-remove_egs=true
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --nnet3-affix "$nnet3_affix" || exit 1;
-
-
-gmm_dir=exp/${gmm}
-graph_dir=$gmm_dir/graph_tgsmall
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-dir=exp/nnet3${nnet3_affix}/tdnn${affix:+_$affix}_sp
-train_data_dir=data/${train_set}_sp_hires
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
-
-
-for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-if [ $stage -le 11 ]; then
-  echo "$0: creating neural net configs";
-
-  # create the config files for nnet initialization
-  python steps/nnet3/tdnn/make_configs.py  \
-    --feat-dir $train_data_dir \
-    --ivector-dir $train_ivector_dir \
-    --ali-dir $ali_dir \
-    --relu-dim 1280 \
-    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
-    --use-presoftmax-prior-scale true \
-   $dir/configs || exit 1;
-fi
-
-
-
-if [ $stage -le 12 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_dnn.py --stage=$train_stage \
-    --cmd="$decode_cmd" \
-    --feat.online-ivector-dir $train_ivector_dir \
-    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.0017 \
-    --trainer.optimization.final-effective-lrate 0.00017 \
-    --egs.dir "$common_egs_dir" \
-    --cleanup.remove-egs $remove_egs \
-    --cleanup.preserve-model-interval 100 \
-    --feat-dir=$train_data_dir \
-    --ali-dir $ali_dir \
-    --lang data/lang \
-    --reporting.email="$reporting_email" \
-    --dir=$dir  || exit 1;
-
-fi
-
-if [ $stage -le 13 ]; then
-  # this does offline decoding that should give about the same results as the
-  # real online decoding (the one with --per-utt true)
-  rm $dir/.error 2>/dev/null || true
-  for test in test_clean test_other dev_clean dev_other; do
-    (
-    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \
-      ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1
-    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
-      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || exit 1
-    steps/lmrescore_const_arpa.sh \
-      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
-      data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1
-    steps/lmrescore_const_arpa.sh \
-      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
-      data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-exit 0;
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..28ee2b92004
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train960 --gmm tri6b --nnet3-affix "" &
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train_960_cleaned
+gmm=tri6b_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                   # should have alignments for the specified training data.
+nnet3_affix=_cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+train_stage=-10
+common_egs_dir=
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph_tgsmall
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs";
+
+  # create the config files for nnet initialization
+  python steps/nnet3/tdnn/make_configs.py  \
+    --feat-dir $train_data_dir \
+    --ivector-dir $train_ivector_dir \
+    --ali-dir $ali_dir \
+    --relu-dim 1280 \
+    --splice-indexes "-2,-1,0,1,2 -1,2 -3,3 -7,2 0"  \
+    --use-presoftmax-prior-scale true \
+   $dir/configs || exit 1;
+fi
+
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --feat-dir=$train_data_dir \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # this does offline decoding that should give about the same results as the
+  # real online decoding (the one with --per-utt true)
+  rm $dir/.error 2>/dev/null || true
+  for test in test_clean test_other dev_clean dev_other; do
+    (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \
+      ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..a96a1b33e6c
--- /dev/null
+++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+
+# 1b is as 1a but uses xconfigs.
+
+# this is the standard "tdnn" system, built in nnet3; it's what we use to
+# call multi-splice.
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train960 --gmm tri6b --nnet3-affix "" &
+
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train_960_cleaned
+gmm=tri6b_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                   # should have alignments for the specified training data.
+nnet3_affix=_cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=
+train_stage=-10
+common_egs_dir=
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph_tgsmall
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  relu-batchnorm-layer name=tdnn0 dim=1280
+  relu-batchnorm-layer name=tdnn1 dim=1280 input=Append(-1,2)
+  relu-batchnorm-layer name=tdnn2 dim=1280 input=Append(-3,3)
+  relu-batchnorm-layer name=tdnn3 dim=1280 input=Append(-7,2)
+  relu-batchnorm-layer name=tdnn4 dim=1280
+  output-layer name=output input=tdnn4 dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --feat-dir=$train_data_dir \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # this does offline decoding that should give about the same results as the
+  # real online decoding (the one with --per-utt true)
+  rm $dir/.error 2>/dev/null || true
+  for test in test_clean test_other dev_clean dev_other; do
+    (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \
+      ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1
+    steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed}  || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1
+    steps/lmrescore_const_arpa.sh \
+      --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+      data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/librispeech/s5/local/prepare_dict.sh b/egs/librispeech/s5/local/prepare_dict.sh
index f798a804355..f9efb2ee46b 100755
--- a/egs/librispeech/s5/local/prepare_dict.sh
+++ b/egs/librispeech/s5/local/prepare_dict.sh
@@ -75,7 +75,7 @@ if [ $stage -le 1 ]; then
   auto_vocab_splits=$(eval "echo $auto_vocab_prefix.{$(seq -s',' $nj | sed 's/,$//')}")
   awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $cmudict_plain $vocab |\
     sort | tee $g2p_dir/vocab_autogen.full |\
-    utils/split_scp.pl - $auto_vocab_splits || exit 1
+    utils/split_scp.pl /dev/stdin $auto_vocab_splits || exit 1
   echo "Autogenerating pronunciations for the words in $auto_vocab_prefix.* ..."
   $cmd JOB=1:$nj $g2p_dir/log/g2p.JOB.log \
     local/g2p.sh  $auto_vocab_prefix.JOB $g2p_model_dir $auto_lexicon_prefix.JOB || exit 1
diff --git a/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh b/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..137a972f3d9
--- /dev/null
+++ b/egs/librispeech/s5/local/rnnlm/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2018  Ke Li
+
+# This script trains LMs on the librispeech-lm-norm.txt.gz.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 143) was 142, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 109.2 / 110.7.
+# Train objf: -5.74 -5.54 -5.44 -5.37 -5.32 -5.28 -5.25 -5.23 -5.20 -5.18 -5.15 -5.14 -5.12 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.96 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.92 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.78 -4.79 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.70 -4.70 -4.70 -4.70 -4.70 -4.69 -4.69 -4.69 -4.69 -4.69 -4.69 -4.68 -4.68
+# Dev objf:   -5.99 -5.65 -5.53 -5.44 -5.38 -5.34 -5.30 -5.27 -5.22 -5.20 -5.18 -5.16 -5.14 -5.12 -5.11 -5.10 -5.09 -5.08 -5.07 -5.05 -5.04 -5.04 -5.03 -5.01 -5.00 -4.99 -4.99 -4.98 -4.97 -4.97 0.00 -4.96 -4.95 -4.95 -4.94 -4.93 -4.93 -4.92 -4.92 -4.91 -4.91 -4.90 -4.90 -4.89 -4.89 -4.89 -4.88 -4.88 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.87 -4.84 -4.84 -4.84 -4.83 -4.91 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.82 -4.81 -4.81 -4.81 -4.80 -4.80 -4.80 -4.80 -4.80 -4.79 -4.79 -4.79 -4.79 -4.79 -4.79 -4.78 -4.78 -4.79 -4.78 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.77 -4.76 -4.76 -4.76 -4.76 -4.76 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.75 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.74 -4.73 -4.74 -4.73 -4.73 -4.73 -4.73 -4.73 -4.73 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.72 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71 -4.71
+
+# WER summary on dev and test sets
+# System                      tdnn_1d_sp  +lattice_rescore  +nbest_rescore 
+# WER on dev(fglarge)              3.34         2.71            2.62
+# WER on dev(tglarge)              3.44         2.75            2.66
+# WER on dev_other(fglarge)        8.70         7.37            7.55
+# WER on dev_other(tglarge)        9.25         7.56            7.73
+# WER on test(fglarge)             3.77         3.12            3.06
+# WER on test(tglarge)             3.85         3.18            3.11
+# WER on test_other(fglarge)       8.91         7.63            7.68
+# WER on test_other(tglarge)       9.31         7.83            7.95
+
+# command to get the WERs above:
+# tdnn_1d_sp
+# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}/wer* | best_wer.sh; done; done
+# tdnn_1d_sp with lattice rescoring 
+# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_rescore/wer* | best_wer.sh; done; done
+# tdnn_1d_sp with nbest rescoring 
+# for test in dev_clean test_clean dev_other test_other; do for lm in fglarge tglarge; do grep WER exp/chain_cleaned/tdnn_1d_sp/decode_${test}_${lm}_rnnlm_1a_nbest_rescore/wer* | best_wer.sh; done; done
+
+# Begin configuration section.
+
+dir=exp/rnnlm_lstm_1a
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=-10
+train_stage=-10
+epochs=4
+
+# variables for lattice rescoring
+run_lat_rescore=true
+run_nbest_rescore=true
+run_backward_rnnlm=false
+ac_model_dir=exp/chain_cleaned/tdnn_1d_sp
+decode_dir_suffix=rnnlm_1a
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+text=data/local/lm/librispeech-lm-norm.txt.gz
+lexicon=data/lang_nosp/words.txt
+text_dir=data/rnnlm/text
+mkdir -p $dir/config
+set -e
+
+for f in $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  if [ ! -f $text ]; then
+    wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P data/local/lm 
+  fi
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 2000 lines as dev data.
+  gunzip -c $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%2000 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/librispeech.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lexicon $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<UNK>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+librispeech   1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<UNK>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --top-word-features=5000 \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<UNK>,<SPOKEN_NOISE>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-3))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-3))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 400 \
+                            $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-final 8 \
+                       --stage $train_stage \
+                       --num-epochs $epochs \
+                       --cmd "$train_cmd" $dir
+fi
+
+if [ $stage -le 4 ] && $run_lat_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+#  LM=tgsmall # if using the original 3-gram G.fst as old lm
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  for decode_set in test_clean test_other dev_clean dev_other; do
+    for LM in fglarge tglarge; do 
+        decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}
+        # Lattice rescoring
+        rnnlm/lmrescore$pruned.sh \
+            --cmd "$decode_cmd --mem 8G" \
+            --weight 0.45 --max-ngram-order $ngram_order \
+            data/lang_test_$LM $dir \
+            data/${decode_set}_hires ${decode_dir} \
+            exp/chain_cleaned/tdnn_1d_sp/decode_${decode_set}_${LM}_${decode_dir_suffix}_rescore
+    done
+  done
+fi
+
+if [ $stage -le 5 ] && $run_nbest_rescore; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+  for decode_set in test_clean test_other dev_clean dev_other; do
+    for LM in fglarge tglarge; do 
+        decode_dir=${ac_model_dir}/decode_${decode_set}_${LM}
+        # Nbest rescoring
+        rnnlm/lmrescore_nbest.sh \
+            --cmd "$decode_cmd --mem 8G" --N 20 \
+            0.4 data/lang_test_$LM $dir \
+            data/${decode_set}_hires ${decode_dir} \
+            exp/chain_cleaned/tdnn_1d_sp/decode_${decode_set}_${LM}_${decode_dir_suffix}_nbest_rescore
+    done
+  done
+fi
+
+exit 0
diff --git a/egs/librispeech/s5/rnnlm b/egs/librispeech/s5/rnnlm
new file mode 120000
index 00000000000..e136939ba72
--- /dev/null
+++ b/egs/librispeech/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm/
\ No newline at end of file
diff --git a/egs/librispeech/s5/run.sh b/egs/librispeech/s5/run.sh
index b2386489100..f784a8972db 100755
--- a/egs/librispeech/s5/run.sh
+++ b/egs/librispeech/s5/run.sh
@@ -9,6 +9,7 @@ data=/export/a15/vpanayotov/data
 # base url for downloads.
 data_url=www.openslr.org/resources/12
 lm_url=www.openslr.org/resources/11
+mfccdir=mfcc
 stage=1
 
 . ./cmd.sh
@@ -75,7 +76,6 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ $stage -le 5 ]; then
-  mfccdir=mfcc
   # spread the mfccs over various machines, as this data-set is quite large.
   if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
     mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename.
diff --git a/egs/madcat_ar/v1/README.txt b/egs/madcat_ar/README.txt
similarity index 100%
rename from egs/madcat_ar/v1/README.txt
rename to egs/madcat_ar/README.txt
diff --git a/egs/madcat_ar/v1/RESULTS b/egs/madcat_ar/v1/RESULTS
new file mode 100644
index 00000000000..357d209f6b9
--- /dev/null
+++ b/egs/madcat_ar/v1/RESULTS
@@ -0,0 +1,18 @@
+
+Subset - Dev: 852, Train: 23564, Test: 923
+
+BPE:  (subset) (run_end2end.sh)
+  • %WER 19.34 [ 932 / 4819, 71 ins, 75 del, 786 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_7_0.0
+  • %WER 13.70 [ 660 / 4819, 52 ins, 65 del, 543 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_8_1.0
+
+word-based: (subset) (run_end2end.sh.word)
+  • %WER 27.39 [ 1320 / 4819, 209 ins, 50 del, 1061 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_10_1.0
+  • %WER 24.26 [ 1169 / 4819, 123 ins, 80 del, 966 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_13_1.0
+
+BPE:  (subset) (run.sh)
+  • %WER 15.98 [ 770 / 4819, 64 ins, 48 del, 658 sub ] exp/chain/cnn_1a/decode_test/wer_8_0.5
+  
+  
+word-based: (subset) (run.sh.word)
+  • %WER 24.20 [ 1166 / 4819, 121 ins, 69 del, 976 sub ] exp/chain/cnn_1a/decode_test/wer_11_1.0
+    %WER 24.28 [ 1170 / 4819, 126 ins, 104 del, 940 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_1.0
diff --git a/egs/madcat_ar/v1/local/chain/compare_wer.sh b/egs/madcat_ar/v1/local/chain/compare_wer.sh
index ad90710b13f..7f04061dafb 100755
--- a/egs/madcat_ar/v1/local/chain/compare_wer.sh
+++ b/egs/madcat_ar/v1/local/chain/compare_wer.sh
@@ -27,6 +27,13 @@ for x in $*; do
 done
 echo
 
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
 echo -n "# CER                        "
 for x in $*; do
   cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
@@ -34,6 +41,13 @@ for x in $*; do
 done
 echo
 
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn.sh b/egs/madcat_ar/v1/local/chain/run_cnn.sh
new file mode 120000
index 00000000000..df6f0a468c1
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_cnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh
new file mode 120000
index 00000000000..a864819f542
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_chainali_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..fcf59f917c1
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1b.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
similarity index 87%
rename from egs/madcat_ar/v1/local/chain/run_cnn_1a.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
index a3a98ce5ad5..892ee441516 100755
--- a/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
@@ -21,27 +21,20 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
 remove_egs=false
-lang_test=lang_test
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -110,9 +103,9 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
@@ -122,7 +115,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   mkdir -p $dir/configs
@@ -168,25 +161,20 @@ if [ $stage -le 5 ]; then
     --chain.leaky-hmm-coefficient=0.1 \
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
-    --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$frame_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=4 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
+    --trainer.frames-per-iter=2000000 \
     --trainer.optimization.num-jobs-initial=3 \
     --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
     --cleanup.remove-egs=$remove_egs \
@@ -205,20 +193,21 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
     $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
similarity index 89%
rename from egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index b652eab034a..7ca7c652fd2 100755
--- a/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -18,27 +18,19 @@ lats_affix=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
 remove_egs=false
-lang_test=lang_test
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -109,19 +101,18 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
-    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    --cmd "$cmd" $num_leaves $train_data_dir \
     $lang $ali_dir $tree_dir
 fi
 
-
 if [ $stage -le 4 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
@@ -170,25 +161,20 @@ if [ $stage -le 5 ]; then
     --chain.leaky-hmm-coefficient=0.1 \
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
-    --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
     --chain.alignment-subsampling-factor=1 \
-    --trainer.srand=$srand \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
+    --trainer.frames-per-iter=2000000 \
     --trainer.optimization.num-jobs-initial=3 \
     --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
-    --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
     --cleanup.remove-egs=$remove_egs \
@@ -207,20 +193,21 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
     $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
 fi
+
+echo "$0 Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
similarity index 91%
rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
index 38387ce2fcc..a8bc1836ffe 100755
--- a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -19,17 +19,14 @@ reporting_email=
 train_stage=-10
 xent_regularize=0.1
 frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
 # training options
 srand=0
 remove_egs=true
-lang_test=lang_test
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -119,7 +116,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
@@ -171,28 +168,24 @@ if [ $stage -le 5 ]; then
     --chain.leaky-hmm-coefficient=0.1 \
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
-    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
     --chain.frame-subsampling-factor=$frame_subsampling_factor \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
     --chain.right-tolerance 3 \
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
-    --trainer.num-epochs=2 \
-    --trainer.frames-per-iter=1000000 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=2000000 \
     --trainer.optimization.num-jobs-initial=3 \
     --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
-    --trainer.num-chunk-per-minibatch=96,64 \
+    --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
     --trainer.add-option="--optimization.memory-compression-level=2" \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
     --cleanup.remove-egs=$remove_egs \
@@ -213,18 +206,20 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
     $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
 fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
similarity index 92%
rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 75c246f5ffe..0828e051dcc 100755
--- a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -32,17 +32,14 @@ reporting_email=
 train_stage=-10
 xent_regularize=0.1
 frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
 # training options
 srand=0
 remove_egs=true
-lang_test=lang_test
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -106,7 +103,6 @@ if [ $stage -le 2 ]; then
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
                             ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
   echo "" >$lat_dir/splice_opts
-
 fi
 
 if [ $stage -le 3 ]; then
@@ -133,7 +129,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
@@ -185,7 +181,7 @@ if [ $stage -le 5 ]; then
     --chain.leaky-hmm-coefficient=0.1 \
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
-    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
     --chain.frame-subsampling-factor=$frame_subsampling_factor \
     --chain.alignment-subsampling-factor=1 \
     --chain.left-tolerance 3 \
@@ -193,7 +189,7 @@ if [ $stage -le 5 ]; then
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
+    --trainer.frames-per-iter=2000000 \
     --trainer.optimization.num-jobs-initial=3 \
     --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
@@ -201,11 +197,8 @@ if [ $stage -le 5 ]; then
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
     --cleanup.remove-egs=$remove_egs \
@@ -226,18 +219,20 @@ if [ $stage -le 6 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
     $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
 fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
new file mode 100755
index 00000000000..3caf8ae4494
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a     e2e_cnn_1a (with extra corpus text)
+# WER                              9.47            5.73
+# WER (rescored)                   8.05            5.67
+# CER                              2.45            1.45
+# CER (rescored)                   2.10            1.42
+# Final train prob              -0.0934         -0.0934
+# Final valid prob              -0.0746         -0.0746
+# Final train prob (xent)
+# Final valid prob (xent)
+# Parameters                      2.94M           2.94M
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
+# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.071->-0.070 (over 5) logprob:train/valid[64,97,final]=(-0.089,-0.084,-0.093/-0.075,-0.073,-0.075)
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16
+common_egs_dir=
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py
index ba35f8b9ace..650a0704d80 100755
--- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py
+++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py
@@ -13,6 +13,7 @@
  be vertically or horizontally aligned). Hence to extract line image from line bounding box,
  page image is rotated and line image is cropped and saved.
 """
+from __future__ import division
 
 import sys
 import argparse
@@ -21,22 +22,10 @@
 import numpy as np
 from math import atan2, cos, sin, pi, degrees, sqrt
 from collections import namedtuple
-
+import random
 from scipy.spatial import ConvexHull
 from PIL import Image
 from scipy.misc import toimage
-import logging
-
-sys.path.insert(0, 'steps')
-logger = logging.getLogger('libs')
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
 parser = argparse.ArgumentParser(description="Creates line images from page image",
                                  epilog="E.g.  " + sys.argv[0] + "  data/LDC2012T15"
                                              " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid "
@@ -60,6 +49,12 @@
                     help='Path to the downloaded (and extracted) writing conditions file 3')
 parser.add_argument('--padding', type=int, default=400,
                     help='padding across horizontal/verticle direction')
+parser.add_argument('--pixel-scaling', type=int, default=30,
+                    help='padding across horizontal/verticle direction')
+parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="only processes subset of data based on writing condition")
+parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="performs image augmentation")
 args = parser.parse_args()
 
 """
@@ -93,8 +88,8 @@ def unit_vector(pt0, pt1):
     (float, float): unit vector
     """
     dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
-    return (pt1[0] - pt0[0]) / dis_0_to_1, \
-           (pt1[1] - pt0[1]) / dis_0_to_1
+    return (pt1[0] - pt0[0])/ dis_0_to_1, \
+           (pt1[1] - pt0[1])/ dis_0_to_1
 
 
 def orthogonal_vector(vector):
@@ -136,7 +131,7 @@ def bounding_area(index, hull):
     return {'area': len_p * len_o,
             'length_parallel': len_p,
             'length_orthogonal': len_o,
-            'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2),
+            'rectangle_center': (min_p + float(len_p)/ 2, min_o + float(len_o)/ 2),
             'unit_vector': unit_vector_p,
             }
 
@@ -149,7 +144,7 @@ def to_xy_coordinates(unit_vector_angle, point):
     ------
     (float, float): converted x,y coordinate of the unit vector.
     """
-    angle_orthogonal = unit_vector_angle + pi / 2
+    angle_orthogonal = unit_vector_angle + pi/ 2
     return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
            point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
 
@@ -194,65 +189,6 @@ def rectangle_corners(rectangle):
     return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
 
 
-def get_orientation(origin, p1, p2):
-    """
-    Given origin and two points, return the orientation of the Point p1 with
-    regards to Point p2 using origin.
-    Returns
-    -------
-    integer: Negative if p1 is clockwise of p2.
-    """
-    difference = (
-        ((p2[0] - origin[0]) * (p1[1] - origin[1]))
-        - ((p1[0] - origin[0]) * (p2[1] - origin[1]))
-    )
-    return difference
-
-
-def compute_hull(points):
-    """
-    Given input list of points, return a list of points that
-    made up the convex hull.
-    Returns
-    -------
-    [(float, float)]: convexhull points
-    """
-    hull_points = []
-    start = points[0]
-    min_x = start[0]
-    for p in points[1:]:
-        if p[0] < min_x:
-            min_x = p[0]
-            start = p
-
-    point = start
-    hull_points.append(start)
-
-    far_point = None
-    while far_point is not start:
-        p1 = None
-        for p in points:
-            if p is point:
-                continue
-            else:
-                p1 = p
-                break
-
-        far_point = p1
-
-        for p2 in points:
-            if p2 is point or p2 is p1:
-                continue
-            else:
-                direction = get_orientation(point, far_point, p2)
-                if direction > 0:
-                    far_point = p2
-
-        hull_points.append(far_point)
-        point = far_point
-    return hull_points
-
-
 def minimum_bounding_box(points):
     """ Given a list of 2D points, it returns the minimum area rectangle bounding all
         the points in the point cloud.
@@ -272,7 +208,6 @@ def minimum_bounding_box(points):
 
     hull_ordered = [points[index] for index in ConvexHull(points).vertices]
     hull_ordered.append(hull_ordered[0])
-    #hull_ordered = compute_hull(points)
     hull_ordered = tuple(hull_ordered)
 
     min_rectangle = bounding_area(0, hull_ordered)
@@ -301,8 +236,8 @@ def get_center(im):
     -------
     (int, int): center of the image
     """
-    center_x = im.size[0] / 2
-    center_y = im.size[1] / 2
+    center_x = float(im.size[0])/ 2
+    center_y = float(im.size[1])/ 2
     return int(center_x), int(center_y)
 
 
@@ -314,9 +249,9 @@ def get_horizontal_angle(unit_vector_angle):
     (float): updated angle of the unit vector to be in radians.
              It is only in first or fourth quadrant.
     """
-    if unit_vector_angle > pi / 2 and unit_vector_angle <= pi:
+    if unit_vector_angle > pi/ 2 and unit_vector_angle <= pi:
         unit_vector_angle = unit_vector_angle - pi
-    elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2:
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi/ 2:
         unit_vector_angle = unit_vector_angle + pi
 
     return unit_vector_angle
@@ -400,6 +335,36 @@ def update_minimum_bounding_box_input(bounding_box_input):
     return updated_minimum_bounding_box_input
 
 
+def dilate_polygon(points, amount_increase):
+    """ Increases size of polygon given as a list of tuples.
+        Assumes points in polygon are given in CCW
+    """
+    expanded_points = []
+    for index, point in enumerate(points):
+        prev_point = points[(index - 1) % len(points)]
+        next_point = points[(index + 1) % len(points)]
+        prev_edge = np.subtract(point, prev_point)
+        next_edge = np.subtract(next_point, point)
+
+        prev_normal = ((1 * prev_edge[1]), (-1 * prev_edge[0]))
+        prev_normal = np.divide(prev_normal, np.linalg.norm(prev_normal))
+        next_normal = ((1 * next_edge[1]), (-1 * next_edge[0]))
+        next_normal = np.divide(next_normal, np.linalg.norm(next_normal))
+
+        bisect = np.add(prev_normal, next_normal)
+        bisect = np.divide(bisect, np.linalg.norm(bisect))
+
+        cos_theta = np.dot(next_normal, bisect)
+        hyp = float(amount_increase)/ cos_theta
+
+        new_point = np.around(point + hyp * bisect)
+        new_point = new_point.astype(int)
+        new_point = new_point.tolist()
+        new_point = tuple(new_point)
+        expanded_points.append(new_point)
+    return expanded_points
+
+
 def set_line_image_data(image, line_id, image_file_name, image_fh):
     """ Given an image, saves a flipped line image. Line image file name
         is formed by appending the line id at the end page image name.
@@ -438,50 +403,83 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh)
                 word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y')))
                 minimum_bounding_box_input.append(word_coordinate)
         updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input)
-        bounding_box = minimum_bounding_box(updated_mbb_input)
-
-        p1, p2, p3, p4 = bounding_box.corner_points
-        x1, y1 = p1
-        x2, y2 = p2
-        x3, y3 = p3
-        x4, y4 = p4
-        min_x = int(min(x1, x2, x3, x4))
-        min_y = int(min(y1, y2, y3, y4))
-        max_x = int(max(x1, x2, x3, x4))
-        max_y = int(max(y1, y2, y3, y4))
-        box = (min_x, min_y, max_x, max_y)
-        region_initial = im.crop(box)
-        rot_points = []
-        p1_new = (x1 - min_x, y1 - min_y)
-        p2_new = (x2 - min_x, y2 - min_y)
-        p3_new = (x3 - min_x, y3 - min_y)
-        p4_new = (x4 - min_x, y4 - min_y)
-        rot_points.append(p1_new)
-        rot_points.append(p2_new)
-        rot_points.append(p3_new)
-        rot_points.append(p4_new)
-
-        cropped_bounding_box = bounding_box_tuple(bounding_box.area,
-                bounding_box.length_parallel,
-                bounding_box.length_orthogonal,
-                bounding_box.length_orthogonal,
-                bounding_box.unit_vector,
-                bounding_box.unit_vector_angle,
-                set(rot_points)
-            )
-
-        rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
-        img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
-        x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+        points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices]
+        if args.augment:
+            for i in range(0, 3):
+                additional_pixel = random.randint(1, args.pixel_scaling)
+                mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1)
+                bounding_box = minimum_bounding_box(mar)
+                (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points
+                min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4))
+                max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4))
+                box = (min_x, min_y, max_x, max_y)
+                region_initial = im.crop(box)
+                rot_points = []
+                p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y)
+                p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y)
+                rot_points.append(p1)
+                rot_points.append(p2)
+                rot_points.append(p3)
+                rot_points.append(p4)
+
+                cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                        bounding_box.length_parallel,
+                        bounding_box.length_orthogonal,
+                        bounding_box.length_orthogonal,
+                        bounding_box.unit_vector,
+                        bounding_box.unit_vector_angle,
+                        set(rot_points)
+                    )
+
+                rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+                img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
+                x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                    cropped_bounding_box, get_center(region_initial))
+
+                min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                box = (min_x, min_y, max_x, max_y)
+                region_final = img2.crop(box)
+                line_id = id + '_scale' + str(i)
+                set_line_image_data(region_final, line_id, image_file_name, image_fh)
+        else:
+            bounding_box = minimum_bounding_box(points_ordered)
+            (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points
+            min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4))
+            max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4))
+            box = (min_x, min_y, max_x, max_y)
+            region_initial = im.crop(box)
+            rot_points = []
+            p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y)
+            p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y)
+            rot_points.append(p1)
+            rot_points.append(p2)
+            rot_points.append(p3)
+            rot_points.append(p4)
+
+            cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                    bounding_box.length_parallel,
+                    bounding_box.length_orthogonal,
+                    bounding_box.length_orthogonal,
+                    bounding_box.unit_vector,
+                    bounding_box.unit_vector_angle,
+                    set(rot_points)
+                )
+
+            rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+            img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
+            x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
                 cropped_bounding_box, get_center(region_initial))
 
-        min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
-        min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
-        max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
-        max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
-        box = (min_x, min_y, max_x, max_y)
-        region_final = img2.crop(box)
-        set_line_image_data(region_final, id, image_file_name, image_fh)
+            min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+            min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+            max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+            max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+            box = (min_x, min_y, max_x, max_y)
+            region_final = img2.crop(box)
+            set_line_image_data(region_final, id, image_file_name, image_fh)
 
 
 def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3):
@@ -535,16 +533,16 @@ def check_writing_condition(wc_dict, base_name):
     Returns
     (bool): True if writing condition matches.
     """
-    return True
-    writing_condition = wc_dict[base_name].strip()
-    if writing_condition != 'IUC':
-        return False
-
-    return True
-
+    if args.subset:
+        writing_condition = wc_dict[base_name].strip()
+        if writing_condition != 'IUC':
+            return False
+        else:
+            return True
+    else:
+        return True
 
 ### main ###
-
 def main():
 
     wc_dict1 = parse_writing_conditions(args.writing_condition1)
@@ -564,8 +562,7 @@ def main():
             madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3)
             if wc_dict is None or not check_writing_condition(wc_dict, base_name):
                 continue
-            if madcat_file_path is not None:
-                get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh)
+            get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh)
 
 
 if __name__ == '__main__':
diff --git a/egs/madcat_ar/v1/local/download_data.sh b/egs/madcat_ar/v1/local/download_data.sh
deleted file mode 100755
index 7061be49c2a..00000000000
--- a/egs/madcat_ar/v1/local/download_data.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-# Copyright      2018  Ashish Arora
-# Apache 2.0
-
-# This script downloads data splits for MADCAT Arabic dataset.
-# It also check if madcat arabic data is present or not.
-
-download_dir1=/export/corpora/LDC/LDC2012T15/data
-download_dir2=/export/corpora/LDC/LDC2013T09/data
-download_dir3=/export/corpora/LDC/LDC2013T15/data
-train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid
-test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid
-dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid
-data_splits=data/download/data_splits
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh || exit 1;
-
-if [ -d $data_splits ]; then
-  echo "$0: Not downloading the data splits as it is already there."
-else
-  if [ ! -f $data_splits/madcat.train.raw.lineid ]; then
-    mkdir -p $data_splits
-    echo "$0: Downloading the data splits..."
-    wget -P $data_splits $train_split_url || exit 1;
-    wget -P $data_splits $test_split_url || exit 1;
-    wget -P $data_splits $dev_split_url || exit 1;
-  fi
-  echo "$0: Done downloading the data splits"
-fi
-
-if [ -d $download_dir1 ]; then
-  echo "$0: madcat arabic data directory is present."
-else
-  if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
-    echo "$0: please download madcat data..."
-  fi
-fi
diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh
index 70c5498626c..9fe588f31b8 100755
--- a/egs/madcat_ar/v1/local/extract_features.sh
+++ b/egs/madcat_ar/v1/local/extract_features.sh
@@ -1,10 +1,16 @@
 #!/bin/bash
+
 # Copyright   2017 Yiwen Shao
 #             2018 Ashish Arora
 
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
 nj=4
 cmd=run.pl
 feat_dim=40
+augment='no_aug'
+verticle_shift=0
 echo "$0 $@"
 
 . ./cmd.sh
@@ -30,9 +36,10 @@ done
 utils/split_scp.pl $scp $split_scps || exit 1;
 
 $cmd JOB=1:$nj $logdir/extract_features.JOB.log \
-  local/make_features.py $logdir/images.JOB.scp \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
     --allowed_len_file_path $data/allowed_lengths.txt \
-    --feat-dim $feat_dim \| \
+    --feat-dim $feat_dim --augment_type $augment \
+    --vertical-shift $verticle_shift \| \
     copy-feats --compress=true --compression-method=7 \
     ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
 
diff --git a/egs/madcat_ar/v1/local/extract_lines.sh b/egs/madcat_ar/v1/local/extract_lines.sh
index 50129ad38c9..ab87836ae3a 100755
--- a/egs/madcat_ar/v1/local/extract_lines.sh
+++ b/egs/madcat_ar/v1/local/extract_lines.sh
@@ -11,6 +11,8 @@ writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
 writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
 data_split_file=data/download/data_splits/madcat.dev.raw.lineid
 data=data/local/dev
+subset=false
+augment=false
 echo "$0 $@"
 
 . ./cmd.sh
@@ -35,7 +37,7 @@ done
 $cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \
   local/create_line_image_from_page_image.py $download_dir1 $download_dir2 $download_dir3 \
   $log_dir/lines.JOB.scp $data/JOB $writing_condition1 $writing_condition2 $writing_condition3 \
-  || exit 1;
+  --subset $subset --augment $augment || exit 1;
 
 ## concatenate the .scp files together.
 for n in $(seq $nj); do
diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh
index d808d736845..1049db9826d 100755
--- a/egs/madcat_ar/v1/local/prepare_data.sh
+++ b/egs/madcat_ar/v1/local/prepare_data.sh
@@ -5,49 +5,65 @@
 #                2017  Hossein Hadian
 # Apache 2.0
 
-# This script prepares the training and test data for MADCAT Arabic dataset 
-# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py.
+# This script downloads the data splits for MADCAT Arabic dataset and prepares the training
+# validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
+# It also uses Arabic Gigaword text corpus for language modeling.
 
 #  Eg. local/prepare_data.sh
-#  Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ
+#  Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11 
+#                 وهناك تداخل بين الرأسمالية الإسرائيلية
 #      utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
-#      images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0
-#      data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif
+#      images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 
+#                        data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png
 
-stage=0
 download_dir1=/export/corpora/LDC/LDC2012T15/data
 download_dir2=/export/corpora/LDC/LDC2013T09/data
 download_dir3=/export/corpora/LDC/LDC2013T15/data
-writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
-writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
-writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
-data_splits_dir=data/download/data_splits
-images_scp_dir=data/local
+train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid
+test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid
+dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid
+data_splits=data/download/data_splits
+stage=0
+download_dir=data/download
+gigacorpus=data/local/gigawordcorpus
+gigaword_loc=/export/corpora5/LDC/LDC2011T11
+use_extra_corpus_text=true
 
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh || exit 1;
 
-mkdir -p data/{train,test,dev}
-
-if [ $stage -le 1 ]; then
-  echo "$0: Processing dev, train and test data..."
-  echo "Date: $(date)."
-  local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
-    $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \
-    $writing_condition1 $writing_condition2 $writing_condition3 || exit 1
-
-  local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
-    $data_splits_dir/madcat.test.raw.lineid data/test $images_scp_dir/test/images.scp \
-    $writing_condition1 $writing_condition2 $writing_condition3 || exit 1
+if [ -d $data_splits ]; then
+  echo "$0: Not downloading the data splits as it is already there."
+else
+  if [ ! -f $data_splits/madcat.train.raw.lineid ]; then
+    mkdir -p $data_splits
+    echo "$0: Downloading the data splits..."
+    wget -P $data_splits $train_split_url || exit 1;
+    wget -P $data_splits $test_split_url || exit 1;
+    wget -P $data_splits $dev_split_url || exit 1;
+  fi
+  echo "$0: Done downloading the data splits"
+fi
 
-  local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
-    $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \
-    $writing_condition1 $writing_condition2 $writing_condition3 || exit 1
+if [ -d $download_dir1 ]; then
+  echo "$0: madcat arabic data directory is present."
+else
+  if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
+    echo "$0: please download madcat data..."
+  fi
+fi
 
-  for dataset in dev test train; do
-    echo "$0: Fixing data directory for dataset: $dataset"
-    echo "Date: $(date)."
-    image/fix_data_dir.sh data/$dataset
+mkdir -p $download_dir data/local
+if $use_extra_corpus_text; then
+  mkdir -p $gigacorpus
+  cp -r $gigaword_loc/. $gigacorpus
+  for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do
+    for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do
+      gzip -d $file
+    done
+    for file in $gigacorpus/arb_gw_5/data/$newswire/*; do
+      sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt
+    done
   done
 fi
diff --git a/egs/madcat_ar/v1/local/prepend_words.py b/egs/madcat_ar/v1/local/prepend_words.py
deleted file mode 100755
index d53eb8974bf..00000000000
--- a/egs/madcat_ar/v1/local/prepend_words.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# This script, prepend '|' to every words in the transcript to mark
-# the beginning of the words for finding the initial-space of every word
-# after decoding.
-
-import sys, io
-
-infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
-output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
-for line in infile:
-    output.write(' '.join(["|" + word for word in line.split()]) + '\n')
diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py
index b57500cf2fa..a39bcfa87d3 100755
--- a/egs/madcat_ar/v1/local/process_data.py
+++ b/egs/madcat_ar/v1/local/process_data.py
@@ -24,24 +24,28 @@
                                  " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid "
                                  " data/train data/local/lines ",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('database_path1', type=str,
+parser.add_argument('database_path1',
                     help='Path to the downloaded (and extracted) madcat data')
-parser.add_argument('database_path2', type=str,
+parser.add_argument('database_path2',
                     help='Path to the downloaded (and extracted) madcat data')
-parser.add_argument('database_path3', type=str,
+parser.add_argument('database_path3',
                     help='Path to the downloaded (and extracted) madcat data')
-parser.add_argument('data_splits', type=str,
+parser.add_argument('data_splits',
                     help='Path to file that contains the train/test/dev split information')
-parser.add_argument('out_dir', type=str,
+parser.add_argument('out_dir',
                     help='directory location to write output files.')
-parser.add_argument('images_scp_path', type=str,
+parser.add_argument('images_scp_path',
                     help='Path of input images.scp file(maps line image and location)')
-parser.add_argument('writing_condition1', type=str,
+parser.add_argument('writing_condition1',
                     help='Path to the downloaded (and extracted) writing conditions file 1')
-parser.add_argument('writing_condition2', type=str,
+parser.add_argument('writing_condition2',
                     help='Path to the downloaded (and extracted) writing conditions file 2')
-parser.add_argument('writing_condition3', type=str,
+parser.add_argument('writing_condition3',
                     help='Path to the downloaded (and extracted) writing conditions file 3')
+parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="performs image augmentation")
+parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="only processes subset of data based on writing condition")
 args = parser.parse_args()
 
 
@@ -97,50 +101,42 @@ def check_writing_condition(wc_dict):
     Returns:
         (bool): True if writing condition matches.
     """
-    return True
-    writing_condition = wc_dict[base_name].strip()
-    if writing_condition != 'IUC':
-        return False
+    if args.subset:
+        writing_condition = wc_dict[base_name].strip()
+        if writing_condition != 'IUC':
+            return False
+        else:
+            return True
+    else:
+        return True
 
-    return True
 
-
-def get_word_line_mapping(madcat_file_path):
+def read_text(madcat_file_path):
     """ Maps every word in the page image to a  corresponding line.
     Args:
-         madcat_file_path (string): complete path and name of the madcat xml file
+        madcat_file_path (string): complete path and name of the madcat xml file
                                   corresponding to the page image.
     Returns:
+        dict: Mapping every word in the page image to a  corresponding line.
     """
+
+    word_line_dict = dict()
     doc = minidom.parse(madcat_file_path)
     zone = doc.getElementsByTagName('zone')
     for node in zone:
         line_id = node.getAttribute('id')
-        line_word_dict[line_id] = list()
         word_image = node.getElementsByTagName('token-image')
         for tnode in word_image:
             word_id = tnode.getAttribute('id')
-            line_word_dict[line_id].append(word_id)
             word_line_dict[word_id] = line_id
 
-
-def read_text(madcat_file_path):
-    """ Maps every word in the page image to a  corresponding line.
-    Args:
-        madcat_file_path (string): complete path and name of the madcat xml file
-                                  corresponding to the page image.
-    Returns:
-        dict: Mapping every word in the page image to a  corresponding line.
-    """
     text_line_word_dict = dict()
-    doc = minidom.parse(madcat_file_path)
     segment = doc.getElementsByTagName('segment')
     for node in segment:
         token = node.getElementsByTagName('token')
         for tnode in token:
             ref_word_id = tnode.getAttribute('ref_id')
             word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue
-            word = unicodedata.normalize('NFKC',word)
             ref_line_id = word_line_dict[ref_word_id]
             if ref_line_id not in text_line_word_dict:
                 text_line_word_dict[ref_line_id] = list()
@@ -160,7 +156,6 @@ def get_line_image_location():
 
 
 ### main ###
-
 print("Processing '{}' data...".format(args.out_dir))
 
 text_file = os.path.join(args.out_dir, 'text')
@@ -188,23 +183,34 @@ def get_line_image_location():
             madcat_xml_path, image_file_path, wc_dict = check_file_location()
             if wc_dict is None or not check_writing_condition(wc_dict):
                 continue
-            if madcat_xml_path is not None:
-                madcat_doc = minidom.parse(madcat_xml_path)
-                writer = madcat_doc.getElementsByTagName('writer')
-                writer_id = writer[0].getAttribute('id')
-                line_word_dict = dict()
-                word_line_dict = dict()
-                get_word_line_mapping(madcat_xml_path)
-                text_line_word_dict = read_text(madcat_xml_path)
-                base_name = os.path.basename(image_file_path)
-                base_name, b = base_name.split('.tif')
-                for lineID in sorted(text_line_word_dict):
-                    updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png'
+            madcat_doc = minidom.parse(madcat_xml_path)
+            writer = madcat_doc.getElementsByTagName('writer')
+            writer_id = writer[0].getAttribute('id')
+            text_line_word_dict = read_text(madcat_xml_path)
+            base_name = os.path.basename(image_file_path).split('.tif')[0]
+            for line_id in sorted(text_line_word_dict):
+                if args.augment:
+                    key = (line_id + '.')[:-1]
+                    for i in range(0, 3):
+                        location_id = "_{}_scale{}".format(line_id, i)
+                        line_image_file_name = base_name + location_id + '.png'
+                        location = image_loc_dict[line_image_file_name]
+                        image_file_path = os.path.join(location, line_image_file_name)
+                        line = text_line_word_dict[key]
+                        text = ' '.join(line)
+                        base_line_image_file_name = line_image_file_name.split('.png')[0]
+                        utt_id = "{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_line_image_file_name)
+                        text_fh.write(utt_id + ' ' + text + '\n')
+                        utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+                        image_fh.write(utt_id + ' ' + image_file_path + '\n')
+                        image_num += 1
+                else:
+                    updated_base_name = "{}_{}.png".format(base_name, str(line_id).zfill(4))
                     location = image_loc_dict[updated_base_name]
                     image_file_path = os.path.join(location, updated_base_name)
-                    line = text_line_word_dict[lineID]
+                    line = text_line_word_dict[line_id]
                     text = ' '.join(line)
-                    utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4)
+                    utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_name, str(line_id).zfill(4))
                     text_fh.write(utt_id + ' ' + text + '\n')
                     utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
                     image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/madcat_ar/v1/local/score.sh b/egs/madcat_ar/v1/local/score.sh
index 2c11aba3e13..31564d25326 100755
--- a/egs/madcat_ar/v1/local/score.sh
+++ b/egs/madcat_ar/v1/local/score.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
 
-steps/scoring/score_kaldi_wer.sh --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@"
-steps/scoring/score_kaldi_cer.sh --stage 2 --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@"
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh
new file mode 100755
index 00000000000..cc44aa58a62
--- /dev/null
+++ b/egs/madcat_ar/v1/local/tl/augment_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+aug_set=aug1
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp"
+
+for set in $aug_set; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --vertical-shift $verticle_shift \
+    --augment 'random_shift' $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
new file mode 100755
index 00000000000..ccbb7119674
--- /dev/null
+++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+
+# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a/
+# System                      cnn_e2eali_1a
+# WER                             16.78
+# CER                              5.22
+# Final train prob              -0.1189
+# Final valid prob              -0.1319
+# Final train prob (xent)       -0.6395
+# Final valid prob (xent)       -0.6732
+# Parameters                      3.73M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/
+# exp/chain/cnn_e2eali_1a/: num-iters=24 nj=3..15 num-params=3.7M dim=56->392 combine=-0.125->-0.125 (over 1) xent:train/valid[15,23,final]=(-0.850,-1.24,-0.640/-0.901,-1.31,-0.673) logprob:train/valid[15,23,final]=(-0.149,-0.209,-0.119/-0.166,-0.229,-0.132)
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=450
+srand=0
+remove_egs=true
+lang_decode=data/lang
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=56 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=2 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
similarity index 78%
rename from egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh
rename to egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
index 2c85e982ce6..3fca8cf5fdc 100755
--- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh
@@ -3,40 +3,37 @@
 
 # This script does end2end chain training (i.e. from scratch)
 
-# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a
+# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
 # System                      e2e_cnn_1a
-# WER                             10.71
-# CER                              2.85
-# Final train prob              -0.0859
-# Final valid prob              -0.1266
+# WER                             19.30
+# CER                              5.72
+# Final train prob              -0.0734
+# Final valid prob              -0.0607
 # Final train prob (xent)
 # Final valid prob (xent)
-# Parameters                      2.94M
+# Parameters                      3.30M
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
-# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127)
+# exp/chain/e2e_cnn_1a/: num-iters=24 nj=3..15 num-params=3.3M dim=56->292 combine=-0.060->-0.060 (over 1) logprob:train/valid[15,23,final]=(-0.122,-0.143,-0.073/-0.105,-0.132,-0.061)
 
 set -e
 
+
 # configs for 'chain'
 stage=0
-nj=70
+nj=30
 train_stage=-10
 get_egs_stage=-10
 affix=1a
 
 # training options
 tdnn_dim=450
-num_epochs=2
-num_jobs_initial=6
-num_jobs_final=16
-minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
 common_egs_dir=
-l2_regularize=0.00005
 frames_per_iter=1000000
-cmvn_opts="--norm-means=true --norm-vars=true"
+cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train
-lang_test=lang_test
+lang_decode=data/lang
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -89,16 +86,17 @@ if [ $stage -le 2 ]; then
   common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
-  input dim=40 name=input
-  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
-  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
-  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
-  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
-  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
-  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
-  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  input dim=56 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3
   relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim
   relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
   relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
@@ -118,20 +116,21 @@ if [ $stage -le 3 ]; then
     --cmd "$cmd" \
     --feat.cmvn-opts "$cmvn_opts" \
     --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize $l2_regularize \
+    --chain.l2-regularize 0.00005 \
     --chain.apply-deriv-weights false \
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
     --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
     --chain.frame-subsampling-factor 4 \
     --chain.alignment-subsampling-factor 4 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
     --trainer.add-option="--optimization.memory-compression-level=2" \
     --trainer.num-chunk-per-minibatch $minibatch_size \
-    --trainer.frames-per-iter $frames_per_iter \
-    --trainer.num-epochs $num_epochs \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.num-epochs 2 \
     --trainer.optimization.momentum 0 \
-    --trainer.optimization.num-jobs-initial $num_jobs_initial \
-    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
     --trainer.optimization.initial-effective-lrate 0.001 \
     --trainer.optimization.final-effective-lrate 0.0001 \
     --trainer.optimization.shrink-value 1.0 \
@@ -151,7 +150,7 @@ if [ $stage -le 4 ]; then
   # as long as phones.txt was compatible.
 
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 $lang_decode \
     $dir $dir/graph || exit 1;
 fi
 
diff --git a/egs/madcat_ar/v1/local/tl/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/process_waldo_data.py
new file mode 100755
index 00000000000..0d278e64122
--- /dev/null
+++ b/egs/madcat_ar/v1/local/tl/process_waldo_data.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+""" This script reads image and transcription mapping and creates the following files :text, utt2spk, images.scp.
+  Eg. local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test
+  Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ
+      utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
+      images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0
+      data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif
+"""
+
+import argparse
+import os
+import sys
+
+parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files",
+                                 epilog="E.g.  " + sys.argv[0] + " data/train data/local/lines ",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('image_transcription_file', type=str,
+                    help='Path to the file containing line image path and transcription information')
+parser.add_argument('out_dir', type=str,
+                    help='directory location to write output files.')
+args = parser.parse_args()
+
+
+def read_image_text(image_text_path):
+    """ Given the file path containing, mapping information of line image
+     and transcription, it returns a dict. The dict contains this mapping
+    info. It can be accessed via line_id and will provide transcription.
+    Returns:
+    --------
+    dict: line_id and transcription mapping
+    """
+    image_transcription_dict = dict()
+    with open(image_text_path, encoding='utf-8') as f:
+        for line in f:
+            line_vect = line.strip().split(' ')
+            image_path = line_vect[0]
+            line_id = os.path.basename(image_path).split('.png')[0]
+            transcription = line_vect[1:]
+            joined_transcription = list()
+            for word in transcription:
+                joined_transcription.append(word)
+            joined_transcription = " ".join(joined_transcription)
+            image_transcription_dict[line_id] = joined_transcription
+    return image_transcription_dict
+
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+image_transcription_dict = read_image_text(args.image_transcription_file)
+for line_id in sorted(image_transcription_dict.keys()):
+        writer_id = line_id.strip().split('_')[-3]
+        updated_line_id = line_id + '.png'
+        image_file_path = os.path.join('lines', updated_line_id)
+        text = image_transcription_dict[line_id]
+        utt_id = line_id
+        text_fh.write(utt_id + ' ' + text + '\n')
+        utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+        image_fh.write(utt_id + ' ' + image_file_path + '\n')
+
diff --git a/egs/madcat_ar/v1/local/tl/run_text_localization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh
new file mode 100755
index 00000000000..8d12f7d802f
--- /dev/null
+++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+# Copyright 2017    Hossein Hadian
+#           2018    Ashish Arora
+
+# This script performs full page text recognition on automatically extracted line images
+#    from madcat arabic data. It is created as a separate scrip, because it performs
+#    data augmentation, uses smaller language model and calls process_waldo_data for
+#    test images (automatically extracted line images). Data augmentation increases image
+#    height hence requires different DNN arachitecture and different chain scripts.
+
+set -e
+stage=0
+nj=70
+# download_dir{1,2,3} points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# This corpus can be purchased here:
+# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
+download_dir1=/export/corpora/LDC/LDC2012T15/data
+download_dir2=/export/corpora/LDC/LDC2013T09/data
+download_dir3=/export/corpora/LDC/LDC2013T15/data
+writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
+writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
+writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
+data_splits_dir=data/download/data_splits
+images_scp_dir=data/local
+overwrite=false
+subset=true
+augment=true
+verticle_shift=16
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+./local/check_tools.sh
+
+mkdir -p data/{train,test,dev}/data
+mkdir -p data/local/{train,test,dev}
+if [ $stage -le 0 ]; then
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+  echo "$0: Downloading data splits...$(date)"
+  local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
+                         --download_dir2 $download_dir2 --download_dir3 $download_dir3
+
+  for set in train dev; do
+    data_split_file=$data_splits_dir/madcat.$set.raw.lineid
+    local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
+        --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
+        --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
+        --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
+        --data data/local/$set --subset $subset --augment $augment || exit 1
+  done
+ 
+  echo "$0: Preparing data..."
+  for set in dev train; do
+    local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
+      $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
+      $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
+    image/fix_data_dir.sh data/${set}
+  done
+
+  local/tl/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test
+  utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Obtaining image groups. calling get_image2num_frames $(date)."
+  image/get_image2num_frames.py data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  for set in dev train test; do
+    echo "$0: Extracting features and calling compute_cmvn_stats for dataset:  $set. $(date)"
+    local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 \
+    --verticle_shift $verticle_shift data/$set
+    steps/compute_cmvn_stats.sh data/$set || exit 1;
+  done
+  echo "$0: Fixing data directory for train dataset $(date)."
+  image/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  for set in train; do
+    echo "$(date) stage 2: Performing augmentation, it will double training data"
+    local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 \
+    --verticle_shift $verticle_shift data/${set} data/${set}_aug data
+    steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Preparing BPE..."
+  cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \
+    utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+
+  for set in test train dev train_aug; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \
+      utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > data/$set/bpe_text
+
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "$0:Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+                        data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/tl/train_lm.sh --order 3
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+fi
+
+nj=30
+if [ $stage -le 5 ]; then
+  echo "$0: Calling the flat-start chain recipe... $(date)."
+  local/tl/chain/run_e2e_cnn.sh --nj $nj --train_set train_aug
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Aligning the training data using the e2e chain model...$(date)."
+  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+                       --use-gpu false \
+                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)"
+  local/tl/chain/run_cnn_e2eali.sh --nj $nj --train_set train_aug
+fi
diff --git a/egs/madcat_ar/v1/local/tl/train_lm.sh b/egs/madcat_ar/v1/local/tl/train_lm.sh
new file mode 100755
index 00000000000..524bb2e9f40
--- /dev/null
+++ b/egs/madcat_ar/v1/local/tl/train_lm.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=3
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/dev/text | cut -d " " -f 2-  > ${dir}/data/text/dev.txt
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh
index 3b8a382cb00..903b288a834 100755
--- a/egs/madcat_ar/v1/local/train_lm.sh
+++ b/egs/madcat_ar/v1/local/train_lm.sh
@@ -6,20 +6,19 @@
 #           2017  Hossein Hadian
 # Apache 2.0
 #
-# This script trains a LM on the MADCAT training transcriptions.
+# This script trains a LM on the training transcriptions.
 # It is based on the example scripts distributed with PocoLM
 
 # It will check if pocolm is installed and if not will proceed with installation
 
 set -e
 stage=0
-
+dir=data/local/local_lm
+order=6
 echo "$0 $@"  # Print the command line for logging
 . ./utils/parse_options.sh || exit 1;
 
-dir=data/local/local_lm
 lm_dir=${dir}/data
-segments=data/train/segmented_words
 
 
 mkdir -p $dir
@@ -43,12 +42,10 @@ bypass_metaparam_optim_opt=
 # These example numbers of metaparameters is for 4-gram model (with min-counts)
 # running with train_lm.py.
 # The dev perplexity should be close to the non-bypassed model.
-#bypass_metaparam_optim_opt=
 # Note: to use these example parameters, you may need to remove the .done files
 # to make sure the make_lm_dir.py be called and tain only 3-gram model
 #for order in 3; do
 #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
-
 if [ $stage -le 0 ]; then
   mkdir -p ${dir}/data
   mkdir -p ${dir}/data/text
@@ -65,7 +62,13 @@ if [ $stage -le 0 ]; then
 
   # use the training data as an additional data source.
   # we can later fold the dev data into this.
-  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/madcat.txt
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+
+  if [ -d "data/local/gigawordcorpus/arb_gw_5/data" ]; then
+    cat data/local/gigawordcorpus/arb_gw_5/data/nhr_arb_combined.txt | \
+      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > ${dir}/data/text/corpus_text.txt
+  fi
 
   # for reporting perplexities, we'll use the "real" dev set.
   # (the validation data is used as ${dir}/data/text/dev.txt to work
@@ -75,12 +78,10 @@ if [ $stage -le 0 ]; then
   cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
 
   # get the wordlist from MADCAT text
-  cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
   cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
 fi
 
-order=3
-
 if [ $stage -le 1 ]; then
   # decide on the vocabulary.
   # Note: you'd use --wordlist if you had a previously determined word-list
@@ -88,7 +89,7 @@ if [ $stage -le 1 ]; then
   # Note: if you have more than one order, use a certain amount of words as the
   # vocab and want to restrict max memory for 'sort',
   echo "$0: training the unpruned LM"
-  min_counts='train=2 madcat=1'
+  min_counts='corpus_text=2 train=1'
   wordlist=${dir}/data/wordlist
 
   lm_name="`basename ${wordlist}`_${order}"
@@ -96,13 +97,34 @@ if [ $stage -le 1 ]; then
     lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
   fi
   unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
-  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
                --limit-unk-history=true \
                ${bypass_metaparam_optim_opt} \
                ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
 
   get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
-
   mkdir -p ${dir}/data/arpa
   format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
 fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 20 million n-grams for a big LM for rescoring purposes.
+  size=20000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 10 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/madcat_ar/v1/local/wer_output_filter b/egs/madcat_ar/v1/local/wer_output_filter
index c0f03e7178a..d6d46f3f565 100755
--- a/egs/madcat_ar/v1/local/wer_output_filter
+++ b/egs/madcat_ar/v1/local/wer_output_filter
@@ -2,6 +2,9 @@
 # Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
 # Apache 2.0
 
+# This script converts a BPE-encoded text to normal text and performs normalization.
+# It is used in scoring.
+
 use utf8;
 
 use open qw(:encoding(utf8));
diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh
index 14c8bf7a6ce..01bfdbed543 100755
--- a/egs/madcat_ar/v1/run.sh
+++ b/egs/madcat_ar/v1/run.sh
@@ -11,9 +11,7 @@ decode_gmm=false
 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not
 # already downloaded the database you can set it to a local directory
 # This corpus can be purchased here:
-# https://catalog.ldc.upenn.edu/LDC2012T15,
-# https://catalog.ldc.upenn.edu/LDC2013T09/,
-# https://catalog.ldc.upenn.edu/LDC2013T15/.
+# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
 download_dir1=/export/corpora/LDC/LDC2012T15/data
 download_dir2=/export/corpora/LDC/LDC2013T09/data
 download_dir3=/export/corpora/LDC/LDC2013T15/data
@@ -21,47 +19,50 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
 writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
 writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
 data_splits_dir=data/download/data_splits
-
+images_scp_dir=data/local
+overwrite=false
+subset=false
+augment=false
+use_extra_corpus_text=true
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
-
 ./local/check_tools.sh
-
 mkdir -p data/{train,test,dev}/data
 mkdir -p data/local/{train,test,dev}
 
 if [ $stage -le 0 ]; then
-  echo "$0: Downloading data splits..."
-  echo "Date: $(date)."
-  local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
-                         --download_dir2 $download_dir2 --download_dir3 $download_dir3
-fi
-
-if [ $stage -le 1 ]; then
-  for dataset in test train dev; do
-    data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+  local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
+                         --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
+                         --use_extra_corpus_text $use_extra_corpus_text
+
+  for set in test train dev; do
+    data_split_file=$data_splits_dir/madcat.$set.raw.lineid
     local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
         --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
         --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
         --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
-        --data data/local/$dataset
+        --data data/local/$set --subset $subset --augment $augment || exit 1
   done
-fi
 
-if [ $stage -le 2 ]; then
-  echo "$0: Preparing data..."
-  local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
-      --download_dir3 $download_dir3 --images_scp_dir data/local \
-      --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \
-      --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3
+  echo "$0: Processing data..."
+  for set in dev train test; do
+    local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
+      $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
+      $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
+    image/fix_data_dir.sh data/${set}
+  done
 fi
 
-mkdir -p data/{train,test,dev}/data
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 1 ]; then
   for dataset in test train; do
     local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset
     steps/compute_cmvn_stats.sh data/$dataset || exit 1;
@@ -69,33 +70,53 @@ if [ $stage -le 3 ]; then
   utils/fix_data_dir.sh data/train
 fi
 
-if [ $stage -le 4 ]; then
-  echo "$0: Preparing dictionary and lang..."
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing BPE..."
+  cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \
+    utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+
+  for set in test train dev; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \
+      utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > data/$set/bpe_text
+
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "$0:Preparing dictionary and lang..."
   local/prepare_dict.sh
-  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
-    data/local/dict "<sil>" data/lang/temp data/lang
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+                        data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 3 ]; then
   echo "$0: Estimating a language model for decoding..."
   local/train_lm.sh
-  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
-                     data/local/dict/lexicon.txt data/lang_test
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 4 ]; then
   steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
     data/lang exp/mono
 fi
 
-if [ $stage -le 7 ] && $decode_gmm; then
-  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
+if [ $stage -le 5 ] && $decode_gmm; then
+  utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
 
   steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
     exp/mono/decode_test
 fi
 
-if [ $stage -le 8 ]; then
+if [ $stage -le 6 ]; then
   steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
     exp/mono exp/mono_ali
 
@@ -103,14 +124,14 @@ if [ $stage -le 8 ]; then
     exp/mono_ali exp/tri
 fi
 
-if [ $stage -le 9 ] && $decode_gmm; then
-  utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph
+if [ $stage -le 7 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang exp/tri exp/tri/graph
 
   steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
     exp/tri/decode_test
 fi
 
-if [ $stage -le 10 ]; then
+if [ $stage -le 8 ]; then
   steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
     exp/tri exp/tri_ali
 
@@ -119,22 +140,22 @@ if [ $stage -le 10 ]; then
     data/train data/lang exp/tri_ali exp/tri3
 fi
 
-if [ $stage -le 11 ] && $decode_gmm; then
-  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+if [ $stage -le 9 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
 
   steps/decode.sh --nj $nj --cmd $cmd exp/tri3/graph \
     data/test exp/tri3/decode_test
 fi
 
-if [ $stage -le 12 ]; then
+if [ $stage -le 10 ]; then
   steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
     data/train data/lang exp/tri3 exp/tri3_ali
 fi
 
-if [ $stage -le 13 ]; then
-  local/chain/run_cnn_1a.sh
+if [ $stage -le 11 ]; then
+  local/chain/run_cnn.sh
 fi
 
-if [ $stage -le 14 ]; then
-  local/chain/run_cnn_chainali_1a.sh --stage 2
+if [ $stage -le 12 ]; then
+  local/chain/run_cnn_chainali.sh --stage 2
 fi
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
index 5d27476d3e1..62f4eeb7c71 100755
--- a/egs/madcat_ar/v1/run_end2end.sh
+++ b/egs/madcat_ar/v1/run_end2end.sh
@@ -7,9 +7,7 @@ nj=70
 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not
 # already downloaded the database you can set it to a local directory
 # This corpus can be purchased here:
-# https://catalog.ldc.upenn.edu/LDC2012T15,
-# https://catalog.ldc.upenn.edu/LDC2013T09/,
-# https://catalog.ldc.upenn.edu/LDC2013T15/.
+# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
 download_dir1=/export/corpora/LDC/LDC2012T15/data
 download_dir2=/export/corpora/LDC/LDC2013T09/data
 download_dir3=/export/corpora/LDC/LDC2013T15/data
@@ -17,7 +15,11 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
 writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
 writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
 data_splits_dir=data/download/data_splits
-
+images_scp_dir=data/local
+overwrite=false
+subset=false
+augment=false
+use_extra_corpus_text=true
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
@@ -27,102 +29,105 @@ data_splits_dir=data/download/data_splits
 
 mkdir -p data/{train,test,dev}/data
 mkdir -p data/local/{train,test,dev}
-
 if [ $stage -le 0 ]; then
-  echo "$0: Downloading data splits..."
-  echo "Date: $(date)."
-  local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
-                         --download_dir2 $download_dir2 --download_dir3 $download_dir3
-fi
 
-if [ $stage -le 1 ]; then
-  for dataset in test train dev; do
-    data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: preparing data...$(date)"
+  local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
+                         --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
+                         --use_extra_corpus_text $use_extra_corpus_text
+
+  for set in test train dev; do
+    data_split_file=$data_splits_dir/madcat.$set.raw.lineid
     local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
         --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
         --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \
         --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
-        --data data/local/$dataset
+        --data data/local/$set --subset $subset --augment $augment || exit 1
+  done
+
+  echo "$0: Processing data..."
+  for set in dev train test; do
+    local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
+      $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
+      $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset
+    image/fix_data_dir.sh data/${set}
   done
-fi
 
-if [ $stage -le 2 ]; then
-  echo "$0: Preparing data..."
-  local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
-      --download_dir3 $download_dir3 --images_scp_dir data/local \
-      --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \
-      --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3
 fi
 
-if [ $stage -le 3 ]; then
-  echo "$0: Obtaining image groups. calling get_image2num_frames"
-  echo "Date: $(date)."
-  image/get_image2num_frames.py data/train  # This will be needed for the next command
-  # The next command creates a "allowed_lengths.txt" file in data/train
-  # which will be used by local/make_features.py to enforce the images to
-  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
-  echo "$0: Obtaining image groups. calling get_allowed_lengths"
-  echo "Date: $(date)."
+if [ $stage -le 1 ]; then
+  echo "$0: Obtaining image groups. calling get_image2num_frames $(date)."
+  image/get_image2num_frames.py data/train
   image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
-fi
 
-if [ $stage -le 4 ]; then
-  for dataset in test train; do
-    echo "$0: Extracting features and calling compute_cmvn_stats for dataset:  $dataset. "
-    echo "Date: $(date)."
-    local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset
-    steps/compute_cmvn_stats.sh data/$dataset || exit 1;
+  for set in test dev train; do
+    echo "$0: Extracting features and calling compute_cmvn_stats for dataset:  $set. $(date)"
+    local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set
+    steps/compute_cmvn_stats.sh data/$set || exit 1;
   done
-  echo "$0: Fixing data directory for train dataset"
-  echo "Date: $(date)."
+  echo "$0: Fixing data directory for train dataset $(date)."
   utils/fix_data_dir.sh data/train
 fi
 
-if [ $stage -le 5 ]; then
-  echo "$0: Preparing dictionary and lang..."
-  cut -d' ' -f2- data/train/text | local/reverse.py | \
-    local/prepend_words.py | \
-    utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing BPE..."
+  cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \
+    utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+
   for set in test train dev; do
     cut -d' ' -f1 data/$set/text > data/$set/ids
-    cut -d' ' -f2- data/$set/text | local/reverse.py | \
-      local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \
+      utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
       | sed 's/@@//g' > data/$set/bpe_text
+
     mv data/$set/text data/$set/text.old
     paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
   done
+
+  echo "$0:Preparing dictionary and lang..."
   local/prepare_dict.sh
-  # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
-  # So we set --sil-prob to 0.0
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
                         data/local/dict "<sil>" data/lang/temp data/lang
   utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 3 ]; then
+  echo "$0: Calling the flat-start chain recipe... $(date)."
+  local/chain/run_e2e_cnn.sh
+fi
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+decode_e2e=true
+if [ $stage -le 4 ]; then
   echo "$0: Estimating a language model for decoding..."
   local/train_lm.sh
-  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
-                     data/local/dict/lexicon.txt data/lang_test
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \
+                     data/local/dict/lexicon.txt $lang_decode
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang $lang_rescore
 fi
 
-if [ $stage -le 7 ]; then
-  echo "$0: Calling the flat-start chain recipe..."
-  echo "Date: $(date)."
-  local/chain/run_flatstart_cnn1a.sh --nj $nj
-fi
+if [ $stage -le 5 ] && $decode_e2e; then
+  echo "$0: $(date) stage 5: decoding end2end setup..."
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
 
-if [ $stage -le 8 ]; then
-  echo "$0: Aligning the training data using the e2e chain model..."
-  echo "Date: $(date)."
-  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
-                       --use-gpu false \
-                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
-                       data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
-fi
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+    data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
 
-if [ $stage -le 9 ]; then
-  echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
-  echo "Date: $(date)."
-  local/chain/run_cnn_e2eali_1b.sh --nj $nj
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
 fi
diff --git a/egs/madcat_zh/README.txt b/egs/madcat_zh/README.txt
new file mode 100644
index 00000000000..4ea8df8bb3c
--- /dev/null
+++ b/egs/madcat_zh/README.txt
@@ -0,0 +1,5 @@
+This directory contains example scripts for handwriting recognition on
+the MADCAT Chinese HWR dataset (LDC2014T13).
+This dataset consists of handwritten Chinese documents, scanned
+at high resolution and annotated for each line and token. 
+More info: https://catalog.ldc.upenn.edu/LDC2014T13
diff --git a/egs/madcat_zh/v1/cmd.sh b/egs/madcat_zh/v1/cmd.sh
new file mode 100644
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/madcat_zh/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/madcat_zh/v1/image b/egs/madcat_zh/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/madcat_zh/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/local/chain/compare_wer.sh b/egs/madcat_zh/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..4eb665fc702
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/compare_wer.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/madcat_zh/v1/local/chain/run_cnn.sh b/egs/madcat_zh/v1/local/chain/run_cnn.sh
new file mode 120000
index 00000000000..df6f0a468c1
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/run_cnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh
new file mode 120000
index 00000000000..86568421fe1
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/run_cnn_chainali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_chainali_1b.sh
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
new file mode 100755
index 00000000000..164d62a7ad9
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_1a.sh
@@ -0,0 +1,223 @@
+#!/bin/bash
+
+# Copyright    2017 Hossein Hadian
+#              2017 Chun Chieh Chang
+#              2017 Ashish Arora
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_1a/
+# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098)
+
+# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ exp/chain/e2e_cnn_1a/
+# System                         cnn_1a cnn_chainali_1b e2e_cnn_1a
+# WER                             13.51      6.76     10.55
+# Final train prob              -0.0291   -0.0138   -0.0702
+# Final valid prob              -0.0712   -0.0171   -0.0578
+# Final train prob (xent)       -0.3847   -0.4169
+# Final valid prob (xent)       -0.4962   -0.5040
+
+set -e -o pipefail
+
+stage=0
+
+nj=50
+train_set=train
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=450
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=60 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=4 \
+    --trainer.srand=0 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=2 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=false \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
+
+echo "$0: Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
+
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
similarity index 87%
rename from egs/iam/v1/local/chain/run_cnn_chainali_1a.sh
rename to egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
index ee3a1a3d92c..be51bdcc3d1 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -2,10 +2,16 @@
 
 # chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments
 
+# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a/
+# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045)
+
+# cat exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_*
+
 set -e -o pipefail
 
 stage=0
-
 nj=30
 train_set=train
 gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
@@ -13,35 +19,25 @@ gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
 affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 ali=tri3_ali
-chain_model_dir=exp/chain${nnet3_affix}/cnn_1a
+chain_model_dir=exp/chain${nnet3_affix}/cnn${affix}
 common_egs_dir=
 reporting_email=
 
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
-alignment_subsampling_factor=1
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
 # we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
-# training options
-srand=0
-remove_egs=false
-lang_test=lang_test
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
-
 if ! cuda-compiled; then
   cat <<EOF && exit 1
 This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
@@ -96,7 +92,6 @@ if [ $stage -le 2 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
   steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
-                            --acoustic-scale 1.0 \
                             --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
                             ${train_data_dir} data/lang $chain_model_dir $lat_dir
   cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
@@ -112,7 +107,7 @@ if [ $stage -le 3 ]; then
      exit 1;
   fi
   steps/nnet3/chain/build_tree.sh \
-    --frame-subsampling-factor $frame_subsampling_factor \
+    --frame-subsampling-factor 4 \
     --context-opts "--context-width=2 --central-position=1" \
     --cmd "$cmd" $num_leaves ${train_data_dir} \
     $lang $ali_dir $tree_dir
@@ -124,14 +119,13 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
   common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
   common3="height-offsets=-1,0,1 num-filters-out=70"
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
   input dim=40 name=input
-
   conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
   conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
   conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
@@ -175,27 +169,23 @@ if [ $stage -le 5 ]; then
     --chain.l2-regularize=0.00005 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=500" \
-    --chain.frame-subsampling-factor=$frame_subsampling_factor \
-    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
-    --trainer.srand=$srand \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
     --trainer.max-param-change=2.0 \
-    --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
-    --trainer.optimization.num-jobs-initial=2 \
-    --trainer.optimization.num-jobs-final=4 \
+    --trainer.num-epochs=2 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=8 \
+    --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
     --trainer.optimization.final-effective-lrate=0.0001 \
     --trainer.optimization.shrink-value=1.0 \
     --trainer.num-chunk-per-minibatch=64,32 \
     --trainer.optimization.momentum=0.0 \
     --egs.chunk-width=$chunk_width \
-    --egs.chunk-left-context=$chunk_left_context \
-    --egs.chunk-right-context=$chunk_right_context \
-    --egs.chunk-left-context-initial=0 \
-    --egs.chunk-right-context-final=0 \
     --egs.dir="$common_egs_dir" \
     --egs.opts="--frames-overlap-per-eg 0" \
-    --cleanup.remove-egs=$remove_egs \
+    --cleanup.remove-egs=false \
     --use-gpu=true \
     --reporting.email="$reporting_email" \
     --feat-dir=$train_data_dir \
@@ -211,19 +201,14 @@ if [ $stage -le 6 ]; then
   # topology file from the model).  So you could give it a different
   # lang directory, one that contained a wordlist and LM of your choice,
   # as long as phones.txt was compatible.
-
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$lang_test \
+    --self-loop-scale 1.0 data/lang_test \
     $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
     --frames-per-chunk $frames_per_chunk \
     --nj $nj --cmd "$cmd" \
     $dir/graph data/test $dir/decode_test || exit 1;
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
new file mode 100755
index 00000000000..aa61620a92f
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_cnn_chainali_1b.sh
@@ -0,0 +1,226 @@
+#!/bin/bash
+
+# chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer.
+# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/
+
+# steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/
+# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038)
+
+# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ exp/chain/e2e_cnn_1a/
+# System                         cnn_1a cnn_chainali_1b e2e_cnn_1a
+# WER                             13.51      6.76     10.55
+# Final train prob              -0.0291   -0.0138   -0.0702
+# Final valid prob              -0.0712   -0.0171   -0.0578
+# Final train prob (xent)       -0.3847   -0.4169
+# Final valid prob (xent)       -0.4962   -0.5040
+
+set -e -o pipefail
+
+stage=0
+nj=30
+train_set=train
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+chain_model_dir=exp/chain${nnet3_affix}/cnn${affix}
+common_egs_dir=
+reporting_email=
+# chain options
+train_stage=-10
+xent_regularize=0.1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+tdnn_dim=450
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats_chain
+gmm_lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_chainali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_chain
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $chain_model_dir $lat_dir
+  cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=60 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=15 height-out=15 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --trainer.srand=0 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=2 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=6 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=false \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
+
+echo "$0: Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh
new file mode 100755
index 00000000000..ffc9a4c8a14
--- /dev/null
+++ b/egs/madcat_zh/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a
+# System                      e2e_cnn_1a
+# WER                             10.41
+# Final train prob              -0.0536
+# Final valid prob              -0.0489
+# Final train prob (xent)
+# Final valid prob (xent)
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
+# exp/chain/e2e_cnn_1a/: num-iters=63 nj=6..12 num-params=6.1M dim=80->5760 combine=-0.048->-0.048 (over 5) logprob:train/valid[41,62,final]=(-0.062,-0.065,-0.054/-0.058,-0.062,-0.049)
+
+set -e
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=48,24/300=24,12/600=12,6/1200=4,4
+common_egs_dir=
+train_set=train
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 70 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=80 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=80 height-out=80 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=80 height-out=40 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=40 height-out=40 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=40 height-out=40 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=40 height-out=20 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=20 height-out=20 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=20 height-out=20 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn8 height-in=20 height-out=10 time-offsets=-1,0,1 $common3 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
diff --git a/egs/madcat_zh/v1/local/check_tools.sh b/egs/madcat_zh/v1/local/check_tools.sh
new file mode 100755
index 00000000000..00de9778808
--- /dev/null
+++ b/egs/madcat_zh/v1/local/check_tools.sh
@@ -0,0 +1,49 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "from scipy.spatial import ConvexHull"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread'];"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image, scikit-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/madcat_zh/v1/local/create_line_image_from_page_image.py b/egs/madcat_zh/v1/local/create_line_image_from_page_image.py
new file mode 100755
index 00000000000..22af571fc04
--- /dev/null
+++ b/egs/madcat_zh/v1/local/create_line_image_from_page_image.py
@@ -0,0 +1,536 @@
+#!/usr/bin/env python3
+
+# Copyright   2018 Ashish Arora
+# Apache 2.0
+# minimum bounding box part in this script is originally from
+#https://github.com/BebeSparkelSparkel/MinimumBoundingBox
+#https://startupnextdoor.com/computing-convex-hull-in-python/
+
+""" This module will be used for extracting line images from page image.
+ Given the word segmentation (bounding box around a word) for  every word, it will
+ extract line segmentation. To extract line segmentation, it will take word bounding
+ boxes of a line as input, will create a minimum area bounding box that will contain
+ all corner points of word bounding boxes. The obtained bounding box (will not necessarily
+ be vertically or horizontally aligned). Hence to extract line image from line bounding box,
+ page image is rotated and line image is cropped and saved.
+"""
+
+import sys
+import argparse
+import os
+import xml.dom.minidom as minidom
+import numpy as np
+from math import atan2, cos, sin, pi, degrees, sqrt
+from collections import namedtuple
+
+from scipy.spatial import ConvexHull
+from PIL import Image
+from scipy.misc import toimage
+
+parser = argparse.ArgumentParser(description="Creates line images from page image",
+                                 epilog="E.g.  " + sys.argv[0] + "  data/LDC2012T15"
+                                             " data/madcat.train.raw.lineid "
+                                             " data/local/lines ",
+                                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('database_path1', type=str,
+                    help='Path to the downloaded madcat data directory 1')
+parser.add_argument('data_splits', type=str,
+                    help='Path to file that contains the train/test/dev split information')
+parser.add_argument('out_dir', type=str,
+                    help='directory location to write output files')
+parser.add_argument('--padding', type=int, default=400,
+                    help='padding across horizontal/verticle direction')
+args = parser.parse_args()
+
+"""
+bounding_box is a named tuple which contains:
+
+             area (float): area of the rectangle
+             length_parallel (float): length of the side that is parallel to unit_vector
+             length_orthogonal (float): length of the side that is orthogonal to unit_vector
+             rectangle_center(int, int): coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector (float, float): direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle (float): angle of the unit vector to be in radians.
+             corner_points [(float, float)]: set that contains the corners of the rectangle
+"""
+
+bounding_box_tuple = namedtuple('bounding_box_tuple', 'area '
+                                        'length_parallel '
+                                        'length_orthogonal '
+                                        'rectangle_center '
+                                        'unit_vector '
+                                        'unit_vector_angle '
+                                        'corner_points'
+                         )
+
+def unit_vector(pt0, pt1):
+    """ Returns an unit vector that points in the direction of pt0 to pt1.
+    Args:
+        pt0 (float, float): Point 0. Eg. (1.0, 2.0).
+        pt1 (float, float): Point 1. Eg. (3.0, 8.0).
+
+    Returns:
+        (float, float): unit vector that points in the direction of pt0 to pt1.
+        Eg.  0.31622776601683794, 0.9486832980505138
+    """
+    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
+    return (pt1[0] - pt0[0])/ dis_0_to_1, \
+           (pt1[1] - pt0[1])/ dis_0_to_1
+
+
+def orthogonal_vector(vector):
+    """ From vector returns a orthogonal/perpendicular vector of equal length.
+    Args:
+        vector (float, float): A vector. Eg. (0.31622776601683794, 0.9486832980505138).
+
+    Returns:
+        (float, float): A vector that points in the direction orthogonal to vector.
+        Eg. - 0.9486832980505138,0.31622776601683794
+    """
+    return -1 * vector[1], vector[0]
+
+
+def bounding_area(index, hull):
+    """ Returns a named tuple that mainly contains area of the box that bounds
+        the hull. This bounding box orintation is same as the orientation of the 
+        lines formed by the point hull[index] and hull[index+1].
+    Args:
+        index (int): Eg. 1.
+        hull [(float, float)]: list or tuple of point cloud
+        Eg. ((1.0, -1.0), (2.0, -3.0), (3.0, 4.0), (5.0, 6.0)).
+
+    Returns: a named tuple that contains:
+             area: area of the rectangle
+             length_parallel: length of the side that is parallel to unit_vector
+             length_orthogonal: length of the side that is orthogonal to unit_vector
+             rectangle_center: coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector: direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+    """
+    unit_vector_p = unit_vector(hull[index], hull[index+1])
+    unit_vector_o = orthogonal_vector(unit_vector_p)
+
+    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
+    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)
+
+    min_p = min(dis_p)
+    min_o = min(dis_o)
+    len_p = max(dis_p) - min_p
+    len_o = max(dis_o) - min_o
+
+    return {'area': len_p * len_o,
+            'length_parallel': len_p,
+            'length_orthogonal': len_o,
+            'rectangle_center': (min_p + float(len_p)/ 2, min_o + float(len_o)/ 2),
+            'unit_vector': unit_vector_p,
+            }
+
+
+def to_xy_coordinates(unit_vector_angle, point):
+    """ Returns converted unit vector coordinates in x, y coordinates.
+    Args:
+        unit_vector_angle (float): angle of unit vector to be in radians. 
+        Eg. 0.1543 .
+        point (float, float): Point from origin. Eg. (1.0, 2.0).
+
+    Returns:
+        (float, float): converted x,y coordinate of the unit vector.
+        Eg. 0.680742447866183, 2.1299271629971663
+    """
+    angle_orthogonal = unit_vector_angle + pi/ 2
+    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
+           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
+
+
+def rotate_points(center_of_rotation, angle, points):
+    """ Rotates a point cloud around the center_of_rotation point by angle
+    Args:
+        center_of_rotation (float, float): angle of unit vector to be in radians.
+        Eg. (1.56, -23.4).
+        angle (float): angle of rotation to be in radians. Eg. 0.1543 .
+        points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. 
+        Eg. ((1.56, -23.4), (1.56, -23.4))
+
+    Returns:
+        [(float, float)]: Rotated points around center of rotation by angle
+        Eg. ((1.16, -12.4), (2.34, -34.4))
+    """
+
+    rot_points = []
+    ang = []
+    for pt in points:
+        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
+        diff_angle = atan2(diff[1], diff[0]) + angle
+        ang.append(diff_angle)
+        diff_length = sqrt(sum([d**2 for d in diff]))
+        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
+                           center_of_rotation[1] + diff_length * sin(diff_angle)))
+
+    return rot_points
+
+
+def rectangle_corners(rectangle):
+    """ Given rectangle center and its inclination. It returns the corner 
+        locations of the rectangle.
+    Args:
+        rectangle (bounding_box): the output of minimum bounding box rectangle
+
+    Returns:
+    [(float, float)]: 4 corner points of rectangle.
+        Eg. ((1.0, -1.0), (2.0, -3.0), (3.0, 4.0), (5.0, 6.0))
+    """
+    corner_points = []
+    for i1 in (.5, -.5):
+        for i2 in (i1, -1 * i1):
+            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
+                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))
+
+    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
+
+
+# use this function to find the listed properties of the minimum bounding box of a point cloud
+def minimum_bounding_box(points):
+    """ Given a point cloud, it returns the minimum area rectangle bounding all 
+        the points in the point cloud.
+    Args:
+        points [(float, float)]: points to be a list or tuple of 2D points
+                                 needs to be more than 2 points
+
+    Returns: returns a namedtuple that contains:
+             area: area of the rectangle
+             length_parallel: length of the side that is parallel to unit_vector
+             length_orthogonal: length of the side that is orthogonal to unit_vector
+             rectangle_center: coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector: direction of the length_parallel side. RADIANS
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle: angle of the unit vector
+             corner_points: set that contains the corners of the rectangle
+    """
+
+    if len(points) <= 2: raise ValueError('More than two points required.')
+
+    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
+    hull_ordered.append(hull_ordered[0])
+    hull_ordered = tuple(hull_ordered)
+
+    min_rectangle = bounding_area(0, hull_ordered)
+    for i in range(1, len(hull_ordered)-1):
+        rectangle = bounding_area(i, hull_ordered)
+        if rectangle['area'] < min_rectangle['area']:
+            min_rectangle = rectangle
+
+    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
+    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])
+
+    return bounding_box_tuple(
+        area = min_rectangle['area'],
+        length_parallel = min_rectangle['length_parallel'],
+        length_orthogonal = min_rectangle['length_orthogonal'],
+        rectangle_center = min_rectangle['rectangle_center'],
+        unit_vector = min_rectangle['unit_vector'],
+        unit_vector_angle = min_rectangle['unit_vector_angle'],
+        corner_points = set(rectangle_corners(min_rectangle))
+    )
+
+
+def get_center(im):
+    """ Returns the center pixel location of an image
+    Args:
+        im: image 
+
+    Returns:
+        (int, int): center of the image
+        Eg.  2550, 3300
+    """
+    center_x = float(im.size[0])/ 2
+    center_y = float(im.size[1])/ 2
+    return int(center_x), int(center_y)
+
+
+def get_horizontal_angle(unit_vector_angle):
+    """ Returns angle of the unit vector in first or fourth quadrant.
+    Args:
+        angle (float): angle of the unit vector to be in radians. Eg. 0.01543.
+
+    Returns:
+        (float): updated angle of the unit vector to be in radians.
+                 It is only in first or fourth quadrant.
+        Eg. 0.01543.
+    """
+
+    if unit_vector_angle > pi/ 2 and unit_vector_angle <= pi:
+        unit_vector_angle = unit_vector_angle - pi
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi/ 2:
+        unit_vector_angle = unit_vector_angle + pi
+
+    return unit_vector_angle
+
+
+def get_smaller_angle(bounding_box):
+    """ Returns smallest absolute angle of a rectangle.
+    Args:
+        rectangle (bounding_box): bounding box rectangle
+
+    Returns:
+        (float): smallest angle of the rectangle to be in radians.
+        Eg. 0.01543.
+    """
+
+    unit_vector = bounding_box.unit_vector
+    unit_vector_angle = bounding_box.unit_vector_angle
+    ortho_vector = orthogonal_vector(unit_vector)
+    ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0])
+
+    unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle)
+    ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle)
+
+    if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated):
+        return unit_vector_angle_updated
+    else:
+        return ortho_vector_angle_updated
+
+
+def rotated_points(bounding_box, center):
+    """ Rotates the corners of a  bounding box rectangle around the center by smallest angle 
+        of the rectangle. It first finds the smallest angle of the rectangle
+        then rotates it around the given center point.
+    Args:
+        rectangle (bounding_box): bounding box rectangle
+        center (int, int): center point around which the corners of rectangle are rotated.
+        Eg. (2550, 3300).
+
+    Returns: 4 corner points of rectangle.
+        Eg. ((1.0, -1.0), (2.0, -3.0), (3.0, 4.0), (5.0, 6.0))
+    """
+
+    p1, p2, p3, p4 = bounding_box.corner_points
+    x1, y1 = p1
+    x2, y2 = p2
+    x3, y3 = p3
+    x4, y4 = p4
+    center_x, center_y = center
+    rotation_angle_in_rad = -get_smaller_angle(bounding_box)
+    x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x
+
+    y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y
+    return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4
+
+
+def pad_image(image):
+    """ Pads the image around the border. It help in getting
+        bounding boxes that are slightly outside the page boundary.
+    Args:
+        image: page image.
+
+    Returns:
+        image: page image
+    """
+    
+    padded_image = Image.new('RGB', (image.size[0] + padding, image.size[1] + padding), "white")
+    padded_image.paste(im=image, box=(offset, offset))
+    return padded_image
+
+
+def update_minimum_bounding_box_input(bounding_box_input):
+    """ Updates the word bounding box corner points.
+    Args:
+        points [(float, float)]: points, a list or tuple of 2D coordinates.
+                                 ideally should be more than 2 points
+    Returns:
+        points [(float, float)]: points, a list or tuple of 2D coordinates
+    """
+
+    updated_minimum_bounding_box_input = []
+    for point in bounding_box_input:
+        x, y = point
+        new_x = x + offset
+        new_y = y + offset
+        word_coordinate = (new_x, new_y)
+        updated_minimum_bounding_box_input.append(word_coordinate)
+
+    return updated_minimum_bounding_box_input
+
+
+def set_line_image_data(image, line_id, image_file_name):
+    """ Flips a given line image and saves it. Line image file name 
+        is formed by appending the line id at the end page image name.
+    Args:
+        image: line image, non flipped
+        line_id (string): id of the line image.
+        image_file_name(string): name of the page image.
+
+    Returns:
+    """
+
+    base_name = os.path.splitext(os.path.basename(image_file_name))[0]
+    image_file_name_wo_tif, b = image_file_name.split('.tif')
+    line_id = '_' + line_id.zfill(4)
+    line_image_file_name = base_name + line_id + '.png'
+    image_path = os.path.join(output_directory, line_image_file_name)
+    imgray = toimage(image.convert('L'))
+    imgray.save(image_path)
+    image_fh.write(image_path + '\n')
+
+def get_line_images_from_page_image(image_file_name, madcat_file_path):
+    """ Extracts the line image from page image.
+    Args:
+        image_file_name (string): complete path and name of the page image.
+        madcat_file_path (string): complete path and name of the madcat xml file
+                                  corresponding to the page image.
+
+    Returns:
+    """
+    im_wo_pad = Image.open(image_file_name)
+    im = pad_image(im_wo_pad)
+    doc = minidom.parse(madcat_file_path)
+    zone = doc.getElementsByTagName('zone')
+    for node in zone:
+        id = node.getAttribute('id')
+        token_image = node.getElementsByTagName('token-image')
+        minimum_bounding_box_input = []
+        for token_node in token_image:
+            word_point = token_node.getElementsByTagName('point')
+            for word_node in word_point:
+                word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y')))
+                minimum_bounding_box_input.append(word_coordinate)
+        updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input)
+        bounding_box = minimum_bounding_box(updated_mbb_input)
+
+        p1, p2, p3, p4 = bounding_box.corner_points
+        x1, y1 = p1
+        x2, y2 = p2
+        x3, y3 = p3
+        x4, y4 = p4
+        min_x = int(min(x1, x2, x3, x4))
+        min_y = int(min(y1, y2, y3, y4))
+        max_x = int(max(x1, x2, x3, x4))
+        max_y = int(max(y1, y2, y3, y4))
+        box = (min_x, min_y, max_x, max_y)
+        region_initial = im.crop(box)
+        rot_points = []
+        p1_new = (x1 - min_x, y1 - min_y)
+        p2_new = (x2 - min_x, y2 - min_y)
+        p3_new = (x3 - min_x, y3 - min_y)
+        p4_new = (x4 - min_x, y4 - min_y)
+        rot_points.append(p1_new)
+        rot_points.append(p2_new)
+        rot_points.append(p3_new)
+        rot_points.append(p4_new)
+
+        cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                bounding_box.length_parallel,
+                bounding_box.length_orthogonal,
+                bounding_box.length_orthogonal,
+                bounding_box.unit_vector,
+                bounding_box.unit_vector_angle,
+                set(rot_points)
+            )
+
+        rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+        img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample=Image.BICUBIC)
+        x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                cropped_bounding_box, get_center(region_initial))
+
+
+        min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+        min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+        max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+        max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+        box = (min_x, min_y, max_x, max_y)
+        region_final = img2.crop(box)
+        set_line_image_data(region_final, id, image_file_name)
+
+
+def check_file_location():
+    """ Returns the complete path of the page image and corresponding
+        xml file.
+    Args:
+
+    Returns:
+        image_file_name (string): complete path and name of the page image.
+        madcat_file_path (string): complete path and name of the madcat xml file
+                                  corresponding to the page image.
+    """
+
+    madcat_file_path1 = os.path.join(data_path1, 'madcat', base_name + '.madcat.xml')
+
+    image_file_path1 = os.path.join(data_path1, 'images', base_name + '.tif')
+
+    if os.path.exists(madcat_file_path1):
+        return madcat_file_path1, image_file_path1, wc_dict1
+
+    print("ERROR: path does not exist")
+    return None, None, None
+
+def parse_writing_conditions(writing_conditions):
+    """ Returns a dictionary which have writing condition of each page image.
+    Args: 
+         writing_conditions(string): complete path of writing condition file.
+
+    Returns:
+        (dict): dictionary with key as page image name and value as writing condition.
+    """
+
+    with open(writing_conditions) as f:
+        file_writing_cond = dict()
+        for line in f:
+            line_list = line.strip().split("\t")
+            file_writing_cond[line_list[0]] = line_list[3]
+    return file_writing_cond
+
+def check_writing_condition(wc_dict):
+    """ Checks if a given page image is writing in a given writing condition.
+        It is used to create subset of dataset based on writing condition.
+    Args:
+         wc_dict (dict): dictionary with key as page image name and value as writing condition.
+
+    Returns:
+        (bool): True if writing condition matches.
+    """
+
+    return True
+    writing_condition = wc_dict[base_name].strip()
+    if writing_condition != 'IUC':
+        return False
+
+    return True
+
+
+### main ###
+
+data_path1 = os.path.join(args.database_path1, 'data')
+
+splits_handle = open(args.data_splits, 'r')
+splits_data = splits_handle.read().strip().split('\n')
+
+padding = int(args.padding)
+offset = int(padding // 2)
+
+output_directory = args.out_dir
+image_file = os.path.join(output_directory, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+writing_conditions1 = os.path.join(args.database_path1, 'docs', 'writing_conditions.tab')
+
+wc_dict1 = parse_writing_conditions(writing_conditions1)
+
+prev_base_name = ''
+for line in splits_data:
+    base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0]
+    if prev_base_name != base_name:
+        prev_base_name = base_name
+        madcat_file_path, image_file_path, wc_dict = check_file_location()
+        if wc_dict == None or not check_writing_condition(wc_dict):
+            continue
+        if madcat_file_path != None:
+            get_line_images_from_page_image(image_file_path, madcat_file_path)
diff --git a/egs/madcat_zh/v1/local/extract_features.sh b/egs/madcat_zh/v1/local/extract_features.sh
new file mode 100755
index 00000000000..9fe588f31b8
--- /dev/null
+++ b/egs/madcat_zh/v1/local/extract_features.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --augment_type $augment \
+    --vertical-shift $verticle_shift \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/madcat_zh/v1/local/extract_lines.sh b/egs/madcat_zh/v1/local/extract_lines.sh
new file mode 100755
index 00000000000..ed752e97e13
--- /dev/null
+++ b/egs/madcat_zh/v1/local/extract_lines.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright   2018 Ashish Arora
+
+nj=4
+cmd=run.pl
+download_dir=/export/corpora/LDC/LDC2014T13
+dataset_file=data/download/datasplits/madcat.dev.raw.lineid
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+log_dir=$data/log
+mkdir -p $log_dir
+mkdir -p $data
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $log_dir/lines.$n.scp"
+done
+
+utils/split_scp.pl $dataset_file $split_scps || exit 1;
+
+for n in $(seq $nj); do
+  mkdir -p $data/$n
+done
+
+$cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \
+  local/create_line_image_from_page_image.py $download_dir $log_dir/lines.JOB.scp $data/JOB \
+  || exit 1;
+
+## concatenate the .scp files together.
+for n in $(seq $nj); do
+  cat $data/$n/images.scp || exit 1;
+done > $data/images.scp || exit 1
diff --git a/egs/madcat_zh/v1/local/prepare_data.sh b/egs/madcat_zh/v1/local/prepare_data.sh
new file mode 100755
index 00000000000..ba35b90b173
--- /dev/null
+++ b/egs/madcat_zh/v1/local/prepare_data.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+# Apache 2.0
+
+# This script downloads the Madcat Chinese handwriting database and prepares the training
+# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
+# It also downloads the LOB and Brown text corpora. It downloads the database files
+# only if they do not already exist in download directory.
+
+#  Eg. local/prepare_data.sh
+#  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+#      utt2spk file: 000_a01-000u-00 000
+#      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+#      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
+
+download_dir1=/export/corpora/LDC/LDC2014T13/data
+train_split_url=http://www.openslr.org/resources/50/madcat.train.raw.lineid
+test_split_url=http://www.openslr.org/resources/50/madcat.test.raw.lineid
+dev_split_url=http://www.openslr.org/resources/50/madcat.dev.raw.lineid
+data_split_dir=data/download/datasplits
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+if [ -d $data_split_dir ]; then
+  echo "$0: Not downloading the data splits as it is already there."
+else
+  if [ ! -f $data_split_dir/madcat.train.raw.lineid ]; then
+    mkdir -p $data_split_dir
+    echo "$0: Downloading the data splits..."
+    wget -P $data_split_dir $train_split_url || exit 1;
+    wget -P $data_split_dir $test_split_url || exit 1;
+    wget -P $data_split_dir $dev_split_url || exit 1;
+  fi
+  echo "$0: Done downloading the data splits"
+fi
+
+if [ -d $download_dir1 ]; then
+  echo "$0: madcat chinese data directory is present."
+else
+  if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
+    echo "$0: please download madcat data..."
+  fi
+fi
diff --git a/egs/madcat_zh/v1/local/prepare_dict.sh b/egs/madcat_zh/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..f9cd8387fad
--- /dev/null
+++ b/egs/madcat_zh/v1/local/prepare_dict.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+mkdir -p $dir
+
+#local/prepare_lexicon.py data/train $dir
+cat data/train/text | cut -d' ' -f2- | tr ' ' '\n' | sort -u | sed '/^$/d' | \
+  python3 -c \
+  'import sys, io; \
+  sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); \
+  sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); \
+  [sys.stdout.write(line.strip() + " " + " ".join(list(line.strip())) + "\n") for line in sys.stdin];' > $dir/lexicon.txt
+
+cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/madcat_zh/v1/local/process_data.py b/egs/madcat_zh/v1/local/process_data.py
new file mode 100755
index 00000000000..994a4486420
--- /dev/null
+++ b/egs/madcat_zh/v1/local/process_data.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+    the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+
+  Eg. local/process_data.py data/local data/train data --dataset train
+  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+      utt2spk file: 000_a01-000u-00 000
+      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+import unicodedata
+
+parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files",
+                                 epilog="E.g.  " + sys.argv[0] + "  data/LDC2012T15"
+                                 " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid "
+                                 " data/train data/local/lines ",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('database_path1',
+                    help='Path to the downloaded (and extracted) madcat data')
+parser.add_argument('data_splits',
+                    help='Path to file that contains the train/test/dev split information')
+parser.add_argument('out_dir',
+                    help='directory location to write output files.')
+args = parser.parse_args()
+
+
+def check_file_location():
+    """ Returns the complete path of the page image and corresponding
+        xml file.
+    Args:
+
+    Returns:
+        image_file_name (string): complete path and name of the page image.
+        madcat_file_path (string): complete path and name of the madcat xml file
+                                  corresponding to the page image.
+    """
+
+    madcat_file_path1 = os.path.join(args.database_path1, 'data', 'madcat', base_name + '.madcat.xml')
+
+    image_file_path1 = os.path.join(args.database_path1, 'data', 'images', base_name + '.tif')
+
+    if os.path.exists(madcat_file_path1):
+        return madcat_file_path1, image_file_path1, wc_dict1
+
+    return None, None, None
+
+
+def parse_writing_conditions(writing_conditions):
+    """ Returns a dictionary which have writing condition of each page image.
+    Args:
+         writing_conditions(string): complete path of writing condition file.
+
+    Returns:
+        (dict): dictionary with key as page image name and value as writing condition.
+    """
+
+    with open(writing_conditions) as f:
+        file_writing_cond = dict()
+        for line in f:
+            line_list = line.strip().split("\t")
+            file_writing_cond[line_list[0]] = line_list[3]
+    return file_writing_cond
+
+
+def check_writing_condition(wc_dict):
+    """ Checks if a given page image is writing in a given writing condition.
+        It is used to create subset of dataset based on writing condition.
+    Args:
+         wc_dict (dict): dictionary with key as page image name and value as writing condition.
+
+    Returns:
+        (bool): True if writing condition matches.
+    """
+
+    return True
+    writing_condition = wc_dict[base_name].strip()
+    if writing_condition != 'IUC':
+        return False
+
+    return True
+
+
+def get_word_line_mapping(madcat_file_path):
+    """ Maps every word in the page image to a  corresponding line.
+    Args:
+         madcat_file_path (string): complete path and name of the madcat xml file
+                                  corresponding to the page image.
+
+    Returns:
+    """
+
+    doc = minidom.parse(madcat_file_path)
+    zone = doc.getElementsByTagName('zone')
+    for node in zone:
+        line_id = node.getAttribute('id')
+        line_word_dict[line_id] = list()
+        word_image = node.getElementsByTagName('token-image')
+        for tnode in word_image:
+            word_id = tnode.getAttribute('id')
+            line_word_dict[line_id].append(word_id)
+            word_line_dict[word_id] = line_id
+
+
+def read_text(madcat_file_path):
+    """ Maps every word in the page image to a  corresponding line.
+    Args:
+        madcat_file_path (string): complete path and name of the madcat xml file
+                                  corresponding to the page image.
+
+    Returns:
+        dict: Mapping every word in the page image to a  corresponding line.
+    """
+
+    text_line_word_dict = dict()
+    doc = minidom.parse(madcat_file_path)
+    segment = doc.getElementsByTagName('segment')
+    for node in segment:
+        token = node.getElementsByTagName('token')
+        for tnode in token:
+            segment_id = tnode.getAttribute('id')
+            ref_word_id = tnode.getAttribute('ref_id')
+            word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue
+            word = unicodedata.normalize('NFKC',word)
+            ref_line_id = word_line_dict[ref_word_id]
+            if ref_line_id not in text_line_word_dict:
+                text_line_word_dict[ref_line_id] = list()
+            text_line_word_dict[ref_line_id].append(word)
+    return text_line_word_dict
+
+
+def get_line_image_location():
+    image_loc_dict = dict()  # Stores image base name and location
+    image_loc_vect = input_image_fh.read().strip().split("\n")
+    for line in image_loc_vect:
+        base_name = os.path.basename(line)
+        location_vect = line.split('/')
+        location = "/".join(location_vect[:-1])
+        image_loc_dict[base_name]=location
+    return image_loc_dict
+
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+data_path1 = os.path.join(args.database_path1, 'data')
+
+input_image_file = os.path.join(args.out_dir, 'lines', 'images.scp')
+input_image_fh = open(input_image_file, 'r', encoding='utf-8')
+
+writing_conditions1 = os.path.join(args.database_path1, 'docs', 'writing_conditions.tab')
+
+wc_dict1 = parse_writing_conditions(writing_conditions1)
+image_loc_dict = get_line_image_location()
+
+image_num = 0
+with open(args.data_splits) as f:
+    prev_base_name = ''
+    for line in f:
+        base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0]
+        if prev_base_name != base_name:
+            prev_base_name = base_name
+            madcat_xml_path, image_file_path, wc_dict = check_file_location()
+            if wc_dict is None or not check_writing_condition(wc_dict):
+                continue
+            if madcat_xml_path is not None:
+                madcat_doc = minidom.parse(madcat_xml_path)
+                writer = madcat_doc.getElementsByTagName('writer')
+                writer_id = writer[0].getAttribute('id')
+                line_word_dict = dict()
+                word_line_dict = dict()
+                get_word_line_mapping(madcat_xml_path)
+                text_line_word_dict = read_text(madcat_xml_path)
+                base_name = os.path.basename(image_file_path)
+                base_name, b = base_name.split('.tif')
+                for lineID in sorted(text_line_word_dict):
+                    updated_base_name = "{}_{}.png".format(base_name, str(lineID).zfill(4))
+                    location = image_loc_dict[updated_base_name]
+                    image_file_path = os.path.join(location, updated_base_name)
+                    line = text_line_word_dict[lineID]
+                    text = ' '.join(''.join(line))
+                    utt_id = "{}_{}_{}_{}".format(writer_id, str(image_num).zfill(6), base_name, str(lineID).zfill(4))
+                    text_fh.write(utt_id + ' ' + text + '\n')
+                    utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+                    image_fh.write(utt_id + ' ' + image_file_path + '\n')
+                    image_num = image_num + 1
diff --git a/egs/madcat_zh/v1/local/process_segments.py b/egs/madcat_zh/v1/local/process_segments.py
new file mode 100755
index 00000000000..3d09c0df190
--- /dev/null
+++ b/egs/madcat_zh/v1/local/process_segments.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+# Copyright     2017 Chun Chieh Chang
+
+""" This script reads the provided word segmentations of chinese
+    and ensures that all of them are normalized to the same
+    unicode form.
+"""
+
+import argparse
+import os
+import unicodedata
+
+parser = argparse.ArgumentParser(description="""Takes in word segmentations and normalizes character form.""")
+parser.add_argument('segmentation_path', type=str,
+                    help='Path to chinese word segmentation')
+parser.add_argument('out_dir', type=str,
+                    help='Where to write output file')
+args = parser.parse_args()
+
+segment_file = os.path.join(args.out_dir, 'segmented_words')
+segment_fh = open(segment_file, 'w', encoding='utf-8')
+
+with open(args.segmentation_path, encoding='utf-8') as f:
+    for line in f:
+        line_normalize = unicodedata.normalize('NFKC', line)
+        segment_fh.write(line_normalize + '\n')
diff --git a/egs/madcat_zh/v1/local/score.sh b/egs/madcat_zh/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/madcat_zh/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/madcat_zh/v1/local/train_lm.sh b/egs/madcat_zh/v1/local/train_lm.sh
new file mode 100755
index 00000000000..a8e2dc71f28
--- /dev/null
+++ b/egs/madcat_zh/v1/local/train_lm.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains an LM on the LOB+Brown text data and IAM training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/dev/text | cut -d " " -f 2-  > ${dir}/data/text/dev.txt
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/madcat.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from IAM text
+  cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=2 madcat=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/madcat_zh/v1/local/wer_output_filter b/egs/madcat_zh/v1/local/wer_output_filter
new file mode 100755
index 00000000000..5d5290ad8c3
--- /dev/null
+++ b/egs/madcat_zh/v1/local/wer_output_filter
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# Copyright      2017  Hossein Hadian
+
+# This is a filter used in scoring. It separates all
+# punctuations from words. For e.g. this sentence:
+
+# "They have come!" he said reverently, gripping his
+# hands. "Isn't it a glorious thing! Long awaited."
+
+# is converted to this:
+
+# " They have come ! " he said reverently , gripping his
+# hands . " Isn ' t it a glorious thing ! Long awaited . "
+
+import sys
+import io
+import re
+from collections import OrderedDict
+
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8");
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8");
+
+re_dict = OrderedDict([("“","\""), ("”","\"")])
+pattern = re.compile("|".join(re.escape(key) for key in re_dict.keys()))
+
+for line in sys.stdin:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  transcript_fixed = pattern.sub(lambda x: re_dict[x.group()], transcript)
+  sys.stdout.write(uttid + " " + transcript_fixed + "\n")
diff --git a/egs/madcat_zh/v1/path.sh b/egs/madcat_zh/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/madcat_zh/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/madcat_zh/v1/run.sh b/egs/madcat_zh/v1/run.sh
new file mode 100755
index 00000000000..b3ef370c830
--- /dev/null
+++ b/egs/madcat_zh/v1/run.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+
+set -e
+stage=0
+nj=50
+decode_gmm=true
+# madcat_database points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# like "data/download" and follow the instructions
+# in "local/download_data.sh" to download the database:
+# data_split_dir is an unofficial datasplit that is used.
+# The datasplits can be found on http://www.openslr.org/51/
+madcat_database=/export/corpora/LDC/LDC2014T13
+data_split_dir=data/download/datasplits
+overwrite=false
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+./local/check_tools.sh
+
+# Start from stage=-1 for using extra corpus text
+if [ $stage -le -1 ]; then
+  echo "$(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/zh.txt
+  head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt
+fi
+
+mkdir -p data/{train,test,dev}/lines
+if [ $stage -le 0 ]; then
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+   echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir
+
+  for dataset in train test dev; do
+    local/extract_lines.sh --nj $nj --cmd $cmd \
+      --download-dir $madcat_database
+      --dataset-file $data_split_dir/madcat.${dataset}.raw.lineid \
+      data/${dataset}/lines
+  done
+
+  echo "$0: Processing data..."
+  for set in dev train test; do
+    local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set
+    image/fix_data_dir.sh data/$set
+  done
+fi
+
+mkdir -p data/{train,test,dev}/data
+if [ $stage -le 1 ]; then
+  for dataset in train test dev; do
+    local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 60 data/$dataset
+    steps/compute_cmvn_stats.sh data/$dataset
+  done
+fi
+
+if [ $stage -le 2 ]; then
+echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 16 --sil-prob 0.95 \
+                        --position-dependent-phones false \
+                        data/local/dict "<sil>" data/lang/temp data/lang
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+if [ $stage -le 4 ]; then
+  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
+    data/lang exp/mono
+fi
+
+if [ $stage -le 5 ] && $decode_gmm; then
+  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
+
+  steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
+    exp/mono/decode_test
+fi
+
+if [ $stage -le 6 ]; then
+  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+    exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \
+    50000 20000 data/train data/lang \
+    exp/mono_ali exp/tri
+fi
+
+if [ $stage -le 7 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph
+
+  steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
+    exp/tri/decode_test
+fi
+
+if [ $stage -le 8 ]; then
+  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+    exp/tri exp/tri_ali
+
+  steps/train_lda_mllt.sh --cmd $cmd \
+    --splice-opts "--left-context=3 --right-context=3" \
+    --context-opts "--context-width=2 --central-position=1" 50000 20000 \
+    data/train data/lang exp/tri_ali exp/tri2
+fi
+
+if [ $stage -le 9 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+
+  steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \
+    data/test exp/tri2/decode_test
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
+    data/train data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd $cmd --context-opts "--context-width=2 --central-position=1" \
+    50000 20000 data/train data/lang \
+    exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 11 ] && $decode_gmm; then
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+
+  steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \
+    data/test exp/tri3/decode_test
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
+    data/train data/lang exp/tri3 exp/tri3_ali
+fi
+
+if [ $stage -le 13 ]; then
+  local/chain/run_cnn_1a.sh 
+fi
+
+if [ $stage -le 14 ]; then
+  local/chain/run_cnn_chainali_1b.sh --chain-model-dir exp/chain/cnn_1a --stage 2
+fi
diff --git a/egs/madcat_zh/v1/run_end2end.sh b/egs/madcat_zh/v1/run_end2end.sh
new file mode 100755
index 00000000000..7e0fc1e25d1
--- /dev/null
+++ b/egs/madcat_zh/v1/run_end2end.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# Copyright 2017    Hossein Hadian
+
+set -e
+stage=0
+nj=50
+username=
+password=
+# iam_database points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# like "data/download" and follow the instructions
+# in "local/prepare_data.sh" to download the database:
+madcat_database=/export/corpora/LDC/LDC2014T13
+data_split_dir=data/download/datasplits
+overwrite=false
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/zh/
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+./local/check_tools.sh
+
+
+# Start from stage=-1 for using extra corpus text
+if [ $stage -le -1 ]; then
+  echo "$(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/zh.txt
+  head -20000 data/local/text/zh.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/zh.txt > data/local/text/cleaned/corpus.txt
+fi
+
+if [ $stage -le 0 ]; then
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir1 $madcat_database/data --data-split-dir $data_split_dir
+
+  for dataset in train test dev; do
+    local/extract_lines.sh --nj $nj --cmd $cmd \
+      --download-dir $madcat_database \
+      --dataset-file $data_split_dir/madcat.${dataset}.raw.lineid \
+      data/${dataset}/lines
+  done
+
+  echo "$0: Processing data..."
+  for set in dev train test; do
+    local/process_data.py $madcat_database $data_split_dir/madcat.$set.raw.lineid data/$set
+    image/fix_data_dir.sh data/$set
+  done
+
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 1 ]; then
+  image/get_image2num_frames.py --feat-dim 80 data/train  # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$0: Preparing the test and train feature files..."
+  for dataset in train test; do
+    local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 80 data/$dataset
+    steps/compute_cmvn_stats.sh data/$dataset
+  done
+  utils/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 16 --sil-prob 0.95 \
+                        --position-dependent-phones false \
+                        data/local/dict "<sil>" data/lang/temp data/lang
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+lang_decode=data/lang_test
+decode_e2e=true
+if [ $stage -le 4 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+                     data/local/dict/lexicon.txt $lang_decode
+fi
+
+if [ $stage -le 5 ] && $decode_e2e; then
+  echo "$0: $(date) stage 5: decoding end2end setup..."
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --nj $nj --cmd "$cmd" \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+fi
diff --git a/egs/madcat_zh/v1/steps b/egs/madcat_zh/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/madcat_zh/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/madcat_zh/v1/utils b/egs/madcat_zh/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/madcat_zh/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/malach/s5/README.txt b/egs/malach/s5/README.txt
new file mode 100644
index 00000000000..9ea62aae53d
--- /dev/null
+++ b/egs/malach/s5/README.txt
@@ -0,0 +1,64 @@
+# Copyright 2019  IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus
+
+This s5 recipe for MALACH data is a modified version of the s5b
+recipe for AMI. 
+
+You need to download the malach data to get started. For information about the MALACH database see : 
+USC-SFI MALACH Interviews and Transcripts English - Speech Recognition Edition
+https://catalog.ldc.upenn.edu/LDC2019S11
+
+Once the data is unloaded and untar-ed, you need to run:
+
+run_prepare_shared.sh - prepares most of the data for the system
+run.sh - builds the system
+
+Beforehand, you need to edit BOTH scripts to point to 
+where you downloaded and untar-ed the data. Find the lines in
+run_prepare_shared.sh and run.sh that say:
+
+malach_dir=dummy_directory
+
+Replace "dummy_directory" with the fully-qualified location of the actual data
+data. For example, let's say you copied the data distribution tar file to 
+/user/jdoe/malach and untar-ed it there. That would create a high level directory called 
+/user/jdoe/malach/malach_eng_speech_recognition. You would then change the above line to read:
+
+malach_dir=/user/doe/malach/malach_eng_speech_recognition/data
+
+Note that the scripts were "tweaked" to always use sclite scoring
+(vs. default kaldi scoring).
+
+Other issues that we have run up against in setting up this recipe
+that may or may not impact you:
+
+On the system on which these scripts were developed, we run python 2.7
+and a relatively older version of CUDA by default. We had to modify
+path.sh to point to the right load libraries for both python 3 (a
+number of the scripts use python three) and an appropriate library
+consistent with the level of CUDA we were using. Please modify path.sh
+accordingly.
+
+You may also have to modify "configure" line 405 in
+/speech7/picheny5_nb/forked_kaldi/kaldi/src to point to where your
+version of CUDA lives. 
+
+Basic pipeline results summary:
+
+tri2:
+%WER 39.1 | 843 12345 | 66.5 25.1 8.3 5.7 39.1 74.0 | -0.230 | exp/tri2/decode_dev_malach.o4g.kn.pr1-9/ascore_13/dev.ctm.filt.sys
+
+tri3.si:
+%WER 42.8 | 843 12345 | 63.4 28.0 8.5 6.3 42.8 76.9 | -1.079 | exp/tri3/decode_dev_malach.o4g.kn.pr1-9.si/ascore_12/dev.ctm.filt.sys
+
+tri3:
+%WER 34.5 | 843 12345 | 70.7 22.1 7.1 5.2 34.5 69.2 | -0.398 | exp/tri3/decode_dev_malach.o4g.kn.pr1-9/ascore_15/dev.ctm.filt.sys
+
+tri3_cleaned.si:
+%WER 43.1 | 843 12345 | 63.6 28.2 8.2 6.7 43.1 79.0 | -1.095 | exp/tri3_cleaned/decode_dev_malach.o4g.kn.pr1-9.si/ascore_12/dev.ctm.filt.sys
+
+tri3_cleaned:
+%WER 35.1 | 843 12345 | 71.0 22.6 6.4 6.1 35.1 72.7 | -0.431 | exp/tri3_cleaned/decode_dev_malach.o4g.kn.pr1-9/ascore_13/dev.ctm.filt.sys
+
+Results using the chain model, and rescoring the chain model with various LSTMs, can be found in s5/local/chain/run_tdnn.sh
+
+
diff --git a/egs/malach/s5/cmd.sh b/egs/malach/s5/cmd.sh
new file mode 100644
index 00000000000..166bfdd450b
--- /dev/null
+++ b/egs/malach/s5/cmd.sh
@@ -0,0 +1,18 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="run.pl --mem 1G"
+export decode_cmd="run.pl --mem 2G"
+# the use of cuda_cmd is deprecated, used only in 'nnet1',
+export cuda_cmd="run.pl --gpu 1 --mem 20G"
+
+
diff --git a/egs/malach/s5/conf/decode.conf b/egs/malach/s5/conf/decode.conf
new file mode 100644
index 00000000000..c8a0ece58bf
--- /dev/null
+++ b/egs/malach/s5/conf/decode.conf
@@ -0,0 +1,3 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
diff --git a/egs/malach/s5/conf/mfcc.conf b/egs/malach/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..a1aa3d6c158
--- /dev/null
+++ b/egs/malach/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false   # only non-default option.
+--sample-frequency=16000
diff --git a/egs/malach/s5/conf/mfcc_hires.conf b/egs/malach/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/malach/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/malach/s5/conf/online_cmvn.conf b/egs/malach/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/malach/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/malach/s5/local/chain/compare_wer_general.sh b/egs/malach/s5/local/chain/compare_wer_general.sh
new file mode 100755
index 00000000000..9bd017414ab
--- /dev/null
+++ b/egs/malach/s5/local/chain/compare_wer_general.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+echo -n "System               "
+for x in $*; do   printf " % 10s" $x;   done
+echo
+
+#for d in exp/chain_cleaned/tdnn*/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done|grep eval_hires
+
+
+echo -n "WER on dev  "
+for x in $*; do
+  wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev/*sc*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Rescore with lstm 1a  "
+for x in $*; do
+  wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev*tdnn_1a/*sc*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Rescore with lstm 1b  "
+for x in $*; do
+  wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev*tdnn_1b/*sc*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Rescore with lstm bs_1a  "
+for x in $*; do
+  wer=$(grep Sum exp/chain_cleaned/${x}/decode_dev*tdnn_bs_1a/*sc*/*ys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "Final train prob     "
+for x in $*; do
+  if [[ "${x}" != *online* ]]; then  
+  prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+  fi
+done
+echo
+
+echo -n "Final valid prob     "
+for x in $*; do
+  if [[ "${x}" != *online* ]]; then
+  prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10s" $prob
+  fi
+done
+echo
+
+echo -n "Final train prob (xent)    "
+for x in $*; do
+  if [[ "${x}" != *online* ]]; then
+  prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+  fi
+done
+echo
+
+echo -n "Final valid prob (xent)    "
+for x in $*; do
+  if [[ "${x}" != *online* ]]; then
+  prob=$(grep Overall exp/chain_cleaned/${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10s" $prob
+  fi
+done
+echo
diff --git a/egs/malach/s5/local/chain/run_tdnn.sh b/egs/malach/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..ede0f99dc57
--- /dev/null
+++ b/egs/malach/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100644
index 00000000000..007e94ef1a3
--- /dev/null
+++ b/egs/malach/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+
+# Copyright 2019  IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus
+
+# same as 1h in corresponding AMI s5b recipe but replacing proportional-shrink with l2-regularize.
+
+# local/chain/compare_wer_general.sh tdnn1a_sp_bi
+
+# System                tdnn1a_sp_bi
+# WER on dev                    23.7
+# Rescore with lstm 1a          21.1
+# Rescore with lstm 1b          20.8
+# Rescore with lstm bs_1a       20.7
+# Final train prob         -0.118005
+# Final valid prob         -0.167522
+# Final train prob (xent)  -2.06349
+# Final valid prob (xent)  -2.29166
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn1a_sp_bi
+# exp/chain_cleaned/tdnn1i_sp_bi: num-iters=918 nj=2..4 num-params=7.9M dim=40+100->3696 combine=-0.133->-0.130 (over 19) xent:train/valid[610,917,final]=(-2.37,-2.10,-2.06/-2.60,-2.35,-2.29) logprob:train/valid[610,917,final]=(-0.143,-0.124,-0.118/-0.191,-0.173,-0.168)
+
+3set -e -o pipefail
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+ivector_transform_type=pca
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+num_epochs=9
+remove_egs=true
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1a  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --ivector-transform-type "$ivector_transform_type" \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --train-set $train_set
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-9
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.02"
+  output_opts="l2-regularize=0.004"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=450 $opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=450 $opts
+  relu-batchnorm-layer name=tdnn3 dim=450 $opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-1,0,1) dim=450 $opts
+  relu-batchnorm-layer name=tdnn5 dim=450 $opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 $opts
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=450 $opts
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=450 $opts
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=450 $opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain input=tdnn9 dim=450 target-rms=0.5 $opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn9 dim=450 target-rms=0.5 $opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 4 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/malach/s5/local/convert2stm.pl b/egs/malach/s5/local/convert2stm.pl
new file mode 100755
index 00000000000..f0b85c65b42
--- /dev/null
+++ b/egs/malach/s5/local/convert2stm.pl
@@ -0,0 +1,101 @@
+#!/usr/bin/env perl
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#           2013  University of Edinburgh (Author: Pawel Swietojanski)
+
+# This takes as standard input path to directory containing all the usual 
+# data files - segments, text, utt2spk and reco2file_and_channel and creates stm
+
+if (@ARGV < 1 || @ARGV > 2) {
+  print STDERR "Usage: convert2stm.pl <data-dir> [<utt2spk_stm>] > stm-file\n";
+  exit(1);
+}
+
+$dir=shift @ARGV;
+$utt2spk_file=shift @ARGV || 'utt2spk';
+
+$segments = "$dir/segments";
+$reco2file_and_channel = "$dir/reco2file_and_channel";
+$text = "$dir/text";
+$utt2spk_file = "$dir/$utt2spk_file";
+
+open(S, "<$segments") || die "opening segments file $segments";
+while(<S>) {
+  @A = split(" ", $_);
+  @A > 3 || die "convert2stm: Bad line in segments file: $_";
+  ($utt, $recording_id, $begin_time, $end_time) = @A[0..3];
+  $utt2reco{$utt} = $recording_id;
+  $begin{$utt} = $begin_time;
+  $end{$utt} = $end_time;
+}
+close(S);
+
+open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel";
+while(<R>) {
+  @A = split(" ", $_);
+  @A == 3 || die "convert2stm: Bad line in reco2file_and_channel file: $_";
+  ($recording_id, $file, $channel) = @A;
+  $reco2file{$recording_id} = $file;
+  $reco2channel{$recording_id} = $channel;
+}
+close(R);
+
+open(T, "<$text") || die "open text file $text";
+while(<T>) {
+  @A = split(" ", $_);
+  $utt = shift @A;  
+  $utt2text{$utt} = "@A"; 
+}
+close(T);
+
+open(U, "<$utt2spk_file") || die "open utt2spk file $utt2spk_file";
+while(<U>) {
+  @A = split(" ", $_);
+  @A == 2 || die "convert2stm: Bad line in utt2spk file: $_";
+  ($utt, $spk) = @A;
+  $utt2spk{$utt} = $spk;
+}
+close(U);
+
+# Now generate the stm file
+foreach $utt (sort keys(%utt2reco)) {
+
+  # lines look like:
+  # <File> <Channel> <Speaker> <BeginTime> <EndTime> [ <LABEL> ] transcript
+  $recording_id = $utt2reco{$utt};
+  if (!defined $recording_id) { die "Utterance-id $utt not defined in segments file $segments"; }
+  $file = $reco2file{$recording_id};
+  $channel = $reco2channel{$recording_id};
+  if (!defined $file || !defined $channel) { 
+    die "convert2stm: Recording-id $recording_id not defined in reco2file_and_channel file $reco2file_and_channel"; 
+  }
+ 
+  $speaker = $utt2spk{$utt};
+  $transcripts = $utt2text{$utt};  
+  
+  if (!defined $speaker) { die "convert2stm: Speaker-id for utterance $utt not defined in utt2spk file $utt2spk_file"; }
+  if (!defined $transcripts) { die "convert2stm: Transcript for $utt not defined in text file $text"; }
+
+  $b = $begin{$utt};
+  $e = $end{$utt};
+  $line = "$file $channel $speaker $b $e $transcripts \n";
+
+  print $line; # goes to stdout.
+}
+
+__END__
+
+# Test example
+# ES2011a.Headset-0 A AMI_ES2011a_H00_FEE041 34.27 37.14 HERE WE GO
+mkdir tmpdir
+echo utt reco 10.0 20.0 > tmpdir/segments
+echo utt word > tmpdir/text
+echo reco file A > tmpdir/reco2file_and_channel
+echo utt spk > tmpdir/utt2spk
+echo file A spk 10.0 20.00 word > stm_tst
+utils/convert2stm.pl tmpdir | cmp - stm_tst || echo error
+rm -r tmpdir stm_tst
+
+
+
+
diff --git a/egs/malach/s5/local/fisher_map_words.pl b/egs/malach/s5/local/fisher_map_words.pl
new file mode 100755
index 00000000000..f3a6c6ac644
--- /dev/null
+++ b/egs/malach/s5/local/fisher_map_words.pl
@@ -0,0 +1,84 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+
+# Copyright 2013  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script cleans up the Fisher English transcripts and maps the words to
+# be similar to the Switchboard Mississippi State transcripts
+# Reads from STDIN and writes to STDOUT
+
+use strict;
+
+while (<>) {
+  chomp;
+
+  $_ = lc($_);  # few things aren't lowercased in the data, e.g. I'm
+  s/\*//g;  # *mandatory -> mandatory
+  s/\(//g;  s/\)//g;  # Remove parentheses
+  next if /^\s*$/;    # Skip empty lines
+
+  # In one conversation people speak some German phrases that are tagged as
+  # <german (( ja wohl )) > -- we remove these
+  s/<[^>]*>//g;
+
+  s/\.\_/ /g;  # Abbreviations: a._b._c. -> a b c.
+  s/(\w)\.s( |$)/$1's /g;  # a.s -> a's
+  s/\./ /g;    # Remove remaining .
+  s/(\w)\,(\w| )/$1 $2/g;    # commas don't appear within numbers, but still
+
+  s/( |^)\'(blade|cause|course|frisco|okay|plain|specially)( |$)/ $2 /g;
+  s/\'em/-em/g;
+
+  # Remove an opening ' if there is a matching closing ' since some word 
+  # fragments are annotated as: 'kay, etc.
+  # The substitution is done twice, since matching once doesn't capture 
+  # consequetive quoted segments (the space in between is used up).
+  s/(^| )\'(.*?)\'( |$)/ $2 /g;
+  s/(^| )\'(.*?)\'( |$)/ $2 /g;
+
+  s/( |^)\'(\w)( |-|$)/$1 /g;  # 'a- -> a
+  s/( |^)-( |$)/ /g;      # Remove dangling -
+  s/\?//g;                # Remove ?
+  s/( |^)non-(\w+)( |$)/ non $2 /g;  # non-stop -> non stop
+
+  # Some words that are annotated as fragments are actual dictionary words
+  s/( |-)(acceptable|arthritis|ball|cause|comes|course|eight|eighty|field|giving|habitating|heard|hood|how|king|ninety|okay|paper|press|scripts|store|till|vascular|wood|what|york)(-| )/ $2 /g;
+
+  # Remove [[skip]] and [pause]
+  s/\[\[skip\]\]/ /g;  
+  s/\[pause\]/ /g;
+
+  # [breath], [cough], [lipsmack], [sigh], [sneeze] -> [noise]
+  s/\[breath\]/[noise]/g;
+  s/\[cough\]/[noise]/g;
+  s/\[lipsmack\]/[noise]/g;
+  s/\[sigh\]/[noise]/g;
+  s/\[sneeze\]/[noise]/g;
+
+  s/\[mn\]/[vocalized-noise]/g;  # [mn] -> [vocalized-noise]
+  s/\[laugh\]/[laughter]/g;      # [laugh] -> [laughter]
+
+  $_ = uc($_);
+  # Now, mapping individual words
+  my @words = split /\s+/;
+  for my $i (0..$#words) {
+    my $w = $words[$i];
+    $w =~ s/^'/-/;
+    $words[$i] = $w;
+  }
+  print join(" ", @words) . "\n";
+}
diff --git a/egs/malach/s5/local/malach_data_prep.sh b/egs/malach/s5/local/malach_data_prep.sh
new file mode 100755
index 00000000000..174adf9ce0e
--- /dev/null
+++ b/egs/malach/s5/local/malach_data_prep.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2014  University of Edinburgh (Author: Pawel Swietojanski)
+#           2016  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2019  IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus
+#           
+#          
+# MALACH Corpus training data preparation
+# Apache 2.0
+
+# Note: this is called by ../run.sh.
+
+# To be run from one directory above this script.
+
+. ./path.sh
+
+#check existing directories
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 /path/to/MALACH"
+  echo "e.g. $0 /foo/bar/MALACH"
+  exit 1;
+fi
+
+MALACH_DIR=$1
+
+SEGS=data/local/annotations/train.txt
+dir=data/local/train
+odir=data/train_orig
+mkdir -p $dir
+
+cp $MALACH_DIR/train/* $dir
+
+# Audio data directory check
+if [ ! -d $MALACH_DIR ]; then
+  echo "Error: $MALACH_DIR directory does not exists."
+  exit 1;
+fi
+
+# And transcripts check
+if [ ! -f $SEGS ]; then
+  echo "Error: File $SEGS no found (run malach_text_prep.sh)."
+  exit 1;
+fi
+
+utils/utt2spk_to_spk2utt.pl <$dir/utt2spk >$dir/spk2utt || exit 1;
+
+# Copy stuff into its final location
+mkdir -p $odir
+for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
+  cp $dir/$f $odir/$f || exit 1;
+done
+
+utils/validate_data_dir.sh --no-feats $odir || exit 1;
+
+echo MALACH data preparation succeeded.
diff --git a/egs/malach/s5/local/malach_prepare_dict.sh b/egs/malach/s5/local/malach_prepare_dict.sh
new file mode 100755
index 00000000000..4c3c039f74a
--- /dev/null
+++ b/egs/malach/s5/local/malach_prepare_dict.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+#adapted from fisher dict preparation script, Author: Pawel Swietojanski
+# Copyright 2019  IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus
+
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <malach-dir>"
+  echo " <malach-dir> is download space."
+  exit 1;
+fi
+
+malach_dir=$1
+
+dir=data/local/dict
+mkdir -p $dir
+mkdir -p $dir/Malachdict
+echo "Getting Malach dictionary"
+cp  $malach_dir/lexicon.txt  $dir/Malachdict
+
+# Copy over Malach files from the distribution
+cp $malach_dir/silence_phones.txt $dir/silence_phones.txt
+cp $malach_dir/optional_silence.txt $dir/optional_silence.txt
+cp $malach_dir/nonsilence_phones.txt $dir/nonsilence_phones.txt
+cp $malach_dir/nonsilence_phones.txt $dir/nonsilence_phones.txt
+cp $malach_dir/lexicon.txt $dir/lexicon.txt
+
+# An extra question will be added by including the silence phones in one class.
+cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+
+# This is just for diagnostics:
+cat data/local/annotations/train.txt  | \
+  awk '{for (n=6;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
+  sort -nr > $dir/word_counts
+
+awk '{print $1}' $dir/lexicon.txt | \
+  perl -e '($word_counts)=@ARGV;
+   open(W, "<$word_counts")||die "opening word-counts $word_counts";
+   while(<STDIN>) { chop; $seen{$_}=1; }
+   while(<W>) {
+     ($c,$w) = split;
+     if (!defined $seen{$w}) { print; }
+   } ' $dir/word_counts > $dir/oov_counts.txt
+
+echo "*Highest-count OOVs are:"
+head -n 20 $dir/oov_counts.txt
+
+utils/validate_dict_dir.pl $dir
diff --git a/egs/malach/s5/local/malach_scoring_data_prep.sh b/egs/malach/s5/local/malach_scoring_data_prep.sh
new file mode 100755
index 00000000000..8c9c79a1fd6
--- /dev/null
+++ b/egs/malach/s5/local/malach_scoring_data_prep.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+
+# Copyright 2014  University of Edinburgh (Author: Pawel Swietojanski)
+#           2016  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2019  IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus
+# MALACH Corpus training data preparation
+# Apache 2.0
+
+# Note: this is called by ../run.sh.
+
+. ./path.sh
+
+#check existing directories
+if [ $# != 2 ]; then
+  echo "Usage: $0 /path/to/MALACH"
+  echo "e.g. $0 /foo/bar/MALACH"
+  exit 1;
+fi
+
+MALACH_DIR=$1
+SET=$2
+SEGS=data/local/annotations/$SET.txt
+
+dir=data/local/$SET
+odir=data/${SET}_orig
+mkdir -p $dir
+
+cp $MALACH_DIR/$SET/* $dir
+
+
+# Audio data directory check
+if [ ! -d $MALACH_DIR ]; then
+  echo "Error: run.sh requires a directory argument"
+  exit 1;
+fi
+
+# And transcripts check
+if [ ! -f $SEGS ]; then
+  echo "Error: File $SEGS no found (run malach_text_prep.sh)."
+  exit 1;
+fi
+
+
+sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
+
+# Copy stuff into its final locations
+mkdir -p $odir
+for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
+  cp $dir/$f $odir/$f || exit 1;
+done
+
+#Produce STMs for sclite scoring
+local/convert2stm.pl $dir > $odir/stm
+cp $MALACH_DIR/glm $odir/glm
+
+utils/validate_data_dir.sh --no-feats $odir || exit 1;
+
+echo MALACH $SET set data preparation succeeded.
+
diff --git a/egs/malach/s5/local/malach_text_prep.sh b/egs/malach/s5/local/malach_text_prep.sh
new file mode 100755
index 00000000000..55885c66ce9
--- /dev/null
+++ b/egs/malach/s5/local/malach_text_prep.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright 2019  IBM Corp. (Author: Michael Picheny) Adapted AMI recipe to MALACH corpus
+# Copyright 2015, Brno University of Technology (Author: Karel Vesely)
+# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski), 2014, Apache 2.0
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <malach-dir>"
+  echo " <malach-dir> is download space."
+  exit 1;
+fi
+
+set -eux
+
+dir=$1
+
+malach_train_stm=$dir/malach.kaldi_lm.v2.stm
+malach_dev_stm=$dir/malach.minitest.try3.v2.stm
+
+echo "Creating annotations directory..."
+
+logdir=data/local/downloads; mkdir -p $logdir/log
+
+if [ ! -d $dir/annotations ]; then
+  mkdir -p $dir/annotations
+fi
+
+wdir=data/local/annotations
+
+if [ ! -d $wdir ]; then
+  mkdir -p $wdir
+fi
+
+# make final train/dev splits
+for dset in train dev; do
+    file_variable=malach_${dset}_stm
+    eval file_name=\$$file_variable
+    sed "s/<>//" $file_name  | awk '{for (i = 1; i <= 5; ++i) printf "%s ", $i; for (i=6; i<=NF; ++i) printf "%s ", toupper($i); printf "\n"}' > $wdir/$dset.txt
+done
+
+
diff --git a/egs/malach/s5/local/malach_train_lms.sh b/egs/malach/s5/local/malach_train_lms.sh
new file mode 100755
index 00000000000..c4919022bf1
--- /dev/null
+++ b/egs/malach/s5/local/malach_train_lms.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Copyright 2019, IBM Research (Author: Michael Picheny) Adapted AMI recipe to MALACH Corpus
+# Copyright 2013  Arnab Ghoshal, Pawel Swietojanski
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# To be run from one directory above this script.
+
+# Begin configuration section.
+fisher=
+order=4
+swbd=
+google=
+web_sw=
+web_fsh=
+web_mtg=
+# end configuration sections
+
+help_message="Usage: "`basename $0`" [options] <train-txt> <dev-txt> <dict> <out-dir>
+Train language models for Malach \n
+options:
+  --help          # print this message and exit
+  --order N       # N-gram order (default: '$order')
+";
+
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  printf "$help_message\n";
+  exit 1;
+fi
+
+train=$1    # data/train/text
+dev=$2      # data/dev/text
+lexicon=$3  # data/dict/lexicon.txt
+dir=$4      # data/local/lm
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+set -o errexit
+mkdir -p $dir
+export LC_ALL=C
+
+if ! command -v ngram-count 2>/dev/null; then
+  echo "$0: SRILM is not installed.  Please install SRILM with:"
+  echo "pushd $KALDI_ROOT; cd tools; extras/install_srilm.sh; popd"
+  echo "[note: this may require registering on the SRILM website.]"
+  exit 1
+fi
+
+cut -d' ' -f6- $train | gzip -c > $dir/train.gz
+cut -d' ' -f6- $dev | gzip -c > $dir/dev.gz
+
+awk '{print $1}' $lexicon | sort -u > $dir/wordlist.lex
+gunzip -c $dir/train.gz | tr ' ' '\n' | grep -v ^$ | sort -u > $dir/wordlist.train
+sort -u $dir/wordlist.lex $dir/wordlist.train > $dir/wordlist
+
+ngram-count -text $dir/train.gz -order $order -limit-vocab -vocab $dir/wordlist \
+  -unk -map-unk "<unk>" -kndiscount -interpolate -lm $dir/malach.o${order}g.kn.gz \
+  -gt3min 1 -gt4min 1
+echo "PPL for MALACH LM:"
+ngram -unk -lm $dir/malach.o${order}g.kn.gz -ppl $dir/dev.gz
+ngram -unk -lm $dir/malach.o${order}g.kn.gz -ppl $dir/dev.gz -debug 2 >& $dir/ppl2
+mix_ppl="$dir/ppl2"
+mix_tag="malach"
+mix_lms=( "$dir/malach.o${order}g.kn.gz" )
+num_lms=1
+
+#save the lm name for further use
+echo "${mix_tag}.o${order}g.kn" > $dir/final_lm
+
diff --git a/egs/malach/s5/local/nnet3/prepare_lores_feats.sh b/egs/malach/s5/local/nnet3/prepare_lores_feats.sh
new file mode 100755
index 00000000000..5601fcf7dd6
--- /dev/null
+++ b/egs/malach/s5/local/nnet3/prepare_lores_feats.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# this is called from the other nnet and chain training scripts.
+# It prepares normal-resolution MFCC features for purposes of getting
+# alignments and/or lattices on the speed-perturbed data.
+#
+# please see local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh for examples of
+# usage.
+
+stage=0
+nj=30
+min_seg_len=1.55  # min length in seconds... we do this because chain training
+                  # will discard segments shorter than 1.5 seconds.  Must remain in
+                  # sync with the same option given to run_ivector_common.sh.
+                  # Set it to empty string to skip combining segments.
+
+train_set=train   # you might set this to e.g. train_cleaned.
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+for f in data/${train_set}/utt2spk; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
+  echo "$0: $feats already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: preparing directory for speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: making MFCC features for speed-perturbed data"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ ! -z "$min_seg_len" ]; then
+  if [ $stage -le 10 ]; then
+    echo "$0: combining short segments of 13-dimensional speed-perturbed MFCC data"
+    src=data/${train_set}_sp
+    dest=data/${train_set}_sp_comb
+    utils/data/combine_short_segments.sh $src $min_seg_len $dest
+    # re-use the CMVN stats from the source directory, since it seems to be slow to
+    # re-compute them after concatenating short segments.
+    cp $src/cmvn.scp $dest/
+    utils/fix_data_dir.sh $dest
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/malach/s5/local/nnet3/run_ivector_common.sh b/egs/malach/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..d841b8f50ef
--- /dev/null
+++ b/egs/malach/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+
+# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
+# be called by more scripts).  It contains the common feature preparation and iVector-related parts
+# of the script.  See those scripts for examples of usage.
+
+stage=0
+nj=30
+min_seg_len=1.55  # min length in seconds... we do this because chain training
+                  # will discard segments shorter than 1.5 seconds.  Must remain in sync with
+                  # the same option given to prepare_lores_feats.sh.
+train_set=train   # you might set this to e.g. train_cleaned.
+gmm=tri3          # This specifies a GMM-dir from the features of the type you're training the system on;
+                  # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+ivector_transform_type=lda
+nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
+                         # becomes exp/nnet3_cleaned or whatever.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+gmmdir=exp/${gmm}
+
+
+for f in data/${train_set}/feats.scp ; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+
+  for datadir in ${train_set}_sp dev; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp dev; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: combining short segments of speed-perturbed high-resolution MFCC training data"
+  # we have to combine short segments or we won't be able to train chain models
+  # on those segments.
+  utils/data/combine_short_segments.sh \
+     data/${train_set}_sp_hires $min_seg_len data/${train_set}_sp_hires_comb
+
+  # just copy over the CMVN to avoid having to recompute it.
+  cp data/${train_set}_sp_hires/cmvn.scp data/${train_set}_sp_hires_comb/
+  utils/fix_data_dir.sh data/${train_set}_sp_hires_comb/
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: selecting segments of hires training data that were also present in the"
+  echo " ... original training data."
+
+  # note, these data-dirs are temporary; we put them in a sub-directory
+  # of the place where we'll make the alignments.
+  temp_data_root=exp/nnet3${nnet3_affix}/tri5
+  mkdir -p $temp_data_root
+
+  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
+          data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
+
+  # note: essentially all the original segments should be in the hires data.
+  n1=$(wc -l <data/${train_set}/feats.scp)
+  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
+  if [ $n1 != $n1 ]; then
+    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
+  fi
+
+  case $ivector_transform_type in
+    lda)
+      if [ ! -f ${gmmdir}/final.mdl ]; then
+        echo "$0: expected file ${gmmdir}/final.mdl to exist"
+        exit 1;
+      fi
+      echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+      if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+        # we don't want to overwrite old stuff, ask the user to delete it.
+        echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+        echo " ... please delete and then rerun, or use a later --stage option."
+        exit 1;
+      fi
+      steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+        --splice-opts "--left-context=3 --right-context=3" \
+        3000 10000 $temp_data_root/${train_set}_hires data/lang \
+        $gmmdir exp/nnet3${nnet3_affix}/tri5
+      ;;
+    pca)
+      echo "$0: computing a PCA transform from the hires data."
+      steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+        --splice-opts "--left-context=3 --right-context=3" \
+        --max-utts 10000 --subsample 2 \
+        $temp_data_root/${train_set}_hires \
+        exp/nnet3${nnet3_affix}/tri5
+      ;;
+    *) echo "$0: invalid iVector transform type $ivector_transform_type" && exit 1;
+  esac
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  # we don't use the _comb data for this as there is no need for compatibility with
+  # the alignments, and using the non-combined data is more efficient for I/O
+  # (no messing about with piped commands).
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 6 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires_comb ${temp_data_root}/${train_set}_sp_hires_comb_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_comb_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp) or small-segment concatenation (comb).
+  for data in dev; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0;
diff --git a/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_1a.sh b/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_1a.sh
new file mode 100755
index 00000000000..7205a3adcd0
--- /dev/null
+++ b/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_1a.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+set -x
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+# Copyright 2019  IBM Corp. (author: Michael Picheny) modified for MALACH data
+
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 50.6 / 96.4.
+# Train objf: -4.59 -4.40 -4.30 -4.23 -4.18 -4.14 -4.11 -4.08 -4.05 -4.03 -4.00 -3.99 -3.98 -3.96 -3.94 -3.93 -3.92 -3.91 -3.90 
+# Dev objf:   -5.12 -4.84 -4.75 -4.71 -4.64 -4.61 -4.60 -4.58 -4.60 -4.57 -4.59 -4.59 -4.58 -4.57 -4.57 -4.57 -4.57 -4.57 -4.57 
+
+# Begin configuration section.
+dir=exp/rnnlm_lstm_tdnn_1a
+embedding_dim=200
+epochs=60
+stage=-10
+train_stage=0
+
+. ./cmd.sh
+. ./path.sh
+
+. ./utils/parse_options.sh
+[ -z "$cmd" ] && cmd=$train_cmd
+
+train=data/train/text
+dev=data/dev/text
+wordlist=data/lang/words.txt
+text_dir=data/rnnlm/text
+mkdir -p $dir/config
+set -e
+
+for f in $train $dev $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train | cut -d ' ' -f2- > $text_dir/malach.txt
+  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+malach  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features 10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+lstm-layer name=lstm1 cell-dim=$embedding_dim 
+relu-renorm-layer name=tdnn dim=$embedding_dim input=Append(0, IfDefined(-1))
+lstm-layer name=lstm2 cell-dim=$embedding_dim 
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --stage $train_stage \
+                       --num-epochs $epochs --cmd "$cmd" $dir
+fi
+
+exit 0
diff --git a/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh b/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh
new file mode 100755
index 00000000000..db01b3ecbf4
--- /dev/null
+++ b/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+# Copyright 2019  IBM Corp. (author: Michael Picheny) modified for MALACH data
+
+
+# This script is similar to rnnlm_lstm_tdnn_a.sh except for adding L2 regularization.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 30) was 28, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 54.2 / 90.7.
+# Train objf: -4.59 -4.41 -4.31 -4.25 -4.21 -4.18 -4.15 -4.13 -4.12 -4.10 -4.09 -4.08 -4.07 -4.06 -4.05 -4.05 -4.04 -4.04 -4.03 -4.02 -4.02 -4.01 -4.00 -4.00 -3.99 -3.99 -3.98 -3.98 -3.98 
+# Dev objf:   -5.16 -4.92 -4.78 -4.76 -4.64 -4.63 -4.61 -4.59 -4.57 -4.57 -4.57 -4.55 -4.55 -4.57 -4.56 -4.54 -4.58 -4.53 -4.55 -4.55 -4.54 -4.51 -4.53 -4.52 -4.52 -4.52 -4.52 -4.51 -4.53 
+
+# Begin configuration section.
+dir=exp/rnnlm_lstm_tdnn_1b
+embedding_dim=200
+embedding_l2=0.005 # embedding layer l2 regularize
+comp_l2=0.005 # component-level l2 regularize
+output_l2=0.005 # output-layer l2 regularize
+epochs=90
+stage=-10
+train_stage=0
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+[ -z "$cmd" ] && cmd=$train_cmd
+
+train=data/train/text
+dev=data/dev/text
+wordlist=data/lang/words.txt
+text_dir=data/rnnlm/text
+mkdir -p $dir/config
+set -e
+
+for f in $train $dev $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train | cut -d ' ' -f2- > $text_dir/malach.txt
+  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+malach  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features 10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+lstm_opts="l2-regularize=$comp_l2"
+tdnn_opts="l2-regularize=$comp_l2"
+output_opts="l2-regularize=$output_l2"
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+lstm-layer name=lstm1 cell-dim=$embedding_dim $lstm_opts
+relu-renorm-layer name=tdnn dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
+lstm-layer name=lstm2 cell-dim=$embedding_dim $lstm_opts
+output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --embedding_l2 $embedding_l2 \
+                       --stage $train_stage \
+                       --num-epochs $epochs --cmd "$cmd" $dir
+fi
+
+exit 0
diff --git a/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh b/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh
new file mode 100755
index 00000000000..278ee345d50
--- /dev/null
+++ b/egs/malach/s5/local/rnnlm/tuning/run_lstm_tdnn_bs_1a.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2017  Hainan Xu
+#           2017  Ke Li
+#           2017  Yiming Wang
+# Copyright 2019  IBM Corp. (author: Michael Picheny) modified for MALACH data
+
+
+# This script is similar to rnnlm_lstm_tdnn_b.sh except for adding backstitch training
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 30) was 28, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 53.2 / 87.9.
+# Train objf: -5.42 -4.85 -4.61 -4.46 -4.39 -4.33 -4.29 -4.24 -4.22 -4.19 -4.16 -4.14 -4.12 -4.10 -4.08 -4.07 -4.05 -4.04 -4.03 -4.02 -4.01 -4.01 -4.00 -3.99 -3.98 -3.98 -3.97 -3.97 -3.96 
+#Dev objf:   -6.04 -5.57 -5.13 -4.92 -4.82 -4.75 -4.74 -4.69 -4.69 -4.64 -4.60 -4.58 -4.55 -4.55 -4.53 -4.53 -4.52 -4.51 -4.50 -4.50 -4.48 -4.48 -4.49 -4.48 -4.48 -4.48 -4.48 -4.48 -4.48 
+
+
+# Begin configuration section.
+cmd=run.pl
+affix=1a
+embedding_dim=200
+embedding_l2=0.005 # embedding layer l2 regularize
+comp_l2=0.005 # component-level l2 regularize
+output_l2=0.005 # output-layer l2 regularize
+epochs=90
+stage=-10
+train_stage=0
+
+
+# backstitch options
+alpha=0.8 # backstitch training scale
+back_interval=1 # backstitch training interval
+
+. ./cmd.sh
+. ./path.sh
+
+. utils/parse_options.sh
+train=data/train/text
+dev=data/dev/text
+wordlist=data/lang/words.txt
+text_dir=data/rnnlm/text
+dir=exp/rnnlm_lstm_tdnn_bs_$affix
+mkdir -p $dir/config
+set -e
+
+for f in $train $dev $wordlist; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for run.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train | cut -d ' ' -f2- > $text_dir/malach.txt
+  cat $dev | cut -d ' ' -f2- > $text_dir/dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+malach  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features 10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>,[noise],[laughter]' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+lstm_opts="l2-regularize=$comp_l2"
+tdnn_opts="l2-regularize=$comp_l2"
+output_opts="l2-regularize=$output_l2"
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+lstm-layer name=lstm1 cell-dim=$embedding_dim $lstm_opts
+relu-renorm-layer name=tdnn dim=$embedding_dim $tdnn_opts input=Append(0, IfDefined(-1))
+lstm-layer name=lstm2 cell-dim=$embedding_dim $lstm_opts
+output-layer name=output $output_opts include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  backstitch_opt="--backstitch-training-scale $alpha \
+    --backstitch-training-interval $back_interval"
+  rnnlm/train_rnnlm.sh --embedding_l2 $embedding_l2 \
+                       --stage $train_stage \
+                       --num-epochs $epochs --cmd "$cmd" $backstitch_opt $dir
+fi
+
+exit 0
diff --git a/egs/malach/s5/local/run_cleanup_segmentation.sh b/egs/malach/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..c2c730f1a9e
--- /dev/null
+++ b/egs/malach/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Copyright 2019  IBM (Michael Picheny) Adapted from AMI recipe for MALACH Corpus
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+train_stage=-10
+cleanup_affix=cleaned
+nj=50
+gmm=tri3
+lang=data/lang
+
+
+. ./path.sh
+. ./cmd.sh
+. utils/parse_options.sh
+
+data=data/train
+cleaned_data=data/train_${cleanup_affix}
+srcdir=exp/$gmm
+dir=exp/${gmm}_${cleanup_affix}_work
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  # Note: using the shorter min-length options "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+  # [vs. default 0.5, 0.1] leads to more data being kept, I think about 92% vs 88%; and
+  # the WER changes were inconsistent but overall very slightly better, e.g. (0.2% better, 0.1% worse)
+  # on different test sets.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+      --segmentation-opts "--min-segment-length 0.3 --min-new-segment-length 0.6" \
+    $data $lang $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 3 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_${cleanup_affix} data/lang exp/$gmm exp/${gmm}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 4 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" --stage "$train_stage" \
+    5000 80000 data/train_${cleanup_affix} data/lang exp/${gmm}_ali_${cleanup_affix} exp/${gmm}_${cleanup_affix}
+fi
+
+[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-9
+
+
+if [ $stage -le 5 ]; then
+  graph_dir=exp/${gmm}_${cleanup_affix}/graph_$LM
+  nj_dev=$(cat data/dev/spk2utt | wc -l)
+
+  $decode_cmd $graph_dir/mkgraph.log \
+    utils/mkgraph.sh data/lang_$LM exp/${gmm}_${cleanup_affix} $graph_dir
+  steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+    $graph_dir data/dev exp/${gmm}_${cleanup_affix}/decode_dev_$LM
+fi
+
diff --git a/egs/malach/s5/local/score.sh b/egs/malach/s5/local/score.sh
new file mode 100755
index 00000000000..00cc0c0f1a6
--- /dev/null
+++ b/egs/malach/s5/local/score.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+set -x
+
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012
+# Copyright University of Edinburgh (Author: Pawel Swietojanski) 2014
+# Apache 2.0
+
+orig_args=
+for x in "$@"; do orig_args="$orig_args '$x'"; done
+
+# begin configuration section.  we include all the options that score_sclite.sh or
+# score_basic.sh might need, or parse_options.sh will die.
+cmd=run.pl
+stage=0
+min_lmwt=7 # unused,
+max_lmwt=15 # unused,
+iter=final
+asclite=true
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [options] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit;
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  echo "    --asclite (true/false)          # score with ascltie instead of sclite (overlapped speech)"
+  exit 1;
+fi
+
+data=$1
+
+eval local/score_asclite.sh --asclite false $orig_args
+
diff --git a/egs/malach/s5/local/score_asclite.sh b/egs/malach/s5/local/score_asclite.sh
new file mode 100755
index 00000000000..a47e66581ad
--- /dev/null
+++ b/egs/malach/s5/local/score_asclite.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+set -x
+
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+# 2014, University of Edinburgh, (Author: Pawel Swietojanski)
+# 2015, Brno University of Technology (Author: Karel Vesely)
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+min_lmwt=7
+max_lmwt=15
+asclite=true
+iter=final
+overlap_spk=4
+# end configuration section.
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_asclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
+
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
+hubdir=`dirname $hubscr`
+
+for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
+     $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+name=`basename $data`; # e.g. eval2000
+nj=$(cat $dir/num_jobs)
+
+mkdir -p $dir/ascoring/log
+
+if [ $stage -le 0 ]; then
+  for LMWT in $(seq $min_lmwt $max_lmwt); do
+    rm -f $dir/.error
+    (
+    $cmd JOB=1:$nj $dir/ascoring/log/get_ctm.${LMWT}.JOB.log \
+      mkdir -p $dir/ascore_${LMWT}/ '&&' \
+      lattice-scale --inv-acoustic-scale=${LMWT} "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
+      lattice-limit-depth ark:- ark:- \| \
+      lattice-push --push-strings=false ark:- ark:- \| \
+      lattice-align-words-lexicon --max-expand=10.0 \
+       $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt  \| \
+      utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+      '>' $dir/ascore_${LMWT}/${name}.JOB.ctm || touch $dir/.error;
+    # Merge and clean,
+    for ((n=1; n<=nj; n++)); do cat $dir/ascore_${LMWT}/${name}.${n}.ctm; done > $dir/ascore_${LMWT}/${name}.ctm
+    rm -f $dir/ascore_${LMWT}/${name}.*.ctm
+    )&
+  done
+  wait;
+  [ -f $dir/.error ] && echo "$0: error during ctm generation. check $dir/ascoring/log/get_ctm.*.log" && exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+# Remove some stuff we don't want to score, from the ctm.
+# - we remove hesitations here, otherwise the CTM would have a bug!
+#   (confidences in place of the removed hesitations),
+  for x in $dir/ascore_*/${name}.ctm; do
+    cp $x $x.tmpf;
+    cat $x.tmpf | grep -i -v -E '\[noise|laughter|vocalized-noise\]' | \
+      grep -i -v -E ' (ACH|AH|EEE|EH|ER|EW|HA|HEE|HM|HMM|HUH|MM|OOF|UH|UM) ' | \
+      grep -i -v -E '<unk>' > $x;
+#      grep -i -v -E '<UNK>|%HESITATION' > $x;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  if [ "$asclite" == "true" ]; then
+    oname=$name
+    [ ! -z $overlap_spk ] && oname=${name}_o$overlap_spk
+    echo "asclite is starting"
+    # Run scoring, meaning of hubscr.pl options:
+    # -G .. produce alignment graphs,
+    # -v .. verbose,
+    # -m .. max-memory in GBs,
+    # -o .. max N of overlapping speakers,
+    # -a .. use asclite,
+    # -C .. compression for asclite,
+    # -B .. blocksize for asclite (kBs?),
+    # -p .. path for other components,
+    # -V .. skip validation of input transcripts,
+    # -h rt-stt .. removes non-lexical items from CTM,
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/score.LMWT.log \
+      cp $data/stm $dir/ascore_LMWT/ '&&' \
+      cp $dir/ascore_LMWT/${name}.ctm $dir/ascore_LMWT/${oname}.ctm '&&' \
+      $hubscr -G -v -m 1:2 -o$overlap_spk -a -C -B 8192 -p $hubdir -V -l english \
+        -h rt-stt -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${oname}.ctm || exit 1
+    # Compress some scoring outputs : alignment info and graphs,
+    echo -n "compressing asclite outputs "
+    for LMWT in $(seq $min_lmwt $max_lmwt); do
+      ascore=$dir/ascore_${LMWT}
+      gzip -f $ascore/${oname}.ctm.filt.aligninfo.csv
+      cp $ascore/${oname}.ctm.filt.alignments/index.html $ascore/${oname}.ctm.filt.overlap.html
+      tar -C $ascore -czf $ascore/${oname}.ctm.filt.alignments.tar.gz ${oname}.ctm.filt.alignments
+      rm -r $ascore/${oname}.ctm.filt.alignments
+      echo -n "LMWT:$LMWT "
+    done
+    echo done
+  else
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/ascoring/log/score.LMWT.log \
+      cp $data/stm $dir/ascore_LMWT/ '&&' \
+      $hubscr -p $hubdir -v -V -l english -h hub5 -g $data/glm -r $dir/ascore_LMWT/stm $dir/ascore_LMWT/${name}.ctm || exit 1
+  fi
+fi
+
+exit 0
diff --git a/egs/malach/s5/path.sh b/egs/malach/s5/path.sh
new file mode 100644
index 00000000000..c1f5745adc8
--- /dev/null
+++ b/egs/malach/s5/path.sh
@@ -0,0 +1,18 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+LMBIN=$KALDI_ROOT/tools/irstlm/bin
+SRILM=$KALDI_ROOT/tools/srilm/bin/i686-m64
+
+export PATH=$PATH:$LMBIN:$SRILM
+
+# The following was needed to enable Python 3 and also a version of
+# gcc consistent with the latest version of cuda on our system. You
+# might have to do something similar if you are still on python 2.7
+# and have an older version of gcc and a new version of cuda. 
+
+# export LD_LIBRARY_PATH=/opt/share/Python-3.5.2/x86_64/lib:/speech7/picheny4_nb/testi/927/kaldi/egs/ami/s5c/local/lib64:$LD_LIBRARY_PATH
diff --git a/egs/malach/s5/rnnlm b/egs/malach/s5/rnnlm
new file mode 120000
index 00000000000..72302c5e570
--- /dev/null
+++ b/egs/malach/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm
\ No newline at end of file
diff --git a/egs/malach/s5/run.sh b/egs/malach/s5/run.sh
new file mode 100755
index 00000000000..b8961d4df3c
--- /dev/null
+++ b/egs/malach/s5/run.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+. ./cmd.sh
+. ./path.sh
+
+
+# Train systems,
+nj=30 # number of parallel jobs,
+stage=0
+. utils/parse_options.sh
+
+set -euo pipefail
+
+# Path where MALACH gets downloaded (or where locally available):
+MALACH_DIR=/speech7/picheny5_nb/new_malach/malach_eng_speech_recognition/data
+
+[ ! -r data/local/lm/final_lm ] && echo "Please, run 'run_prepare_shared.sh' first!" && exit 1
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-9
+
+PROCESSED_MALACH_DIR=$MALACH_DIR
+
+# Prepare original data directories data/ihm/train_orig, etc.
+if [ $stage -le 2 ]; then
+  local/malach_data_prep.sh $PROCESSED_MALACH_DIR
+  local/malach_scoring_data_prep.sh $PROCESSED_MALACH_DIR dev
+fi
+
+if [ $stage -le 3 ]; then
+  for dset in train dev; do
+    # this splits up the speakers 
+    # into 30-second chunks.  It's like a very brain-dead form
+    # of diarization; we can later replace it with 'real' diarization.
+    seconds_per_spk_max=120
+
+    # Note: the 30 on the next line should have been $seconds_per_spk_max
+    # (thanks: Pavel Denisov.  This is a bug but before fixing it we'd have to
+    # test the WER impact.  I suspect it will be quite small and maybe hard to
+    # measure consistently.
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \
+      data/${dset}_orig data/$dset
+  done
+fi
+
+# Feature extraction,
+if [ $stage -le 4 ]; then
+  for dset in train dev; do
+    steps/make_mfcc.sh --nj 15 --cmd "$train_cmd" data/$dset
+    steps/compute_cmvn_stats.sh data/$dset
+    utils/fix_data_dir.sh data/$dset
+  done
+fi
+
+# monophone training
+if [ $stage -le 5 ]; then
+  # Full set 77h, reduced set 10.8h,
+  utils/subset_data_dir.sh data/train 15000 data/train_15k
+
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+    data/train_15k data/lang exp/mono
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/mono exp/mono_ali
+fi
+
+# context-dep. training with delta features.
+if [ $stage -le 6 ]; then
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    5000 80000 data/train data/lang exp/mono_ali exp/tri1
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali
+fi
+
+if [ $stage -le 7 ]; then
+  # LDA_MLLT
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    5000 80000 data/train data/lang exp/tri1_ali exp/tri2
+    steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri2 exp/tri2_ali
+# Decode
+   graph_dir=exp/tri2/graph_${LM}
+  $decode_cmd --mem 4G $graph_dir/mkgraph.log \
+     utils/mkgraph.sh data/lang_${LM} exp/tri2 $graph_dir
+  steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+    $graph_dir data/dev exp/tri2/decode_dev_${LM}
+fi
+
+if [ $stage -le 8 ]; then
+  # LDA+MLLT+SAT
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 80000 data/train data/lang exp/tri2_ali exp/tri3
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri3 exp/tri3_ali
+fi
+
+if [ $stage -le 9 ]; then
+  # Decode the fMLLR system.
+  graph_dir=exp/tri3/graph_${LM}
+  $decode_cmd --mem 4G $graph_dir/mkgraph.log \
+    utils/mkgraph.sh data/lang_${LM} exp/tri3 $graph_dir
+  steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
+    $graph_dir data/dev exp/tri3/decode_dev_${LM}
+fi
+
+if [ $stage -le 10 ]; then
+  # The following script cleans the data and produces cleaned data
+  # in data/train_cleaned, and a corresponding system
+  # in exp/tri3_cleaned.  It also decodes.
+  #
+  # Note: local/run_cleanup_segmentation.sh defaults to using 50 jobs,
+  # you can reduce it using the --nj option if you want.
+  local/run_cleanup_segmentation.sh 
+fi
+
+if [ $stage -le 11 ]; then
+  ali_opt=
+  local/chain/run_tdnn.sh $ali_opt 
+fi
+
+exit 0
diff --git a/egs/malach/s5/run_prepare_shared.sh b/egs/malach/s5/run_prepare_shared.sh
new file mode 100755
index 00000000000..4ec9613e0f9
--- /dev/null
+++ b/egs/malach/s5/run_prepare_shared.sh
@@ -0,0 +1,52 @@
+#!/bin/bash -u
+
+. ./cmd.sh
+. ./path.sh
+
+. utils/parse_options.sh
+
+if ! command -v prune-lm >/dev/null 2>&1 ; then
+  echo "$0: Error: the IRSTLM is not available or compiled" >&2
+  echo "$0: Error: We used to install it by default, but." >&2
+  echo "$0: Error: this is no longer the case." >&2
+  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
+  echo "$0: Error: and run extras/install_irstlm.sh" >&2
+  exit 1
+fi
+
+if ! command -v ngram-count >/dev/null 2>&1 ; then
+  echo "$0: Error: the SRILM is not available or compiled" >&2
+  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
+  echo "$0: Error: and run extras/install_srilm.sh" >&2
+  exit 1
+fi
+
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
+# -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
+set -euxo pipefail
+
+# Download of annotations, pre-processing,
+# malach_dir must be set to where you have downloaded the malach data
+# (see README.txt for where to get the data)
+#
+# For example: 
+# malach_dir=/speech7/picheny5_nb/new_malach/malach_eng_speech_recognition/data
+
+malach_dir=dummy_directory
+
+local/malach_text_prep.sh $malach_dir
+
+local/malach_prepare_dict.sh $malach_dir
+
+utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+
+local/malach_train_lms.sh data/local/annotations/train.txt data/local/annotations/dev.txt data/local/dict/lexicon.txt data/local/lm
+
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-9
+prune-lm --threshold=1e-9 data/local/lm/$final_lm.gz /dev/stdout | gzip -c > data/local/lm/$LM.gz
+utils/format_lm.sh data/lang data/local/lm/$LM.gz data/local/dict/lexicon.txt data/lang_$LM
+
+echo "Done"
+exit 0
diff --git a/egs/malach/s5/steps b/egs/malach/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/malach/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/malach/s5/utils b/egs/malach/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/malach/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/material/README b/egs/material/README
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/material/s5/README b/egs/material/s5/README
new file mode 100644
index 00000000000..0eb112493a4
--- /dev/null
+++ b/egs/material/s5/README
@@ -0,0 +1,35 @@
+About the MATERIAL corpus:
+
+The MATERIAL project:
+https://www.iarpa.gov/index.php/research-programs/material
+https://www.nist.gov/itl/iad/mig/openclir-evaluation
+
+The speech data in the MATERIAL corpus consist of four data sets for each
+language: train (BUILD), development (BUILD-dev), test (ANALYSIS1 and ANALYSIS2),
+and unlabeled evaluation audio (EVAL{1,2,3}). The train, development, test, and
+evaluation data contain around 40, 10, 20, and 250 hours of audio respectively.
+The train set is transcribed conversational audio that can be used for training
+an ASR system. It consists of some in 8-bit a-law .sph (Sphere) files and some
+in .wav files with 24-bit samples. The development set is transcribed
+conversational audio that can be used as development data for training to tune
+model parameters. The test data come in long unsegmented files. The reference
+transcripts for the test set is provided, hence, one can measure WER on the test
+set. The evaluation set is untranscribed audio that can be used for
+semi-supervised training of the acoustic model.
+Conversational speech data in the train and test sets are two-channel audio with
+the two channels temporally aligned. Each audio channel is provided and
+transcribed as a separate file, identified as inLine or outLine channel. Both
+audio channels are interleaved in a single file and a there is a single
+interleaved transcript that reflects the temporal alignments. In addition to
+conversational speech, the test and evlatuion sets also contain other
+genres of speech, namely news broadcast and topical broadcast, which are
+single channel files.
+
+
+Running the recipe:
+
+In s5)
+./run.sh --language <swahili|tagalog|somali>
+./local/chain/run_tdnn.sh
+./local/chain/decode_test.sh --language <swahili|tagalog|somali>
+./local/rnnlm/run_tdnn_lstm.sh
diff --git a/egs/material/s5/RESULTS b/egs/material/s5/RESULTS
new file mode 100644
index 00000000000..546f1630698
--- /dev/null
+++ b/egs/material/s5/RESULTS
@@ -0,0 +1,51 @@
+WER results for supervised and semi-supervised acoustic model training
+
+Baseline: GMM training to create alignments and lattice-free MMI-trained neural
+network with factorized TDNN. The BUILD package labeled audio is used for
+supervised acoustic model training, the EVALs unlabeled audio is added for
+semi-supervised acoustic model training.
+
+Source-side bitext on the BUILD package and crawled monolingual data are used in
+building the n-gram LM, RNNLM re-scoring, as well as extending the baseline lexicon.
+
+
+Results for *supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   36.8    36.7    38.9
+ANALYSIS1   42.5    41.3    41.4
+ANALYSIS2   38.1    36.8    36.9
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   46.4    46.1    47.5
+ANALYSIS1   52.1    51.0    50.9
+ANALYSIS2   53.6    52.3    52.2
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   57.4    56.5    57.8
+ANALYSIS1   61.6    57.8    57.7
+ANALYSIS2   59.3    55.5    55.3
+
+
+Results for *semi-supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   35.3    35.1    36.7
+ANALYSIS1   35.2    34.5    34.7
+ANALYSIS2   30.8    30.0    30.1
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   45.0    45.2    46.6
+ANALYSIS1   40.8    40.1    40.1
+ANALYSIS2   41.1    40.6    40.6
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   56.8    56.3    57.7
+ANALYSIS1   50.6    48.8    48.6
+ANALYSIS2   49.8    48.2    48.2
diff --git a/egs/material/s5/cmd.sh b/egs/material/s5/cmd.sh
new file mode 100644
index 00000000000..2bb1c6d24f5
--- /dev/null
+++ b/egs/material/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 8G"
diff --git a/egs/material/s5/conf/decode.config b/egs/material/s5/conf/decode.config
new file mode 100644
index 00000000000..7ba966f2b83
--- /dev/null
+++ b/egs/material/s5/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/material/s5/conf/lang/somali.conf b/egs/material/s5/conf/lang/somali.conf
new file mode 100755
index 00000000000..999c4c0ef14
--- /dev/null
+++ b/egs/material/s5/conf/lang/somali.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/BUILD/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1S-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/corpus/paracrawl-release3.2018-11-05.en-so.zipporah-20-dedup.lang-filtered.so
+mono2=/home/pkoehn/statmt/data/data.statmt.org/lm/so.filtered.tok.gz
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/somali_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
new file mode 100755
index 00000000000..d90f4c2abd7
--- /dev/null
+++ b/egs/material/s5/conf/lang/swahili.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1A-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.sw
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
new file mode 100644
index 00000000000..238979feb3f
--- /dev/null
+++ b/egs/material/s5/conf/lang/tagalog.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1B-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.tl
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=
+# Acoustic model parameters
+numShorestUtts=45000
+numLeavesTri1=4000
+numGaussTri1=60000
+numLeavesTri2=5000
+numGaussTri2=80000
+numLeavesTri3=7000
+numGaussTri3=100000
+
+
diff --git a/egs/material/s5/conf/mfcc.conf b/egs/material/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..e6defc10078
--- /dev/null
+++ b/egs/material/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=8000 
diff --git a/egs/material/s5/conf/mfcc_hires.conf b/egs/material/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..f218143e78a
--- /dev/null
+++ b/egs/material/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 # most of the files are 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/material/s5/conf/online_cmvn.conf b/egs/material/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/material/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/material/s5/conf/plp.conf b/egs/material/s5/conf/plp.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs/material/s5/conf/plp.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/material/s5/local/audio2wav_scp.pl b/egs/material/s5/local/audio2wav_scp.pl
new file mode 100755
index 00000000000..f051c2714d2
--- /dev/null
+++ b/egs/material/s5/local/audio2wav_scp.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+
+my $sox =  `which sox` or die "The sox binary does not exist";
+chomp $sox;
+my $sph2pipe = `which sph2pipe` or die "The sph2pipe binary does not exist";
+chomp $sph2pipe;
+
+while(<STDIN>) {
+  chomp;
+  my $full_path = $_;
+  (my $basename = $full_path) =~ s/.*\///g;
+
+  die "The filename $basename does not match the expected naming pattern!" unless $basename =~ /.*\.(wav|sph)$/;
+  (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g;
+  (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g;
+
+
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.sph
+  # Please note that the naming pattern must match
+  # the pattern in create_datafiles.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+
+  if ($ext eq "wav") {
+    print "$name $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n";
+  } else {
+    print "$name $sph2pipe -f wav -p -c 1 $full_path|\n";
+  }
+}
+
+
diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
new file mode 100755
index 00000000000..40115a04cf6
--- /dev/null
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+#           2018  Mahsa Yarmohammadi
+#           2018  Yiming Wang
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+language=swahili
+stage=0
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+dir=exp/chain/tdnn1b_sp
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+cmd=queue.pl
+graph_affix=_combined
+
+# training options
+chunk_width=140,100,160
+chunk_left_context=0
+chunk_right_context=0
+
+# ivector options
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=600
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+nj=30
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf                                   
+. ./lang.conf
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 1 ]; then
+  # extract hires mfcc features from uniformly segmented data
+  for datadir in $datadev; do
+    utils/copy_data_dir.sh ${datadir}_segmented ${datadir}_segmented_hires
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_segmented_hires || exit 1;
+    steps/compute_cmvn_stats.sh ${datadir}_segmented_hires || exit 1;
+    utils/fix_data_dir.sh ${datadir}_segmented_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # extract iVectors for the test data, in this case we don't need the speed
+  # perturbation (sp).
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_hires
+  done
+fi
+
+frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+rm $dir/.error 2>/dev/null || true
+
+if [ $stage -le 3 ]; then
+  # do the 1st pass decoding
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+
+if [ $stage -le 4 ]; then
+  # re-segement data based on 1st-pass decoding
+  segmentation_opts="--silence-proportion 0.2 --max-segment-length 15 --frame-shift 0.03"
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    # get alignment from lattice
+    nj_ali=`cat ${dir}/decode_${data}_segmented/num_jobs` || exit 1;
+    $cmd JOB=1:${nj_ali} ${dir}/decode_${data}_segmented/log/generate_alignments.JOB.log \
+    lattice-best-path --acoustic-scale=0.2 \
+    "ark:gunzip -c ${dir}/decode_${data}_segmented/lat.JOB.gz |" \
+    ark:/dev/null "ark:|gzip -c >${dir}/decode_${data}_segmented/ali.JOB.gz" || exit 1;
+
+    cp $lang/phones.txt ${dir}/decode_${data}_segmented || exit 1;
+
+    steps/resegment_data.sh --segmentation-opts "$segmentation_opts" ${datadir}_segmented_hires $lang \
+      ${dir}/decode_${data}_segmented ${datadir}_segmented_reseg_hires_tmp exp/resegment_${data}_segmented
+
+    utils/data/subsegment_data_dir.sh ${datadir}_segmented_hires ${datadir}_segmented_reseg_hires_tmp/segments \
+      ${datadir}_segmented_reseg_hires
+
+    rm -rf ${datadir}_segmented_reseg_hires_tmp 2>/dev/null || true
+
+    echo "Extracting i-vectors, stage 2"
+    # this does offline decoding, except we estimate the iVectors per
+    # speaker, excluding silence (based on alignments from a DNN decoding), with a
+    # different script.  This is just to demonstrate that script.
+    # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+    # up into "sub-speakers" of at least that many frames... can be useful if
+    # acoustic conditions drift over time within the speaker's data.
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_reseg_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_reseg_hires;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # 2nd-pass decoding on the resegmented data
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_reseg_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented_reseg
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_reseg_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_reseg_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented_reseg $tree_dir/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+exit 0;
diff --git a/egs/material/s5/local/chain/run_tdnn.sh b/egs/material/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/material/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/chain/run_tdnn_lstm.sh b/egs/material/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/material/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..4f38ee886a7
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,276 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# cat exp/chain/tdnn1a_sp/decode_dev/scoring_kaldi/best_wer
+# [for swahili]
+# %WER 38.65 [ 24021 / 62144, 3044 ins, 6378 del, 14599 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.5
+# [for tagalog]
+# %WER 46.53 [ 29955 / 64382, 3425 ins, 9485 del, 17045 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# [for swahili]
+# exp/chain/tdnn1a_sp: num-iters=99 nj=2..12 num-params=12.2M dim=40+100->1792 xent:train/valid[65,98,final]=(-1.93,-1.66,-1.68/-2.05,-1.84,-1.83) logprob:train/valid[65,98,final]=(-0.199,-0.166,-0.167/-0.225,-0.208,-0.206)
+# [for tagalog]
+# exp/chain/tdnn1a_sp: num-iters=96 nj=2..12 num-params=12.3M dim=40+100->1952 combine=-0.165->-0.165 (over 2) xent:train/valid[63,95,final]=(-1.89,-1.66,-1.65/-2.06,-1.89,-1.89) logprob:train/valid[63,95,final]=(-0.186,-0.158,-0.157/-0.231,-0.219,-0.218)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.01 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.005"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=768
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=768
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=768
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1024
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1024
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)  
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..023cb34b43d
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# cat exp/chain/tdnn1b_sp/decode_dev/scoring_kaldi/best_wer
+# [for swahili]
+# %WER 36.84 [ 22893 / 62144, 2988 ins, 5712 del, 14193 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+# [for tagalog]
+# %WER 46.37 [ 29852 / 64382, 4163 ins, 7652 del, 18037 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+# [for somali]
+# %WER 57.44 [ 46889 / 81637, 5016 ins, 12015 del, 29858 sub ] exp/chain/tdnn1b_sp/decode_dev/wer_10_0.0
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp
+
+# [for swahili]
+# exp/chain/tdnn1b_sp/: num-iters=99 nj=2..12 num-params=17.2M dim=40+100->1816
+# combine=-0.127->-0.127 (over 2) xent:train/valid[65,98,final]=(-1.74,-1.44,-1.43/-1.80,-1.62,-1.61)
+# logprob:train/valid[65,98,final]=(-0.175,-0.136,-0.135/-0.194,-0.182,-0.180)
+
+# [for tagalog]
+# exp/chain/tdnn1b_sp/: num-iters=96 nj=2..12 num-params=17.2M dim=40+100->1928 combine=-0.124->-0.123
+# (over 2) xent:train/valid[63,95,final]=(-1.69,-1.43,-1.42/-1.75,-1.62,-1.60) 
+# logprob:train/valid[63,95,final]=(-0.168,-0.128,-0.127/-0.193,-0.187,-0.187)
+
+# [for somali]
+# exp/chain/tdnn1b_sp/: num-iters=84 nj=2..12 num-params=17.9M dim=40+100->3240 combine=-0.162->-0.160 
+# (over 2) xent:train/valid[55,83,final]=(-2.31,-2.02,-2.00/-2.27,-2.13,-2.10)
+# logprob:train/valid[55,83,final]=(-0.218,-0.157,-0.154/-0.268,-0.263,-0.263)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1b   #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+  
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..af5a62dad0d
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# tdnn-lstm recipe
+# [for swahili]
+# cat exp/chain/tdnn_lstm1a_sp/decode_dev/scoring_kaldi/best_wer
+# %WER 39.12 [ 24312 / 62144, 3118 ins, 5952 del, 15242 sub ] exp/chain/tdnn_lstm1a_sp/decode_dev/wer_9_0.5
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=70 nj=2..12 num-params=10.9M dim=40+100->1792 combine=-0.176->-0.174 (over 6) xent:train/valid[45,69,final]=(-1.71,-1.52,-1.50/-1.81,-1.69,-1.67) logprob:train/valid[45,69,final]=(-0.185,-0.160,-0.159/-0.213,-0.208,-0.205)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train
+test_sets=dev
+gmm=tri3
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+tlstm_affix=1a   # affix for the TDNN-LSTM directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $ali_dir $tree_dir
+fi
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.02"
+  lstm_opts="l2-regularize=0.005"
+  output_opts="l2-regularize=0.004"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=512
+  relu-batchnorm-layer name=tdnn2 $tdnn_opts input=Append(-1,0,1) dim=512
+  relu-batchnorm-layer name=tdnn3 $tdnn_opts input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn5 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn6 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn8 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn9 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin=8 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/cleanup_transcripts.pl b/egs/material/s5/local/cleanup_transcripts.pl
new file mode 100755
index 00000000000..6cd237c5b7e
--- /dev/null
+++ b/egs/material/s5/local/cleanup_transcripts.pl
@@ -0,0 +1,90 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+# replacement of the smart-match operator (apparently not supported anymore)
+sub is_elem {
+  my $word = shift;
+  my $array = shift;
+  foreach my $other_word (@{$array}) {
+    return 1 if $word eq $other_word;
+  }
+  return 0;
+}
+
+my $unk = "<unk>";
+my $noise = "<noise>";
+my $spnoise = "<spnoise>";
+my $sil = "<sil>";
+
+my @ignore_events = ("<female-to-male>", "<male-to-female>");
+#as per the BABEL docs, ~ means truncation of the word/utterance
+my @ignore_utt_events = ("<overlap>", "<dtmf>", "<foreign>", "~");
+my @sil_events = ("<no-speech>");
+my @noise_events = ("<sta>", "<ring>", "<int>" );
+my @spnoise_events = ("<breath>", "<cough>", "<hes>", "<laugh>", "<click>", "<lipsmack>");
+
+
+
+UTT: while(<>) {
+  chomp;
+  my @line = split " ", $_;
+  my $file = shift @line;
+  my $begin = shift @line;
+  my $end = shift @line;
+
+  next if (@line == 1) and ($line[0] eq "<no-speech>");
+  next if (@line == 1) and ($line[0] =~ "<.*>"); #skip the utterance if all
+                                                 #it contains is a non-speech event
+
+  my @out_line;
+  foreach my $word (@line) {
+    if ($word =~ /.*-$/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^-.*/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^\*.*\*$/) {
+      push @out_line, $unk;
+    } elsif ($word eq "(())") {
+      push @out_line, $unk;
+    } elsif (is_elem $word, \@ignore_events) {
+      next;
+    } elsif (is_elem $word, \@ignore_utt_events) {
+      next UTT;
+    } elsif (is_elem $word, \@sil_events) {
+      push @out_line, $sil;
+    } elsif (is_elem $word, \@noise_events) {
+      push @out_line, $noise;
+    } elsif (is_elem $word, \@spnoise_events) {
+      push @out_line, $spnoise;
+    } else {
+      push @out_line, $word;
+    }
+  }
+  print "$file\t$begin\t$end\t" . join(" ", @out_line) . "\n" if @out_line;
+
+}
+
+
diff --git a/egs/material/s5/local/convert_lexicon.pl b/egs/material/s5/local/convert_lexicon.pl
new file mode 100755
index 00000000000..1fe7e90ac1f
--- /dev/null
+++ b/egs/material/s5/local/convert_lexicon.pl
@@ -0,0 +1,76 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+my $lexicon_name = $ARGV[0];
+open(my $lexicon_file, "<:encoding(UTF-8)", $lexicon_name) or
+  die "Cannot open $lexicon_name: $!\n";
+
+my $wordlist_name = $ARGV[1];
+open(my $wordlist_file, "<:encoding(UTF-8)", $wordlist_name) or
+  die "Cannot open $wordlist_name: $!\n";
+
+
+my %lexicon;
+while (<$lexicon_file>) {
+  chomp;
+  (my $word, my $prons) = split " ", $_, 2;
+  $lexicon{uc $word} = $prons;
+}
+
+while (<$wordlist_file>) {
+  chomp;
+  my $word = $_;
+  print STDERR "Cannot find word $word in lexicon\n" unless defined($lexicon{uc $word});
+
+  #print "$word $lexicon{$word}\n";
+
+  my @prons = split "\t", $lexicon{uc $word};
+  foreach my $pron (@prons) {
+    my @phones = split " ", $pron;
+    my $stress_mark = 0;
+    my @out_phones = ();
+    foreach my $phone (@phones) {
+      if ($phone eq "\"") {
+        $stress_mark = 1
+      } elsif ( $phone eq "." ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } elsif ( $phone eq "#" ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } else {
+        $phone =~ s/_/+/g;
+        #let's just ignore stress for now
+        #$phone = "${phone}_\"" if $stress_mark;
+        push @out_phones, $phone;
+      }
+    }
+    my $out_pron = join(" ", @out_phones);
+    $out_pron =~ s/ *\. */\t/g;
+    print "$word\t$out_pron\n";
+  }
+}
+
diff --git a/egs/material/s5/local/count_oovs.pl b/egs/material/s5/local/count_oovs.pl
new file mode 100755
index 00000000000..228399f99e3
--- /dev/null
+++ b/egs/material/s5/local/count_oovs.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/perl -W
+
+# (c) 2014  Korbinian Riedhammer
+
+# Count the number of OOV per turn (or speaker, if utt2spk is provided).  Use
+# the --split-words option to split non-ascii words into characters (syllable
+# based languages).
+
+
+use strict;
+use warnings;
+use Getopt::Long;
+use open qw(:std :utf8);
+
+
+my $utt2spkf = "";
+my $split_words = 0;
+
+GetOptions(
+	'utt2spk=s' => \$utt2spkf,
+	'split-words' => \$split_words
+);
+
+if (scalar @ARGV lt 1) {
+	print STDERR "usage:  $0 [--utt2spk=utt2spk] words.txt [input]\n";
+	exit 1;
+}
+
+my $lexf = shift @ARGV;
+
+my %lex = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $lexf`;
+
+my %utt2spk = ();
+if (length $utt2spkf gt 0) {
+	%utt2spk = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $utt2spkf`; #read_file($utt2spkf, binmode => ':utf8');
+}
+
+my %num_words = ();
+my %num_oovs = ();
+my %oov_string = ();
+
+while (<>) {
+	my ($id, @trl) = split /\s+/;
+
+	if (length $utt2spkf gt 0) {
+		if (defined $utt2spk{$id}) {
+			$id = $utt2spk{$id};
+		} else {
+			printf STDERR "Warning: $id not specified in $utt2spkf\n";
+		}
+	}
+
+	$num_words{$id} = 0 unless defined $num_words{$id};
+	$num_oovs{$id} = 0 unless defined $num_oovs{$id};
+	$oov_string{$id} = ""  unless defined $oov_string{$id};
+
+
+	if ($split_words) {
+		for (my $i = 0; $i < scalar @trl; $i++) {
+			my $w = $trl[$i];
+			unless ($w =~ m/[a-zA-Z_\-]/) {
+				my @sw = split //, $w;
+				splice @trl, $i, 1, @sw;
+				$i += (scalar @sw) - 1;
+			}
+		}
+	}
+
+	$num_words{$id} += scalar @trl;
+	for my $w (@trl) {
+		$num_oovs{$id} += 1 unless defined $lex{$w};
+		$oov_string{$id} .= "$w " unless defined $lex{$w};
+	}
+
+}
+
+for my $i (sort keys %num_words) {
+	printf "%s %d %d %s\n", $i, $num_words{$i}, $num_oovs{$i}, 
+		( defined $oov_string{$i} ? $oov_string{$i} : "");
+}
+
diff --git a/egs/material/s5/local/create_datafiles.pl b/egs/material/s5/local/create_datafiles.pl
new file mode 100755
index 00000000000..d8e692524a1
--- /dev/null
+++ b/egs/material/s5/local/create_datafiles.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $output = $ARGV[0];
+open(my $utt2spk, ">:utf8", "$output/utt2spk") or
+  die "Cannot open $output/utt2spk: $!\n";
+open(my $text, ">:utf8", "$output/text") or
+  die "Cannot open $output/text: $!\n";
+open(my $segments, ">:utf8", "$output/segments") or
+  die "Cannot open $output/segments: $!\n";
+open(my $wav, ">:utf8", "$output/wav2file") or
+  die "Cannot open $output/wav2file: $!\n";
+
+my %text2id;
+while(<STDIN>) {
+  chomp;
+  my @line = split (" ", $_, 4);
+  my $name = shift @line;
+  my $begin =  shift @line;
+  my $end = shift @line;
+  my $words = shift @line;
+  my $name_raw = $name;
+
+  my $begin_text = sprintf("%07d", $begin * 1000);
+  my $end_text = sprintf("%07d", $end * 1000);
+
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.txt
+  # Please note that the naming pattern must match
+  # the pattern in audio2wav_scp.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+
+  my $utt_name = join("_", $name, $begin_text, $end_text);
+  print $segments "$utt_name $name $begin $end\n";
+  print $utt2spk  "$utt_name $name\n";
+  print $text "$utt_name $words\n";
+  if (defined $text2id{$name}) {
+    die "" if $text2id{$name} ne $name_raw;
+  } else {
+    print $wav "$name $name_raw\n";
+    $text2id{$name} = $name_raw;
+  }
+}
diff --git a/egs/material/s5/local/ctm_filter b/egs/material/s5/local/ctm_filter
new file mode 100755
index 00000000000..fa0f749c92a
--- /dev/null
+++ b/egs/material/s5/local/ctm_filter
@@ -0,0 +1,7 @@
+#!/usr/bin/perl
+
+while (<>) {
+  if ($_ !~ m/<(noise|unk|spnoise|sil)>/i) {
+    print $_;
+  }
+}
diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/material/s5/local/g2p/apply_g2p.sh
similarity index 67%
rename from egs/multi_en/s5/local/g2p/apply_g2p.sh
rename to egs/material/s5/local/g2p/apply_g2p.sh
index 8484155800d..704a1a906bb 100755
--- a/egs/multi_en/s5/local/g2p/apply_g2p.sh
+++ b/egs/material/s5/local/g2p/apply_g2p.sh
@@ -25,16 +25,6 @@ outlexicon=$4
 
 mkdir -p $workdir
 
-# awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
-echo 'Gathering missing words...'
-cat data/*/train/text | \
-  local/count_oovs.pl $lexicon | \
-  awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
-  perl -ape 's/\s/\n/g;' | \
-  sort | uniq > $workdir/missing.txt
-cat $workdir/missing.txt | \
-  grep "^[a-z]*$"  > $workdir/missing_onlywords.txt
-
 echo 'Synthesizing pronunciations for missing words...'
 phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt 
 
diff --git a/egs/multi_en/s5/local/g2p/train_g2p.sh b/egs/material/s5/local/g2p/train_g2p.sh
similarity index 100%
rename from egs/multi_en/s5/local/g2p/train_g2p.sh
rename to egs/material/s5/local/g2p/train_g2p.sh
diff --git a/egs/material/s5/local/nnet3/run_ivector_common.sh b/egs/material/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a56b3bf67d8
--- /dev/null
+++ b/egs/material/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev"
+nj=30
+gmm=tri3
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang_test $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 32 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/normalize_numbers.py b/egs/material/s5/local/normalize_numbers.py
new file mode 100755
index 00000000000..b471cb853d4
--- /dev/null
+++ b/egs/material/s5/local/normalize_numbers.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# Converts numbers to their text representations
+# Reads from stdin
+
+import os
+import sys
+__location__ = os.path.realpath(
+    os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+# Read translations of numbers into a dict
+num_trans = dict()
+with open(sys.argv[1]) as s_f:
+  for line in s_f:
+    line_comp = line.strip().split('\t')
+    num_trans[int(line_comp[0])] = line_comp[1]
+
+# Read input line by line and translate integers
+# Will only work for positive integers
+# Will not handle numbers which have a comma in them
+for line in sys.stdin:
+  words = line.strip().split()
+  for i in range(len(words)):
+    if words[i].isdigit() and int(words[i]) in num_trans:
+      words[i] = num_trans[int(words[i])]
+
+  sys.stdout.write(" ".join(words) + "\n")
diff --git a/egs/material/s5/local/parse_dev_transcripts.py b/egs/material/s5/local/parse_dev_transcripts.py
new file mode 100755
index 00000000000..730d27ec4f1
--- /dev/null
+++ b/egs/material/s5/local/parse_dev_transcripts.py
@@ -0,0 +1,195 @@
+#! /usr/bin/env python3
+
+import sys
+import os
+import re
+
+
+def normalize_text(text):
+    parts = text.strip().split()
+
+    for i, w in enumerate(parts):
+        if w in ["<no-speech>", "--", ".", "?", "~"]:
+            parts[i] = ""
+        elif w == "%incomplete":
+            parts[i] = "<unk>"
+        elif w in ["<cough>", "<laugh>", "<lipsmack>", "<hes>"]:
+            parts[i] = "<spnoise>"
+        elif w in ["<breath>", "<sta>"]:
+            parts[i] = "<noise>"
+        elif w in ["<int>", "(())", "<foreign>", "<overlap>", "<misc>"]:
+            parts[i] = "<unk>"
+
+        # change *word* into word
+        parts[i] = re.sub(r"^[*](\S+)[*]$", r"\1", parts[i])
+
+    return re.sub(r"\s+", " ", " ".join(parts))
+
+
+def write_segment(start_time, end_time, text, reco_id,
+                  segments_fh, utt2spk_fh, text_fh):
+    assert end_time > start_time
+
+    text = normalize_text(text)
+
+    utt_id = "{reco_id}-{st:06d}-{end:06d}".format(
+        reco_id=reco_id,
+        st=int(start_time * 100), end=int(end_time * 100))
+
+    print ("{utt_id} {reco_id} {st} {end}"
+           "".format(utt_id=utt_id, reco_id=reco_id,
+                     st=start_time, end=end_time),
+           file=segments_fh)
+    print ("{utt_id} {reco_id}"
+           "".format(utt_id=utt_id, reco_id=reco_id),
+           file=utt2spk_fh)
+    print ("{utt_id} {text}"
+           "".format(utt_id=utt_id, text=text),
+           file=text_fh)
+
+
+def parse_calls_transcript_file(transcript_file, segments_fh,
+                                utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    inline_start_time = -1
+    outline_start_time = -1
+
+    i = 0
+
+    for line in open(transcript_file):
+        parts = line.strip().split()
+
+        if i == 0 and not parts[0].startswith('0'):
+            raise Exception("Transcript file {0} does not start with 0.000"
+                            "".format(transcript_file))
+        i += 1
+
+        start_time = float(parts[0])
+        if len(parts) == 1:
+            # Last line in the file
+            write_segment(inline_start_time, start_time, inline_text, file_id + "_inLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            write_segment(outline_start_time, start_time, outline_text, file_id + "_outLine",
+                          segments_fh, utt2spk_fh, text_fh)
+            break
+
+        assert parts[1] in ["inLine", "outLine"]
+
+        if parts[1] == "inLine":
+            reco_id = file_id + "_inLine"
+            if inline_start_time >= 0:
+                write_segment(inline_start_time, start_time, inline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            inline_text = " ".join(parts[2:])
+            inline_start_time = start_time
+        else:
+            reco_id = file_id + "_outLine"
+            if outline_start_time >= 0:
+                write_segment(outline_start_time, start_time, outline_text, reco_id,
+                              segments_fh, utt2spk_fh, text_fh)
+            outline_text = " ".join(parts[2:])
+            outline_start_time = start_time
+
+
+def parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh):
+    base_name = os.path.basename(transcript_file)
+    file_id = re.sub(".transcription.txt", "", base_name)
+
+    start_time = -1
+    i = 0
+
+    with open(transcript_file) as fh:
+        line = fh.readline().strip()
+        if not line.startswith('['):
+            raise Exception("Transcript file {0} does not start with [0.000"
+                            "".format(transcript_file))
+        try:
+            start_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+        except Exception:
+            print("Could not parse line {0}".format(line), file=sys.stderr)
+            raise
+
+        text = fh.readline()
+        while text != '':
+            text = text.strip()
+            line = fh.readline().strip()
+            if not line.startswith('['):
+                raise Exception("Time-stamp in transcript file {0} does not start with [; error parsing line {1} after text {2}"
+                                "".format(transcript_file, line, text))
+            try:
+                end_time  = float(re.sub(r"\[([^\]]+)\]", r"\1", line))
+            except Exception:
+                print("Could not parse line {0}".format(line), file=sys.stderr)
+                raise
+
+            write_segment(start_time, end_time, text, file_id,
+                          segments_fh, utt2spk_fh, text_fh)
+            start_time = end_time
+            text = fh.readline()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print ("Usage: {0} <corpus-root-dir> <calls-list> <non-calls-list> <data-dir>",
+               file=sys.stderr)
+        raise SystemExit(1)
+
+    root_path = sys.argv[1]
+    calls_list = open(sys.argv[2]).readlines()
+    non_calls_list = open(sys.argv[3]).readlines()
+    data_dir = sys.argv[4]
+
+    wav_scp_fh = open("{0}/wav.scp".format(data_dir), 'w')
+    utt2spk_fh = open("{0}/utt2spk".format(data_dir), 'w')
+    reco2file_and_channel_fh = open(
+        "{0}/reco2file_and_channel".format(data_dir), 'w')
+    text_fh = open("{0}/text".format(data_dir), 'w')
+    segments_fh = open("{0}/segments".format(data_dir), 'w')
+
+    for line in calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        for channel in [1, 2]:
+            reco_id = file_id + ("_inLine" if channel == 1 else "_outLine")
+            print ("{reco_id} {file_id} {channel}"
+                   "".format(reco_id=reco_id, file_id=file_id,
+                             channel="A" if channel == 1 else "B"),
+                   file=reco2file_and_channel_fh)
+            print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - remix {channel} |"
+                   "".format(reco_id=reco_id, wav_file=wav_file, channel=channel),
+                   file=wav_scp_fh)
+
+        parse_calls_transcript_file(transcript_file, segments_fh,
+                                    utt2spk_fh, text_fh)
+
+    for line in non_calls_list:
+        file_id = line.strip()
+        transcript_file = (
+            "{root_path}/transcription/{file_id}.transcription.txt"
+            "".format(root_path=root_path, file_id=file_id))
+        wav_file = "{root_path}/src/{file_id}.wav".format(
+            root_path=root_path, file_id=file_id)
+
+        print ("{file_id} {file_id} 1"
+               "".format(file_id=file_id),
+               file=reco2file_and_channel_fh)
+        print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - |"
+               "".format(reco_id=file_id, wav_file=wav_file),
+               file=wav_scp_fh)
+
+        parse_non_calls_transcript_file(transcript_file, segments_fh,
+                                        utt2spk_fh, text_fh)
+
+    wav_scp_fh.close()
+    utt2spk_fh.close()
+    reco2file_and_channel_fh.close()
+    text_fh.close()
+    segments_fh.close()
diff --git a/egs/material/s5/local/parse_transcripts.pl b/egs/material/s5/local/parse_transcripts.pl
new file mode 100755
index 00000000000..06c18a30c6c
--- /dev/null
+++ b/egs/material/s5/local/parse_transcripts.pl
@@ -0,0 +1,53 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $file = $ARGV[0];
+
+open(my $transcript, "<:utf8", $file) or
+  die "Cannot open file $file: $!\n";
+
+(my $basename = $file) =~ s/(.*\/)?([^\/]+)/$2/g;
+
+my $sentence = undef;
+my $begin_time = undef;
+my $end_time = undef;
+while(<$transcript>) {
+  chomp;
+  if (/^\[([0-9.]+)\]$/) {
+    $begin_time = $end_time;
+    $end_time = $1;
+    if ($sentence) {
+      print "$basename\t$begin_time\t$end_time\t$sentence\n";
+      $sentence = undef;
+    }
+  } else {
+    die "Invalid format of the transcription in $basename\n" if defined($sentence);
+    $sentence = $_;
+  }
+}
+
+die "Invalid format of the transcription in $basename\n" if defined($sentence);
+
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
new file mode 100755
index 00000000000..950c1191d4d
--- /dev/null
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+set -euo pipefail
+echo "$0 $@"
+
+stage=0
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <data-id> <graph-dir> <decode-dir>"
+  echo " e.g.: $0 analysis1 exp/chain/tdnn/graph exp/chain/tdnn/decode_analysis1_segmented"
+  exit 1
+fi
+
+data=$1
+graph_dir=$2
+decode_dir=$3
+
+# get recording-level CTMs from the lattice by resolving the overlapping
+# regions
+
+if [ $stage -le 0 ]; then
+  steps/get_ctm_fast.sh --cmd "$decode_cmd" --frame-shift 0.03 \
+    data/${data}_hires/ ${graph_dir} \
+    ${decode_dir} ${decode_dir}/score_10_0.0
+fi
+
+if [ $stage -le 1 ]; then
+  utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments \
+    ${decode_dir}/score_10_0.0/ctm \
+    - | utils/convert_ctm.pl data/${data}_hires/segments data/${data}_hires/reco2file_and_channel > \
+    ${decode_dir}/score_10_0.0/${data}_hires.ctm
+fi
+
+if [ $stage -le 2 ]; then
+  # extract n-best lists from archive.* files
+  if [[ ${decode_dir} == *_rescore_nbest ]]; then
+    hyp_filtering_cmd="cat"
+    [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+    [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+    mkdir -p ${decode_dir}/output_nbest
+    for f in ${decode_dir}/archives.*; do
+      docid=$(head -1 $f/words_text | awk '{print $1}' | cut -f1,2 -d'-')
+      $hyp_filtering_cmd $f/words_text  > \
+        ${decode_dir}/output_nbest/$docid".n.txt" || exit 1;
+    done
+  fi
+
+  # compute WER              
+  local/score_stm.sh --min-lmwt 10 --max-lmwt 10 --word-ins-penalty 0.0 \
+    --cmd "$decode_cmd" data/${data}_hires $graph_dir ${decode_dir}
+
+  grep -H Sum ${decode_dir}/score*/*.sys | utils/best_wer.sh
+fi
diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
new file mode 100755
index 00000000000..2bf9283f435
--- /dev/null
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1
+
+conversational_train=$data/conversational/training/
+audio=$conversational_train/audio/
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl > data/train/wav.scp
+
+
+conversational_dev=$data/conversational/dev
+audio=$conversational_dev/audio/
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl > data/dev/wav.scp
+
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..710f1a66e2e
--- /dev/null
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+language=swahili
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 [options] <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1
+
+lexicon=$data/conversational/reference_materials/lexicon.txt
+
+mkdir -p data/local
+cat $lexicon | awk '{print $1}' > data/local/lexicon_words
+cat $lexicon | cut -f2-  > data/local/lexicon_phns
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix}
+  $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \
+    < data/local/lexicon_words > data/local/lexicon_words_tc
+
+paste data/local/lexicon_words_tc data/local/lexicon_phns | sort > data/local/lexicon_tc
+
+lexicon=data/local/lexicon_tc
+
+[ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1;
+echo $0: using lexicon $lexicon
+mkdir -p data/local/dict_nosp/
+cat data/train/text | cut -f 2- -d ' ' | \
+  sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/wordlist
+
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict_nosp/wordlist | sort -u > data/local/dict_nosp/lexicon.txt
+[ -f  data/local/dict_nosp/lexiconp.txt ] && rm data/local/dict_nosp/lexiconp.txt
+
+cat data/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
+  cut -f 2- -d ' ' | sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/phones.txt
+
+
+grep "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/silence_phones.txt
+grep -v "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/nonsilence_phones.txt
+echo "<sil>" > data/local/dict_nosp/optional_silence.txt
+echo "<unk>" > data/local/dict_nosp/oov.txt
+
+
+
+utils/validate_dict_dir.pl data/local/dict_nosp/
+
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
new file mode 100755
index 00000000000..4200a55ed9d
--- /dev/null
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+language=swahili
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 [options] <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit
+fi
+data=$1;
+conversational_train=$data/conversational/training/
+mkdir -p data/train/
+for file in $conversational_train/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done  > data/train/transcripts.txt
+
+
+conversational_dev=$data/conversational/dev/
+mkdir -p data/dev
+for file in $conversational_dev/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done > data/dev/transcripts.txt
+
+
+cat data/train/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/train/
+
+cat data/dev/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/dev/
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix}
+
+for i in train dev; do
+  cat data/$i/text | cut -d " " -f2- > data/$i/text.notruecase
+  cat data/$i/text | cut -d " " -f1  > data/$i/uttids
+  # Truecase
+  $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \
+    < data/$i/text.notruecase | sed "s=<= <=g" > data/$i/text.truecase
+#  cat data/$i/text.truecase | sed 's/&apos; //g' | sed 's/&apos//g' | sed 's/&#91//g' | sed 's/&#93//g' | sed 's/&quot; //g' | sed 's/&quot //g' | sed 's/&amp; //g' | sed 's/@-@ //g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > data/$i/text.nopunc
+  cat data/$i/text.truecase | tr 'A-Z' 'a-z' > data/$i/text.nopunc
+  paste -d " " data/$i/uttids data/$i/text.nopunc > data/$i/text
+done
+
+
diff --git a/egs/material/s5/local/preprocess_external_text.sh b/egs/material/s5/local/preprocess_external_text.sh
new file mode 100755
index 00000000000..4cbc457310e
--- /dev/null
+++ b/egs/material/s5/local/preprocess_external_text.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+language=swahili
+srctext_bitext=data/bitext/text
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+output=$1
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+if [ "$language" == "swahili" ]; then
+  language_affix="sw"
+elif [ "$language" == "tagalog" ]; then
+  language_affix="tl"
+elif [ "$language" == "somali" ]; then
+  language_affix="so"
+fi
+MOSES=/home/pkoehn/moses
+
+# Normalize punctuation and tokenize input
+$MOSES/scripts/tokenizer/normalize-punctuation.perl ${language_affix} < ${srctext_bitext} \
+ | $MOSES/scripts/tokenizer/tokenizer.perl -a -l ${language_affix} > ${srctext_bitext}.tok
+
+# convert to lower cases
+cat ${srctext_bitext}.tok | tr 'A-Z' 'a-z' > ${srctext_bitext}.tc
+
+# Remove punctuation
+cat ${srctext_bitext}.tc | sed 's/&apos; //g' | sed 's/&apos//g' | sed 's/&#91//g' | sed 's/&#93//g' | sed 's/&quot; //g' | sed 's/&quot //g' | sed 's/&amp; //g' | sed 's/@-@ //g' | sed 's/-//g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > $output
+
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
new file mode 100755
index 00000000000..fbc868d3f7c
--- /dev/null
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -0,0 +1,135 @@
+#!/bin/sh
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+stage=0
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+. ./lang.conf
+
+datadev=$1
+
+mkdir -p $datadev
+
+# 1. create the reference transcript $datadev/reftext
+
+dataset=$(basename $datadev)
+
+audio_path=
+if [ $dataset == "analysis1" ]; then
+  audio_path=${audio_path_analysis1}
+elif [ $dataset == "analysis2" ]; then
+  audio_path=${audio_path_analysis2}
+elif [ $(basename $datadev) == 'test_dev' ]; then
+  audio_path=${audio_path_dev}
+elif [ $(basename $datadev) == 'eval1' ]; then
+  audio_path=${audio_path_eval1}
+elif [ $(basename $datadev) == 'eval2' ]; then
+  audio_path=${audio_path_eval2}
+elif [ $(basename $datadev) == 'eval3' ]; then
+  audio_path=${audio_path_eval3}
+fi
+
+[ -z ${audio_path} ] && echo "$0: test data should be either analysis1, analysis2, test_dev, eval1 or eval2." && exit 1
+
+metadata_file=${audio_path}/metadata/metadata.tsv
+
+if [ $stage -le 0 ]; then
+  mkdir -p data/local/$dataset
+
+  tail -n +2 $metadata_file | \
+    perl -ane '$F[0] =~ s/.wav//; print "$F[0] $F[1]\n";' > \
+    data/local/$dataset/all_list
+
+  awk '{if ($2 == "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/call_list
+  awk '{if ($2 != "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/non_call_list
+fi
+
+if [ $stage -le 2 ]; then
+  rm data/local/$dataset/{wav.scp,reco2file_and_channel} 2>/dev/null || true
+
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    local/parse_dev_transcripts.py $audio_path \
+      data/local/$dataset/call_list \
+      data/local/$dataset/non_call_list \
+      data/local/$dataset
+  else
+    for f in $(cat data/local/$dataset/call_list); do
+      wav_file="$audio_path/src/$f.wav"
+
+      echo "${f}_inLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 1 |" >> data/local/$dataset/wav.scp
+      echo "${f}_outLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 2 |" >> data/local/$dataset/wav.scp
+      echo "${f}_inLine ${f} A" >> data/local/$dataset/reco2file_and_channel
+      echo "${f}_outLine ${f} B" >> data/local/$dataset/reco2file_and_channel
+    done
+    
+    for f in $(cat data/local/$dataset/non_call_list); do
+      wav_file="$audio_path/src/$f.wav"
+
+      echo "${f} sox $wav_file -r 8000 -b 16 -c 1 -t wav - |" >> data/local/$dataset/wav.scp
+      echo "${f} ${f} 1" >> data/local/$dataset/reco2file_and_channel
+    done
+
+    awk '{print $1" "$1}' data/local/$dataset/wav.scp > data/local/$dataset/utt2spk
+  fi
+  utils/utt2spk_to_spk2utt.pl data/local/$dataset/utt2spk > data/local/$dataset/spk2utt
+  utils/fix_data_dir.sh data/local/$dataset
+  
+  utils/copy_data_dir.sh data/local/$dataset $datadev
+fi
+
+if [ $stage -le 3 ]; then
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    cat data/local/$dataset/all_list | awk '{print $1" <"$2",O>"}' > \
+      data/local/$dataset/all_list_labels
+    
+    awk '{print $2" "$1" "$3" "$4" "$1}' $datadev/segments | \
+      utils/apply_map.pl -f 1 $datadev/reco2file_and_channel | \
+      utils/apply_map.pl -f 3 $datadev/utt2spk | \
+      awk '{print $1" "$2" "$3" "$4" "$5" "$1" "$6}' | \
+      utils/apply_map.pl -f 7 $datadev/text | \
+      utils/apply_map.pl -f 6 data/local/$dataset/all_list_labels | \
+      sort +0 -1 +1 -2 +3nb -4 > \
+      $datadev/stm
+
+    touch $datadev/glm
+  fi
+fi
+
+# 3. segment .wav files
+ 
+# 3.1. create a trivial segments file:
+
+if [ $stage -le 4 ]; then
+  utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev}
+
+  if [ ! -f $datadev/segments ]; then
+    utils/data/get_segments_for_data.sh $datadev/ > $datadev/segments
+  fi
+
+  # 3.2. create uniform segmented directory using: (The durations are in seconds)
+
+  if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then
+    utils/data/convert_data_dir_to_whole.sh $datadev ${datadev}_whole
+    utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev}_whole
+    
+    utils/data/get_segments_for_data.sh ${datadev}_whole > ${datadev}_whole/segments
+    utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
+    --overlap-duration=5 --max-remaining-duration=15 ${datadev}_whole/segments > \
+    ${datadev}_whole/uniform_sub_segments
+
+    utils/data/subsegment_data_dir.sh ${datadev}_whole/ \
+      ${datadev}_whole/uniform_sub_segments ${datadev}_segmented
+  else
+    utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
+    --overlap-duration=5 --max-remaining-duration=15 ${datadev}/segments > \
+    ${datadev}/uniform_sub_segments
+
+    utils/data/subsegment_data_dir.sh ${datadev}/ \
+      ${datadev}/uniform_sub_segments ${datadev}_segmented
+  fi
+fi
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..3f5c7e547b1
--- /dev/null
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#                2018  Ke Li
+#                2018  Yiming Wang
+
+
+# [for swahili]
+# rnnlm/train_rnnlm.sh: best iteration (out of 40) was 38, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 140.6 / 1019.4.
+# Train objf: -6.28 -5.90 -5.70 -5.56 -5.47 -5.40 -5.34 -5.29 -5.25 -5.22 -5.17 -5.16 -5.13 -5.10 -5.07 -5.06 -5.04 -5.01 -4.99 -4.98 -4.97 -4.96 -4.93 -4.93 -4.91 -4.91 -4.89 -4.88 -4.87 -4.86 -4.84 -4.85 -4.81 -4.79 -4.78 -4.76 -4.75 -4.74 -4.73
+# Dev objf:   -8.69 -7.76 -7.31 -7.03 -6.98 -7.00 -6.96 -6.96 -6.93 -6.94
+
+# %WER 36.75 [ 22836 / 62144, 2758 ins, 6307 del, 13771 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 38.91 [ 24181 / 62144, 2750 ins, 6579 del, 14852 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  9906   59164  | 62.2     23.8    14.0     3.5     41.3    49.1  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  9906    59164  |  61.9     23.6     14.6      3.2     41.4     49.5  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  5322   37120  | 66.2     21.2    12.6     2.9     36.8    49.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  5322    37120  |  65.8     21.1     13.1      2.7     36.9     49.9  |
+
+# [for tagalog]
+# rnnlm/train_rnnlm.sh: best iteration (out of 320) was 125, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 141.2 / 259.6.
+# Train objf: -6.08 -5.78 -5.62 -5.52 -5.45 -5.40 -5.36 -5.32 -5.28 -5.26 -5.23 -5.20 -5.18 -5.16 -5.14 -5.13 -5.11 -5.10 -5.09 -5.07 -5.06 -5.05 -5.03 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.97 -4.97 -4.97 -4.96 -4.94 -4.94 -4.93 -4.93 -4.92 -4.91 -4.92 -4.91 -4.90 -4.89 -4.89 -4.89 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.81 -4.82 -4.81 -4.81 -4.80 -4.79 -4.79 -4.79 -4.79 -4.80 -4.79 -4.79 -4.79 -4.80 -4.79 -4.78 -4.78 -4.79 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.79 -4.78 -4.80 -4.79 -4.78 -4.79 -4.80 -4.80 -4.79 -4.79 -4.77 -4.78 -4.77 -4.77 -4.78 -4.75 -4.80 -4.78 -4.77 -4.76 -4.77 -4.76 -4.76 -4.75 -4.75 -4.76 -4.76 -4.77 -4.75 -4.75 -4.75 -4.76 -4.75 -4.76 -4.74 -4.75 -4.75 -4.76 -4.75 -4.75 -4.75 -4.74 -4.76 -4.75 -4.74 -4.78 -4.74 -4.73 -4.77 -4.76 -4.75 -4.74 -4.73 -4.73 -4.75 -4.75 -4.74 -4.76 -4.73 -4.72 -4.76 -4.72 -4.72 -4.73 -4.72 -4.73 -4.75 -4.72 -4.73 -4.76 -4.75 -4.72 -4.72 -4.74 -4.75 -4.73 -4.72 -4.74 -4.74 -4.73 -4.74 -4.74 -4.74 -4.72 -4.70 -4.72 -4.75 -4.74 -4.75 -4.74 -4.76 -4.72 -4.72 -4.74 -4.75 -4.71 -4.74 -4.73 -4.73 -4.73 -4.73 -4.74 -4.75 -4.73 -4.73 -4.72 -4.71 -4.72 -4.71 -4.72 -4.75 -4.72 -4.71 -4.74 -4.71 -4.70 -4.73 -4.73 -4.75 -4.75 -4.72 -4.72 -4.73 -4.75 -4.73 -4.72 -4.72 -4.72 -4.73 -4.76 -4.73 -4.76 -4.74 -4.73 -4.74 -4.74 -4.74 -4.73 -4.73 -4.73 -4.70 -4.73 -4.74 -4.72 -4.73 -4.73 -4.75 -4.72 -4.73 -4.73 -4.75 -4.73 -4.75 -4.75 -4.73 -4.75 -4.74 -4.75 -4.77 -4.74 -4.75 -4.74 -4.73 -4.77 -4.75 -4.74 -4.75 -4.74 -4.77 -4.76 -4.75 -4.79 -4.78 -4.76 -4.76 -4.77 -4.76 -4.75 -4.74 -4.74 -4.78 -4.77 -4.77 -4.78 -4.79 -4.79 -4.79 -4.76 -4.77 -4.76 -4.79 -4.76 -4.77 -4.76 -4.78 -4.80 -4.79 -4.78 -4.82 -4.82 -4.79 -4.80 -4.81 -4.79 -4.77 -4.79 -4.82 -4.81 -4.82 -4.83 -4.85 -4.84 -4.83 -4.85 -4.88 -4.85 -4.87 -4.86 -4.84 -4.87 -4.85 -4.84 
+# Dev objf:   -8.70 -7.03 -60340.00 -6.61 -6.45 -6.54 -60340.00 -6.34 -60340.00 -60340.00 -6.15 -6.12 -6.03 -6.03 -60340.00 -60340.00 -6.64 -60340.00 -6.01 -5.91 -5.93 -6.06 -5.92 -5.95 -6.00 -6.17 -6.06 -5.92 -5.92 -60340.00 -6.03 -5.93 -5.98 -60340.00 -6.00 -5.90 -5.84 -6.00 -60340.00 -5.95 -5.89 -60340.00 -5.90 -6.14 -5.84 -5.92 -5.83 -5.86 -5.89 -5.84 -60340.00 -5.90 -5.80 -5.87 -5.87 -60340.00 -5.79 -60340.00 -60340.00 -60340.00 -6.56 -5.88 -5.94 -60340.00 -5.84 -60340.00 -5.84 -5.81 -5.77 -60340.00 -60340.00 -60340.00 -5.81 -5.90 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.79 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.80 -60340.00 -60340.00 -5.68 -5.73 -5.74 -60340.00 -5.67 -5.63 -60340.00 -5.75 -60340.00 -5.66 -5.71 -5.73 -5.73 -5.75 -60340.00 -5.77 -60340.00 -5.70 -5.70 -5.82 -60340.00 -60340.00 -5.77 -5.72 -5.75 -60340.00 -5.56 -60340.00 -5.73 -60340.00 -60340.00 -5.99 -5.77 -60340.00 -5.65 -5.80 -60340.00 -60340.00 -5.64 -5.67 -5.73 -5.59 -60340.00 -60340.00 -5.73 -60340.00 -60340.00 -5.83 -5.58 -5.64 -5.75 -60340.00 -5.77 -5.68 -60340.00 -60340.00 -5.70 -5.85 -60340.00 -60340.00 -5.82 -6.15 -5.74 -5.73 -5.75 -60340.00 -60340.00 -5.86 -60340.00 -5.80 -5.79 -5.81 -60340.00 -5.89 -60340.00 -5.81 -5.71 -60340.00 -60340.00 -5.65 -5.87 -60340.00 -60340.00 -60340.00 -5.83 -60340.00 -5.94 -5.74 -5.75 -5.75 -60340.00 -5.76 -5.73 -5.76 -60340.00 -60340.00 -5.85 -5.91 -5.98 -60340.00 -5.88 -5.86 -60340.00 -60340.00 -60340.00 -60340.00 -5.91 -5.81 -5.86 -60340.00 -6.10 -6.17 -60340.00 -60340.00 -5.82 -5.82 -60340.00 -60340.00 -6.78 -5.71 -5.87 -60340.00 -60340.00 -5.98 -5.94 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.81 -60340.00 -60340.00 -60340.00 -5.74 -60340.00 -5.83 -60340.00 -5.96 -5.80 -60340.00 -60340.00 -60340.00 -5.82 -60340.00 -60340.00 -60340.00 -60340.00 -5.80 -60340.00 -60340.00 -60340.00 -60340.00 -5.79 -60340.00 -6.13 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.98 -60340.00 -60340.00 -60340.00 -5.85 -5.92 -5.85 -5.82 -6.04 -60340.00 -60340.00 -60340.00 -60340.00 -5.93 -60340.00 -5.85 -5.87 -5.77 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.89 -60340.00 -60340.00 -60340.00 -60340.00 -6.18 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.92 -6.01
+
+# %WER 46.07 [ 29664 / 64382, 3133 ins, 9896 del, 16635 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.5
+# %WER 47.47 [ 30563 / 64382, 3568 ins, 8934 del, 18061 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.5
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            | 10551   87329  | 53.7     25.3    21.0     4.6     51.0    65.6  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            | 10551    87329  |  53.4     24.9     21.6      4.3     50.9     65.6  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  5933   56887  | 52.6     25.0    22.4     4.9     52.3    73.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  5933    56887  |  52.3     24.5     23.1      4.5     52.2     73.9  |
+
+# [for somali]
+# rnnlm/train_rnnlm.sh: best iteration (out of 800) was 133, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 414.5 / 860.9.
+
+# %WER 56.54 [ 46160 / 81637, 4654 ins, 13070 del, 28436 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 57.85 [ 47226 / 81637, 5002 ins, 12287 del, 29937 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  9852   90609  | 50.4     33.3    16.3     8.2     57.8    74.8  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  9852    90609  |  50.4     33.2     16.4      8.1     57.7     74.9  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# | Sum/Avg                            |  8275   67640  | 53.0     32.8    14.2     8.5     55.5    69.3  |
+# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys 
+# |  Sum/Avg                            |  8275    67640  |  53.0     32.7     14.3      8.3     55.3     69.2  |
+
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/chain/tdnn1b_sp
+decode_sets="dev analysis1_segmented analysis2_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+
+dir=exp/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+monotext=data/mono/text.txt
+
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext $monotext; do
+
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+  cat $monotext > $text_dir/monotext.txt
+
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+monotext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh
new file mode 100755
index 00000000000..13cf0bde44c
--- /dev/null
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#                2018  Ke Li
+#                2018  Yiming Wang
+
+
+# [for swahili]
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 5, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 59.1 / 273.1.
+# Train objf: -5.48 -4.75 -4.47 -4.30 -4.17 -4.06 -3.96 -3.87 -3.77 -3.68 
+# Dev objf:   -10.79 -6.00 -5.75 -5.69 -5.62 -5.61 -5.62 -5.66 -5.66
+
+# %WER 35.84 [ 22270 / 62144, 2573 ins, 6961 del, 12736 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_11_0.5
+# %WER 48.49 [ 28692 / 59166, 2310 ins, 9200 del, 17182 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+
+# [for tagalog]
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 4, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 73.6 / 106.2.
+# Train objf: -5.55 -4.83 -4.58 -4.41 -4.28 -4.17 -4.06 -3.96 -3.86
+# Dev objf:   -10.54 -4.87 -4.72 -4.67 -4.67 -4.69 -4.71 -4.74 -4.78
+
+# %WER 42.91 [ 27628 / 64382, 3624 ins, 8301 del, 15703 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0
+# %WER 55.55 [ 48530 / 87362, 4030 ins, 19326 del, 25174 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/chain/tdnn1b_sp
+#decode_sets="dev analysis1_segmented_reseg test_dev_segmented_reseg eval1_segmented_reseg eval2_segmented_reseg"
+decode_sets="dev analysis1_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+decode_sets="analysis2_segmented"
+#decode_sets="dev eval1_segmented eval2_segmented"
+dir=exp/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/score.sh b/egs/material/s5/local/score.sh
new file mode 100755
index 00000000000..c7da00fba32
--- /dev/null
+++ b/egs/material/s5/local/score.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
diff --git a/egs/material/s5/local/score_segments.sh b/egs/material/s5/local/score_segments.sh
new file mode 100755
index 00000000000..064e15ae40d
--- /dev/null
+++ b/egs/material/s5/local/score_segments.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+local/score_wer_segments.sh "$@"
+#local/score_cer_segment.sh --stage 2 "$@"
+
diff --git a/egs/material/s5/local/score_stm.sh b/egs/material/s5/local/score_stm.sh
new file mode 100755
index 00000000000..7e1236ce92e
--- /dev/null
+++ b/egs/material/s5/local/score_stm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Copyright 2013  Johns Hopkins University (authors: Yenda Trmal)
+#           2018  Vimal Manohar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This scoring script is copied from Babel and modified.
+# This is a scoring script for the CTMS in <decode-dir>/score_<LMWT>/${name}.ctm
+# it tries to mimic the NIST scoring setup as much as possible (and usually does a good job)
+
+# begin configuration section.
+cmd=run.pl
+cer=0
+min_lmwt=7
+max_lmwt=17
+model=
+stage=0
+ctm_name=
+word_ins_penalty=0.0,0.5,1.0
+case_insensitive=true
+use_icu=true
+icu_transform='Any-Lower'
+#end configuration section.
+
+echo $0 $@
+
+[ -f ./path.sh ] && . ./path.sh
+[ -f ./cmd.sh ]  && . ./cmd.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <dataDir> <langDir|graphDir> <decodeDir>" && exit;
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --cer (0|1)                     # compute CER in addition to WER"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # This parameter is not used -- kept only for backwards compatibility
+dir=$3
+
+set -e
+set -o pipefail
+set -u
+
+ScoringProgram=`which sclite` || ScoringProgram=$KALDI_ROOT/tools/sctk/bin/sclite
+[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1;
+SortingProgram=`which hubscr.pl` || SortingProgram=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1;
+
+stm_filter_cmd=cat
+[ -x local/stm_filter ] && stm_filter_cmd=local/stm_filter
+ctm_filter_cmd=cat
+[ -x local/ctm_filter ] && ctm_filter_cmd=local/ctm_filter
+
+for f in $data/stm  ; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+if [ -z $ctm_name ] ; then
+  name=`basename $data`; # e.g. eval2000
+else
+  name=$ctm_name
+fi
+
+if [ $stage -le 0 ] ; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring/penalty_$wip/log
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.log \
+      set -e';' set -o pipefail';' \
+      cat $dir/score_LMWT_${wip}/${name}.ctm \| $ctm_filter_cmd '>' $dir/score_LMWT_${wip}/${name}.ctm.unsorted '&&' \
+      cat $data/stm \| $stm_filter_cmd '>' $dir/score_LMWT_${wip}/stm.unsorted '&&' \
+      $SortingProgram sortSTM \<$dir/score_LMWT_${wip}/stm.unsorted          \>$dir/score_LMWT_${wip}/stm.sorted '&&' \
+      $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.unsorted  \>$dir/score_LMWT_${wip}/${name}.ctm.sorted '&&' \
+      paste -d ' ' \<\(cut -f 1-5 -d ' ' $dir/score_LMWT_${wip}/stm.sorted \) \
+                   \<\(cut -f 6- -d ' ' $dir/score_LMWT_${wip}/stm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
+          \> $dir/score_LMWT_${wip}/stm '&&' \
+      paste -d ' ' \<\(cut -f 1-4 -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \) \
+                   \<\(cut -f 5-  -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \
+          \> $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \
+      utils/fix_ctm.sh $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \
+      $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.sorted2  \>$dir/score_LMWT_${wip}/${name}.ctm '&&' \
+      $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm  stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \
+        -n "$name.ctm" -f 0 -D -F  -o  sum rsum prf dtl sgml -e utf-8 || exit 1
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  if [ $cer -eq 1 ]; then
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.char.log \
+      $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \
+        -n "$name.char.ctm" -o sum rsum prf dtl sgml -f 0 -D -F -c NOASCII DH -e utf-8 || exit 1
+  fi
+fi
+
+
+echo "Finished scoring on" `date`
+exit 0
diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh
new file mode 100755
index 00000000000..555ec5056d9
--- /dev/null
+++ b/egs/material/s5/local/score_wer_segments.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+stats=true
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+data=$1
+dir=$2
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+mkdir -p $dir/scoring_kaldi
+if [ -f $data/reftext ]; then
+  cat $data/reftext | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+else
+  echo "$0: No reference text to compute WER" 
+fi
+
+if [ $stage -le 0 ]; then
+
+  mkdir -p $dir/scoring_kaldi/log
+  # begin building hypothesis hyp.txt
+  # in the same format as $data/reftext
+  awk '{a[$1]=a[$1]" "$5;}END{for(i in a)print i""a[i];}' \
+    $dir/score_10/ctm_out > tmpconcat
+  if [ -f $data/reftext ]; then
+    awk -F" " '{print $1}' $data/reftext > tmporder
+    awk 'FNR==NR {x2[$1] = $0; next} $1 in x2 {print x2[$1]}' \
+      tmpconcat tmporder > "$dir/score_10/ctm_out.concat"
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    # end building hypothesis hyp.txt
+
+    $cmd $dir/scoring_kaldi/log/score.hyp.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark:- ">&" $dir/wer || exit 1;
+
+    cat $dir/wer
+  else
+    cat tmpconcat > "$dir/score_10/ctm_out.concat"
+    awk -F" " '{print $1}' $dir/score_10/ctm_out.concat > tmporder
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    #exit 0;
+    #end building hypothesis hyp.txt
+
+  fi
+  
+  # building hyp.segmentedXms.txt
+  for dur in {700,800,900,1000}; do                                             
+    dursec=`echo $dur' / 1000' | bc -l`                                         
+    awk '{if ($4 < '$dursec') a[$1]=a[$1]" "$5; else a[$1]=a[$1]" "$5"\n"$1"";}END\
+      {for(i in a)print i""a[i];}' $dir/score_10/ctm_out > tmpconcat          
+    rm -rf $dir/score_10/ctm_out.concat.$dur                                    
+    while read LINE; do                                                         
+    grep "$LINE" "tmpconcat" >> "$dir/score_10/ctm_out.concat."$dur           
+    done < "tmporder"                                                        
+    
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat.$dur > $dir/scoring_kaldi/hyp.segmented${dur}ms.txt || exit 1;                   
+  done       
+  rm -rf tmpconcat                                                            
+  rm -rf tmporder 
+fi
+
+if [ $stage -le 1 ]; then
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+         ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/hyp.txt \
+         '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+  fi
+fi
diff --git a/egs/material/s5/local/semisup/chain/decode_test.sh b/egs/material/s5/local/semisup/chain/decode_test.sh
new file mode 100755
index 00000000000..3d9a1eda1f5
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/decode_test.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+#           2018  Mahsa Yarmohammadi
+#           2018  Yiming Wang
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+language=swahili
+stage=0
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+dir=exp/semisup/chain/tdnn_semisup_1a
+lang=data/lang_combined_chain
+tree_dir=exp/semisup/chain/tree_sp
+cmd=queue.pl
+graph_affix=_combined
+
+# training options
+chunk_width=140,100,160
+chunk_left_context=0
+chunk_right_context=0
+
+# ivector options
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=600
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+nj=30
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf                                   
+. ./lang.conf
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+if [ $stage -le 1 ]; then
+  # extract hires mfcc features from uniformly segmented data
+  for datadir in $datadev; do
+    utils/copy_data_dir.sh ${datadir}_segmented ${datadir}_segmented_hires
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_segmented_hires || exit 1;
+    steps/compute_cmvn_stats.sh ${datadir}_segmented_hires || exit 1;
+    utils/fix_data_dir.sh ${datadir}_segmented_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # extract iVectors for the test data, in this case we don't need the speed
+  # perturbation (sp).
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_hires
+  done
+fi
+
+frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+rm $dir/.error 2>/dev/null || true
+
+if [ $stage -le 3 ]; then
+  # do the 1st pass decoding
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_segmented_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
+        $tree_dir/graph${graph_affix} ${datadir}_segmented_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph${graph_affix} \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+# [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/chain/run_tdnn.sh b/egs/material/s5/local/semisup/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh b/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh
new file mode 120000
index 00000000000..f1cc0216196
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/run_tdnn_semisupervised.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_semisupervised_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..3d3056182ee
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+
+# Copyright 2017-2019  Johns Hopkins University (author: Daniel Povey)
+#                2017  Vimal Manohar
+#           2018-2019  Yiming Wang
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This system uses phone LM to model UNK.
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+exp_root=exp/semisup
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1
+
+
+gmm_dir=exp/$gmm   # used to get training lattices (for chain supervision)
+tree_dir=$exp_root/chain/tree_sp${tree_affix:+_tree_affix}
+lang_combined=data/lang_combined_chain
+lat_dir=$exp_root/chain/${gmm}_${train_set}_sp_lats  # training lattices directory
+dir=$exp_root/chain/tdnn_${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang_combined with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang_combined ]; then
+    if [ $lang_combined/L.fst -nt data/lang_combined_test/L.fst ]; then
+      echo "$0: $lang_combined already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang_combined already exists and seems to be older than data/lang_combined_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    rm -rf ${lang_combined} 2>/dev/null || true
+    cp -r data/lang_combined_test $lang_combined
+    silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
+    --generate-ali-from-lats true ${lores_train_data_dir} \
+    data/lang_combined $gmm_dir $lat_dir || exit 1;
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang_combined $lat_dir $tree_dir || exit 1
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+  
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.stage=$get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0 --generate-egs-scp true" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
new file mode 100755
index 00000000000..37c957a3227
--- /dev/null
+++ b/egs/material/s5/local/semisup/chain/tuning/run_tdnn_semisupervised_1a.sh
@@ -0,0 +1,466 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2019  Yiming Wang
+# Apache 2.0
+
+# This script is semi-supervised recipe with ~40 hours of supervised data
+# and ~320 hours unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_100k.sh shows how to call this.
+
+# This version of script uses only supervised data for i-vector extractor
+# training as against using the combined data.
+
+# This script uses the same tree as that for the seed model.
+# See the comments in the script about how to change these.
+
+# Unsupervised set: eval1_2_3_segmented (320 hours)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: ngram
+# Supervision: Naive split lattices
+
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# Semi-supervised training            train_sup
+# WER on dev                          18.70
+# WER on test                         18.18
+# Final output-0 train prob           -0.1345
+# Final output-0 valid prob           -0.1547
+# Final output-0 train prob (xent)    -1.3683
+# Final output-0 valid prob (xent)    -1.4077
+# Final output-1 train prob           -0.6856
+# Final output-1 valid prob           -0.6815
+# Final output-1 train prob (xent)    -1.1224
+# Final output-1 valid prob (xent)    -1.2218
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+nj=30
+test_sets="dev"
+
+exp_root=exp/semisup
+affix=1a  # affix for semi-supervised chain system
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=12
+
+# training options
+srand=0
+remove_egs=true
+
+# Datasets -- Expects data/$supervised_set and data/$unsupervised_set to be
+# present
+supervised_set=train
+unsupervised_set=eval1_2_3_segmented
+
+# Input seed system
+sup_chain_dir=exp/semisup/chain/tdnn_1a_sp  # supervised chain system
+sup_lat_dir=exp/semisup/chain/tri3_${supervised_set}_sp_lats  # Seed model options
+sup_tree_dir=exp/semisup/chain/tree_sp  # tree directory for supervised chain system
+ivector_root_dir=exp/nnet3  # i-vector extractor root directory
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+# The following can be replaced with the versions that model
+# UNK using phone LM. $sup_lat_dir should also ideally be changed.
+unsup_decode_lang=data/lang_combined_test
+test_lang=data/lang_combined_test
+
+dir=$exp_root/chain/tdnn_semisup_${affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+supervised_set_perturbed=${supervised_set}_sp
+
+sup_ivector_dir=$ivector_root_dir/ivectors_${supervised_set_perturbed}_hires
+
+graphdir=$sup_chain_dir/graph_combined
+
+for f in data/${supervised_set_perturbed}/feats.scp \
+  data/${supervised_set_perturbed}_hires/feats.scp \
+  $ivector_root_dir/extractor/final.ie $sup_ivector_dir/ivector_online.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $unsup_decode_lang/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  if [ ! -f $graphdir/HCLG.fst ]; then
+    utils/mkgraph.sh --self-loop-scale 1.0 $unsup_decode_lang $sup_chain_dir $graphdir
+  fi
+fi
+
+# Prepare the speed-perturbed unsupervised data directory
+if [ $stage -le 2 ]; then
+  if [ -f data/${unsupervised_set}_sp_hires/feats.scp ]; then
+    echo "$0: data/${unsupervised_set}_sp_hires/feats.scp exists. Remove it or re-run from next stage"
+    exit 1
+  fi
+
+  utils/data/perturb_data_dir_speed_3way.sh data/$unsupervised_set data/${unsupervised_set}_sp_hires
+  utils/data/perturb_data_dir_volume.sh data/${unsupervised_set}_sp_hires
+
+  steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" \
+    --mfcc-config conf/mfcc_hires.conf data/${unsupervised_set}_sp_hires || exit 1
+fi
+unsupervised_set_perturbed=${unsupervised_set}_sp
+
+# Extract i-vectors for the unsupervised data
+if [ $stage -le 3 ]; then
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${unsupervised_set_perturbed}_hires data/${unsupervised_set_perturbed}_max2_hires
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    data/${unsupervised_set_perturbed}_max2_hires $ivector_root_dir/extractor \
+    $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires || exit 1
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+# Set --skip-scoring to false in order to score the unsupervised data
+if [ $stage -le 4 ]; then
+  echo "$0: getting the decoding lattices for the unsupervised subset using the chain model at: $sup_chain_dir"
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$decode_cmd" \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --extra-left-context $chunk_left_context --extra-right-context $chunk_right_context \
+            --extra-left-context-initial 0 --extra-right-context-final 0 \
+            --frames-per-chunk 150 \
+            --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+            --scoring-opts "--min-lmwt 10 --max-lmwt 10" --word-determinize false \
+            $graphdir data/${unsupervised_set_perturbed}_hires $sup_chain_dir/decode_${unsupervised_set_perturbed}
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 5 ]; then
+  steps/best_path_weights.sh --cmd "${train_cmd}" --acwt 0.1 \
+    data/${unsupervised_set_perturbed}_hires \
+    $sup_chain_dir/decode_${unsupervised_set_perturbed} \
+    $sup_chain_dir/best_path_${unsupervised_set_perturbed}
+fi
+
+frame_subsampling_factor=1
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Uncomment the following lines if you need to build new tree using both
+# supervised and unsupervised data. This may help if amount of
+# supervised data used to train the seed system tree is very small.
+# unsupervised data
+
+# tree_affix=semisup
+# treedir=$exp_root/chain/tree_sp_${tree_affix}
+# if [ -f $treedir/final.mdl ]; then
+#   echo "$0: $treedir/final.mdl exists. Remove it and run again."
+#   exit 1
+# fi
+#
+# if [ $stage -le 6 ]; then
+#   # This is usually 3 for chain systems.
+#   echo $frame_subsampling_factor > \
+#     $sup_chain_dir/best_path_${unsupervised_set_perturbed}/frame_subsampling_factor
+#
+#   # This should be 1 if using a different source for supervised data alignments.
+#   # However alignments in seed tree directory have already been sub-sampled.
+#   echo $frame_subsampling_factor > \
+#     $sup_tree_dir/frame_subsampling_factor
+#
+#   # Build a new tree using stats from both supervised and unsupervised data
+#   steps/nnet3/chain/build_tree_multiple_sources.sh \
+#     --use-fmllr false --context-opts "--context-width=2 --central-position=1" \
+#     --frame-subsampling-factor $frame_subsampling_factor \
+#     7000 $lang \
+#     data/${supervised_set_perturbed} \
+#     ${sup_tree_dir} \
+#     data/${unsupervised_set_perturbed} \
+#     $chaindir/best_path_${unsupervised_set_perturbed} \
+#     $treedir || exit 1
+# fi
+#
+# sup_tree_dir=$treedir   # Use the new tree dir for further steps
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 7 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \
+    ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \
+    $dir
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024
+  linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1)
+  linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024
+  linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024
+  linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0)
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-chain-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-chain-batchnorm
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  linear-component name=prefinal-xent-l dim=256 $linear_opts
+  batchnorm-component name=prefinal-xent-batchnorm
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+ 
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_${supervised_set_perturbed}
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 9 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --left-context-initial $egs_left_context --right-context-final $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor $frame_subsampling_factor \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 1500000 \
+               --cmvn-opts "$cmvn_opts" \
+               --online-ivector-dir $sup_ivector_dir \
+               --generate-egs-scp true \
+               data/${supervised_set_perturbed}_hires $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=150  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=4.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=1   # frame-tolerance for chain training
+
+unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed}
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed}
+
+  if [ $stage -le 10 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    cp $sup_chain_dir/final.mdl $unsup_lat_dir || exit 1;
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$decode_cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --left-context-initial $egs_left_context --right-context-final $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}/weights.scp \
+      --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/${unsupervised_set_perturbed}_hires $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 11 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \
+    --block-size 128 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+if [ $stage -le 12 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir="$comb_egs_dir" \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$sup_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --egs.chunk-width=$frames_per_eg \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.max-param-change=2.0 \
+    --cleanup.remove-egs=false \
+    --feat-dir=data/${supervised_set_perturbed}_hires \
+    --tree-dir=$sup_tree_dir \
+    --lat-dir=$sup_lat_dir \
+    --dir=$dir || exit 1;
+fi
+
+test_graph_dir=$dir/graph_combined
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  frames_per_chunk=150
+  rm -f $dir/.error 2>/dev/null || true
+  for data in $test_sets; do
+      (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir ${ivector_root_dir}/ivectors_${data}_hires \
+          $test_graph_dir data/${data}_hires ${dir}/decode_${data} || touch $dir/.error
+      ) &
+  done
+  wait;
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..8fb570ea153
--- /dev/null
+++ b/egs/material/s5/local/semisup/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,178 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#           2018-2019  Yiming Wang
+
+
+# [for swahili]
+# %WER 34.5 | 9906 59164 | 68.1 16.9 15.0 2.6 34.5 47.0 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 30.0 | 5322 37120 | 72.3 15.1 12.6 2.2 30.0 47.5 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# [for tagalog]
+# %WER 40.1 | 10551 87329 | 63.9 19.4 16.6 4.0 40.1 63.6 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 40.6 | 5933 56887 | 63.5 18.6 17.9 4.1 40.6 71.7 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# [for somali]
+# %WER 48.8 | 9852 90609 | 59.1 28.6 12.3 7.8 48.8 73.4 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys
+# %WER 48.2 | 8275 67640 | 60.0 28.3 11.6 8.2 48.2 68.3 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+# Begin configuration section.
+
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=0
+train_stage=-10
+epochs=40
+
+# variables for lattice rescoring
+run_rescore=true
+decode_dir_suffix=rnnlm
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+ac_model_dir=exp/semisup/chain/tdnn_semisup_1a
+decode_sets="dev analysis1_segmented analysis2_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented"
+
+dir=exp/semisup/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+monotext=data/mono/text.txt
+
+lang=data/lang_combined_chain
+tree_dir=exp/semisup/chain/tree_sp
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+
+mkdir -p $dir/config
+set -e
+
+for f in ${train_text} ${dev_text} $bitext $monotext; do
+
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
+  cat $monotext > $text_dir/monotext.txt
+
+fi
+
+if [ $stage -le 1 ]; then
+  cp $lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   1   1.0
+bitext  1   1.0
+monotext  1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
+fi
+
+LM=combined_chain
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
+  done
+fi
+wait
+#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
+
+if [ $stage -le 5 ]; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore_nbest.sh \
+        --N 20 \
+        --cmd "$decode_cmd" \
+        --skip-scoring ${skip_scoring} \
+        0.5 data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore_nbest
+      fi
+    ) || touch $dir/.error 
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5/local/semisup/run.sh b/egs/material/s5/local/semisup/run.sh
new file mode 100755
index 00000000000..6b22cb1ad36
--- /dev/null
+++ b/egs/material/s5/local/semisup/run.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2019  Yiming Wang
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using ~40 hours of
+# supervised data and ~320 hours of unsupervised data.
+
+. ./cmd.sh
+. ./path.sh 
+
+set -o pipefail
+exp_root=exp/semisup
+
+stage=0
+
+. ./utils/parse_options.sh
+
+###############################################################################
+# Train seed chain system using ~40 hours supervised data.
+# Here we train i-vector extractor on only the supervised set.
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  local/semisup/chain/run_tdnn.sh \
+    --train-set train \
+    --nnet3-affix "" \
+    --affix 1a --tree-affix "" \
+    --gmm tri3 --exp-root $exp_root || exit 1
+fi
+
+if [ $stage -le 2 ]; then
+    utils/combine_data.sh data/eval1_2_3_segmented data/eval1_segmented data/eval2_segmented data/eval3_segmented || exit 1
+fi
+
+###############################################################################
+# Semi-supervised training using ~40 hours supervised data and
+# 320 hours unsupervised data. We use i-vector extractor, tree, lattices
+# and seed chain system from the previous stage.
+###############################################################################
+
+if [ $stage -le 3 ]; then
+  local/semisup/chain/run_tdnn_semisupervised.sh \
+    --supervised-set train \
+    --unsupervised-set eval1_2_3_segmented \
+    --sup-chain-dir $exp_root/chain/tdnn_1a_sp \
+    --sup-lat-dir $exp_root/chain/tri3_train_sp_lats \
+    --sup-tree-dir $exp_root/chain/tree_sp \
+    --ivector-root-dir exp/nnet3 \
+    --affix 1a \
+    --exp-root $exp_root || exit 1
+
+  # [for swahili]
+  # %WER 35.2 | 9906 59164 | 67.8 18.4 13.8 3.0 35.2 47.1 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys
+  # %WER 30.8 | 5322 37120 | 71.9 16.4 11.8 2.7 30.8 47.8 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys
+
+  # [for tagalog]
+  # %WER 40.8 | 10551 87329 | 64.0 21.4 14.6 4.8 40.8 63.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys
+  # %WER 41.1 | 5933 56887 | 63.8 20.4 15.9 4.9 41.1 71.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys
+fi
+
diff --git a/egs/material/s5/local/stm_filter b/egs/material/s5/local/stm_filter
new file mode 100755
index 00000000000..9409119a54f
--- /dev/null
+++ b/egs/material/s5/local/stm_filter
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+while (<>) {
+  chomp;
+  my @F = split;
+  my @A = @F[6..$#F];
+  for (my $i = 0; $i <= $#A; $i++) {
+    my $w = $A[$i];
+
+    # Make partial words optionally detectable
+    if ($w =~ m/^(\S+-)$/ || $w =~ m/^(-\S+)$/) {
+      $A[$i] = "(" . $w . ")";
+    }
+    
+    # Remove filler words
+    if ($w =~ m/<(unk|noise|spnoise|sil)>/) {
+      $A[$i] = "";
+    }
+  }
+    
+  print join(" ", @F[0..5]) . " " . join(" ", @A) . "\n";
+}
diff --git a/egs/material/s5/local/train_lms_srilm.sh b/egs/material/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..8160b060dc7
--- /dev/null
+++ b/egs/material/s5/local/train_lms_srilm.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  [ -z "$train_text" ] && train_text=$datadir/train/text
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/material/s5/local/wer_output_filter b/egs/material/s5/local/wer_output_filter
new file mode 100755
index 00000000000..5195bb9150d
--- /dev/null
+++ b/egs/material/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL") || ($s =~ /--|\.|\?|\(\(\)\)|%incomplete/)) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/material/s5/path.sh b/egs/material/s5/path.sh
new file mode 100644
index 00000000000..ffa108b6737
--- /dev/null
+++ b/egs/material/s5/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+[ ! -f $KALDI_ROOT/tools/env.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/env.sh is not present (this is uncommon but might be OK)"
+. $KALDI_ROOT/tools/env.sh
+export LC_ALL=C
diff --git a/egs/material/s5/rnnlm b/egs/material/s5/rnnlm
new file mode 120000
index 00000000000..72302c5e570
--- /dev/null
+++ b/egs/material/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm
\ No newline at end of file
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
new file mode 100755
index 00000000000..4ba518f53e0
--- /dev/null
+++ b/egs/material/s5/run.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#           2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2018  Yiming Wang
+#                2019  Mahsa Yarmohammadi
+# License: Apache 2.0
+
+. ./path.sh
+. ./cmd.sh
+
+nj=30 # number of parallel jobs
+stage=1
+language=swahili
+. utils/parse_options.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+[ ! -f ./conf/lang/${language}.conf ] && \
+  echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf
+. ./lang.conf
+
+if [ $stage -le 1 ]; then
+  local/prepare_text_data.sh $corpus
+  local/prepare_audio_data.sh $corpus
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh $corpus
+  utils/validate_dict_dir.pl data/local/dict_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+  utils/validate_lang.pl data/lang_nosp
+fi
+
+if [ $stage -le 3 ]; then
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
+    data/lang_nosp/words.txt data data/lm
+  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
+    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
+  utils/validate_lang.pl data/lang_nosp_test
+fi
+
+if [ $stage -le 4 ]; then
+  for set in train dev; do
+    dir=data/$set
+    utils/fix_data_dir.sh $dir
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
+    steps/compute_cmvn_stats.sh $dir
+    utils/fix_data_dir.sh $dir
+    utils/validate_data_dir.sh $dir
+  done
+fi
+
+# Create a subset with 40k short segments to make flat-start training easier
+if [ $stage -le 5 ]; then
+  utils/subset_data_dir.sh --shortest data/train $numShorestUtts data/train_short
+fi
+
+# monophone training
+if [ $stage -le 6 ]; then
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+    data/train_short data/lang_nosp_test exp/mono
+  (
+    utils/mkgraph.sh data/lang_nosp_test \
+      exp/mono exp/mono/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
+        data/$test exp/mono/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/mono exp/mono_ali
+fi
+
+# train a first delta + delta-delta triphone system on all utterances
+if [ $stage -le 7 ]; then
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    $numLeavesTri1 $numGaussTri1 data/train data/lang_nosp_test exp/mono_ali exp/tri1
+
+  # decode using the tri1 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
+        data/$test exp/tri1/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
+fi
+
+# train an LDA+MLLT system.
+if [ $stage -le 8 ]; then
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" $numLeavesTri2 $numGaussTri2 \
+    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
+
+  # decode using the LDA+MLLT model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
+        data/$test exp/tri2/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
+fi
+
+# Train tri3, which is LDA+MLLT+SAT
+if [ $stage -le 9 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" $numLeavesTri3 $numGaussTri3 \
+    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
+
+  # decode using the tri3 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+    for test in dev; do
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
+        data/$test exp/tri3/decode_nosp_$test
+    done
+  )&
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory.
+if [ $stage -le 10 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
+
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  utils/format_lm.sh data/lang data/lm/lm.gz \
+    data/local/dict/lexiconp.txt data/lang_test
+
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_test exp/tri3 exp/tri3_ali
+fi
+
+if [ $stage -le 11 ]; then
+  # Test the tri3 system with the silprobs and pron-probs.
+
+  # decode using the tri3 model
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+  for test in dev; do
+    steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
+      exp/tri3/graph data/$test exp/tri3/decode_$test
+  done
+fi
+
+mkdir -p data/bitext
+mkdir -p data/mono
+
+srctext_bitext=data/bitext/text
+srctext_mono=data/mono/text
+
+if [ $stage -le 12 ]; then
+  # Read the foreign part of the bitext as $srctext_bitext and preprocess the text
+  if [ "$number_mapping" != "" ]; then
+    echo Number mapping file Found. Converting numbers...
+    cat $bitext | awk -F"\t" '{print $2;}' | local/normalize_numbers.py $number_mapping > $srctext_bitext
+    if [[ $mono == *.gz ]]; then 
+      gzip -cd $mono | local/normalize_numbers.py $number_mapping > $srctext_mono
+    else
+      cat $mono | local/normalize_numbers.py $number_mapping > $srctext_mono
+    fi
+    if [ "$mono2" != "" ]; then
+      if [[ $mono2 == *.gz ]]; then 
+        gzip -cd $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono
+      else
+        cat $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono
+      fi
+    fi
+  else
+    cat $bitext | awk -F"\t" '{print $2;}' > $srctext_bitext
+    if [[ $mono == *.gz ]]; then
+      gzip -cd $mono > $srctext_mono
+    else
+      cat $mono > $srctext_mono
+    fi
+    if [ "$mono2" != "" ]; then
+      if [[ $mono2 == *.gz ]]; then 
+        gzip -cd $mono2 >> $srctext_mono
+      else
+        cat $mono2 >> $srctext_mono
+      fi
+    fi
+  fi
+
+  local/preprocess_external_text.sh --language $language \
+    --srctext-bitext ${srctext_bitext} ${srctext_bitext}.txt
+
+  local/preprocess_external_text.sh --language $language \
+    --srctext-bitext ${srctext_mono} ${srctext_mono}.txt
+
+  # Combine two sources of text
+  cat $bitext | awk '{print $1}' > ${srctext_bitext}.header
+  paste ${srctext_bitext}.header ${srctext_bitext}.txt > ${srctext_bitext}.processed
+
+  if [[ $mono == *.gz ]]; then
+    gzip -cd $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header
+  else
+    cat $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header
+  fi
+  if [ "$mono2" != "" ]; then
+    if [[ $mono2 == *.gz ]]; then 
+      gzip -cd $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header
+    else
+      cat $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header
+    fi
+  fi
+  paste ${srctext_mono}.header ${srctext_mono}.txt > ${srctext_mono}.processed
+fi
+
+# The next 3 stages are to train g2p from the existing lexicon,
+# apply g2p to expand the lexicon using oov words from bitext data
+# as in ${dict_root}_nosp.
+g2p_workdir=data/local/g2p_phonetisarus
+if [ $stage -le 13 ]; then
+  echo 'Gathering missing words...'
+  mkdir -p ${g2p_workdir}
+  cat ${srctext_bitext}.txt ${srctext_mono}.txt | \
+    local/count_oovs.pl data/local/dict_nosp/lexicon.txt | \
+    awk '{for(i=4; i<NF; i++) printf "%s",$i OFS; if(NF) printf "%s",$NF; printf ORS}' | \
+    perl -ape 's/\s/\n/g;' | \
+    sort | uniq > ${g2p_workdir}/missing.txt
+  cat ${g2p_workdir}/missing.txt | \
+    grep "^[a-z]*$"  > ${g2p_workdir}/missing_onlywords.txt
+fi
+
+if [ $stage -le 14 ]; then
+  local/g2p/train_g2p.sh --stage 0 --silence-phones \
+    "data/local/dict/silence_phones.txt" data/local/dict_nosp exp/g2p || touch exp/g2p/.error
+fi
+
+dict_root=data/local/dict_combined
+if [ $stage -le 15 ]; then
+  if [ -f exp/g2p/.error ]; then
+    rm exp/g2p/.error || true
+    echo "Fail to train the G2P model." && exit 1;
+  fi
+  mkdir -p ${dict_root}_nosp
+  rm ${dict_root}_nosp/lexiconp.txt 2>/dev/null || true
+  cp data/local/dict_nosp/{phones,oov,nonsilence_phones,silence_phones,optional_silence}.txt ${dict_root}_nosp
+  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst ${g2p_workdir} \
+  data/local/dict_nosp/lexicon.txt ${dict_root}_nosp/lexicon.txt || exit 1;
+
+  utils/validate_dict_dir.pl ${dict_root}_nosp
+fi
+
+lang_root=data/lang_combined
+lmdir=data/lm_combined
+if [ $stage -le 16 ]; then
+  utils/prepare_lang.sh ${dict_root}_nosp "<unk>" data/local/lang_combined_nosp ${lang_root}_nosp
+  utils/validate_lang.pl ${lang_root}_nosp
+fi
+
+# prepare the new LM with bitext data and the new lexicon,
+# as in the new test lang directory ${lang_root}_nosp_test
+
+datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3"
+
+if [ $stage -le 17 ]; then
+  for datadir in $datadev; do
+    local/preprocess_test.sh $datadir &
+  done
+  wait
+
+  mkdir -p $lmdir
+  mkdir -p $lmdir/mono
+  mkdir -p $lmdir/bitext
+
+  cat data/analysis1/text | awk '{for(i=2;i<=NF;i++) printf("%s ", $i); print""}' \
+    | grep . | shuf | head -n 2000 > $lmdir/dev_text || echo done
+
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file ${lang_root}_nosp/words.txt \
+    --train-text ${srctext_bitext}.processed --dev-text $lmdir/dev_text \
+    data $lmdir/bitext
+
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file ${lang_root}_nosp/words.txt \
+    --train-text ${srctext_mono}.processed --dev-text $lmdir/dev_text \
+    data $lmdir/mono
+fi
+
+if [ $stage -le 18 ]; then
+  ngram -order 4 -lm data/lm/lm.gz -mix-lm $lmdir/bitext/lm.gz \
+    -mix-lm2 $lmdir/mono/lm.gz -lambda 0.3 -mix-lambda2 0.4 \
+    -write-lm $lmdir/lm.gz
+
+  utils/format_lm.sh ${lang_root}_nosp $lmdir/lm.gz \
+    ${dict_root}_nosp/lexiconp.txt ${lang_root}_nosp_test
+  utils/validate_lang.pl ${lang_root}_nosp_test
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory ${lang_root}_test.
+if [ $stage -le 19 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train ${lang_root}_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    ${dict_root}_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt ${dict_root}
+  utils/prepare_lang.sh ${dict_root} "<unk>" data/local/lang_combined ${lang_root}
+
+  utils/format_lm.sh ${lang_root} $lmdir/lm.gz \
+    ${dict_root}/lexiconp.txt ${lang_root}_test
+fi
+
+# After run.sh is finished, run the followings:
+# ./local/chain/run_tdnn.sh
+# ./local/chain/decode_test.sh --language <swahili|tagalog|somali>
+# ./local/rnnlm/run_tdnn_lstm.sh
+exit 0;
diff --git a/egs/material/s5/steps b/egs/material/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/material/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/material/s5/utils b/egs/material/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/material/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/mgb2_arabic/s5/README b/egs/mgb2_arabic/s5/README
new file mode 100644
index 00000000000..248331ff209
--- /dev/null
+++ b/egs/mgb2_arabic/s5/README
@@ -0,0 +1,103 @@
+V##########################################################
+#
+#  Initial notes
+#
+##########################################################
+
+# To this recipe you'll need
+# 1) Download MGB-2 corpus from https://arabicspeech.org/mgb2 into a 
+#    DB directory. After downloading, it must have DB/{train,dev,test}.tar.gz
+# 2) Download lexicon from https://github.com/qcri/ArabicASRChallenge2016/blob/master/lexicon/ar-ar_grapheme_lexicon
+# 3) xmlstarlet 
+#    xml should be in the PATH. Most new linux distros will already have this.
+#    Otherwise, this should be obtained from http://xmlstar.sourceforge.net/ 
+
+
+##########################################################
+# Structure of the DB directory
+##########################################################
+
+DB:
+dev
+test
+train
+
+DB/dev:
+music
+non_overlap_speech
+original_txt
+overlap_speech
+README.md
+segments
+segments.all
+segments.non_overlap_speech
+silence
+text
+text.all
+text.non_overlap_speech
+wav
+wav.scp
+xml
+
+DB/dev/original_txt:
+Contains the original unsegmented transcription
+
+DB/dev/wav:
+Contains the audio wav files
+
+DB/dev/xml:
+bw
+utf8
+
+DB/dev/xml/bw:
+Contains the transcription in buckwalter transliteration
+
+DB/dev/xml/utf8:
+Contains the transcription in UTF-8
+
+DB/test:
+ar20160531mgb.glm
+non_overlap_speech.lst
+overlap_speech.lst
+README.md
+segments
+segments.all
+segments.non_overlap_speech
+stm_ndnp.stm
+stm_norm.stm
+stm_orig.stm
+text
+text.all
+text.non_overlap_speech
+wav
+wav.scp
+
+DB/test/wav:
+Contains the audio wav files
+
+DB/train:
+lm_text
+original_txt
+README.md
+wav
+xml
+
+DB/train/lm_text:
+lm_text_clean_bw
+lm_text_utf8
+
+DB/train/original_txt:
+Contains the original unsegmented transcription
+
+DB/train/wav:
+Contains the audio wav files
+
+DB/train/xml:
+bw
+utf8
+
+DB/train/xml/bw:
+Contains the transcription in buckwalter transliteration
+
+DB/train/xml/utf8:
+Contains the transcription in UTF-8
diff --git a/egs/mgb2_arabic/s5/RESULTS b/egs/mgb2_arabic/s5/RESULTS
new file mode 100644
index 00000000000..701e2d0ed62
--- /dev/null
+++ b/egs/mgb2_arabic/s5/RESULTS
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+for x in exp/mer80/*/decode*; do grep Sum $x/score*/*.sys | utils/best_wer.sh; done | sort -k2,2n
+
+exit 0
+
+# tri1
+
+%WER 49.7 | 5002 60169 | 55.5 35.9 8.7 5.1 49.7 95.4 | exp/mer80/tri1/decode_dev_non_overlap/score_13/dev_non_overlap.ctm.updated.utf8.sys
+%WER 82.3 | 840 14419 | 21.3 34.5 44.2 3.7 82.3 99.9 | exp/mer80/tri1/decode_dev_overlap/score_16/dev_overlap.ctm.updated.utf8.sys
+
+# tri2
+
+%WER 48.5 | 5002 60169 | 56.4 34.5 9.1 5.0 48.5 94.9 | exp/mer80/tri2/decode_dev_non_overlap/score_14/dev_non_overlap.ctm.updated.utf8.sys
+%WER 81.9 | 840 14419 | 21.9 32.6 45.5 3.8 81.9 99.9 | exp/mer80/tri2/decode_dev_overlap/score_16/dev_overlap.ctm.updated.utf8.sys
+
+# tri3
+
+%WER 44.0 | 5002 60169 | 60.6 31.0 8.4 4.6 44.0 93.0 | exp/mer80/tri3/decode_dev_non_overlap/score_14/dev_non_overlap.ctm.updated.utf8.sys
+%WER 79.6 | 840 14419 | 26.0 40.4 33.6 5.6 79.6 99.9 | exp/mer80/tri3/decode_dev_overlap/score_12/dev_overlap.ctm.updated.utf8.sys
+
+# tri4
+
+%WER 44.0 | 5002 60169 | 60.9 31.5 7.6 4.9 44.0 93.0 | exp/mer80/tri4/decode_dev_non_overlap.si/score_13/dev_non_overlap.ctm.updated.utf8.sys
+%WER 80.0 | 840 14419 | 24.6 35.8 39.6 4.7 80.0 99.8 | exp/mer80/tri4/decode_dev_overlap.si/score_15/dev_overlap.ctm.updated.utf8.sys
+
+%WER 42.3 | 5002 60169 | 62.7 29.8 7.5 5.0 42.3 92.5 | exp/mer80/tri4/decode_dev_non_overlap/score_13/dev_non_overlap.ctm.updated.utf8.sys
+%WER 79.0 | 840 14419 | 26.4 37.8 35.9 5.4 79.0 99.9 | exp/mer80/tri4/decode_dev_overlap/score_13/dev_overlap.ctm.updated.utf8.sys
+
+# tri5
+
+%WER 42.8 | 5002 60169 | 62.3 30.5 7.2 5.0 42.8 92.1 | exp/mer80/tri5/decode_dev_non_overlap.si/score_14/dev_non_overlap.ctm.updated.utf8.sys
+%WER 79.1 | 840 14419 | 25.5 36.5 37.9 4.7 79.1 99.8 | exp/mer80/tri5/decode_dev_overlap.si/score_15/dev_overlap.ctm.updated.utf8.sys
+
+%WER 40.7 | 5002 60169 | 63.8 28.0 8.2 4.5 40.7 91.6 | exp/mer80/tri5/decode_dev_non_overlap/score_18/dev_non_overlap.ctm.updated.utf8.sys
+%WER 78.2 | 840 14419 | 26.4 34.7 38.9 4.6 78.2 100.0 | exp/mer80/tri5/decode_dev_overlap/score_23/dev_overlap.ctm.updated.utf8.sys
+
+## tri5 rescore with larger LM
+
+%WER 38.9 | 5002 60169 | 65.6 26.7 7.7 4.5 38.9 89.7 | exp/mer80/tri5/decode_dev_non_overlap_fg/score_16/dev_non_overlap.ctm.updated.utf8.sys
+%WER 77.8 | 840 14419 | 27.8 37.8 34.4 5.6 77.8 100.0 | exp/mer80/tri5/decode_dev_overlap_fg/score_13/dev_overlap.ctm.updated.utf8.sys
+
+# For 'chain' model results, see local/chain/*
diff --git a/egs/mgb2_arabic/s5/cmd.sh b/egs/mgb2_arabic/s5/cmd.sh
new file mode 100644
index 00000000000..88db78823a5
--- /dev/null
+++ b/egs/mgb2_arabic/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/mgb2_arabic/s5/conf/decode.config b/egs/mgb2_arabic/s5/conf/decode.config
new file mode 100644
index 00000000000..3f822d5a8d8
--- /dev/null
+++ b/egs/mgb2_arabic/s5/conf/decode.config
@@ -0,0 +1,6 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+num_threads=4
+
+
+
diff --git a/egs/mgb2_arabic/s5/conf/decode_dnn.config b/egs/mgb2_arabic/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/mgb2_arabic/s5/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/mgb2_arabic/s5/conf/mfcc.conf b/egs/mgb2_arabic/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/mgb2_arabic/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/mgb2_arabic/s5/conf/mfcc_hires.conf b/egs/mgb2_arabic/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..8dff0436694
--- /dev/null
+++ b/egs/mgb2_arabic/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40    
+--num-ceps=40   
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/mgb2_arabic/s5/conf/online_cmvn.conf b/egs/mgb2_arabic/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/mgb2_arabic/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/mgb2_arabic/s5/conf/vad.conf b/egs/mgb2_arabic/s5/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/mgb2_arabic/s5/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/mgb2_arabic/s5/local/add_to_datadir.py b/egs/mgb2_arabic/s5/local/add_to_datadir.py
new file mode 100755
index 00000000000..ecd9612faf9
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/add_to_datadir.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+
+# This script appends utterances dumped out from XML to a Kaldi datadir
+
+import sys, re
+from xml.sax.saxutils import unescape
+
+basename=sys.argv[1]
+outdir = sys.argv[2]
+
+if len(sys.argv) > 3:
+    mer_thresh=float(sys.argv[3])
+else:
+    mer_thresh = None
+
+# open the output files in append mode
+segments_file = open(outdir + '/segments', 'a')
+utt2spk_file = open(outdir + '/utt2spk', 'a')
+text_file = open(outdir + '/text', 'a')
+
+for line in sys.stdin:
+
+    m = re.match(r'\w+speaker(\d+)\w+\s+(.*)', line)
+    #print line
+
+    if m:
+
+        spk = int(m.group(1))
+
+        t = m.group(2).split()
+        start = float(t[0])
+        end = float(t[1])
+        mer = float(t[2])
+        
+        s = [unescape(w) for w in t[3:]]       
+        words = ' '.join(s)
+
+        segId = '%s_spk-%04d_seg-%07d:%07d' % (basename, spk, start*100, end*100)
+        spkId = '%s_spk-%04d' % (basename, spk)
+
+        # only add segments where the Matching Error Rate is below the prescribed threshhold
+        if mer_thresh == None or mer <= mer_thresh:
+            print >> segments_file, '%s %s %.2f %.2f' % (segId, basename, start, end ) 
+            print >> text_file, '%s %s' % (segId, words)
+            print >> utt2spk_file, '%s %s' % (segId, spkId)
+
+segments_file.close()
+utt2spk_file.close()
+text_file.close()
+ 
+            
diff --git a/egs/mgb2_arabic/s5/local/chain/run_tdnn_lstm.sh b/egs/mgb2_arabic/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/mgb2_arabic/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/mgb2_arabic/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..56aa815ffb9
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# Adapted from gale_arabic s5b.
+
+# %WER 16.4 | 5002 60169 | 84.8 10.6 4.6 1.3 16.4 64.9 | exp/chain_mer80//tdnn_lstm_1a_sp_bi/decode_dev_non_overlap_fg/score_7/dev_non_overlap_hires.ctm.updated.utf8.sys
+# %WER 17.3 | 5002 60169 | 84.0 11.3 4.7 1.3 17.3 67.1 | exp/chain_mer80//tdnn_lstm_1a_sp_bi/decode_dev_non_overlap/score_7/dev_non_overlap_hires.ctm.updated.utf8.sys
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=-17
+nj=60
+decode_nj=30
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+xent_regularize=0.025
+train_set=train_mer80
+gmm=mer80/tri5          # the gmm for the target data gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_mer80      # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=150
+dropout_schedule="0,0.0@0.05,0.2@0.5,0.0"
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1a  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+extractor=  # use previously trained extractor
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --train-set $train_set \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix" \
+                                  --extractor "$extractor"
+
+if [ -z "$extractor" ]; then
+  extractor=exp/nnet3${nnet3_affix}/extractor
+fi
+
+gmm_dir=exp/$gmm
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm_${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \
+    --generate-ali-from-lats true ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 7000 ${lores_train_data_dir} data/lang_chain $lat_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 dim=1024 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  relu-batchnorm-layer name=tdnn3 dim=1024 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 dim=1024 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+  relu-batchnorm-layer name=tdnn5 dim=1024 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn6 dim=1024 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b{05,06,11,12}/$USER/kaldi-data/egs/mgb2_arabic-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd --mem 4G" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/lang_test $dir $dir/graph
+fi
+
+for dev in dev_non_overlap; do
+  ivectors_dir=exp/nnet3${nnet3_affix}
+  if [ $stage -le 20 ]; then
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 40 \
+      data/${dev}_hires $extractor $ivectors_dir/ivectors_$dev
+  fi
+
+  if [ $stage -le 21 ]; then
+    steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --frames-per-chunk "$frames_per_chunk" \
+      --online-ivector-dir $ivectors_dir/ivectors_$dev \
+      --scoring-opts "--min-lmwt 5 --max-lmwt 15" \
+      $dir/graph data/${dev}_hires $dir/decode_$dev || exit 1;
+  fi
+
+  if [ $stage -le 22 ]; then
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      --scoring-opts "--min-lmwt 5 --max-lmwt 15" \
+      data/lang_test data/lang_test_fg \
+      data/${dev}_hires $dir/decode_${dev}{,_fg} || exit 1
+  fi
+done
+
+exit 0
diff --git a/egs/mgb2_arabic/s5/local/chain/tuning/run_tdnn_lstm_1a_disc.sh b/egs/mgb2_arabic/s5/local/chain/tuning/run_tdnn_lstm_1a_disc.sh
new file mode 100644
index 00000000000..290c13e223d
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/chain/tuning/run_tdnn_lstm_1a_disc.sh
@@ -0,0 +1,267 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+
+# Below is with 0.00002 and last_layer_factor=0.5
+# this is the setting we're leaving in the script, but the discriminative training
+# is not really helping.  Maybe we should try the frame-shifted version.
+# steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2:num-jobs=4;effective-lrate=2e-05;last-layer-factor=0.50;iters-per-epoch=138;epoch[0,1,2]:train-objf=[0.94,0.96,0.97],valid-objf=[0.95,0.96,0.96],train-counts=[0.24,0.12,0.10],valid-counts=[0.28,0.20,0.17]
+# b01:s5_r2: steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2:{1,2}
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_smbroutslow2:1 tdnn_lstm1e_sp_bi_smbroutslow2:2
+# WER on dev(orig)            9.0       8.9       8.9
+#         [looped:]           9.0       8.9       8.9
+# WER on dev(rescored)        8.4       8.3       8.4
+#         [looped:]           8.4       8.3       8.4
+# WER on test(orig)           8.8       8.7       8.8
+#         [looped:]           8.8       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.4
+#         [looped:]           8.3       8.4       8.5
+
+
+
+# Below is with 0.00002 and last_layer_factor=1.0.
+# b01:s5_r2: steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr:num-jobs=4;lrate=2e-05;iters-per-epoch=138;epoch[0,1,2]:train-objf=[0.94,0.96,0.97],valid-objf=[0.95,0.96,0.96],train-counts=[0.24,0.12,0.09],valid-counts=[0.28,0.19,0.16]
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr:{1,2}
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_smbr:1 tdnn_lstm1e_sp_bi_smbr:2
+# WER on dev(orig)            9.0       8.8       8.9
+#         [looped:]           9.0       8.9       8.9
+# WER on dev(rescored)        8.4       8.3       8.4
+#         [looped:]           8.4       8.3       8.4
+# WER on test(orig)           8.8       8.8       8.9
+#         [looped:]           8.8       8.8       8.9
+# WER on test(rescored)       8.4       8.4       8.5
+#         [looped:]           8.3       8.4       8.5
+
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+# you can set disc_affix if you run different configurations, e.g. --disc-affix "_b"
+disc_affix=
+
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+srcdir=exp/chain_mer80/tdnn_lstm_1a_sp_bi
+graph_dir=$srcdir/graph
+train_data_dir=data/train_mer80_sp_hires_comb
+online_ivector_dir=exp/nnet3_mer80/ivectors_train_mer80_sp_hires_comb
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # frames-per-chunk for decoding in alignment and
+                          # denlat decoding.
+frames_per_chunk_decoding=150  # frames-per-chunk for decoding when we test
+                               # the models.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+extra_left_context=50
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.00002
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=2
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+last_layer_factor=0.5    # have the output layer train slower than the others.. this can
+                         # be helpful.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --self-loop-scale 1.0 --acwt 1.0 \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true  --acoustic-scale 1.0 \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --last-layer-factor $last_layer_factor \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev_non_overlap; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      if [ $num_jobs -gt 40 ]; then
+        num_jobs=40
+      fi
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+
+      (
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk "$frames_per_chunk_decoding" \
+         --extra-left-context $extra_left_context \
+         --extra-right-context $extra_right_context \
+         --extra-left-context-initial 0 --extra-right-context-final 0 \
+         --online-ivector-dir exp/nnet3_mer80/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
+
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_test data/lang_test_fg data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter} \
+          $dir/decode_${decode_set}_${iter}_fg || exit 1;
+      ) &
+    done
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev_non_overlap; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      if [ $num_jobs -gt 40 ]; then
+        num_jobs=40
+      fi
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+      (
+        steps/nnet3/decode_looped.sh \
+          --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3_mer80/ivectors_${decode_set} \
+          --scoring-opts "--min-lmwt 5 " \
+          $graph_dir data/${decode_set}_hires $dir/decode_looped_${decode_set}_${iter} || exit 1;
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_test data/lang_test_fg \
+          data/${decode_set}_hires \
+          ${dir}/decode_looped_${decode_set}_${iter} ${dir}/decode_looped_${decode_set}_${iter}_fg || exit 1
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
+
diff --git a/egs/mgb2_arabic/s5/local/graphgeme_mgb_prep_dict.sh b/egs/mgb2_arabic/s5/local/graphgeme_mgb_prep_dict.sh
new file mode 100755
index 00000000000..5a88220a19a
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/graphgeme_mgb_prep_dict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+
+
+# run this from ../
+dir=data/local/dict
+mkdir -p $dir
+lexicon=$1
+
+#(2) Dictionary preparation:
+
+# silence phones, one per line.
+echo SIL > $dir/silence_phones.txt
+echo SIL > $dir/optional_silence.txt
+
+if [ ! -f $lexicon ]; then
+  echo "$0: no such file $lexicon"
+  exit 1;
+fi
+
+sed '2,$!d' $lexicon > $dir/lexicon.txt
+cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '\n' |\
+sort -u >  $dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> SIL' $dir/lexicon.txt
+ 
+echo Dictionary preparation succeeded
+
diff --git a/egs/mgb2_arabic/s5/local/mgb_data_prep.sh b/egs/mgb2_arabic/s5/local/mgb_data_prep.sh
new file mode 100755
index 00000000000..9d5b3611da8
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_data_prep.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+#               2016-2019  Vimal Manohar
+#               2019 Dongji Gao
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <DB-dir> <mer-sel>"
+  exit 1;
+fi
+
+db_dir=$1
+mer=$2
+
+train_dir=data/train_mer$mer
+dev_dir=data/dev
+
+for x in $train_dir $dev_dir; do
+  mkdir -p $x
+  if [ -f ${x}/wav.scp ]; then
+    mkdir -p ${x}/.backup
+    mv $x/{wav.scp,feats.scp,utt2spk,spk2utt,segments,text} ${x}/.backup
+  fi
+done
+
+if [ -z $(which xml) ]; then
+  echo "$0: Could not find tool xml"
+  echo "$0: Download and install it from xmlstar.sourceforge.net"
+  exit 1
+fi
+
+find $db_dir/train/wav -type f -name "*.wav" | \
+  awk -F/ '{print $NF}' | perl -pe 's/\.wav//g' > \
+  $train_dir/wav_list
+
+#Creating the train program lists
+head -500 $train_dir/wav_list > $train_dir/wav_list.short
+
+set -e -o pipefail
+
+xmldir=$db_dir/train/xml/bw
+cat $train_dir/wav_list | while read basename; do
+    [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
+    xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v  "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $train_dir $mer
+    echo $basename $wavDir/$basename.wav >> $train_dir/wav.scp
+done 
+
+for x in text segments; do
+  cp $db_dir/dev/${x}.all $dev_dir/${x}
+done
+
+find $db_dir/dev/wav -type f -name "*.wav" | \
+  awk -F/ '{print $NF}' | perl -pe 's/\.wav//g' > \
+  $dev_dir/wav_list
+
+for x in $(cat $dev_dir/wav_list); do 
+  echo $x $db_dir/dev/wav/$x.wav >> $dev_dir/wav.scp
+done
+
+#Creating a file reco2file_and_channel which is used by convert_ctm.pl in local/score.sh script
+awk '{print $1" "$1" 1"}' $dev_dir/wav.scp > $dev_dir/reco2file_and_channel
+
+# Creating utt2spk for dev from segments
+if [ ! -f $dev_dir/utt2spk ]; then
+  cut -d ' ' -f1 $dev_dir/segments > $dev_dir/utt_id
+  cut -d '_' -f1-2 $dev_dir/utt_id | paste -d ' ' $dev_dir/utt_id - > $dev_dir/utt2spk
+fi
+
+for list in overlap non_overlap; do
+  rm -rf ${dev_dir}_$list || true
+  cp -r $dev_dir ${dev_dir}_$list
+  for x in segments text utt2spk; do
+    utils/filter_scp.pl $db_dir/dev/${list}_speech $dev_dir/$x > ${dev_dir}_$list/${x}
+  done
+done
+
+for dir in $train_dir $dev_dir ${dev_dir}_overlap ${dev_dir}_non_overlap; do
+  utils/fix_data_dir.sh $dir
+  utils/validate_data_dir.sh --no-feats $dir
+done
+
+mkdir -p ${train_dir}_subset500
+utils/filter_scp.pl $train_dir/wav_list.short ${train_dir}/wav.scp > \
+  ${train_dir}_subset500/wav.scp
+cp ${train_dir}/{utt2spk,segments,spk2utt} ${train_dir}_subset500
+utils/fix_data_dir.sh ${train_dir}_subset500
+
+echo "Training and Test data preparation succeeded"
diff --git a/egs/mgb2_arabic/s5/local/mgb_data_prep_eval.sh b/egs/mgb2_arabic/s5/local/mgb_data_prep_eval.sh
new file mode 100755
index 00000000000..229664b714a
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_data_prep_eval.sh
@@ -0,0 +1,33 @@
+#! /bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0.
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <WAV-DIR> <XML-DIR>"
+  echo " e.g.: $0 /export/a15/vmanoha1/MGB/eval /export/a15/vmanoha1/MGB/eval_xml_2016_05_29/bw"
+  exit 1
+fi
+
+wavDir=$1
+xmldir=$2
+
+evalDir=data/eval
+
+if [ -f ${evalDir}.uem/wav.scp ]; then
+  mkdir -p ${evalDir}.uem/.backup
+  mv ${evalDir}.uem/{wav.scp,feats.scp,utt2spk,spk2utt,segments,text} ${evalDir}.uem/.backup
+fi
+
+mkdir -p ${evalDir}.uem
+for x in $wavDir/*.wav; do
+  basename=`basename $x .wav`
+  [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
+  $XMLSTARLET/xml sel -t -m '//segments[@annotation_id="transcript_manual"]' -m "segment" -n -v  "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename ${evalDir}.uem
+  echo $basename $wavDir/$basename.wav >> ${evalDir}.uem/wav.scp
+done
+
+#Creating a file reco2file_and_channel which is used by convert_ctm.pl in local/score.sh script
+awk '{print $1" "$1" 0"}' ${evalDir}.uem/wav.scp > ${evalDir}.uem/reco2file_and_channel
+
+utils/data/convert_data_dir_to_whole.sh ${evalDir}.uem ${evalDir}
diff --git a/egs/mgb2_arabic/s5/local/mgb_extract_data.sh b/egs/mgb2_arabic/s5/local/mgb_extract_data.sh
new file mode 100755
index 00000000000..40c27db6506
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_extract_data.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+if [[ ! -e "DB/train.tar.gz" || ! -e "DB/dev.tar.gz" ]]; then
+  echo "You need to download the MGB-2 first and copy dev.tar.gz and train.tar.gz to DB directory"
+  echo "check: https://arabicspeech.org/mgb2"
+  exit 1
+fi
+
+(cd DB; rm -fr train dev test; for x in *; do tar -xvf $x; done)
diff --git a/egs/mgb2_arabic/s5/local/mgb_format_data.sh b/egs/mgb2_arabic/s5/local/mgb_format_data.sh
new file mode 100755
index 00000000000..0fc24c15add
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_format_data.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+
+if [ -f path.sh ]; then
+  . path.sh; else
+   echo "missing path.sh"; exit 1;
+fi 
+
+lang_test=data/lang_test
+arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
+
+. utils/parse_options.sh
+
+mkdir -p $lang_test
+
+[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
+
+rm -r $lang_test
+cp -r data/lang $lang_test
+
+# grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
+# LM doesn't have these "invalid combinations".  These can cause 
+# determinization failures of CLG [ends up being epsilon cycles].
+# Note: remove_oovs.pl takes a list of words in the LM that aren't in
+# our word list.  Since our LM doesn't have any, we just give it
+# /dev/null [we leave it in the script to show how you'd do it].
+gunzip -c "$arpa_lm" | \
+   grep -v '<s> <s>' | \
+   grep -v '</s> <s>' | \
+   grep -v '</s> </s>' | \
+   arpa2fst - | fstprint | \
+   utils/remove_oovs.pl /dev/null | \
+   utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang_test/words.txt \
+     --osymbols=$lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    fstrmepsilon | fstarcsort --sort_type=ilabel > $lang_test/G.fst
+  fstisstochastic $lang_test/G.fst
+
+
+echo  "Checking how stochastic G is (the first of these numbers should be small):"
+fstisstochastic $lang_test/G.fst 
+
+## Check lexicon.
+## just have a look and make sure it seems sane.
+echo "First few lines of lexicon FST:"
+fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
+
+echo Performing further checks
+
+# Checking that G.fst is determinizable.
+fstdeterminize $lang_test/G.fst /dev/null || echo Error determinizing G.
+
+# Checking that L_disambig.fst is determinizable.
+fstdeterminize $lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
+
+# Checking that disambiguated lexicon times G is determinizable
+# Note: we do this with fstdeterminizestar not fstdeterminize, as
+# fstdeterminize was taking forever (presumbaly relates to a bug
+# in this version of OpenFst that makes determinization slow for
+# some case).
+fsttablecompose $lang_test/L_disambig.fst $lang_test/G.fst | \
+   fstdeterminizestar >/dev/null || echo Error
+
+# Checking that LG is stochastic:
+fsttablecompose data/lang/L_disambig.fst $lang_test/G.fst | \
+   fstisstochastic || echo LG is not stochastic
+
+
+echo mgb_format_data  succeeded.
diff --git a/egs/mgb2_arabic/s5/local/mgb_prep_full_data.sh b/egs/mgb2_arabic/s5/local/mgb_prep_full_data.sh
new file mode 100755
index 00000000000..30e702c6841
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_prep_full_data.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+#               2016-2019  Vimal Manohar
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <wav-dir> <xml-dir> <mer-sel>"
+  exit 1;
+fi
+
+wavDir=$1
+xmldir=$2
+mer=$3
+
+#wavDir=$WAV_DIR;xmldir=$XML_DIR;mer=80
+trainDir=data/train_mer$mer
+devDir=data/dev
+
+rm -r $trainDir || true
+rm -r $devDir || true
+mkdir -p $trainDir
+mkdir -p $devDir
+
+dev=dev$$
+
+set -e -o pipefail
+
+cut -d '/' -f2 local/train | while read basename; do     
+    [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
+    $XMLSTARLET/xml sel -t -m '//segments[@annotation_id="transcript_align"]' -m "segment" -n -v  "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $trainDir $mer
+    echo $basename $wavDir/$basename.wav >> $trainDir/wav.scp
+done 
+
+cut -d '/' -f2 local/dev | while read basename; do
+    [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
+    $XMLSTARLET/xml sel -t -m '//segments[@annotation_id="transcript_manual"]' -m "segment" -n -v  "concat(@who,' ',@starttime,' ',@endtime,' ',@WMER,' ')" -m "element" -v "concat(text(),' ')" $xmldir/$basename.xml | local/add_to_datadir.py $basename $devDir
+    echo $basename $wavDir/$basename.wav >> $devDir/wav.scp
+done
+
+#Creating a file reco2file_and_channel which is used by convert_ctm.pl in local/score.sh script
+awk '{print $1" "$1" 0"}' $devDir/wav.scp > $devDir/reco2file_and_channel
+
+#stm reference file for scoring
+cat dev$$ | while read basename; do
+    [ ! -e $xmldir/$basename.xml ] && echo "Missing $xmldir/$basename.xml" && exit 1
+    local/xml2stm.py $xmldir/$basename.xml 
+done > $devDir/stm
+
+for list in overlap non_overlap; do
+  rm -rf ${devDir}_$list || true
+  cp -r $devDir ${devDir}_$list
+  for x in segments text utt2spk; do
+    utils/filter_scp.pl local/${list}_speech.lst $devDir/$x > ${devDir}_$list/$x
+  done
+done
+
+for dir in $trainDir $devDir ${devDir}_overlap ${devDir}_non_overlap; do
+  utils/fix_data_dir.sh $dir
+  utils/validate_data_dir.sh --no-feats $dir
+done
+
+for dir in ${devDir} ${devDir}_overlap ${devDir}_non_overlap; do
+  awk '{print $1 " " $1}' $dir/segments > $dir/spk2utt
+  cp $dir/spk2utt $dir/utt2spk
+  perl -e '
+ ($f1,$f2)= split /\s+/, $ARGV[0];
+ open(FNAME, "$f1");
+ while (<FNAME>){chomp $_;@arr=split /\s+/,$_;shift @arr;$scal = "@arr";$hashExist{$scal}=1;}close (FNAME);
+ open(FTR, "$f2"); while (<FTR>){$line=$_;s/ 0 UNKNOWN / /;@arr=split /\s+/,$_;if (defined $hashExist{"$arr[0] $arr[1] $arr[2]"}) {print "$line";}}close (FTR);
+ ' "$dir/segments $dir/stm" > $dir/stm_
+ mv $dir/stm_ $dir/stm
+done
+
+echo "Training and Test data preparation succeeded"
diff --git a/egs/mgb2_arabic/s5/local/mgb_prep_original_data.sh b/egs/mgb2_arabic/s5/local/mgb_prep_original_data.sh
new file mode 100755
index 00000000000..6edb5ac946d
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_prep_original_data.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <wav-dir> <text-dir>"
+  exit 1;
+fi
+
+wavDir=$1
+textDir=$2
+
+trainDirOrig=data/train_orig
+
+rm -r $trainDirOrig
+
+for file in train; do
+  if [ ! -f $file ]; then
+    echo "$0: no such file $file - copy $file from GitHub repository ArabicASRChallenge2016/download/"
+    exit 1;
+  fi
+done 
+
+set -e -o pipefail
+
+mkdir -p $trainDirOrig
+cut -d '/' -f 2 train | while read basename; do
+  [ ! -e $textDir/$basename.txt ] && echo "Missing $textDir/$basename.txt" && exit 1
+  text=$(cat $textDir/$basename.txt | tr '\n' ' ' | perl -pe 's/\s+/ /g')
+  [ -z "$text" ] && exit 1
+  echo "$basename $text" >> $trainDirOrig/text
+  echo "$basename $wavDir/$basename.wav" >> $trainDirOrig/wav.scp
+done
+
+awk '{print $1" "$1" 1"}' $trainDirOrig/wav.scp > $trainDirOrig/reco2file_and_channel
+awk '{print $1" "$1}' $trainDirOrig/wav.scp > $trainDirOrig/utt2spk
+cp $trainDirOrig/utt2spk $trainDirOrig/spk2utt
+
+utils/fix_data_dir.sh $trainDirOrig
+utils/validate_data_dir.sh --no-feats $trainDirOrig
diff --git a/egs/mgb2_arabic/s5/local/mgb_train_lms.sh b/egs/mgb2_arabic/s5/local/mgb_train_lms.sh
new file mode 100755
index 00000000000..e49055b478d
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_train_lms.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+# To be run from one directory above this script.
+
+mer=$1
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <mer>"
+  exit 1
+fi
+
+lexicon=data/local/dict/lexicon.txt 
+[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
+
+# check if sri is installed or no
+sri_installed=false
+which ngram-count  &>/dev/null
+if [[ $? == 0 ]]; then 
+sri_installed=true
+fi
+
+# This script takes no arguments.  It assumes you have already run
+# previus steps successfully
+# It takes as input the files
+#data/local/train.*/text
+#data/local/dict/lexicon.txt
+
+
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+# have a different locale.
+export PATH=$PATH:./../../../tools/kaldi_lm
+( # First make sure the kaldi_lm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d kaldi_lm ]; then
+   echo Not installing the kaldi_lm toolkit since it is already there.
+ else
+   echo Downloading and installing the kaldi_lm tools
+   if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
+   fi
+   tar -xvzf kaldi_lm.tar.gz || exit 1;
+   cd kaldi_lm
+   make || exit 1;
+   echo Done making the kaldi_lm tools
+ fi
+) || exit 1;
+
+
+dir=data/local/lm_mer$mer
+mkdir -p $dir
+text=data/train_mer$mer/text
+
+[ ! -f $text ] && echo "$0: No such file $text" && exit 1;
+
+cleantext=$dir/text.no_oov
+
+cut -d ' ' -f 2- $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
+{for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ",$n);} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+
+cat $cleantext | awk '{for(n=1;n<=NF;n++) print $n; }' | sort | uniq -c | \
+  sort -nr > $dir/word.counts || exit 1;
+
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=1;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+  sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <UNK> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
+  || exit 1;
+
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+  || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+# Perplexity over 214056.000000 words is 799.163165
+# Perplexity over 212800.000000 words (excluding 1256.000000 OOVs) is 818.385022
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1;
+# Perplexity over 214056.000000 words is 791.753136
+# Perplexity over 212800.000000 words (excluding 1256.000000 OOVs) is 810.825952
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+if $sri_installed; then 
+
+ heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+ sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+ mkdir -p $sdir
+ cat $cleantext | awk '{for(n=1;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+   head -$heldout_sent > $sdir/heldout
+ cat $cleantext | awk '{for(n=1;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+   tail -n +$heldout_sent > $sdir/train
+
+ cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+   -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout 
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <UNK>.
+ ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout 
+fi
+
+
+echo train lm succeeded
diff --git a/egs/mgb2_arabic/s5/local/mgb_train_lms_extra.sh b/egs/mgb2_arabic/s5/local/mgb_train_lms_extra.sh
new file mode 100755
index 00000000000..c29b6e83764
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_train_lms_extra.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+# To be run from one directory above this script.
+
+lm_text=$1
+mer=$2
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <lm-text> <mer>"
+  exit 1
+fi
+
+lexicon=data/local/dict/lexicon.txt 
+[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
+
+# check if sri is installed or no
+sri_installed=false
+which ngram-count  &>/dev/null
+if [[ $? == 0 ]]; then 
+sri_installed=true
+fi
+
+# This script takes no arguments.  It assumes you have already run
+# previus steps successfully
+# It takes as input the files
+#data/local/train.*/text
+#data/local/dict/lexicon.txt
+
+
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+# have a different locale.
+export PATH=$PATH:./../../../tools/kaldi_lm
+( # First make sure the kaldi_lm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d kaldi_lm ]; then
+   echo Not installing the kaldi_lm toolkit since it is already there.
+ else
+   echo Downloading and installing the kaldi_lm tools
+   if [ ! -f kaldi_lm.tar.gz ]; then
+     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
+   fi
+   tar -xvzf kaldi_lm.tar.gz || exit 1;
+   cd kaldi_lm
+   make || exit 1;
+   echo Done making the kaldi_lm tools
+ fi
+) || exit 1;
+
+dir=data/local/lm_large_mer$mer
+mkdir -p $dir
+text=data/train_mer$mer/text
+
+[ ! -f $text ] && echo "$0: No such file $text" && exit 1;
+
+cleantext=$dir/text.no_oov
+
+cut -d ' ' -f 2- $text | cat - $lm_text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
+   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ",$n);} } printf("\n");}' \
+   > $cleantext || exit 1;
+
+
+cat $cleantext | awk '{for(n=1;n<=NF;n++) print $n; }' | sort | uniq -c | \
+  sort -nr > $dir/word.counts || exit 1;
+
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=1;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+  sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <UNK> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
+  || exit 1;
+
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+  || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+# Perplexity over 214056.000000 words is 713.021328
+# Perplexity over 212800.000000 words (excluding 1256.000000 OOVs) is 729.551393
+
+train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1;
+# Perplexity over 214056.000000 words is 693.053632
+# Perplexity over 212800.000000 words (excluding 1256.000000 OOVs) is 709.025436
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+if $sri_installed; then 
+
+ heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+ sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+ mkdir -p $sdir
+ cat $cleantext | awk '{for(n=1;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+   head -$heldout_sent > $sdir/heldout
+ cat $cleantext | awk '{for(n=1;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+   tail -n +$heldout_sent > $sdir/train
+
+ cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+   -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout 
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <UNK>.
+ ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout 
+fi
+
+
+echo train lm succeeded
diff --git a/egs/mgb2_arabic/s5/local/mgb_train_lms_extra_pocolm.sh b/egs/mgb2_arabic/s5/local/mgb_train_lms_extra_pocolm.sh
new file mode 100755
index 00000000000..b9f82012add
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/mgb_train_lms_extra_pocolm.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Vimal Manohar
+# Apache 2.0
+#
+# It is based on the example scripts distributed with PocoLM
+
+# It will first check if pocolm is installed and if not will process with installation
+
+set -e
+stage=0
+set -o pipefail 
+set -u
+
+stage=0
+dir=data/local/pocolm
+cmd=run.pl
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+lm_text=$1
+mer=$2
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <lm-text> <mer>"
+  exit 1
+fi
+
+lm_dir=${dir}/data
+
+mkdir -p $dir
+. ./path.sh || exit 1;  # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+num_dev_sentences=30000
+RANDOM=0
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Full acoustic transcripts
+  cat data/train_mer$mer/text | cut -d ' ' -f 2- | \
+    shuf > ${dir}/train_mer${mer}_text
+  head -n $num_dev_sentences < ${dir}/train_mer${mer}_text > \
+    ${dir}/data/text/dev.txt
+
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_mer${mer}_text | \
+    gzip -c > \
+    ${dir}/data/text/train_mer${mer}.txt.gz
+
+  # Get text from the extra LM corpus
+  cat $lm_text | gzip -c > ${dir}/data/text/mgb_arabic.txt.gz
+
+  cp data/dev_non_overlap/text ${dir}/data/mgb2_dev.txt
+fi
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir/data/work
+  if [ ! -f $dir/data/work/word_counts/.done ]; then
+    get_word_counts.py $dir/data/text $dir/data/work/word_counts
+    touch $dir/data/work/word_counts/.done
+  fi
+fi
+
+lexicon=data/local/dict/lexicon.txt 
+[ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1;
+
+if [ $stage -le 2 ]; then
+  cat $lexicon | awk '{print $1}' > $dir/data/work/wordlist
+  wordlist_to_vocab.py --unk-symbol="<UNK>" $dir/data/work/wordlist > \
+    $dir/data/work/vocab_wordlist.txt
+  touch $dir/data/work/.vocab_wordlist.txt.done
+fi
+
+order=4
+wordlist=$dir/data/work/wordlist
+
+min_counts="default=5 train_mer${mer}=2"
+lm_name="`basename ${wordlist}`_${order}"
+if [ -n "${min_counts}" ]; then
+  lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "," "." | tr "=" "-"`"
+fi
+unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+
+if [ $stage -le 3 ]; then
+  echo "$0: training the unpruned LM"
+
+  $cmd ${unpruned_lm_dir}/log/train.log \
+    train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
+                 --limit-unk-history=true \
+                 --fold-dev-into=train_mer$mer \
+                 --min-counts="${min_counts}" \
+                 ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  for x in mgb2_dev; do
+    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} 
+
+    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 20 million n-grams for a big LM for rescoring purposes.
+  size=20000000
+  $cmd ${dir}/data/lm_${order}_prune_big/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 \
+    ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  for x in mgb2_dev; do
+    $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_big
+
+    cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  
+  $cmd ${dir}/data/lm_${order}_prune_small/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big \
+    ${dir}/data/lm_${order}_prune_small
+
+  for x in mgb2_dev; do
+    $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_small
+
+    cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/mgb2_arabic/s5/local/nnet/run_dnn.sh b/egs/mgb2_arabic/s5/local/nnet/run_dnn.sh
new file mode 100755
index 00000000000..12fd60f7947
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/nnet/run_dnn.sh
@@ -0,0 +1,71 @@
+#!/bin/bash 
+
+# Copyright 2014 QCRI (author: Ahmed Ali)
+# Apache 2.0
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+nDecodeJobs=80
+
+mer=$1
+
+#train DNN
+mfcc_fmllr_dir=mfcc_fmllr_mer$mer
+baseDir=exp/mer$mer/tri4
+alignDir=exp/mer$mer/tri4_ali
+dnnDir=exp/mer$mer/tri4_dnn_2048x5
+align_dnnDir=exp/mer$mer/tri4_dnn_2048x5_ali
+dnnLatDir=exp/mer$mer/tri4_dnn_2048x5_denlats
+dnnMPEDir=exp/mer$mer/tri4_dnn_2048x5_smb
+
+trainTr90=data/train_mer${mer}_tr90
+trainCV=data/train_mer${mer}_cv10 
+
+for dev in overlap non_overlap; do
+  steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+    --transform-dir $baseDir/decode_$dev data/dev_${dev}_fmllr data/dev_${dev} \
+    $baseDir $mfcc_fmllr_dir/log_dev_${dev} $mfcc_fmllr_dir || exit 1;
+    cp data/dev_${dev}/reco2file_channel data/dev_${dev}_fmllr/reco2file_channel
+    cp data/dev_${dev}/test.stm data/dev_${dev}_fmllr/test.stm
+done
+
+steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+  --transform-dir $alignDir data/train_mer${mer}_fmllr data/train_mer$mer \
+  $baseDir $mfcc_fmllr_dir/log_train_mer$mer $mfcc_fmllr_dir || exit 1;
+                            
+utils/subset_data_dir_tr_cv.sh  data/train_mer${mer}_fmllr $trainTr90 $trainCV || exit 1;
+
+(tail --pid=$$ -F $dnnDir/train_mer${mer}_nnet.log 2>/dev/null)& 
+$train_cmd --gpu 1 $dnnDir/train_mer${mer}_nnet.log \
+steps/train_nnet.sh  --hid-dim 2048 --hid-layers 5 --learn-rate 0.008 \
+  $trainTr90 $trainCV data/lang $alignDir $alignDir $dnnDir || exit 1;
+
+for dev in overlap non_overlap; do
+  steps/decode_nnet.sh --nj $nDecodeJobs --cmd "$decode_cmd" \
+    --config conf/decode_dnn.config --nnet $dnnDir/final.nnet \
+    --acwt 0.08 $baseDir/graph data/dev_${dev}_fmllr $dnnDir/decode_$dev &
+done
+
+#
+steps/nnet/align.sh --nj $nDecodeJobs --cmd "$train_cmd" data/train_mer${mer}_fmllr data/lang \
+  $dnnDir $align_dnnDir || exit 1;
+
+steps/nnet/make_denlats.sh --nj $nDecodeJobs --cmd "$train_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+  data/train_mer${mer}_fmllr data/lang $dnnDir $dnnLatDir || exit 1;
+
+steps/nnet/train_mpe.sh --cmd "$train_cmd --gpu 1" --num-iters 6 --acwt 0.1 --do-smbr true \
+  data/train_mer${mer}_fmllr data/lang $dnnDir $align_dnnDir $dnnLatDir $dnnMPEDir || exit 1;
+
+#decode
+for dev in overlap non_overlap; do
+  for n in 1 2 3 4 5 6; do
+    steps/decode_nnet.sh --nj $nDecodeJobs --cmd "$train_cmd" --config conf/decode_dnn.config \
+    --nnet $dnnMPEDir/$n.nnet --acwt 0.08 \
+    $baseDir/graph data/dev_${dev}_fmllr $dnnMPEDir/decode_${dev}_it$n || exit 1;
+  done
+done
+
+echo DNN success
+# End of DNN
+
diff --git a/egs/mgb2_arabic/s5/local/nnet3/run_ivector_common.sh b/egs/mgb2_arabic/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..ae2edc27a91
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
+# be called by more scripts).  It contains the common feature preparation and iVector-related parts
+# of the script.  See those scripts for examples of usage.
+
+
+stage=0
+nj=100
+train_set=train_mer80  # you might set this to e.g. train.
+num_threads_ubm=32
+nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
+                         # becomes exp/nnet3_cleaned or whatever.
+extractor=
+
+. cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+for f in data/${train_set}/utt2spk; do 
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b{05,06,10,11}/$USER/kaldi-data/egs/mgb2_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp dev; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+
+  for datadir in ${train_set}_sp dev; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
+fi
+
+if [ -z "$extractor" ]; then
+  if [ $stage -le 4 ]; then
+    echo "$0: selecting segments of hires training data that were also present in the"
+    echo " ... original training data."
+
+    # note, these data-dirs are temporary; we put them in a sub-directory
+    # of the place where we'll make the alignments.
+    temp_data_root=exp/nnet3${nnet3_affix}/tri5
+    mkdir -p $temp_data_root
+
+    utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
+            data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
+
+    # note: essentially all the original segments should be in the hires data.
+    n1=$(wc -l <data/${train_set}/feats.scp)
+    n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
+    if [ $n1 != $n2 ]; then
+      echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
+    fi
+
+    echo "$0: training a system on the hires data for its PCA transform, in order to produce the diagonal GMM."
+    if [ -e exp/nnet3${nnet3_affix}/pca_transform/final.mat ]; then
+      # we don't want to overwrite old stuff, ask the user to delete it.
+      echo "$0: exp/nnet3${nnet3_affix}/pca_transform/final.mat already exists: "
+      echo " ... please delete and then rerun, or use a later --stage option."
+      exit 1;
+    fi
+    steps/online/nnet2/get_pca_transform.sh \
+      --cmd "$train_cmd" --splice-opts "--left-context=3 --right-context=3" \
+       $temp_data_root/${train_set}_hires \
+        exp/nnet3${nnet3_affix}/pca_transform
+  fi
+
+
+  if [ $stage -le 5 ]; then
+    echo "$0: computing a subset of data to train the diagonal UBM."
+
+    mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+    temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+    # train a diagonal UBM using a subset of about a quarter of the data
+    num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+    num_utts=$[$num_utts_total/4]
+    utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+        $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+    echo "$0: training the diagonal UBM."
+    # Use 512 Gaussians in the UBM.
+    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+      --num-frames 700000 \
+      --num-threads $num_threads_ubm \
+      ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+      exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+  fi
+
+  if [ $stage -le 6 ]; then
+    # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+    # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+    # 100.
+    echo "$0: training the iVector extractor"
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+      data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+  fi
+
+  extractor=exp/nnet3${nnet3_affix}/extractor
+fi
+
+if [ $stage -le 7 ]; then
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b{05,06,10,11}/$USER/kaldi-data/egs/mgb2_arabic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # With --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    $extractor $ivectordir
+  
+  for suffix in overlap non_overlap; do
+    # Create hires version of the dev dirs
+    utils/subset_data_dir.sh --utt-list data/dev_${suffix}/utt2spk \
+      data/dev_hires data/dev_${suffix}_hires
+    cp data/dev_${suffix}/stm data/dev_${suffix}_hires
+  done
+
+  # Also extract iVectors for the test data
+  for data in dev dev_non_overlap; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "$nj" \
+      data/${data}_hires $extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
+  echo "$0: $feats already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+exit 0;
diff --git a/egs/mgb2_arabic/s5/local/run_cleanup_segmentation.sh b/egs/mgb2_arabic/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..559d20046dd
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri3
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./path.sh
+. ./cmd.sh
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+    $data data/lang $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+if [ $stage -le 4 ]; then
+  # Test with the models trained on cleaned-up data.
+  utils/mkgraph.sh data/lang ${cleaned_dir} ${cleaned_dir}/graph
+
+  for dset in dev test; do
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd"  --num-threads 4 \
+       ${cleaned_dir}/graph data/${dset} ${cleaned_dir}/decode_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset} ${cleaned_dir}/decode_${dset} ${cleaned_dir}/decode_${dset}_rescore
+  done
+fi
diff --git a/egs/mgb2_arabic/s5/local/run_test_mgb.sh b/egs/mgb2_arabic/s5/local/run_test_mgb.sh
new file mode 100755
index 00000000000..f380d7a5182
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/run_test_mgb.sh
@@ -0,0 +1,111 @@
+#! /bin/bash
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0.
+
+set -euo pipefail
+
+. path.sh
+. cmd.sh
+
+stage=100
+nj=40
+
+seconds_per_spk_max=30
+
+gmm_dir=
+chain_dir=exp/chain_mer80/tdnn_lstm1a_sp_bi
+
+lang=data/lang_1200h_test
+lang_rescore=data/lang_test_fg
+
+extractor=exp/nnet3_mer80/extractor
+extra_left_context=50 
+extra_right_context=0
+frames_per_chunk=150
+
+. utils/parse_options.sh
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 [options] <data>"
+  echo " e.g.: $0 data/eval.uem"
+  exit 1
+fi
+
+data=$1
+
+if [ ! -z "$seconds_per_spk_max" ]; then
+  utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max $data ${data}_spk30sec
+  data=${data}_spk30sec
+fi
+
+data_id=`basename $data`
+lang_id=$(echo $lang | perl -ne 'm:.+/lang(_\S+)_test:; print $1;')
+
+if [ ! -z "$gmm_dir" ]; then
+  if [ ! -s $data/feats.scp ]; then
+    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $nj $data
+    steps/compute_cmvn_stats.sh ${data}
+  fi
+
+  if [ ! -f $gmm_dir/graph${lang_id}/HCLG.fst ]; then
+    utils/mkgraph.sh $lang $gmm_dir $gmm_dir/graph${lang_id}
+  fi
+
+  steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
+    --config conf/decode.config \
+    $gmm_dir $data $gmm_dir/decode${lang_id}_${data_id}
+  
+  if [ ! -z "$lang_rescore" ]; then
+    rescore_id=$(echo $lang_rescore/ | perl -ne 'm:.+test([^/]+)/:; print $1;')
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      --config conf/decode.config \
+      $lang $lang_rescore $data \
+      $gmm_dir/decode${lang_id}_${data_id} \
+      $gmm_dir/decode${lang_id}_${data_id}${rescore_id}
+  fi
+fi
+
+if [ ! -s ${data}_hires/feats.scp ]; then
+  utils/copy_data_dir.sh $data ${data}_hires
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj ${data}_hires
+  steps/compute_cmvn_stats.sh ${data}_hires
+fi
+
+ivector_dir=`dirname $extractor`/ivectors_${data_id}_hires
+
+if [ $stage -le 6 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${data}_hires $extractor $ivector_dir
+fi
+  
+if [ ! -z "$chain_dir" ]; then
+  if [ $stage -le 7 ]; then
+    if [ ! -f $chain_dir/graph${lang_id}/HCLG.fst ]; then
+      utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 \
+        $lang $chain_dir $chain_dir/graph${lang_id}
+    fi
+  fi
+ 
+  if [ $stage -le 8 ]; then
+    steps/nnet3/decode.sh --num-threads 4 --nj $nj --cmd "$decode_cmd" \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --frames-per-chunk "$frames_per_chunk" \
+      --online-ivector-dir $ivector_dir \
+      --scoring-opts "--min-lmwt 5 --max-lmwt 15" \
+      $chain_dir/graph${lang_id} ${data}_hires $chain_dir/decode${lang_id}_${data_id}
+  fi
+
+  if [ ! -z "$lang_rescore" ]; then
+    rescore_id=$(echo $lang_rescore/ | perl -ne 'm:.+test([^/]+)/:; print $1;')
+    if [ $stage -le 9 ]; then
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        --scoring-opts "--min-lmwt 5 --max-lmwt 15" \
+        $lang $lang_rescore ${data}_hires \
+        $chain_dir/decode${lang_id}_${data_id} \
+        $chain_dir/decode${lang_id}_${data_id}${rescore_id}
+    fi
+  fi
+fi
diff --git a/egs/mgb2_arabic/s5/local/score.sh b/egs/mgb2_arabic/s5/local/score.sh
new file mode 100755
index 00000000000..08b67050c01
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/score.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+reverse=false
+word_ins_penalty=0.0
+min_lmwt=9
+max_lmwt=30
+
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  echo "    --reverse (true/false)          # score with time reversed features "
+  exit 1;
+fi
+
+args=$*
+
+data=$1
+lang_or_graph=$2
+dir=$3
+srcdir=`dirname $dir`;
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz; do 
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+if [ -f $data/stm ]; then
+  local/score_sclite.sh --decode-mbr $decode_mbr --reverse $reverse --word-ins-penalty $word_ins_penalty --min-lmwt $min_lmwt --max-lmwt $max_lmwt --stage $stage --cmd "$cmd" $args
+else
+  steps/score_kaldi.sh --decode-mbr $decode_mbr --word-ins-penalty $word_ins_penalty --min-lmwt $min_lmwt --max-lmwt $max_lmwt --stage $stage --cmd "$cmd" $args
+fi
diff --git a/egs/mgb2_arabic/s5/local/score_combine.sh b/egs/mgb2_arabic/s5/local/score_combine.sh
new file mode 100755
index 00000000000..576962c7442
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/score_combine.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Copyright 2013  Arnab Ghoshal
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Script for system combination using minimum Bayes risk decoding.
+# This calls lattice-combine to create a union of lattices that have been 
+# normalized by removing the total forward cost from them. The resulting lattice
+# is used as input to lattice-mbr-decode. This should not be put in steps/ or 
+# utils/ since the scores on the combined lattice must not be scaled.
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=9
+max_lmwt=20
+lat_weights=
+#end configuration section.
+
+help_message="Usage: "$(basename $0)" [options] <data-dir> <graph-dir|lang-dir> <decode-dir1> <decode-dir2> [decode-dir3 ... ] <out-dir>
+Options:
+  --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
+  --min-lmwt INT                  # minumum LM-weight for lattice rescoring 
+  --max-lmwt INT                  # maximum LM-weight for lattice rescoring
+  --lat-weights STR               # colon-separated string of lattice weights
+";
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -lt 5 ]; then
+  printf "$help_message\n";
+  exit 1;
+fi
+
+data=$1
+graphdir=$2
+odir=${@: -1}  # last argument to the script
+shift 2;
+decode_dirs=( $@ )  # read the remaining arguments into an array
+unset decode_dirs[${#decode_dirs[@]}-1]  # 'pop' the last argument which is odir
+num_sys=${#decode_dirs[@]}  # number of systems to combine
+
+symtab=$graphdir/words.txt
+[ ! -f $symtab ] && echo "$0: missing word symbol table '$symtab'" && exit 1;
+[ ! -f $data/text ] && echo "$0: missing reference '$data/text'" && exit 1;
+
+
+mkdir -p $odir/log
+
+for i in `seq 0 $[num_sys-1]`; do
+  model=${decode_dirs[$i]}/../final.mdl  # model one level up from decode dir
+  for f in $model ${decode_dirs[$i]}/lat.1.gz ; do
+    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+  done
+  lats[$i]="\"ark:gunzip -c ${decode_dirs[$i]}/lat.*.gz |\""
+done
+
+mkdir -p $odir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' \
+  > $odir/scoring/test_filt.txt
+
+if [ -z "$lat_weights" ]; then
+  $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
+    lattice-combine --inv-acoustic-scale=LMWT ${lats[@]} ark:- \| \
+    lattice-mbr-decode --word-symbol-table=$symtab ark:- \
+    ark,t:$odir/scoring/LMWT.tra || exit 1;
+else
+  $cmd LMWT=$min_lmwt:$max_lmwt $odir/log/combine_lats.LMWT.log \
+    lattice-combine --inv-acoustic-scale=LMWT --lat-weights=$lat_weights \
+    ${lats[@]} ark:- \| \
+    lattice-mbr-decode --word-symbol-table=$symtab ark:- \
+    ark,t:$odir/scoring/LMWT.tra || exit 1;
+fi
+
+$cmd LMWT=$min_lmwt:$max_lmwt $odir/scoring/log/score.LMWT.log \
+  cat $odir/scoring/LMWT.tra \| \
+  utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+  compute-wer --text --mode=present \
+  ark:$odir/scoring/test_filt.txt  ark,p:- ">&" $odir/wer_LMWT || exit 1;
+
+exit 0
diff --git a/egs/mgb2_arabic/s5/local/score_mbr.sh b/egs/mgb2_arabic/s5/local/score_mbr.sh
new file mode 100755
index 00000000000..4052512f726
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/score_mbr.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Script for minimum bayes risk decoding.
+
+[ -f ./path.sh ] && . ./path.sh;
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=9
+max_lmwt=20
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score_mbr.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+# We submit the jobs separately, not as an array, because it's hard
+# to get the inverse of the LM scales.
+rm $dir/.error 2>/dev/null
+for inv_acwt in `seq $min_lmwt $max_lmwt`; do
+  acwt=`perl -e "print (1.0/$inv_acwt);"`
+  $cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \
+    lattice-mbr-decode  --acoustic-scale=$acwt --word-symbol-table=$symtab \
+      "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/${inv_acwt}.tra \
+    || touch $dir/.error &
+done
+wait;
+[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout.";
+     
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">" $dir/wer_LMWT || exit 1;
+
diff --git a/egs/mgb2_arabic/s5/local/score_sclite.sh b/egs/mgb2_arabic/s5/local/score_sclite.sh
new file mode 100755
index 00000000000..2c8be28a568
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/score_sclite.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+reverse=false
+word_ins_penalty=0.0
+min_lmwt=9
+max_lmwt=30
+iter=final
+
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  echo "    --reverse (true/false)          # score with time reversed features "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+srcdir=`dirname $dir`;
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+name=`basename $data`
+
+mkdir -p $dir/scoring/log
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+sclite=$KALDI_ROOT/tools/sctk/bin/sclite
+[ ! -f $sclite ] && echo "Cannot find scoring program at $sclite" && exit 1
+
+#cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+if [ $stage -le 0 ]; then
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
+    mkdir -p $dir/score_LMWT/ '&&' \
+    lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+    lattice-align-words $lang_or_graph/phones/word_boundary.int $srcdir/$iter.mdl ark:- ark:- \| \
+    nbest-to-ctm $frame_shift_opt ark:- - \| \
+    utils/int2sym.pl -f 5 $symtab  \| \
+    utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+    '>' $dir/score_LMWT/${name}.ctm '&&' \
+    grep -v '\<UNK\>' $dir/score_LMWT/${name}.ctm \| \
+    sed -e 's:^[^ ]*\/::' -e 's:.wav::' \| sort -k1,1 -k3,3n \
+    '>' $dir/score_LMWT/${name}.ctm.updated || exit 1;
+fi    
+
+# Remove some stuff we don't want to score, from the ctm.                                                                                                   
+
+if [ $stage -le 1 ]; then 
+  for x in $dir/score_*/$name.ctm.updated; do
+    cp $x $dir/tmpf;
+    cat $dir/tmpf | grep -v -E '\[NOISE|LAUGHTER|VOCALIZED-NOISE\]' | \
+      grep -v -E '<UNK>|%HESITATION' > $x;
+  done
+fi  
+
+if [ $stage -le 2 ]; then
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+    local/toutf8.py $dir/score_LMWT/${name}.ctm.updated $dir/score_LMWT/${name}.ctm.updated.utf8 '&&' \
+    local/toutf8.py $data/stm $dir/score_LMWT/stm '&&' \
+    $sclite -O $dir/score_LMWT -o all spk -h $dir/score_LMWT/${name}.ctm.updated.utf8 ctm -r $dir/score_LMWT/stm stm || exit 1;  
+fi 
+
diff --git a/egs/mgb2_arabic/s5/local/toutf8.py b/egs/mgb2_arabic/s5/local/toutf8.py
new file mode 100755
index 00000000000..e1bb571793f
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/toutf8.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# simple utility to convert bulkwater text to utf8
+# 
+
+import sys
+import codecs
+
+_unicode = u"\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630\u0632\u0634\u0636\u0638\u063a\u0640\u0642\u0644\u0646\u0648\u064a\u064c\u064e\u0650\u0652\u0670\u067e\u0686\u0621\u0623\u0625\u06a4\u0627\u0629\u062b\u062d\u062f\u0631\u0633\u0635\u0637\u0639\u0641\u0643\u0645\u0647\u0649\u064b\u064d\u064f\u0651\u0671"
+_buckwalter = u"|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
+
+_backwardMap = {ord(b):a for a,b in zip(_unicode, _buckwalter)}
+
+def fromBuckWalter(s):
+  return s.translate(_backwardMap)
+
+infile = codecs.open(sys.argv[1], 'r', 'utf-8')
+outfile = codecs.open(sys.argv[2], 'w', 'utf-8')
+
+for line in infile:
+  tokens = line.split()
+  limit = 4 if len(tokens) == 5 else 5
+  for i in xrange(len(tokens)):
+    if i >= limit:
+      tokens[i] = fromBuckWalter(tokens[i])
+  outfile.write(" ".join(tokens))
+  outfile.write("\n")
diff --git a/egs/mgb2_arabic/s5/local/xml2stm.py b/egs/mgb2_arabic/s5/local/xml2stm.py
new file mode 100755
index 00000000000..0f3f8312c81
--- /dev/null
+++ b/egs/mgb2_arabic/s5/local/xml2stm.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+
+# this script is to be used to convert MGB challenge xmls to SCLITE stm
+# format so it can be used as reference for evaluation.
+#
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU (author: Yifan Zhang)
+#
+
+__author__ = 'Yifan Zhang (yzhang@qf.org.qa)'
+
+import os
+import sys
+import time
+import codecs
+from xml.dom import minidom
+
+_unicode = u"\u0622\u0624\u0626\u0628\u062a\u062c\u06af\u062e\u0630\u0632\u0634\u0636\u0638\u063a\u0640\u0642\u0644\u0646\u0648\u064a\u064c\u064e\u0650\u0652\u0670\u067e\u0686\u0621\u0623\u0625\u06a4\u0627\u0629\u062b\u062d\u062f\u0631\u0633\u0635\u0637\u0639\u0641\u0643\u0645\u0647\u0649\u064b\u064d\u064f\u0651\u0671"
+_buckwalter = u"|&}btjGx*z$DZg_qlnwyNaio`PJ'><VApvHdrsSTEfkmhYFKu~{"
+
+_forwardMap = {ord(a):b for a,b in zip(_unicode, _buckwalter)}
+_backwardMap = {ord(b):a for a,b in zip(_unicode, _buckwalter)}
+
+def toBuckWalter(s):
+  return s.translate(_forwardMap)
+
+def fromBuckWalter(s):
+  return s.translate(_backwardMap)
+
+class Element(object):
+  def __init__(self, text, startTime, endTime=None):
+    self.text = text
+    self.startTime = startTime
+    self.endTime = endTime
+
+def loadXml(xmlFileName, opts):
+  dom = minidom.parse(open(xmlFileName, 'r'))
+  trans = dom.getElementsByTagName('transcript')[0]
+  segments = trans.getElementsByTagName('segments')[0]
+  elements = []
+  for segment in segments.getElementsByTagName('segment'):
+    sid = segment.attributes['id'].value.split('_utt_')[0].replace("_","-")
+    startTime = float(segment.attributes['starttime'].value)
+    endTime = float(segment.attributes['endtime'].value)
+
+    tokens = [e.childNodes[0].data for e in segment.getElementsByTagName('element') if len(e.childNodes)]
+    # skip any word starts with '#'
+    tokens = filter(lambda i: not i.startswith('#'), tokens)
+    # convert to buckwalter if required
+    if opts.buck:
+      tokens = map(toBuckWalter, tokens)
+
+    text = ' '.join(tokens)
+
+    elements.append(Element(text, startTime, endTime))
+  return {'id': sid, 'turn': elements}
+
+def stm(data):
+  out = codecs.getwriter('utf-8')(sys.stdout)
+  for e in data['turn']:
+    out.write("{} 1 UNKNOWN {:.02f} {:.02f} ".format(data['id'], e.startTime, e.endTime))
+    out.write(e.text)
+    out.write("\n")
+
+def ctm(data):
+  """ generate ctm output for test
+  """
+  out = codecs.getwriter('utf-8')(sys.stdout)
+  for e in data['turn']:
+    tokens = e.text.split()
+    duration = e.endTime - e.startTime
+    interval = duration / len(tokens)
+    startTime = e.startTime
+    for token in tokens:
+      out.write("{} 1 {:.02f} {:.02f} ".format(data['id'], startTime, interval))
+      out.write(token)
+      out.write("\n")
+
+def main(args):
+  data = loadXml(args.xmlFileName, args)
+  if args.ctm:
+    ctm(data)
+  else:
+    stm(data)
+
+if __name__ == '__main__':
+  import argparse
+
+  parser = argparse.ArgumentParser(description='convert Arabic MGB xml file to MGB xml')
+  parser.add_argument("--id", dest="uid",
+                      help="utterance id")
+  parser.add_argument("--buck", dest="buck", default=False, action='store_true',
+                      help="output buckwalter text")
+  parser.add_argument("--ctm", dest="ctm", default=False, action='store_true',
+                      help="output ctm file for testing")
+  parser.add_argument("--skip-bad-segments", dest="skip_bs", default=False, action='store_true',
+                      help="skip segments with ###, these are either overlapped speech or unintelligible speech")
+  parser.add_argument(dest="xmlFileName", metavar="xml", type=str)
+  args = parser.parse_args()
+
+  main(args)
diff --git a/egs/mgb2_arabic/s5/path.sh b/egs/mgb2_arabic/s5/path.sh
new file mode 100755
index 00000000000..be11b34cbc6
--- /dev/null
+++ b/egs/mgb2_arabic/s5/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=$(pwd)/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/mgb2_arabic/s5/run.sh b/egs/mgb2_arabic/s5/run.sh
new file mode 100755
index 00000000000..334aef1bf30
--- /dev/null
+++ b/egs/mgb2_arabic/s5/run.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# Copyright (C) 2016, Qatar Computing Research Institute, HBKU
+#               2017-19 Vimal Manohar
+# Apache 2.0
+
+stage=-1
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+# See README for instructions.
+# This script assumes that you are already familiar with Kaldi recipes.
+# This script assumes that you have downloaded the corpus and lexicon as 
+# mentioned in the README.
+
+# TO DO: you will need to choose the size of training set you want.
+# Here we select according to an upper threshhold on Matching Error
+# Rate from the lightly supervised alignment.  When using all the
+# training shows, this will give you training data speech segments of
+# approximate lengths listed below:
+
+
+set -e -o pipefail -u
+#FILTER OUT SEGMENTS BASED ON MER (Match Error Rate)
+
+mer=80  
+
+# Location of lexicon
+# Download from https://github.com/qcri/ArabicASRChallenge2016/blob/master/lexicon/ar-ar_grapheme_lexicon
+LEXICON=ar-ar_grapheme_lexicon
+
+nj=100  # split training into how many jobs?
+nDecodeJobs=80
+
+##########################################################
+#
+#  Recipe
+#
+##########################################################
+
+
+#1) Data preparation
+
+if [ $stage -le 0 ]; then
+  local/mgb_extract_data.sh DB
+fi
+
+if [ $stage -le 1 ]; then
+  #DATA PREPARATION
+  echo "Preparing training data"
+  local/mgb_data_prep.sh DB $mer
+fi
+
+if [ $stage -le 2 ]; then
+  #LEXICON PREPARATION: The lexicon is also provided
+  echo "Preparing dictionary"
+  local/graphgeme_mgb_prep_dict.sh $LEXICON
+fi
+
+# Using the training data transcript for building the language model
+LM_TEXT=DB/train/lm_text/lm_text_clean_bw
+
+if [ $stage -le 3 ]; then
+  #LM TRAINING: Using the training set transcript text for language modelling
+  echo "Training n-gram language model"
+  local/mgb_train_lms.sh $mer
+  local/mgb_train_lms_extra.sh $LM_TEXT $mer
+
+  # Uncomment if you want to use pocolm for language modeling 
+  #local/mgb_train_lms_extra_pocolm.sh $LM_TEXT $mer
+fi
+
+if [ $stage -le 4 ]; then
+  #L Compilation
+  echo "Preparing lang dir"
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+fi
+
+if [ $stage -le 5 ]; then
+  #G compilation
+  local/mgb_format_data.sh --lang-test data/lang_test \
+    --arpa-lm data/local/lm_mer80/3gram-mincount/lm_unpruned.gz
+  utils/build_const_arpa_lm.sh data/local/lm_large_mer80/4gram-mincount/lm_unpruned.gz \
+    data/lang_test data/lang_test_fg
+fi
+
+# Uncomment if you want to use pocolm for language modeling 
+#if [ $stage -le 6 ]; then
+#  local/mgb_format_data.sh --lang-test data/lang_poco_test \
+#    --arpa-lm data/local/pocolm/data/arpa/4gram_small.arpa.gz
+#  utils/build_const_arpa_lm.sh data/local/pocolm/data/arpa/4gram_big.arpa.gz \
+#    data/lang_poco_test data/lang_poco_test_fg
+#fi
+
+if [ $stage -le 7 ]; then
+  #Calculating mfcc features
+  mfccdir=mfcc
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir ]; then
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mgb2_arabic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  echo "Computing features"
+  for x in train_mer$mer train_mer${mer}_subset500 dev_non_overlap dev_overlap ; do
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$x \
+      exp/mer$mer/make_mfcc/$x/log $mfccdir
+    steps/compute_cmvn_stats.sh data/$x \
+      exp/mer$mer/make_mfcc/$x/log $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  #Taking 10k segments for faster training
+  utils/subset_data_dir.sh data/train_mer${mer}_subset500 10000 data/train_mer${mer}_subset500_10k 
+fi
+
+if [ $stage -le 9 ]; then
+  #Monophone training
+  steps/train_mono.sh --nj 80 --cmd "$train_cmd" \
+    data/train_mer${mer}_subset500_10k data/lang exp/mer$mer/mono 
+fi
+
+if [ $stage -le 10 ]; then
+  #Monophone alignment
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_mer${mer}_subset500 data/lang exp/mer$mer/mono exp/mer$mer/mono_ali 
+
+  #tri1 [First triphone pass]
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2500 30000 data/train_mer${mer}_subset500 data/lang exp/mer$mer/mono_ali exp/mer$mer/tri1 
+
+  #tri1 decoding
+  utils/mkgraph.sh data/lang_test exp/mer$mer/tri1 exp/mer$mer/tri1/graph
+
+  for dev in dev_overlap dev_non_overlap; do
+    steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
+      exp/mer$mer/tri1/graph data/$dev exp/mer$mer/tri1/decode_$dev &
+  done
+fi
+
+if [ $stage -le 11 ]; then
+  #tri1 alignment
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_mer${mer}_subset500 data/lang exp/mer$mer/tri1 exp/mer$mer/tri1_ali 
+
+  #tri2 [a larger model than tri1]
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    3000 40000 data/train_mer${mer}_subset500 data/lang exp/mer$mer/tri1_ali exp/mer$mer/tri2
+
+  #tri2 decoding
+  utils/mkgraph.sh data/lang_test exp/mer$mer/tri2 exp/mer$mer/tri2/graph
+
+  for dev in dev_overlap dev_non_overlap; do
+   steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
+   exp/mer$mer/tri2/graph data/$dev exp/mer$mer/tri2/decode_$dev &
+  done
+fi
+
+if [ $stage -le 12 ]; then
+  #tri2 alignment
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_mer${mer}_subset500 data/lang exp/mer$mer/tri2 exp/mer$mer/tri2_ali
+
+  # tri3 training [LDA+MLLT]
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    4000 50000 data/train_mer${mer}_subset500 data/lang exp/mer$mer/tri1_ali exp/mer$mer/tri3
+
+  #tri3 decoding
+  utils/mkgraph.sh data/lang_test exp/mer$mer/tri3 exp/mer$mer/tri3/graph
+
+  for dev in dev_overlap dev_non_overlap; do
+   steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
+   exp/mer$mer/tri3/graph data/$dev exp/mer$mer/tri3/decode_$dev & 
+  done
+fi
+
+if [ $stage -le 13 ]; then
+  #tri3 alignment
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true data/train_mer${mer}_subset500 data/lang exp/mer$mer/tri3 exp/mer$mer/tri3_ali
+
+  #now we start building model with speaker adaptation SAT [fmllr]
+  steps/train_sat.sh  --cmd "$train_cmd" \
+    5000 100000 data/train_mer${mer}_subset500 data/lang exp/mer$mer/tri3_ali exp/mer$mer/tri4
+
+  #sat decoding
+  utils/mkgraph.sh data/lang_test exp/mer$mer/tri4 exp/mer$mer/tri4/graph
+
+  for dev in dev_overlap dev_non_overlap; do
+    steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
+      exp/mer$mer/tri4/graph data/$dev exp/mer$mer/tri4/decode_$dev &
+  done
+fi
+
+if [ $stage -le 14 ]; then
+  #sat alignment
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" data/train_mer$mer data/lang exp/mer$mer/tri4 exp/mer$mer/tri4_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    10000 150000 data/train_mer$mer data/lang \
+    exp/mer$mer/tri4_ali \
+    exp/mer$mer/tri5
+
+  utils/mkgraph.sh data/lang_test exp/mer$mer/tri5{,/graph}
+
+  for dev in dev_overlap dev_non_overlap; do
+    steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \
+      exp/mer$mer/tri5/graph data/$dev exp/mer$mer/tri5/decode_$dev
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" --config conf/decode.config \
+      data/lang_test data/lang_test_fg data/$dev \
+      exp/mer$mer/tri5/decode_${dev}{,_fg}
+  done
+fi
+
+exit 0 
+
+# nnet1 dnn                                                                                                                                
+local/nnet/run_dnn.sh $mer
+
+
+time=$(date +"%Y-%m-%d-%H-%M-%S")
+results=baseline.$time
+#SCORING IS DONE USING SCLITE
+for x in exp/*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done | sort -n -k2 > tmp$$
+
+echo "non_overlap_speech_WER:" > $results
+grep decode_dev_non_overlap tmp$$ >> $results
+echo "" >> $results
+echo "" >> $results
+echo "overlap_speech_WER:" >> $results
+grep decode_dev_overlap tmp$$ >> $results
+echo "" >> $results
+rm -fr tmp$$
+
diff --git a/egs/mgb2_arabic/s5/steps b/egs/mgb2_arabic/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/mgb2_arabic/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/mgb2_arabic/s5/utils b/egs/mgb2_arabic/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/mgb2_arabic/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/mgb5/README b/egs/mgb5/README
new file mode 100644
index 00000000000..5114b278b71
--- /dev/null
+++ b/egs/mgb5/README
@@ -0,0 +1,18 @@
+###
+# MGB-5 corpus: Moroccan Arabic Automatic Speech Recognition
+# Created in collaboration between QCRI and ELRA
+# More details can be found here: https://arabicspeech.org/mgb5
+###
+
+
+## INTRODUCTION ##
+Training data: 10.2 hours from 69 programs
+Development data: 1.8 hours from 10 programs
+Testing data: 2.0 hours from 14 programs
+
+## KNOWN ISSUES ##
+1- The dev data does not have the same alignment across the four annotators 
+2- Once alignment is consistent, we can include multi-refence word error rate
+3- Use MGB-2 as background model
+
+
diff --git a/egs/mgb5/s5/RESULTS b/egs/mgb5/s5/RESULTS
new file mode 100644
index 00000000000..5ac2daf0d49
--- /dev/null
+++ b/egs/mgb5/s5/RESULTS
@@ -0,0 +1,14 @@
+%WER 75.59 [ 51973 / 68755, 1993 ins, 19098 del, 30882 sub ] exp/chain/tdnn_1a/decode_dev/wer_10_0.0
+%WER 84.47 [ 58079 / 68755, 1679 ins, 18637 del, 37763 sub ] exp/sgmm2_5b2/decode_dev.rescored/wer_9_0.0
+%WER 84.48 [ 58087 / 68755, 1720 ins, 18518 del, 37849 sub ] exp/sgmm2_5b2/decode_dev.big/wer_9_0.0
+%WER 84.62 [ 58180 / 68755, 1746 ins, 18289 del, 38145 sub ] exp/sgmm2_5b2/decode_dev/wer_9_0.0
+%WER 86.93 [ 59766 / 68755, 1634 ins, 19636 del, 38496 sub ] exp/tri3b/decode_dev.rescored/wer_10_0.0
+%WER 87.01 [ 59822 / 68755, 1508 ins, 20885 del, 37429 sub ] exp/tri3b/decode_dev/wer_11_0.0
+%WER 87.23 [ 59974 / 68755, 1686 ins, 18873 del, 39415 sub ] exp/tri3b/decode_dev.si/wer_10_0.0
+%WER 87.57 [ 60209 / 68755, 1325 ins, 21282 del, 37602 sub ] exp/tri2b/decode_dev.rescored/wer_11_0.0
+%WER 87.59 [ 60225 / 68755, 1133 ins, 21631 del, 37461 sub ] exp/tri2b/decode_dev/wer_10_0.5
+%WER 88.35 [ 60745 / 68755, 1359 ins, 20030 del, 39356 sub ] exp/tri2a/decode_dev.rescored/wer_11_0.0
+%WER 88.50 [ 60849 / 68755, 1469 ins, 18597 del, 40783 sub ] exp/tri1/decode_dev.rescored/wer_10_0.0
+%WER 88.53 [ 60866 / 68755, 1229 ins, 20752 del, 38885 sub ] exp/tri2a/decode_dev/wer_10_0.5
+%WER 88.59 [ 60909 / 68755, 1567 ins, 17986 del, 41356 sub ] exp/tri1/decode_dev/wer_10_0.0
+%WER 94.78 [ 65167 / 68755, 664 ins, 23336 del, 41167 sub ] exp/mono/decode_dev/wer_7_0.0
diff --git a/egs/mgb5/s5/cmd.sh b/egs/mgb5/s5/cmd.sh
new file mode 100644
index 00000000000..86240967f67
--- /dev/null
+++ b/egs/mgb5/s5/cmd.sh
@@ -0,0 +1,10 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+export train_cmd="slurm.pl --mem 6G --config conf/slurm.conf"
+export decode_cmd="slurm.pl  --config conf/slurm.conf"
+export cuda_cmd="slurm.pl gpu --mem 6G --gpu 2 --config conf/slurm.conf"
diff --git a/egs/mgb5/s5/conf/decode.config b/egs/mgb5/s5/conf/decode.config
new file mode 100644
index 00000000000..10b0eee900b
--- /dev/null
+++ b/egs/mgb5/s5/conf/decode.config
@@ -0,0 +1,4 @@
+# Use wider-than-normal decoding beams for RM.
+first_beam=16.0
+beam=20.0
+lattice_beam=10.0
diff --git a/egs/mgb5/s5/conf/decode_dnn.config b/egs/mgb5/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/mgb5/s5/conf/mfcc.conf b/egs/mgb5/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/mgb5/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/mgb5/s5/conf/mfcc_hires.conf b/egs/mgb5/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/mgb5/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/mgb5/s5/conf/online_cmvn.conf b/egs/mgb5/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/mgb5/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/mgb5/s5/conf/slurm.conf b/egs/mgb5/s5/conf/slurm.conf
new file mode 100644
index 00000000000..2cc4052a0a9
--- /dev/null
+++ b/egs/mgb5/s5/conf/slurm.conf
@@ -0,0 +1,10 @@
+command sbatch --export=PATH --ntasks-per-node=1 --partition=cpu 
+option mem=* --mem-per-cpu=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task=$0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task=1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+option max_jobs_run=*     # Do nothing
+option gpu=* -N1 -n1 -p gpu --mem=4GB --gres=gpu:$0 --cpus-per-task=6 --time=72:0:0  # in reality, we probably should have --cpus-per-task=$((6*$0))
+option gpu=0
+
+
diff --git a/egs/mgb5/s5/local/chain/run_tdnn.sh b/egs/mgb5/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/mgb5/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100644
index 00000000000..6300511e817
--- /dev/null
+++ b/egs/mgb5/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# The script is copied from egs/iban
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# grep WER exp/chain/tdnn_1a/decode_dev/wer_10_0.0
+# %WER 75.59 [ 51973 / 68755, 1993 ins, 19098 del, 30882 sub ] 
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_1a
+# exp/chain/tdnn_1a: num-iters=38 nj=2..5 num-params=12.6M dim=40+50->1592 combine=-0.069->-0.067 (over 2) xent:train/valid[24,37,final]=(-1.41,-1.18,-1.12/-1.68,-1.54,-1.47) logprob:train/valid[24,37,final]=(-0.071,-0.057,-0.053/-0.124,-0.122,-0.121)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3b
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=15
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm || exit 1;
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain/tree_sp
+lang=data/lang_chain
+lat_dir=exp/chain/${gmm}_${train_set}_sp_lats
+dir=exp/chain/tdnn_${affix}
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang_test ..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 10 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 50 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  opts="l2-regularize=0.08 dropout-per-dim-continuous=true"
+  output_opts="l2-regularize=0.02 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=50 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn2 $opts dim=768 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn4 $opts dim=768 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768
+  relu-batchnorm-dropout-layer name=tdnn6 $opts dim=768 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts dim=768 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts dim=768 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=768
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=768
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)  
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 16 ]; then
+  for data in $test_sets; do
+    (
+      steps/lmrescore_const_arpa.sh  --cmd "$decode_cmd" \
+        data/lang_test/ data/lang_big/ data/${data} \
+        ${dir}/decode_${data} ${dir}/decode_${data}.rescored
+    )
+  done
+  wait
+fi
+
+exit 0;
diff --git a/egs/mgb5/s5/local/nnet3/run_ivector_common.sh b/egs/mgb5/s5/local/nnet3/run_ivector_common.sh
new file mode 100644
index 00000000000..b909ed04cde
--- /dev/null
+++ b/egs/mgb5/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev"
+gmm=tri3b
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 17 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 16 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  #utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 16 --mfcc-config conf/mfcc_hires.conf \
+     --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
+  # because after we get the transform (12th iter is the last), any further
+  # training is pointless.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+    --realign-iters "" --splice-opts "--left-context=3 --right-context=3" \
+    5000 10000 data/${train_set}_sp_hires data/lang \
+     $ali_dir exp/nnet3/tri5b || exit 1
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: training the diagonal UBM."
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 16  --num-frames 200000 \
+     data/${train_set}_sp_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm || exit 1
+fi
+
+if [ $stage -le 6 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data. The iVector dimension of 50.
+  # even though $nj is just 10, each job uses multiple processes and threads.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --nj 10 --num-processes 1 --num-threads 2 --ivector-dim 50 \
+    data/${train_set}_sp_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3/ivectors_${train_set}_sp_hires
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 16 \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 6 \
+      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data}_hires
+  done
+fi
+
+exit 0;
diff --git a/egs/mgb5/s5/local/prepare_data.sh b/egs/mgb5/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..36cb4d8fa3f
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_data.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright 2019 QCRI (Author: Ahmed Ali)
+# Apache 2.0
+
+set -e -o pipefail
+
+
+###
+# The script assumes you have downloaded to the MGB-5 corpus: https://arabicspeech.org/mgb5
+# DB/{dev.tar.gz,train.tar.gz}
+###
+echo "Preparing train and dev data"
+
+if [[ ! -e "DB/train.tar.gz" || ! -e "DB/dev.tar.gz" ]]; then
+  echo "You need to download the MGB-5 first and copy dev.tar.gz and train.tar.gz to DB folder"
+  echo "check: https://arabicspeech.org/mgb5"
+  exit 1
+fi
+
+# We will extract data again even if you did this before.
+(cd DB; rm -fr train dev;for x in *; do tar -xvf $x; done)
+
+mkdir -p data/local data/train data/dev
+
+for x in train dev; do
+    sed -e 's:UNK: :g' -e 's:  : :g' DB/$x/$x.txt.bw > data/$x/text #removing words that annotators couldn't understand
+    cp DB/$x/$x.segments.bw data/$x/segments
+    awk '{print $1 " " $1}' DB/$x/$x.segments.bw > data/$x/spk2utt
+    cp data/$x/spk2utt data/$x/utt2spk 
+    find $PWD/DB/$x/ -name \*.wav | while read wav; do
+        id=$(basename $wav | sed 's:.wav::')
+        echo $id $wav
+    done | sort -u > data/$x/wav.scp
+    utils/fix_data_dir.sh data/$x
+done
+
+
+echo "Data preparation completed."
+
diff --git a/egs/mgb5/s5/local/prepare_dict.sh b/egs/mgb5/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..5ea0938af90
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_dict.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Copyright 2019 QCRI (author: Ahmed Ali)
+# Apache 2.0
+# This script prepares the grapaheme dictionary
+
+set -e
+dir=data/local/dict
+lexicon_url1="https://arabicspeech.org/static/data_resources/ar-ar_grapheme_lexicon_20160209.bz2";
+lexicon_url2="https://arabicspeech.org/static/data_resources/ar-ar_phoneme_lexicon_20140317.bz2";
+stage=0
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+mkdir -p $dir data/local/lexicon_data
+
+if [ $stage -le 0 ]; then
+  echo "$0: Downloading text for lexicon... $(date)."
+  if [ ! -f data/local/lexicon_data/ar-ar_grapheme_lexicon_20160209.bz2 ]; then
+    wget -P data/local/lexicon_data $lexicon_url1
+  else
+    echo "data/local/lexicon_data/ar-ar_grapheme_lexicon_20160209.bz2 already exist on disk"
+  fi 
+  
+  if [ ! -f data/local/lexicon_data/ar-ar_phoneme_lexicon_20140317.bz2 ]; then
+    wget -P data/local/lexicon_data $lexicon_url2
+  else
+    echo "data/local/lexicon_data/ar-ar_phoneme_lexicon_20140317.bz2 already exist on disk"
+  fi 
+  
+  rm -fr data/local/lexicon_data/grapheme_lexicon
+  for dict in ar-ar_grapheme_lexicon_20160209.bz2 ar-ar_phoneme_lexicon_20140317.bz2; do
+    bzcat data/local/lexicon_data/$dict | sed '1,3d' | \
+    awk '{print $1}'  >>  data/local/lexicon_data/grapheme_lexicon
+  done
+  cat data/train/text | cut -d ' ' -f 2- | tr -s " " "\n" | grep -v UNK |  sort -u >> data/local/lexicon_data/grapheme_lexicon
+fi
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: processing lexicon text and creating lexicon... $(date)."
+  # remove vowels and  rare alef wasla
+  grep -v [0-9] data/local/lexicon_data/grapheme_lexicon |  sed -e 's:[FNKaui\~o\`]::g' -e 's:{:}:g' | sort -u > data/local/lexicon_data/processed_lexicon
+  local/prepare_lexicon.py
+fi
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+sed -i '1i<UNK> UNK' $dir/lexicon.txt
+
+echo UNK >> $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
+
+echo "$0: Dictionary preparation succeeded"
diff --git a/egs/mgb5/s5/local/prepare_lexicon.py b/egs/mgb5/s5/local/prepare_lexicon.py
new file mode 100755
index 00000000000..215541585eb
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_lexicon.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data','local', 'lexicon_data', 'processed_lexicon')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        characters = list(line)
+        characters = " ".join(['V' if char == '*' else char for char in characters])
+        lex[line] = characters
+
+with open(os.path.join('data','local','dict', 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + "  " + lex[key] + "\n")
diff --git a/egs/mgb5/s5/local/prepare_lm.sh b/egs/mgb5/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..02fb59aba87
--- /dev/null
+++ b/egs/mgb5/s5/local/prepare_lm.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright 2019  QCRI (Author: Ahmed Ali)
+# Apache 2.0
+
+set -e -o pipefail
+
+# To create G.fst from ARPA language model
+. ./path.sh || die "path.sh expected";
+
+local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
+
+# for basic decoding, let's use only a trigram LM
+[ -d data/lang_test/ ] && rm -rf data/lang_test
+cp -R data/lang data/lang_test
+lm=data/srilm/3gram.me.gz
+utils/format_lm.sh data/lang_test $lm data/local/dict/lexicon.txt data/lang_test
+
+# for decoding using bigger, we build 4-gram using the same transcription text
+[ -d data/lang_big ] && rm -rf data/lang_big
+cp -R data/lang data/lang_big
+lm=data/srilm/4gram.me.gz
+utils/format_lm.sh data/lang_big $lm data/local/dict/lexicon.txt data/lang_big
+
+utils/build_const_arpa_lm.sh $lm data/lang_big data/lang_big
+exit 0;
diff --git a/egs/mgb5/s5/local/score.sh b/egs/mgb5/s5/local/score.sh
new file mode 100755
index 00000000000..9988c941441
--- /dev/null
+++ b/egs/mgb5/s5/local/score.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+
+# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
+
+exit 0;
diff --git a/egs/mgb5/s5/local/train_lms_srilm.sh b/egs/mgb5/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..6af13921511
--- /dev/null
+++ b/egs/mgb5/s5/local/train_lms_srilm.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+
+#      2019 QCRI (Ahmed Ali)
+
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ -z "$dev_text" ] ; then
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+elif [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  train_text=$datadir/train/text
+  dev_text=$datadir/dev2h/text
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/mgb5/s5/path.sh b/egs/mgb5/s5/path.sh
new file mode 100644
index 00000000000..ebc3e1f4ee0
--- /dev/null
+++ b/egs/mgb5/s5/path.sh
@@ -0,0 +1,8 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+export CUDA_CACHE_DISABLE=1
+
diff --git a/egs/mgb5/s5/run.sh b/egs/mgb5/s5/run.sh
new file mode 100755
index 00000000000..6fc21629f0f
--- /dev/null
+++ b/egs/mgb5/s5/run.sh
@@ -0,0 +1,190 @@
+#!/bin/bash
+
+# Copyright 2019 QCRI (Author:Ahmed Ali)
+# Apache 2.0
+
+
+stage=0
+
+# initialization PATH
+. ./path.sh  || die "path.sh expected";
+# initialization commands
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+set -e -o pipefail
+
+
+nj=16
+dev_nj=16
+
+if [ $stage -le 1 ]; then
+  echo "Preparing data and training language models"
+  local/prepare_data.sh 
+  local/prepare_dict.sh 
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+  local/prepare_lm.sh
+fi
+
+
+if [ $stage -le 2 ]; then
+  # Feature extraction
+  for x in train dev; do
+      steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$x exp/make_mfcc/$x mfcc
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  ### Monophone
+  echo "Starting monophone training."
+  utils/subset_data_dir.sh data/train 1000 data/train.1k
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" data/train.1k data/lang exp/mono
+  echo "Mono training done."
+
+  (
+  echo "Decoding the dev set using monophone models."
+  utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph
+
+  steps/decode.sh --config conf/decode.config --nj $dev_nj --cmd "$decode_cmd" \
+    exp/mono/graph data/dev exp/mono/decode_dev
+  echo "Monophone decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 4 ]; then
+  ### Triphone
+  echo "Starting triphone training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/mono exp/mono_ali
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd"  \
+      3200 30000 data/train data/lang exp/mono_ali exp/tri1
+  echo "Triphone training done."
+
+  (
+  echo "Decoding the dev set using triphone models."
+  utils/mkgraph.sh data/lang_test  exp/tri1 exp/tri1/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd"  \
+      exp/tri1/graph  data/dev exp/tri1/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri1/decode_dev exp/tri1/decode_dev.rescored
+  echo "Triphone decoding done."
+  ) &
+fi
+
+if [ $stage -le 5 ]; then
+  ## Triphones + delta delta
+  # Training
+  echo "Starting (larger) triphone training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \
+       data/train data/lang exp/tri1 exp/tri1_ali
+  steps/train_deltas.sh --cmd "$train_cmd"  \
+      4200 40000 data/train data/lang exp/tri1_ali exp/tri2a
+  echo "Triphone (large) training done."
+
+  (
+  echo "Decoding the dev set using triphone(large) models."
+  utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2a/graph data/dev exp/tri2a/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2a/decode_dev exp/tri2a/decode_dev.rescored
+  echo "Triphone(large) decoding done."
+  ) &
+fi
+
+if [ $stage -le 6 ]; then
+  ### Triphone + LDA and MLLT
+  # Training
+  echo "Starting LDA+MLLT training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd"  \
+      data/train data/lang exp/tri2a exp/tri2a_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd"  \
+    --splice-opts "--left-context=3 --right-context=3" \
+    4200 40000 data/train data/lang exp/tri2a_ali exp/tri2b
+  echo "LDA+MLLT training done."
+
+  (
+  echo "Decoding the dev set using LDA+MLLT models."
+  utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+  steps/decode.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri2b/graph data/dev exp/tri2b/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri2b/decode_dev exp/tri2b/decode_dev.rescored
+  echo "LDA+MLLT decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 7 ]; then
+  ### Triphone + LDA and MLLT + SAT and FMLLR
+  # Training
+  echo "Starting SAT+FMLLR training."
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+      --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train data/lang exp/tri2b_ali exp/tri3b
+  echo "SAT+FMLLR training done."
+
+  (
+  echo "Decoding the dev set using SAT+FMLLR models."
+  utils/mkgraph.sh data/lang_test  exp/tri3b exp/tri3b/graph
+  steps/decode_fmllr.sh --nj $dev_nj --cmd "$decode_cmd" \
+      exp/tri3b/graph  data/dev exp/tri3b/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/tri3b/decode_dev exp/tri3b/decode_dev.rescored
+  echo "SAT+FMLLR decoding done."
+  ) &
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "Starting SGMM training."
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+      data/train data/lang exp/tri3b exp/tri3b_ali
+
+  steps/train_ubm.sh --cmd "$train_cmd"  \
+      600 data/train data/lang exp/tri3b_ali exp/ubm5b2
+
+  steps/train_sgmm2.sh --cmd "$train_cmd"  \
+       5200 12000 data/train data/lang exp/tri3b_ali exp/ubm5b2/final.ubm exp/sgmm2_5b2
+  echo "SGMM training done."
+
+  (
+  echo "Decoding the dev set using SGMM models"
+  # Graph compilation
+  utils/mkgraph.sh data/lang_test exp/sgmm2_5b2 exp/sgmm2_5b2/graph
+  utils/mkgraph.sh data/lang_big/ exp/sgmm2_5b2 exp/sgmm2_5b2/graph_big
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph data/dev exp/sgmm2_5b2/decode_dev
+
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_test/ data/lang_big/ data/dev \
+      exp/sgmm2_5b2/decode_dev exp/sgmm2_5b2/decode_dev.rescored
+
+  steps/decode_sgmm2.sh --nj $dev_nj --cmd "$decode_cmd" \
+      --transform-dir exp/tri3b/decode_dev \
+      exp/sgmm2_5b2/graph_big data/dev exp/sgmm2_5b2/decode_dev.big
+  echo "SGMM decoding done."
+  ) &
+fi
+
+wait;
+
+time bash -x ./local/chain/run_tdnn.sh &> chain_run_tdnn.log
+#score
+for x in exp/chain/*/decode* exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done | sort -k2 -n > RESULTS
+
+
diff --git a/egs/mgb5/s5/steps b/egs/mgb5/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/mgb5/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/mgb5/s5/utils b/egs/mgb5/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/mgb5/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/RESULTS b/egs/mini_librispeech/s5/RESULTS
index 463c059bdbb..0b747120416 100755
--- a/egs/mini_librispeech/s5/RESULTS
+++ b/egs/mini_librispeech/s5/RESULTS
@@ -2,7 +2,7 @@
 
 for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
 
-for x in exp/chain/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+for x in exp/chain*/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
 exit 0
 
 # Results on on dev_clean_2
diff --git a/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh
new file mode 120000
index 00000000000..f8f445501b0
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/run_cnn_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
index 3922170ac12..3a574742cba 120000
--- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1h.sh
\ No newline at end of file
+tuning/run_tdnn_1k.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
new file mode 100755
index 00000000000..c8f2503b578
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -0,0 +1,307 @@
+#!/bin/bash
+
+# run_cnn_tdnn_1a.sh is modified from run_tdnn_1h.sh, but adding CNN layers
+#  near the beginning.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/cnn_tdnn1a_sp
+# System                tdnn1h_sp cnn_tdnn1a_sp
+#WER dev_clean_2 (tgsmall)      12.09     11.15
+#             [online:]         12.11     11.17
+#WER dev_clean_2 (tglarge)       8.59      7.79
+#             [online:]          8.76      7.80
+# Final train prob        -0.0493   -0.0467
+# Final valid prob        -0.0805   -0.0789
+# Final train prob (xent)   -1.1730   -1.0767
+# Final valid prob (xent)   -1.3872   -1.3070
+# Num-params                 5207856   4492816
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  cnn_opts="l2-regularize=0.03"
+  ivector_affine_opts="l2-regularize=0.03"
+  tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+
+  batchnorm-component name=idct-batchnorm input=idct
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
+  # limit the num-parameters).
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
new file mode 100755
index 00000000000..9be405a5e1a
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
@@ -0,0 +1,309 @@
+#!/bin/bash
+
+
+# 1b is as 1a but adding SpecAugment and removing dropout (which, in
+# combination with SpecAugment, no longer seemed to give an improvement).
+
+#  local/chain/compare_wer.sh --online exp/chain/cnn_tdnn1{a,a2,b,b2}_sp
+# System                cnn_tdnn1a_sp cnn_tdnn1a2_sp cnn_tdnn1b_sp cnn_tdnn1b2_sp
+#WER dev_clean_2 (tgsmall)      10.89     10.96     10.04      9.93
+#             [online:]         10.91     10.93      9.99      9.99
+#WER dev_clean_2 (tglarge)       7.50      7.80      6.94      6.89
+#             [online:]          7.58      7.84      6.97      7.04
+# Final train prob        -0.0476   -0.0470   -0.0577   -0.0575
+# Final valid prob        -0.0754   -0.0760   -0.0742   -0.0746
+# Final train prob (xent)   -1.0930   -1.0995   -1.3090   -1.3043
+# Final valid prob (xent)   -1.2916   -1.2904   -1.4242   -1.4225
+# Num-params                 4492816   4492816   4492816   4492816
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a76b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  cnn_opts="l2-regularize=0.03"
+  ivector_affine_opts="l2-regularize=0.03"
+  tdnn_opts="l2-regularize=0.03"
+  tdnnf_first_opts="l2-regularize=0.03 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.03"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+
+  batchnorm-component name=idct-batchnorm input=idct
+  spec-augment-layer name=idct-spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-spec-augment, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
+  # limit the num-parameters).
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
index 642c20ec191..da16297c9dd 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -147,7 +147,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 110b7b87415..3d0c2d63902 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -154,7 +154,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
index fe6f1b50f9e..081af8fe2f8 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -150,7 +150,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
index 225b36f909c..04df38d4da3 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -150,7 +150,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
index 565387003ff..cdf9bb584f4 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -148,7 +148,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05"
   output_opts="l2-regularize=0.01"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
index 9cc6d93022a..d1385ff2be5 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -156,7 +156,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
index e234b847aa7..ad51780e191 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -155,7 +155,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
index 18540806028..dbfe5c5a07a 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh
@@ -168,7 +168,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
index 776247f5ea3..cc4123e2755 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1h.sh
@@ -151,7 +151,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh
new file mode 100755
index 00000000000..502c225fa87
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1i.sh
@@ -0,0 +1,298 @@
+#!/bin/bash
+
+# 1i is as 1h but adding SpecAugment.
+
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1i_sp
+# System                tdnn1h_sp tdnn1i_sp
+#WER dev_clean_2 (tgsmall)      12.09     11.11
+#             [online:]         12.11     11.06
+#WER dev_clean_2 (tglarge)       8.59      7.65
+#             [online:]          8.76      7.74
+# Final train prob        -0.0493   -0.0620
+# Final valid prob        -0.0805   -0.0778
+# Final train prob (xent)   -1.1730   -1.4671
+# Final valid prob (xent)   -1.3872   -1.5783
+# Num-params                 5207856   5207856
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1i_sp
+# exp/chain/tdnn1i_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2328 combine=-0.069->-0.065 (over 4) xent:train/valid[21,33,final]=(-1.69,-1.48,-1.47/-1.78,-1.58,-1.58) logprob:train/valid[21,33,final]=(-0.076,-0.066,-0.062/-0.087,-0.082,-0.078)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1i   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  tdnn_opts="l2-regularize=0.03"
+  tdnnf_opts="l2-regularize=0.03 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat include-in-init=true
+  batchnorm-component name=batchnorm0 input=idct include-in-init=true
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20 include-in-init=true
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1j.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1j.sh
new file mode 100755
index 00000000000..7a6604f9773
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1j.sh
@@ -0,0 +1,301 @@
+#!/bin/bash
+
+# 1j is as 1i but replaces the LDA layer at the input of the
+# network with delta and delta-delta features.
+# Below, 1j2 is just a different rerun of 1j with different
+# --affix option, to give some idea of the run-to-run variation.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1i_sp exp/chain/tdnn1j_sp exp/chain/tdnn1j2_sp
+# System                tdnn1i_sp tdnn1j_sp tdnn1j2_sp
+#WER dev_clean_2 (tgsmall)      10.73     10.78     10.79
+#             [online:]         10.81     10.80     10.80
+#WER dev_clean_2 (tglarge)       7.54      7.43      7.54
+#             [online:]          7.59      7.37      7.55
+# Final train prob        -0.0621   -0.0620   -0.0615
+# Final valid prob        -0.0787   -0.0787   -0.0785
+# Final train prob (xent)   -1.4603   -1.4363   -1.4438
+# Final valid prob (xent)   -1.5847   -1.5629   -1.5712
+# Num-params                 5210944   5210944   5210944
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1j_sp
+# exp/chain/tdnn1j_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2336 combine=-0.049->-0.046 (over 4) xent:train/valid[21,33,final]=(-1.40,-1.18,-1.17/-1.56,-1.38,-1.37) logprob:train/valid[21,33,final]=(-0.064,-0.052,-0.047/-0.089,-0.085,-0.078)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1j   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  tdnn_opts="l2-regularize=0.03"
+  tdnnf_opts="l2-regularize=0.03 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+  batchnorm-component name=batchnorm0 input=idct
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20
+  
+  delta-layer name=delta input=spec-augment
+  no-op-component name=input2 input=Append(delta, Scale(0.4, ReplaceIndex(ivector, t, 0)))
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=768 input=input2
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1k.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1k.sh
new file mode 100755
index 00000000000..652f0175558
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1k.sh
@@ -0,0 +1,312 @@
+#!/bin/bash
+
+# 1k is like 1j, while it introduces 'apply-cmvn-online' that does
+# cmn normalization both for i-extractor and TDNN input.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1j_sp exp/chain_online_cmn/tdnn1k_sp
+# System                tdnn1j_sp tdnn1k_sp
+#WER dev_clean_2 (tgsmall)      10.97     10.64
+#             [online:]         10.97     10.62
+#WER dev_clean_2 (tglarge)       7.57      7.17
+#             [online:]          7.65      7.16
+# Final train prob        -0.0623   -0.0618
+# Final valid prob        -0.0793   -0.0793
+# Final train prob (xent)   -1.4448   -1.4376
+# Final valid prob (xent)   -1.5605   -1.5461
+# Num-params                 5210944   5210944
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1j_sp
+# exp/chain/tdnn1j_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2336 combine=-0.068->-0.064 (over 4) xent:train/valid[21,33,final]=(-1.65,-1.48,-1.44/-1.77,-1.58,-1.56) logprob:train/valid[21,33,final]=(-0.076,-0.068,-0.062/-0.091,-0.084,-0.079)
+
+# steps/info/chain_dir_info.pl exp/chain_online_cmn/tdnn1k_sp
+# exp/chain_online_cmn/tdnn1k_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2336 combine=-0.067->-0.062 (over 5) xent:train/valid[21,33,final]=(-1.63,-1.47,-1.44/-1.73,-1.57,-1.55) logprob:train/valid[21,33,final]=(-0.074,-0.067,-0.062/-0.093,-0.085,-0.079)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=_online_cmn
+
+# Setting 'online_cmvn' to true replaces 'apply-cmvn' by
+# 'apply-cmvn-online' both for i-vector extraction and TDNN input.
+# The i-vector extractor uses the config 'conf/online_cmvn.conf' for
+# both the UBM and the i-extractor. The TDNN input is configured via
+# '--feat.cmvn-opts' that is set to the same config, so we use the
+# same cmvn for i-extractor and the TDNN input.
+online_cmvn=true
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1k   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --online-cmvn-iextractor $online_cmvn \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  tdnn_opts="l2-regularize=0.03"
+  tdnnf_opts="l2-regularize=0.03 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+  batchnorm-component name=batchnorm0 input=idct
+  spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20
+
+  delta-layer name=delta input=spec-augment
+  no-op-component name=input2 input=Append(delta, Scale(0.4, ReplaceIndex(ivector, t, 0)))
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=768 input=input2
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--config=conf/online_cmvn.conf" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --online-cmvn $online_cmvn" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    --online-cmvn-config conf/online_cmvn.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+        data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/download_and_untar.sh b/egs/mini_librispeech/s5/local/download_and_untar.sh
index 93c18f263d2..5a27219f676 100755
--- a/egs/mini_librispeech/s5/local/download_and_untar.sh
+++ b/egs/mini_librispeech/s5/local/download_and_untar.sh
@@ -28,9 +28,11 @@ if [ ! -d "$data" ]; then
   exit 1;
 fi
 
+data=$(readlink -f $data)
+
 part_ok=false
 list="dev-clean-2 train-clean-5"
-for x in $list; do 
+for x in $list; do
   if [ "$part" == $x ]; then part_ok=true; fi
 done
 if ! $part_ok; then
@@ -49,7 +51,8 @@ if [ -f $data/LibriSpeech/$part/.complete ]; then
 fi
 
 
-sizes="126046265 332747356"
+#sizes="126046265 332747356"
+sizes="126046265 332954390"
 
 if [ -f $data/$part.tar.gz ]; then
   size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
@@ -77,6 +80,7 @@ if [ ! -f $data/$part.tar.gz ]; then
     echo "$0: error executing wget $full_url"
     exit 1;
   fi
+  cd -
 fi
 
 cd $data
diff --git a/egs/mini_librispeech/s5/local/download_lm.sh b/egs/mini_librispeech/s5/local/download_lm.sh
index 185d4811768..b37ae599118 100755
--- a/egs/mini_librispeech/s5/local/download_lm.sh
+++ b/egs/mini_librispeech/s5/local/download_lm.sh
@@ -4,14 +4,15 @@
 #           2017  Daniel Povey
 # Apache 2.0
 
-if [ $# -ne "2" ]; then
-  echo "Usage: $0 <base-url> <download_dir>"
-  echo "e.g.: $0 http://www.openslr.org/resources/11 data/local/lm"
+if [ $# -ne "3" ]; then
+  echo "Usage: $0 <base-url> <download_dir> <local?"
+  echo "e.g.: $0 http://www.openslr.org/resources/11 ./corpus/ data/local/lm"
   exit 1
 fi
 
 base_url=$1
 dst_dir=$2
+local_dir=$3
 
 # given a filename returns the corresponding file size in bytes
 # The switch cases below can be autogenerated by entering the data directory and running:
@@ -57,16 +58,17 @@ function check_and_download () {
   return 0
 }
 
-mkdir -p $dst_dir
+mkdir -p $dst_dir $local_dir
 
 for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz  \
          librispeech-vocab.txt librispeech-lexicon.txt; do
   check_and_download $f || exit 1
 done
 
-cd $dst_dir
-ln -sf 3-gram.pruned.1e-7.arpa.gz lm_tgmed.arpa.gz
-ln -sf 3-gram.pruned.3e-7.arpa.gz lm_tgsmall.arpa.gz
-ln -sf 3-gram.arpa.gz lm_tglarge.arpa.gz
-
+dst_dir=$(readlink -f $dst_dir)
+ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz
+ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz
+ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz
+ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt
+ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt
 exit 0
diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
new file mode 100755
index 00000000000..60d620b06c6
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
@@ -0,0 +1,328 @@
+#!/usr/bin/env bash
+
+# This script demonstrates how to use the grammar-decoding framework to build
+# graphs made out of more than one part.  It demonstrates using `fstequivalent`
+# that the graph constructed this way is equivalent to what you would create if
+# you had the LM all as a single piece.  This uses the command line tools to
+# expand to a regular FST (--write-as-grammar=false) In practice you might not
+# want do to that, since the result might be large, and since writing the entire
+# thing might take too much time.  The code itself allows you to construct these
+# GrammarFst objects in lightweight way and decode using them.
+
+# Unfortunately the filenames here are not very well through through.  I hope to
+# rework this when I have time.
+
+stage=0
+run_g2p=false  # set this to true to run the g2p stuff, it's slow so
+               # by default we fake it by providing what it previously output
+set -e
+
+. ./path.sh
+. utils/parse_options.sh
+
+
+tree_dir=exp/chain/tree_sp
+lang_base=data/lang_nosp_basevocab
+lang_ext=data/lang_nosp_extvocab
+
+# For the purposes of this script we just need a biphone tree and associated
+# transition-model for testing, because we're testing it at the graph level,
+# i.e. testing equivalence of compiled HCLG graphs; there is no decoding
+# involved here.
+
+# We're doing this with the "no-silprobs" dictionary dir for now, as we
+# need to write some scripts to support silprobs with this.
+
+# For reference, here is how we could create the 'lang' dir for the
+# baseline.
+#utils/prepare_lang.sh data/local/dict_nosp \
+#   "<UNK>" data/local/lang_tmp_nosp data/lang_nosp
+
+if [ $stage -le 0 ]; then
+  cp -r data/local/dict_nosp data/local/dict_nosp_basevocab
+  echo "#nonterm:unk" > data/local/dict_nosp_basevocab/nonterminals.txt
+
+  utils/prepare_lang.sh data/local/dict_nosp_basevocab \
+       "<UNK>" data/local/lang_tmp_nosp $lang_base
+fi
+
+if [ $stage -le 1 ]; then
+  # note: <UNK> does appear in that arpa file, with a reasonable probability
+  # (0.0)...  presumably because the vocab that the arpa file was built with was
+  # not vast, so there were plenty of OOVs.  It would be possible to adjust its
+  # probability with adjust_unk_arpa.pl, but for now we just leave it as-is.
+  # The <UNK> appears quite a few times in the ARPA.  In the language model we
+  # replaced it with #nonterm:unk, which will later expand to our custom graph
+  # of new words.
+
+  # We don't want the #nonterm:unk on the output side of G.fst, or it would
+  # appear in the decoded output, so we remove it using the 'fstrmsymbols' command.
+
+  nonterm_unk=$(grep '#nonterm:unk' $lang_base/words.txt | awk '{print $2}')
+
+  gunzip -c  data/local/lm/lm_tgsmall.arpa.gz | \
+    sed 's/<UNK>/#nonterm:unk/g' | \
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang_base/words.txt - | \
+    fstrmsymbols --remove-from-output=true "echo $nonterm_unk|" - $lang_base/G.fst
+fi
+
+
+if [ $stage -le 2 ]; then
+  # make the top-level part of the graph.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_nosp_top
+fi
+
+if [ $stage -le 3 ] && $run_g2p; then
+  # you may have to do some stuff manually to install sequitur, to get this to work.
+  dict=data/local/dict_nosp_basevocab
+  steps/dict/train_g2p.sh --silence-phones $dict/silence_phones.txt $dict/lexicon.txt  $tree_dir/extvocab_nosp_g2p
+fi
+
+
+if [ $stage -le 4 ]; then
+  # Create data/local/dict_nosp_newvocab as a dict-dir containing just the
+  # newly created vocabulary entries (but the same phone list as our old setup, not
+  # that it matters)
+
+  mkdir -p $tree_dir/extvocab_nosp_lexicon
+
+  # First find a list of words in the test set that are out of vocabulary.
+  # Of course this is totally cheating.
+  awk -v w=data/lang/words.txt 'BEGIN{while(getline <w) seen[$1] = $1} {for(n=2;n<=NF;n++) if(!($n in seen)) oov[$n] = 1}
+                                END{ for(k in oov) print k;}' < data/dev_clean_2/text > $tree_dir/extvocab_nosp_lexicon/words
+  echo "$0: generating g2p entries for $(wc -l <$tree_dir/extvocab_nosp_lexicon/words) words"
+
+  if $run_g2p; then
+    steps/dict/apply_g2p.sh $tree_dir/extvocab_nosp_lexicon/words $tree_dir/extvocab_nosp_g2p  $tree_dir/extvocab_nosp_lexicon
+  else
+    cat <<EOF >$tree_dir/extvocab_nosp_lexicon/lexicon.lex
+HARDWIGG	0.962436	HH AA1 R D W IH1 G
+SUDVESTR	0.162048	S AH1 D V EY1 S T R
+SUDVESTR	0.133349	S AH1 D V EH1 S T R
+SUDVESTR	0.114376	S AH1 D V EH1 S T ER0
+VINOS	0.558345	V IY1 N OW0 Z
+VINOS	0.068883	V AY1 N OW0 Z
+VINOS	0.068431	V IY1 N OW0 S
+DOMA	0.645714	D OW1 M AH0
+DOMA	0.118255	D UW1 M AH0
+DOMA	0.080682	D OW0 M AH0
+GWYNPLAINE'S	0.983053	G W IH1 N P L EY1 N Z
+SHIMERDA	0.610922	SH IH0 M EH1 R D AH0
+SHIMERDA	0.175678	SH IY0 M EH1 R D AH0
+SHIMERDA	0.069785	SH AY1 M ER1 D AH0
+MYRDALS	0.479183	M IH1 R D AH0 L Z
+MYRDALS	0.135225	M ER1 D AH0 L Z
+MYRDALS	0.115478	M IH1 R D L Z
+HEUCHERA	0.650042	HH OY1 K IH1 R AH0
+HEUCHERA	0.119363	HH OY1 K EH1 R AH0
+HEUCHERA	0.077907	HH OY1 K ER0 AH0
+IMPARA	0.906222	IH0 M P AA1 R AH0
+VERLOC'S	0.564847	V ER0 L AA1 K S
+VERLOC'S	0.173540	V ER1 L AH0 K S
+VERLOC'S	0.050543	V ER1 L AA1 K S
+UNTRUSSING	0.998019	AH0 N T R AH1 S IH0 NG
+DARFHULVA	0.317057	D AA2 F UH1 L V AH0
+DARFHULVA	0.262882	D AA2 F HH UH1 L V AH0
+DARFHULVA	0.064055	D AA2 F HH UW1 L V AH0
+FINNACTA	0.594586	F IH1 N AH0 K T AH0
+FINNACTA	0.232454	F IH1 N AE1 K T AH0
+FINNACTA	0.044733	F IH1 N IH0 K T AH0
+YOKUL	0.845279	Y OW1 K AH0 L
+YOKUL	0.051082	Y OW2 K AH0 L
+YOKUL	0.029435	Y OW0 K AH0 L
+CONGAL	0.504228	K AA1 NG G AH0 L
+CONGAL	0.151648	K AA2 NG G AH0 L
+CONGAL	0.137837	K AH0 N JH AH0 L
+DELECTASTI	0.632180	D IH0 L EH0 K T EY1 S T IY0
+DELECTASTI	0.203808	D IH0 L EH1 K T EY1 S T IY0
+DELECTASTI	0.066722	D IH0 L EH0 K T AE1 S T IY0
+YUNDT	0.975077	Y AH1 N T
+QUINCI	0.426115	K W IH1 N S IY0
+QUINCI	0.369324	K W IH1 N CH IY0
+QUINCI	0.064507	K W IY0 N CH IY0
+BIRDIKINS	0.856979	B ER1 D IH0 K AH0 N Z
+BIRDIKINS	0.045315	B ER1 D AH0 K AH0 N Z
+SNEFFELS	0.928413	S N EH1 F AH0 L Z
+FJORDUNGR	0.130629	F Y AO1 R D UW0 NG G R
+FJORDUNGR	0.125082	F Y AO1 R D AH0 NG G R
+FJORDUNGR	0.111035	F Y AO1 R D UH1 NG R
+YULKA	0.540253	Y UW1 L K AH0
+YULKA	0.295588	Y AH1 L K AH0
+YULKA	0.076631	Y UH1 L K AH0
+LACQUEY'S	0.987908	L AE1 K IY0 Z
+OSSIPON'S	0.651400	AA1 S AH0 P AA2 N Z
+OSSIPON'S	0.118444	AA1 S AH0 P AA0 N Z
+OSSIPON'S	0.106377	AA1 S AH0 P AH0 N Z
+SAKNUSSEMM	0.060270	S AE1 K N AH1 S EH1 M
+SAKNUSSEMM	0.044992	S AE1 K N AH0 S EH1 M
+SAKNUSSEMM	0.044084	S AA0 K N AH1 S EH1 M
+CONGAL'S	0.618287	K AA1 NG G AH0 L Z
+CONGAL'S	0.185952	K AA2 NG G AH0 L Z
+CONGAL'S	0.115143	K AH0 N G AH0 L Z
+TARRINZEAU	0.159153	T AA1 R IY0 N Z OW1
+TARRINZEAU	0.136536	T AA1 R AH0 N Z OW1
+TARRINZEAU	0.100924	T EH1 R IY0 N Z OW1
+SHIMERDAS	0.230819	SH IH0 M EH1 R D AH0 Z
+SHIMERDAS	0.216235	SH IH0 M EH1 R D AH0 S
+SHIMERDAS	0.073311	SH AY1 M ER1 D AH0 Z
+RUGGEDO'S	0.821285	R UW0 JH EY1 D OW0 Z
+RUGGEDO'S	0.166825	R AH1 G AH0 D OW0 Z
+CORNCAKES	0.934118	K AO1 R N K EY2 K S
+VENDHYA	0.616662	V EH0 N D Y AH0
+VENDHYA	0.178349	V EH1 N D Y AH0
+VENDHYA	0.160768	V AA1 N D Y AH0
+GINGLE	0.919815	G IH1 NG G AH0 L
+STUPIRTI	0.422653	S T UW0 P IH1 R T IY0
+STUPIRTI	0.126925	S T UW1 P IH0 R T IY0
+STUPIRTI	0.078422	S T UW1 P AH0 R T IY0
+HERBIVORE	0.950887	HH ER1 B IH0 V AO2 R
+BRION'S	0.838326	B R AY1 AH0 N Z
+BRION'S	0.140310	B R IY0 AH0 N Z
+DELAUNAY'S	0.993259	D EH1 L AO0 N EY0 Z
+KHOSALA	0.920908	K OW0 S AA1 L AH0
+BRANDD	0.827461	B R AE1 N D
+BRANDD	0.085646	B R AE2 N D
+GARDAR	0.598675	G AA0 R D AA1 R
+GARDAR	0.289831	G AA1 R D AA2 R
+GARDAR	0.057983	G AA0 R D AA2 R
+MACKLEWAIN	0.570209	M AE1 K AH0 L W EY0 N
+MACKLEWAIN	0.101477	M AH0 K AH0 L W EY0 N
+MACKLEWAIN	0.067905	M AE1 K AH0 L W EY2 N
+LIBANO	0.993297	L IY0 B AA1 N OW0
+MOLING	0.782578	M OW1 L IH0 NG
+MOLING	0.059362	M OW2 L IH0 NG
+MOLING	0.056217	M AA1 L IH0 NG
+BENNYDECK'S	0.583859	B EH1 N IY0 D EH0 K S
+BENNYDECK'S	0.276699	B EH1 N IH0 D EH0 K S
+BENNYDECK'S	0.028343	B EH1 N IH0 D IH0 K S
+MACKLEWAIN'S	0.615766	M AE1 K AH0 L W EY0 N Z
+MACKLEWAIN'S	0.109585	M AH0 K AH0 L W EY0 N Z
+MACKLEWAIN'S	0.039423	M AE1 K AH0 L W AH0 N Z
+PRESTY	0.616071	P R EH1 S T IY0
+PRESTY	0.288701	P R AH0 S T IY0
+BREADHOUSE	0.995874	B R EH1 D HH AW2 S
+BUZZER'S	0.992495	B AH1 Z ER0 Z
+BHUNDA	0.502439	B UW1 N D AH0
+BHUNDA	0.267733	B AH0 N D AH0
+BHUNDA	0.193772	B UH1 N D AH0
+PINKIES	0.998440	P IH1 NG K IY0 Z
+TROKE	0.723320	T R OW1 K
+TROKE	0.269707	T R OW2 K
+OSSIPON	0.728486	AA1 S AH0 P AA2 N
+OSSIPON	0.098752	AA1 S AH0 P AH0 N
+OSSIPON	0.033957	AA1 S AH0 P AO0 N
+RIVERLIKE	0.991731	R IH1 V ER0 L AY2 K
+NICLESS	0.478183	N IH1 K L AH0 S
+NICLESS	0.159889	N IH0 K L AH0 S
+NICLESS	0.120611	N IH1 K L IH0 S
+TRAMPE	0.959184	T R AE1 M P
+VERLOC	0.610461	V ER0 L AA1 K
+VERLOC	0.128479	V ER1 L AH0 K
+VERLOC	0.073687	V ER1 L AA0 K
+GANNY	0.991703	G AE1 N IY0
+AMBROSCH	0.302906	AE0 M B R OW1 SH
+AMBROSCH	0.201163	AE0 M B R AO1 SH
+AMBROSCH	0.109274	AE1 M B R AO1 SH
+FIBI	0.619154	F IH1 B IY0
+FIBI	0.163168	F IY1 B IY0
+FIBI	0.083443	F AY1 B IY0
+IROLG	0.823123	IH0 R OW1 L G
+IROLG	0.053196	IH0 R OW1 L JH
+IROLG	0.021038	IH0 R OW1 L JH IY1
+BALVASTRO	0.251546	B AA0 L V AA1 S T R OW0
+BALVASTRO	0.213351	B AE0 L V AE1 S T R OW0
+BALVASTRO	0.133005	B AA0 L V AE1 S T R OW0
+BOOLOOROO	0.676757	B UW1 L UW1 R UW0
+BOOLOOROO	0.173653	B UW1 L UH2 R UW0
+BOOLOOROO	0.086501	B UW1 L UH0 R UW0
+EOF
+  fi
+
+  # extend_lang.sh needs it to have basename 'lexiconp.txt'.
+  mv $tree_dir/extvocab_nosp_lexicon/lexicon.lex $tree_dir/extvocab_nosp_lexicon/lexiconp.txt
+
+  [ -f data/lang_nosp_extvocab/G.fst ] && rm data/lang_nosp_extvocab/G.fst
+  utils/lang/extend_lang.sh  data/lang_nosp_basevocab $tree_dir/extvocab_nosp_lexicon/lexiconp.txt  data/lang_nosp_extvocab
+fi
+
+if [ $stage -le 5 ]; then
+  # make the G.fst for the extra words.  Just assign equal probabilities to all of
+  # them.  The words will all transition from state 1 to 2.
+  cat <<EOF > $lang_ext/G.txt
+0    1    #nonterm_begin <eps>
+2    3    #nonterm_end <eps>
+3
+EOF
+  lexicon=$tree_dir/extvocab_nosp_lexicon/lexiconp.txt
+  num_words=$(wc -l <$lexicon)
+  cost=$(perl -e "print log($num_words)");
+  awk -v cost=$cost '{print 1, 2, $1, $1, cost}' <$lexicon >>$lang_ext/G.txt
+  fstcompile --isymbols=$lang_ext/words.txt --osymbols=$lang_ext/words.txt <$lang_ext/G.txt | \
+    fstarcsort --sort_type=ilabel >$lang_ext/G.fst
+fi
+
+if [ $stage -le 6 ]; then
+  # make the part of the graph that will be included.
+  # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
+  # this in code.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_nosp_part
+fi
+
+if [ $stage -le 7 ]; then
+  offset=$(grep nonterm_bos $lang_ext/phones.txt | awk '{print $2}')
+  nonterm_unk=$(grep nonterm:unk $lang_ext/phones.txt | awk '{print $2}')
+
+  mkdir -p $tree_dir/extvocab_nosp_combined
+  [ -d $tree_dir/extvocab_nosp_combined/phones ] && rm -r $tree_dir/extvocab_nosp_combined/phones
+  # the decoding script expects words.txt and phones/, copy them from the extvocab_part
+  # graph directory where they will have suitable values.
+  cp -r $tree_dir/extvocab_nosp_part/{words.txt,phones.txt,phones/} $tree_dir/extvocab_nosp_combined
+
+  # the following, due to --write-as-grammar=false, compiles it into an FST
+  # which can be decoded by our normal decoder.
+  make-grammar-fst --write-as-grammar=false --nonterm-phones-offset=$offset $tree_dir/extvocab_nosp_top/HCLG.fst \
+                   $nonterm_unk $tree_dir/extvocab_nosp_part/HCLG.fst  $tree_dir/extvocab_nosp_combined/HCLG.fst
+
+  # the following compiles it and writes as GrammarFst.  The size is 176M, vs. 182M for HCLG.fst.
+  # In other examples, of course the difference might be more.
+
+  make-grammar-fst --write-as-grammar=true --nonterm-phones-offset=$offset $tree_dir/extvocab_nosp_top/HCLG.fst \
+                $nonterm_unk $tree_dir/extvocab_nosp_part/HCLG.fst  $tree_dir/extvocab_nosp_combined/HCLG.gra
+fi
+
+
+if [ $stage -le 8 ]; then
+  # OK, now we actually decode the test data.  For reference, the command which was used to
+  # decode the test data in the current (at the time of writing) chain TDNN system
+  # local/chain/run_tdnn.sh (as figured out by running it from that stage), was:
+  # steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
+  #   --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
+  #   exp/chain/tree_sp/graph_tgsmall data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2
+
+  # We just replace the graph with the one in $treedir/extvocab_nosp_combined.
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
+    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
+    exp/chain/tree_sp/extvocab_nosp_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb
+
+
+
+ #  s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh
+ # %WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_12_0.0
+
+ #.. versus the baseline below note, the baseline is not 100% comparable as it used the
+ #   silence probabilities, which the grammar-decoding does not (yet) support...
+ # s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_* | utils/best_wer.sh
+ # %WER 12.01 [ 2418 / 20138, 244 ins, 307 del, 1867 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_13_0.0
+fi
+
+if [ $stage -le 9 ]; then
+  steps/nnet3/decode_grammar.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
+    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
+    exp/chain/tree_sp/extvocab_nosp_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra
+
+  #  The WER when decoding with the grammar FST directly is exactly the same:
+  # s5:  grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra/wer_* | utils/best_wer.sh
+  # %WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb_gra/wer_12_0.0
+fi
diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
new file mode 100755
index 00000000000..28c58dfa453
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo_silprobs.sh
@@ -0,0 +1,326 @@
+#!/usr/bin/env bash
+
+# This script demonstrates how to use the grammar-decoding framework to build
+# graphs made out of more than one part.  (This version uses word-specific
+# silence probabilities). It demonstrates using `fstequivalent`
+# that the graph constructed this way is equivalent to what you would create if
+# you had the LM all as a single piece.  This uses the command line tools to
+# expand to a regular FST (--write-as-grammar=false) In practice you might not
+# want do to that, since the result might be large, and since writing the entire
+# thing might take too much time.  The code itself allows you to construct these
+# GrammarFst objects in lightweight way and decode using them.
+
+# Unfortunately the filenames here are not very well through through.  I hope to
+# rework this when I have time.
+
+stage=0
+run_g2p=false  # set this to true to run the g2p stuff, it's slow so
+               # by default we fake it by providing what it previously output
+set -e
+
+. ./path.sh
+. utils/parse_options.sh
+
+
+tree_dir=exp/chain/tree_sp
+lang_base=data/lang_basevocab
+lang_ext=data/lang_extvocab
+
+# For the purposes of this script we just need a biphone tree and associated
+# transition-model for testing, because we're testing it at the graph level,
+# i.e. testing equivalence of compiled HCLG graphs; there is no decoding
+# involved here.
+
+# We're doing this with the "no-silprobs" dictionary dir for now, as we
+# need to write some scripts to support silprobs with this.
+
+# For reference, here is how we could create the 'lang' dir for the
+# baseline.
+#utils/prepare_lang.sh data/local/dict \
+#   "<UNK>" data/local/lang_tmp data/lang
+
+if [ $stage -le 0 ]; then
+  cp -r data/local/dict data/local/dict_basevocab
+  echo "#nonterm:unk" > data/local/dict_basevocab/nonterminals.txt
+
+  utils/prepare_lang.sh data/local/dict_basevocab \
+       "<UNK>" data/local/lang_tmp $lang_base
+fi
+
+if [ $stage -le 1 ]; then
+  # note: <UNK> does appear in that arpa file, with a reasonable probability
+  # (0.0)...  presumably because the vocab that the arpa file was built with was
+  # not vast, so there were plenty of OOVs.  It would be possible to adjust its
+  # probability with adjust_unk_arpa.pl, but for now we just leave it as-is.
+  # The <UNK> appears quite a few times in the ARPA.  In the language model we
+  # replaced it with #nonterm:unk, which will later expand to our custom graph
+  # of new words.
+
+  # We don't want the #nonterm:unk on the output side of G.fst, or it would
+  # appear in the decoded output, so we remove it using the 'fstrmsymbols' command.
+
+  nonterm_unk=$(grep '#nonterm:unk' $lang_base/words.txt | awk '{print $2}')
+
+  gunzip -c  data/local/lm/lm_tgsmall.arpa.gz | \
+    sed 's/<UNK>/#nonterm:unk/g' | \
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$lang_base/words.txt - | \
+    fstrmsymbols --remove-from-output=true "echo $nonterm_unk|" - $lang_base/G.fst
+fi
+
+
+if [ $stage -le 2 ]; then
+  # make the top-level part of the graph.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_base $tree_dir $tree_dir/extvocab_top
+fi
+
+if [ $stage -le 3 ] && $run_g2p; then
+  # you may have to do some stuff manually to install sequitur, to get this to work.
+  dict=data/local/dict_basevocab
+  steps/dict/train_g2p.sh --silence-phones $dict/silence_phones.txt $dict/lexicon.txt  $tree_dir/extvocab_g2p
+fi
+
+
+if [ $stage -le 4 ]; then
+  # Create data/local/dict_newvocab as a dict-dir containing just the
+  # newly created vocabulary entries (but the same phone list as our old setup, not
+  # that it matters)
+
+  mkdir -p $tree_dir/extvocab_lexicon
+
+  # First find a list of words in the test set that are out of vocabulary.
+  # Of course this is totally cheating.
+  awk -v w=data/lang/words.txt 'BEGIN{while(getline <w) seen[$1] = $1} {for(n=2;n<=NF;n++) if(!($n in seen)) oov[$n] = 1}
+                                END{ for(k in oov) print k;}' < data/dev_clean_2/text > $tree_dir/extvocab_lexicon/words
+  echo "$0: generating g2p entries for $(wc -l <$tree_dir/extvocab_lexicon/words) words"
+
+  if $run_g2p; then
+    steps/dict/apply_g2p.sh $tree_dir/extvocab_lexicon/words $tree_dir/extvocab_g2p  $tree_dir/extvocab_lexicon
+  else
+    cat <<EOF >$tree_dir/extvocab_lexicon//lexicon.lex
+HARDWIGG	0.962436	HH AA1 R D W IH1 G
+SUDVESTR	0.162048	S AH1 D V EY1 S T R
+SUDVESTR	0.133349	S AH1 D V EH1 S T R
+SUDVESTR	0.114376	S AH1 D V EH1 S T ER0
+VINOS	0.558345	V IY1 N OW0 Z
+VINOS	0.068883	V AY1 N OW0 Z
+VINOS	0.068431	V IY1 N OW0 S
+DOMA	0.645714	D OW1 M AH0
+DOMA	0.118255	D UW1 M AH0
+DOMA	0.080682	D OW0 M AH0
+GWYNPLAINE'S	0.983053	G W IH1 N P L EY1 N Z
+SHIMERDA	0.610922	SH IH0 M EH1 R D AH0
+SHIMERDA	0.175678	SH IY0 M EH1 R D AH0
+SHIMERDA	0.069785	SH AY1 M ER1 D AH0
+MYRDALS	0.479183	M IH1 R D AH0 L Z
+MYRDALS	0.135225	M ER1 D AH0 L Z
+MYRDALS	0.115478	M IH1 R D L Z
+HEUCHERA	0.650042	HH OY1 K IH1 R AH0
+HEUCHERA	0.119363	HH OY1 K EH1 R AH0
+HEUCHERA	0.077907	HH OY1 K ER0 AH0
+IMPARA	0.906222	IH0 M P AA1 R AH0
+VERLOC'S	0.564847	V ER0 L AA1 K S
+VERLOC'S	0.173540	V ER1 L AH0 K S
+VERLOC'S	0.050543	V ER1 L AA1 K S
+UNTRUSSING	0.998019	AH0 N T R AH1 S IH0 NG
+DARFHULVA	0.317057	D AA2 F UH1 L V AH0
+DARFHULVA	0.262882	D AA2 F HH UH1 L V AH0
+DARFHULVA	0.064055	D AA2 F HH UW1 L V AH0
+FINNACTA	0.594586	F IH1 N AH0 K T AH0
+FINNACTA	0.232454	F IH1 N AE1 K T AH0
+FINNACTA	0.044733	F IH1 N IH0 K T AH0
+YOKUL	0.845279	Y OW1 K AH0 L
+YOKUL	0.051082	Y OW2 K AH0 L
+YOKUL	0.029435	Y OW0 K AH0 L
+CONGAL	0.504228	K AA1 NG G AH0 L
+CONGAL	0.151648	K AA2 NG G AH0 L
+CONGAL	0.137837	K AH0 N JH AH0 L
+DELECTASTI	0.632180	D IH0 L EH0 K T EY1 S T IY0
+DELECTASTI	0.203808	D IH0 L EH1 K T EY1 S T IY0
+DELECTASTI	0.066722	D IH0 L EH0 K T AE1 S T IY0
+YUNDT	0.975077	Y AH1 N T
+QUINCI	0.426115	K W IH1 N S IY0
+QUINCI	0.369324	K W IH1 N CH IY0
+QUINCI	0.064507	K W IY0 N CH IY0
+BIRDIKINS	0.856979	B ER1 D IH0 K AH0 N Z
+BIRDIKINS	0.045315	B ER1 D AH0 K AH0 N Z
+SNEFFELS	0.928413	S N EH1 F AH0 L Z
+FJORDUNGR	0.130629	F Y AO1 R D UW0 NG G R
+FJORDUNGR	0.125082	F Y AO1 R D AH0 NG G R
+FJORDUNGR	0.111035	F Y AO1 R D UH1 NG R
+YULKA	0.540253	Y UW1 L K AH0
+YULKA	0.295588	Y AH1 L K AH0
+YULKA	0.076631	Y UH1 L K AH0
+LACQUEY'S	0.987908	L AE1 K IY0 Z
+OSSIPON'S	0.651400	AA1 S AH0 P AA2 N Z
+OSSIPON'S	0.118444	AA1 S AH0 P AA0 N Z
+OSSIPON'S	0.106377	AA1 S AH0 P AH0 N Z
+SAKNUSSEMM	0.060270	S AE1 K N AH1 S EH1 M
+SAKNUSSEMM	0.044992	S AE1 K N AH0 S EH1 M
+SAKNUSSEMM	0.044084	S AA0 K N AH1 S EH1 M
+CONGAL'S	0.618287	K AA1 NG G AH0 L Z
+CONGAL'S	0.185952	K AA2 NG G AH0 L Z
+CONGAL'S	0.115143	K AH0 N G AH0 L Z
+TARRINZEAU	0.159153	T AA1 R IY0 N Z OW1
+TARRINZEAU	0.136536	T AA1 R AH0 N Z OW1
+TARRINZEAU	0.100924	T EH1 R IY0 N Z OW1
+SHIMERDAS	0.230819	SH IH0 M EH1 R D AH0 Z
+SHIMERDAS	0.216235	SH IH0 M EH1 R D AH0 S
+SHIMERDAS	0.073311	SH AY1 M ER1 D AH0 Z
+RUGGEDO'S	0.821285	R UW0 JH EY1 D OW0 Z
+RUGGEDO'S	0.166825	R AH1 G AH0 D OW0 Z
+CORNCAKES	0.934118	K AO1 R N K EY2 K S
+VENDHYA	0.616662	V EH0 N D Y AH0
+VENDHYA	0.178349	V EH1 N D Y AH0
+VENDHYA	0.160768	V AA1 N D Y AH0
+GINGLE	0.919815	G IH1 NG G AH0 L
+STUPIRTI	0.422653	S T UW0 P IH1 R T IY0
+STUPIRTI	0.126925	S T UW1 P IH0 R T IY0
+STUPIRTI	0.078422	S T UW1 P AH0 R T IY0
+HERBIVORE	0.950887	HH ER1 B IH0 V AO2 R
+BRION'S	0.838326	B R AY1 AH0 N Z
+BRION'S	0.140310	B R IY0 AH0 N Z
+DELAUNAY'S	0.993259	D EH1 L AO0 N EY0 Z
+KHOSALA	0.920908	K OW0 S AA1 L AH0
+BRANDD	0.827461	B R AE1 N D
+BRANDD	0.085646	B R AE2 N D
+GARDAR	0.598675	G AA0 R D AA1 R
+GARDAR	0.289831	G AA1 R D AA2 R
+GARDAR	0.057983	G AA0 R D AA2 R
+MACKLEWAIN	0.570209	M AE1 K AH0 L W EY0 N
+MACKLEWAIN	0.101477	M AH0 K AH0 L W EY0 N
+MACKLEWAIN	0.067905	M AE1 K AH0 L W EY2 N
+LIBANO	0.993297	L IY0 B AA1 N OW0
+MOLING	0.782578	M OW1 L IH0 NG
+MOLING	0.059362	M OW2 L IH0 NG
+MOLING	0.056217	M AA1 L IH0 NG
+BENNYDECK'S	0.583859	B EH1 N IY0 D EH0 K S
+BENNYDECK'S	0.276699	B EH1 N IH0 D EH0 K S
+BENNYDECK'S	0.028343	B EH1 N IH0 D IH0 K S
+MACKLEWAIN'S	0.615766	M AE1 K AH0 L W EY0 N Z
+MACKLEWAIN'S	0.109585	M AH0 K AH0 L W EY0 N Z
+MACKLEWAIN'S	0.039423	M AE1 K AH0 L W AH0 N Z
+PRESTY	0.616071	P R EH1 S T IY0
+PRESTY	0.288701	P R AH0 S T IY0
+BREADHOUSE	0.995874	B R EH1 D HH AW2 S
+BUZZER'S	0.992495	B AH1 Z ER0 Z
+BHUNDA	0.502439	B UW1 N D AH0
+BHUNDA	0.267733	B AH0 N D AH0
+BHUNDA	0.193772	B UH1 N D AH0
+PINKIES	0.998440	P IH1 NG K IY0 Z
+TROKE	0.723320	T R OW1 K
+TROKE	0.269707	T R OW2 K
+OSSIPON	0.728486	AA1 S AH0 P AA2 N
+OSSIPON	0.098752	AA1 S AH0 P AH0 N
+OSSIPON	0.033957	AA1 S AH0 P AO0 N
+RIVERLIKE	0.991731	R IH1 V ER0 L AY2 K
+NICLESS	0.478183	N IH1 K L AH0 S
+NICLESS	0.159889	N IH0 K L AH0 S
+NICLESS	0.120611	N IH1 K L IH0 S
+TRAMPE	0.959184	T R AE1 M P
+VERLOC	0.610461	V ER0 L AA1 K
+VERLOC	0.128479	V ER1 L AH0 K
+VERLOC	0.073687	V ER1 L AA0 K
+GANNY	0.991703	G AE1 N IY0
+AMBROSCH	0.302906	AE0 M B R OW1 SH
+AMBROSCH	0.201163	AE0 M B R AO1 SH
+AMBROSCH	0.109274	AE1 M B R AO1 SH
+FIBI	0.619154	F IH1 B IY0
+FIBI	0.163168	F IY1 B IY0
+FIBI	0.083443	F AY1 B IY0
+IROLG	0.823123	IH0 R OW1 L G
+IROLG	0.053196	IH0 R OW1 L JH
+IROLG	0.021038	IH0 R OW1 L JH IY1
+BALVASTRO	0.251546	B AA0 L V AA1 S T R OW0
+BALVASTRO	0.213351	B AE0 L V AE1 S T R OW0
+BALVASTRO	0.133005	B AA0 L V AE1 S T R OW0
+BOOLOOROO	0.676757	B UW1 L UW1 R UW0
+BOOLOOROO	0.173653	B UW1 L UH2 R UW0
+BOOLOOROO	0.086501	B UW1 L UH0 R UW0
+EOF
+  fi
+
+  # extend_lang.sh needs it to have basename 'lexiconp.txt'.
+  mv $tree_dir/extvocab_lexicon/lexicon.lex $tree_dir/extvocab_lexicon/lexiconp.txt
+
+  [ -f data/lang_extvocab/G.fst ] && rm data/lang_extvocab/G.fst
+  utils/lang/extend_lang.sh  data/lang_basevocab $tree_dir/extvocab_lexicon/lexiconp.txt  data/lang_extvocab
+fi
+
+if [ $stage -le 5 ]; then
+  # make the G.fst for the extra words.  Just assign equal probabilities to all of
+  # them.  The words will all transition from state 1 to 2.
+  cat <<EOF > $lang_ext/G.txt
+0    1    #nonterm_begin <eps>
+2    3    #nonterm_end <eps>
+3
+EOF
+  lexicon=$tree_dir/extvocab_lexicon/lexiconp.txt
+  num_words=$(wc -l <$lexicon)
+  cost=$(perl -e "print log($num_words)");
+  awk -v cost=$cost '{print 1, 2, $1, $1, cost}' <$lexicon >>$lang_ext/G.txt
+  fstcompile --isymbols=$lang_ext/words.txt --osymbols=$lang_ext/words.txt <$lang_ext/G.txt | \
+    fstarcsort --sort_type=ilabel >$lang_ext/G.fst
+fi
+
+if [ $stage -le 6 ]; then
+  # make the part of the graph that will be included.
+  # Refer to the 'compile-graph' commands in ./simple_demo.sh for how you'd do
+  # this in code.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_ext $tree_dir $tree_dir/extvocab_part
+fi
+
+if [ $stage -le 7 ]; then
+  offset=$(grep nonterm_bos $lang_ext/phones.txt | awk '{print $2}')
+  nonterm_unk=$(grep nonterm:unk $lang_ext/phones.txt | awk '{print $2}')
+
+  mkdir -p $tree_dir/extvocab_combined
+  [ -d $tree_dir/extvocab_combined/phones ] && rm -r $tree_dir/extvocab_combined/phones
+  # the decoding script expects words.txt and phones/, copy them from the extvocab_part
+  # graph directory where they will have suitable values.
+  cp -r $tree_dir/extvocab_part/{words.txt,phones.txt,phones/} $tree_dir/extvocab_combined
+
+  # the following, due to --write-as-grammar=false, compiles it into an FST
+  # which can be decoded by our normal decoder.
+  make-grammar-fst --write-as-grammar=false --nonterm-phones-offset=$offset $tree_dir/extvocab_top/HCLG.fst \
+                   $nonterm_unk $tree_dir/extvocab_part/HCLG.fst  $tree_dir/extvocab_combined/HCLG.fst
+
+  # the following compiles it and writes as GrammarFst.  The size is 176M, vs. 182M for HCLG.fst.
+  # In other examples, of course the difference might be more.
+
+  make-grammar-fst --write-as-grammar=true --nonterm-phones-offset=$offset $tree_dir/extvocab_top/HCLG.fst \
+                $nonterm_unk $tree_dir/extvocab_part/HCLG.fst  $tree_dir/extvocab_combined/HCLG.gra
+fi
+
+
+if [ $stage -le 8 ]; then
+  # OK, now we actually decode the test data.  For reference, the command which was used to
+  # decode the test data in the current (at the time of writing) chain TDNN system
+  # local/chain/run_tdnn.sh (as figured out by running it from that stage), was:
+  # steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
+  #   --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
+  #   exp/chain/tree_sp/graph_tgsmall data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2
+
+  # We just replace the graph with the one in $treedir/extvocab_combined.
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
+    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
+    exp/chain/tree_sp/extvocab_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb
+
+  # s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb/wer_* | utils/best_wer.sh
+  # %WER 11.42 [ 2300 / 20138, 227 ins, 275 del, 1798 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb/wer_12_0.0
+
+  #.. versus the baseline below:
+  # s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_* | utils/best_wer.sh
+  # %WER 12.01 [ 2418 / 20138, 244 ins, 307 del, 1867 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2/wer_13_0.0
+fi
+
+if [ $stage -le 9 ]; then
+ steps/nnet3/decode_grammar.sh --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 140 --nj 38 \
+    --cmd "queue.pl --mem 4G --num-threads 4" --online-ivector-dir exp/nnet3/ivectors_dev_clean_2_hires \
+    exp/chain/tree_sp/extvocab_combined data/dev_clean_2_hires exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb_gra
+
+ # WER with grammar decoding is exactly the same as decoding from the converted FST.
+ # grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb_gra/wer_* | utils/best_wer.sh
+ # %WER 11.42 [ 2300 / 20138, 227 ins, 275 del, 1798 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_comb_gra/wer_12_0.0
+fi
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
new file mode 100755
index 00000000000..a4edeb8091c
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo.sh
@@ -0,0 +1,177 @@
+#!/usr/bin/env bash
+
+# This script demonstrates how to use the grammar-decoding framework to build
+# graphs made out of more than one part.  It demonstrates using `fstequivalent`
+# that the graph constructed this way is equivalent to what you would create if
+# you had the LM all as a single piece.  This uses the command line tools to
+# expand to a regular FST (--write-as-grammar=false) In practice you might not
+# want do to that, since the result might be large, and since writing the entire
+# thing might take too much time.  The code itself allows you to construct these
+# GrammarFst objects in lightweight way and decode using them.
+
+stage=0
+set -e
+. ./path.sh
+. utils/parse_options.sh
+
+
+tree_dir=exp/chain/tree_sp
+
+# For the purposes of this script we just need a biphone tree and associated
+# transition-model for testing, because we're testing it at the graph level,
+# i.e. testing equivalence of compiled HCLG graphs; there is no decoding
+# involved here.
+
+# We're doing this with the "no-silprobs" dictionary dir for now, as we
+# need to write some scripts to support silprobs with this.
+
+# For reference, the original command we
+#utils/prepare_lang.sh data/local/dict_nosp \
+#   "<UNK>" data/local/lang_tmp_nosp data/lang_nosp
+
+if [ $stage -le 0 ]; then
+  [ -d data/local/dict_nosp_grammar1 ] && rm -r data/local/dict_nosp_grammar1
+  cp -r data/local/dict_nosp data/local/dict_nosp_grammar1
+  echo "#nonterm:contact_list" > data/local/dict_nosp_grammar1/nonterminals.txt
+
+  [ -f data/lang_nosp_grammar1/G.fst ] && rm data/lang_nosp_grammar1/G.fst
+  utils/prepare_lang.sh data/local/dict_nosp_grammar1 \
+       "<UNK>" data/local/lang_tmp_nosp data/lang_nosp_grammar1
+fi
+
+
+
+if [ $stage -le 1 ]; then
+  # Most contents of these directories will be the same, only G.fst differs, but
+  # it's our practice to make these things as directories combining G.fst with
+  # everything else.
+  rm -r  data/lang_nosp_grammar2{a,b} 2>/dev/null || true
+  cp -r data/lang_nosp_grammar1 data/lang_nosp_grammar2a
+  cp -r data/lang_nosp_grammar1 data/lang_nosp_grammar2b
+fi
+
+if [ $stage -le 2 ]; then
+  # Create a simple G.fst in data/lang_nosp_grammar1, which won't
+  # actually use any grammar stuff, it will be a baseline to test against.
+
+  lang=data/lang_nosp_grammar1
+  cat <<EOF | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt | \
+      fstarcsort --sort_type=ilabel > $lang/G.fst
+0    1    GROUP  GROUP
+0    1   <eps>  <eps>  4.0
+1    2    ONE   ONE   0.69314718055994
+1    2    TWO   TWO  0.69314718055994
+1    2    <eps>  <eps>  5.0
+2    3    ASSIST   ASSIST  0.69314718055994
+2  0.69314718055994
+3
+EOF
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar1
+
+  # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
+  compile-graph --read-disambig-syms=$lang/phones/disambig.int $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar1/HCLG2.fst
+
+  if ! fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar1/HCLG{,2}.fst; then
+    echo "$0: two methods of producing graph in $tree_dir/grammar1 were different."
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 3 ]; then
+  # create the top-level graph in data/lang_nosp_grammar2a
+
+  # you can of course choose to put what symbols you want on the output side, as
+  # long as they are defined in words.txt.  #nonterm:contact_list, #nonterm_begin
+  # and #nonterm_end would be defined in this example.  This might be useful in
+  # situations where you want to keep track of the structure of calling
+  # nonterminals.
+  lang=data/lang_nosp_grammar2a
+  cat <<EOF | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt | \
+     fstarcsort --sort_type=ilabel >$lang/G.fst
+0    1    GROUP   GROUP
+0    1    <eps>   <eps>  4.0
+1    2    #nonterm:contact_list  <eps>
+2    3    ASSIST   ASSIST  0.69314718055994
+2  0.69314718055994
+3
+EOF
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2a
+
+  # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
+  offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
+  compile-graph --nonterm-phones-offset=$offset --read-disambig-syms=$lang/phones/disambig.int \
+       $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar2a/HCLG2.fst
+
+  if ! fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar2a/HCLG{,2}.fst; then
+    echo "$0: two methods of producing graph in $tree_dir/grammar2a were different."
+    exit 1
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  # Create the graph for the nonterminal in data/lang_nosp_grammar2b
+  # Again, we don't choose to put these symbols on the output side, but it would
+  # be possible to do so.
+  lang=data/lang_nosp_grammar2b
+  cat <<EOF | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt | \
+     fstarcsort --sort_type=ilabel > $lang/G.fst
+0    1    #nonterm_begin <eps>
+1    2    ONE  ONE    0.69314718055994
+1    2    TWO  TWO    0.69314718055994
+1    2    <eps>  <eps>  5.0
+2    3    #nonterm_end <eps>
+3
+EOF
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2b
+
+
+  # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
+  offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
+  compile-graph --nonterm-phones-offset=$offset --read-disambig-syms=$lang/phones/disambig.int \
+       $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar2b/HCLG2.fst
+
+  if ! fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar2b/HCLG{,2}.fst; then
+    echo "$0: two methods of producing graph in $tree_dir/grammar2b were different."
+    exit 1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # combine the top-level graph and the sub-graph together using the command
+  # line tools. (In practice you might want to do this from appliation code).
+
+  lang=data/lang_nosp_grammar2a
+  offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
+  clist=$(grep nonterm:contact_list $lang/phones.txt | awk '{print $2}') # 368
+
+  # the graph in $tree_dir/grammar2/HCLG.fst will be a normal FST (ConstFst)
+  # that was expanded from the grammar.  (we use --write-as-grammar=false to
+  # make it expand it).  This is to test equivalence to the one in
+  # $tree_dir/grammar1/
+  mkdir -p $tree_dir/grammar2
+  make-grammar-fst --write-as-grammar=false --nonterm-phones-offset=$offset $tree_dir/grammar2a/HCLG.fst \
+                   $clist $tree_dir/grammar2b/HCLG.fst  $tree_dir/grammar2/HCLG.fst
+fi
+
+if [ $stage -le 6 ]; then
+  # Test equivalence using a random path.. can be useful for debugging if
+  # fstequivalent fails.
+  echo "$0: will print costs with the two FSTs, for one random path."
+  fstrandgen $tree_dir/grammar1/HCLG.fst > path.fst
+  for x in 1 2; do
+    fstproject --project_output=false path.fst | fstcompose - $tree_dir/grammar${x}/HCLG.fst | fstcompose - <(fstproject --project_output=true path.fst) > composed.fst
+    start_state=$(fstprint composed.fst | head -n 1 | awk '{print $1}')
+    fstshortestdistance --reverse=true composed.fst | awk -v s=$start_state '{if($1 == s) { print $2; }}'
+  done
+
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: will test equivalece using fstequivalent"
+  if fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar1/HCLG.fst $tree_dir/grammar2/HCLG.fst; then
+    echo "$0: success: the two were equivalent"
+  else
+    echo "$0: failure: the two were inequivalent"
+  fi
+fi
diff --git a/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
new file mode 100755
index 00000000000..414227f2ad6
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/grammar/simple_demo_silprobs.sh
@@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+
+# simple_demo_silprobs.sh is a version of simple_demo.sh that uses a lexicon
+# with word-specific silence probabilities.
+
+# These scripts demonstrate how to use the grammar-decoding framework to build
+# graphs made out of more than one part.  It demonstrates using `fstequivalent`
+# that the graph constructed this way is equivalent to what you would create if
+# you had the LM all as a single piece.  This uses the command line tools to
+# expand to a regular FST (--write-as-grammar=false) In practice you might not
+# want do to that, since the result might be large, and since writing the entire
+# thing might take too much time.  The code itself allows you to construct these
+# GrammarFst objects in lightweight way and decode using them.
+
+stage=0
+set -e
+. ./path.sh
+. utils/parse_options.sh
+
+
+tree_dir=exp/chain/tree_sp
+
+# For the purposes of this script we just need a biphone tree and associated
+# transition-model for testing, because we're testing it at the graph level,
+# i.e. testing equivalence of compiled HCLG graphs; there is no decoding
+# involved here.
+
+
+# For reference, the original command we
+#utils/prepare_lang.sh data/local/dict \
+#   "<UNK>" data/local/lang_tmp data/lang
+
+if [ $stage -le 0 ]; then
+  [ -d data/local/dict_grammar1 ] && rm -r data/local/dict_grammar1
+  cp -r data/local/dict data/local/dict_grammar1
+  echo "#nonterm:contact_list" > data/local/dict_grammar1/nonterminals.txt
+
+  utils/prepare_lang.sh data/local/dict_grammar1 \
+       "<UNK>" data/local/lang_tmp data/lang_grammar1
+fi
+
+
+
+if [ $stage -le 1 ]; then
+  # Most contents of these directories will be the same, only G.fst differs, but
+  # it's our practice to make these things as directories combining G.fst with
+  # everything else.
+  rm -r  data/lang_grammar2{a,b} 2>/dev/null || true
+  cp -r data/lang_grammar1 data/lang_grammar2a
+  cp -r data/lang_grammar1 data/lang_grammar2b
+fi
+
+if [ $stage -le 2 ]; then
+  # Create a simple G.fst in data/lang_grammar1, which won't
+  # actually use any grammar stuff, it will be a baseline to test against.
+
+  lang=data/lang_grammar1
+  cat <<EOF | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt | \
+     fstarcsort --sort_type=ilabel > $lang/G.fst
+0    1    GROUP  GROUP
+1    2    ONE   ONE   0.69314718055994
+1    2    TWO   TWO  0.69314718055994
+1    2    <eps>  <eps>  5.0
+2    3    ASSIST   ASSIST  0.69314718055994
+2  0.69314718055994
+3
+EOF
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar1
+
+  # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
+  compile-graph --read-disambig-syms=$lang/phones/disambig.int $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar1/HCLG2.fst
+
+  if ! fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar1/HCLG{,2}.fst; then
+    echo "$0: two methods of producing graph in $tree_dir/grammar1 were different."
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 3 ]; then
+  # create the top-level graph in data/lang_grammar2a
+
+  # you can of course choose to put what symbols you want on the output side, as
+  # long as they are defined in words.txt.  #nonterm:contact_list, #nonterm_begin
+  # and #nonterm_end would be defined in this example.  This might be useful in
+  # situations where you want to keep track of the structure of calling
+  # nonterminals.
+  lang=data/lang_grammar2a
+  cat <<EOF | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt | \
+      fstarcsort --sort_type=ilabel > $lang/G.fst
+0    1    GROUP   GROUP
+1    2    #nonterm:contact_list  <eps>
+2    3    ASSIST   ASSIST  0.69314718055994
+2  0.69314718055994
+3
+EOF
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2a
+
+  # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
+  offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
+  compile-graph --nonterm-phones-offset=$offset --read-disambig-syms=$lang/phones/disambig.int \
+       $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar2a/HCLG2.fst
+
+  if ! fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar2a/HCLG{,2}.fst; then
+    echo "$0: two methods of producing graph in $tree_dir/grammar2a were different."
+    exit 1
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  # Create the graph for the nonterminal in data/lang_grammar2b
+  # Again, we don't choose to put these symbols on the output side, but it would
+  # be possible to do so.
+  lang=data/lang_grammar2b
+  cat <<EOF | fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
+     |  fstarcsort --sort_type=ilabel > $lang/G.fst
+0    1    #nonterm_begin <eps>
+1    2    ONE  ONE    0.69314718055994
+1    2    TWO  TWO    0.69314718055994
+1    2    <eps>  <eps>  5.0
+2    3    #nonterm_end <eps>
+3
+EOF
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang $tree_dir $tree_dir/grammar2b
+
+
+  # test that the binary 'compile-graph' does the same thing as mkgraph.sh.
+  offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
+  compile-graph --nonterm-phones-offset=$offset --read-disambig-syms=$lang/phones/disambig.int \
+       $tree_dir/tree $tree_dir/1.mdl $lang/L_disambig.fst $lang/G.fst $tree_dir/grammar2b/HCLG2.fst
+
+  if ! fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar2b/HCLG{,2}.fst; then
+    echo "$0: two methods of producing graph in $tree_dir/grammar2b were different."
+    exit 1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # combine the top-level graph and the sub-graph together using the command
+  # line tools. (In practice you might want to do this from appliation code).
+
+  lang=data/lang_grammar2a
+  offset=$(grep nonterm_bos $lang/phones.txt | awk '{print $2}') # 364
+  clist=$(grep nonterm:contact_list $lang/phones.txt | awk '{print $2}') # 368
+
+  # the graph in $tree_dir/grammar2/HCLG.fst will be a normal FST (ConstFst)
+  # that was expanded from the grammar.  (we use --write-as-grammar=false to
+  # make it expand it).  This is to test equivalence to the one in
+  # $tree_dir/grammar1/
+  mkdir -p $tree_dir/grammar2
+  make-grammar-fst --write-as-grammar=false --nonterm-phones-offset=$offset $tree_dir/grammar2a/HCLG.fst \
+                   $clist $tree_dir/grammar2b/HCLG.fst  $tree_dir/grammar2/HCLG.fst
+fi
+
+if [ $stage -le 6 ]; then
+  # Test equivalence using a random path.. can be useful for debugging if
+  # fstequivalent fails.
+  echo "$0: will print costs with the two FSTs, for one random path."
+  fstrandgen $tree_dir/grammar1/HCLG.fst > path.fst
+  for x in 1 2; do
+    fstproject --project_output=false path.fst | fstcompose - $tree_dir/grammar${x}/HCLG.fst | fstcompose - <(fstproject --project_output=true path.fst) > composed.fst
+    start_state=$(fstprint composed.fst | head -n 1 | awk '{print $1}')
+    fstshortestdistance --reverse=true composed.fst | awk -v s=$start_state '{if($1 == s) { print $2; }}'
+  done
+
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: will test equivalece using fstequivalent"
+  if fstequivalent --delta=0.01 --random=true --npath=100 $tree_dir/grammar1/HCLG.fst $tree_dir/grammar2/HCLG.fst; then
+    echo "$0: success: the two were equivalent"
+  else
+    echo "$0: failure: the two were inequivalent"
+  fi
+fi
diff --git a/egs/mini_librispeech/s5/local/kws/compile_keywords.sh b/egs/mini_librispeech/s5/local/kws/compile_keywords.sh
new file mode 100755
index 00000000000..9f88b9665ff
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/compile_keywords.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Copyright (c) 2015-2018, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+silence_word=
+filter='OOV=0'
+# End configuration section
+echo $0 "$@"
+. ./utils/parse_options.sh || exit 1;
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+
+data=$1
+lang=$2
+workdir=$3
+
+mkdir -p $workdir
+if [ -f $data/categories ] ; then
+  cat $data/categories | \
+    local/search/filter_by_category.pl $data/categories "$filter" > $workdir/categories
+
+  if [ ! -s $workdir/categories ]; then
+    echo "$0: WARNING: $workdir/categories is zero-size. That means no keyword"
+    echo "$0: WARNING: was found that fits the filter \"$filter\". That might be expected."
+    touch $workdir/keywords.int
+    touch $workdir/keywords.fsts
+    exit 0
+  fi
+  grep -w -F -f <(awk '{print $1}' $workdir/categories) \
+    $data/keywords.int > $workdir/keywords.int
+else
+  cp $data/keywords.int $workdir/keywords.int
+fi
+
+
+
+if [ -s $workdir/keywords.int ]; then
+  if [ -z $silence_word ]; then
+    transcripts-to-fsts ark:$workdir/keywords.int \
+      ark,scp,t:$workdir/keywords.fsts,- | sort -o $workdir/keywords.scp
+  else
+    silence_int=`grep -w $silence_word $lang/words.txt | awk '{print $2}'`
+    [ -z $silence_int ] && \
+       echo "$0: Error: could not find integer representation of silence word $silence_word" && exit 1;
+    transcripts-to-fsts ark:$data/keywords.int ark,t:- | \
+      awk -v 'OFS=\t' -v silint=$silence_int '{
+        if (NF == 4 && $1 != 0) { print $1, $1, silint, silint; } print;
+      }' | fstcopy ark:- ark,scp,t:$workdir/keywords.fsts,- | \
+      sort -o $workdir/keywords.scp
+  fi
+else
+  echo "$0: WARNING: $workdir/keywords.int is zero-size. That means no keyword"
+  echo "$0: WARNING: was found in the dictionary. That might be expected -- or not."
+  touch $workdir/keywords.fsts
+fi
+
diff --git a/egs/mini_librispeech/s5/local/kws/create_categories.pl b/egs/mini_librispeech/s5/local/kws/create_categories.pl
new file mode 100755
index 00000000000..4a9e3314c41
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/create_categories.pl
@@ -0,0 +1,112 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2015-2018  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+my $Usage = <<EOU;
+Parses the data file and outputs the basic KW categories
+
+Usage:  $0 <source-file>
+ e.g.:  $0 keywords.txt
+    or  $0 --results results
+
+Allowed options:
+  --results          : instead of keyword specification format, keyword search
+                       results format is assumed.
+
+NOTE:
+  If you need both information, you can call the script twice (with different
+  parameters) and call local/search/normalize_categories.pl to merge (and normalize)
+  these two tables together.
+EOU
+
+use strict;
+use warnings;
+use utf8;
+use POSIX;
+use Data::Dumper;
+use Getopt::Long;
+use open qw(:std :utf8);
+
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+my $result_format;
+GetOptions("results", \$result_format) or do {
+  print STDERR "Cannot parse the command-line parameters.\n";
+  print STDERR "$Usage\n";
+  die "Cannot continue\n"
+};
+
+if ( @ARGV > 1 ) {
+  print STDERR "Incorrect number of command-line parameters\n";
+  print STDERR "$Usage\n";
+  die "Cannot continue\n"
+}
+
+sub QuantizeCount {
+  my $count = shift @_;
+
+  if ($count <= 0) {
+    return "0";
+  } elsif ($count == 1) {
+    return "000-001";
+  } elsif ($count <= 5) {
+    return "002-005";
+  } elsif ($count <=10) {
+    return "006-010";
+  } elsif ($count <=20) {
+    return "011-020";
+  } elsif ($count <=100) {
+    return "021-100";
+  } else {
+    return "101-inf";
+  }
+}
+
+if (not $result_format ) {
+  my $kwlist_name=$ARGV[0];
+  while (my $line = <>) {
+    chomp $line;
+    my ($kwid, $text) = split " ", $line, 2;
+
+    my @words = split " ", $text;
+    printf "$kwid NGramOrder=%03d\n", scalar @words;
+    printf "$kwid Characters=%03d\n", length(join("", @words));
+    print "$kwid $kwid\n";
+  }
+} else {
+  my $prev_kwid = "";
+  my $count = 0;
+
+  while (my $line = <>) {
+    chomp $line;
+    my @entries = split " ", $line;
+    next unless @entries;
+
+    if ($prev_kwid ne $entries[0]) {
+      if ($prev_kwid) {
+        print "$prev_kwid ResCount=$count\n";
+        print "$prev_kwid ResCountQuant=" . QuantizeCount($count) . "\n";
+      }
+      $count = 0;
+      $prev_kwid = $entries[0];
+    }
+    $count += 1;
+  }
+}
+
+
diff --git a/egs/mini_librispeech/s5/local/kws/create_hitlist.sh b/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
new file mode 100755
index 00000000000..be06a3b9312
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/create_hitlist.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# Copyright 2012-2018  Johns Hopkins University (Author: Guoguo Chen, Yenda Trmal)
+# Apache 2.0.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+cmd=run.pl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+boost_silence=1.0
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+  echo "This script takes an ali directory and creates the corresponding RTTM file"
+  echo ""
+  echo "Usage: create_hitlist.sh <data-dir> <lang-dir> <lang-tmp-dir> <exp-dir> <kws-data-dir>"
+  echo " e.g.: create_hitlist.sh data/heldout data/lang data/local/lang_tmp exp/heldout_ali data/heldout/kws"
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) "
+
+  exit 1;
+fi
+
+set -e
+set -o pipefail
+set -u
+
+data=$1
+lang=$2
+lang_tmp=$3
+dir=$4
+kws=$5
+
+oov=`cat $lang/oov.txt`
+mkdir -p $dir/log
+
+echo "$0: writing alignments."
+wbegin=`grep "#1" $lang/phones.txt | head -1 | awk '{print $2}'`
+wend=`grep "#2" $lang/phones.txt | head -1 | awk '{print $2}'`
+
+if [ ! -f $lang/L_align.fst ]; then
+  echo "$0: generating $lang/L_align.fst"
+  local/kws/make_L_align.sh $lang_tmp $lang $lang 2>&1 | tee $dir/log/L_align.log
+fi
+
+$cmd $dir/log/ali_to_hitlist.log \
+  set -e -o pipefail\; \
+  ali-to-phones $dir/final.mdl "ark:gunzip -c $dir/ali.*.gz|" ark,t:- \| \
+  phones-to-prons $lang/L_align.fst $wbegin $wend ark:- "ark,s:utils/sym2int.pl -f 2- --map-oov '$oov' $lang/words.txt <$data/text|" ark,t:- \| \
+  prons-to-wordali ark:- "ark:ali-to-phones --write-lengths=true $dir/final.mdl 'ark:gunzip -c $dir/ali.*.gz|' ark,t:- |" ark,t:- \| \
+  local/kws/generate_hitlist.pl $kws/keywords.int \|\
+  utils/sym2int.pl -f 2 $kws/utt.map  \> $kws/hitlist
+
+echo "$0: done generating hitlist"
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/kws/example/keywords.txt b/egs/mini_librispeech/s5/local/kws/example/keywords.txt
new file mode 100644
index 00000000000..118de904297
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/example/keywords.txt
@@ -0,0 +1,7 @@
+KWS_001 GOOD MORNING
+KWS_002 SCOTLAND YARD
+KWS_003 CLERGYMAN
+KWS_004 UNKNOWN
+KWS_005 WHITE FLAG
+KWS_006 DON'T CRY
+
diff --git a/egs/mini_librispeech/s5/local/kws/filter_kws_results.pl b/egs/mini_librispeech/s5/local/kws/filter_kws_results.pl
new file mode 100755
index 00000000000..37549249bdc
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/filter_kws_results.pl
@@ -0,0 +1,189 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2015-2018  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+my $Usage = <<EOU;
+Filters the kws results file and remove duplicates and/or prints out only given
+number of results with the best score (for each KWID individually).
+
+Usage: cat results | $0 <options>  > output
+ e.g.:  gunzip -c exp/tri5/kws/result.*.gz | $0 > exp/tri5/kws/results
+
+Allowed options:
+  --nbest           :  how many best results (for each KWID) should be printed
+                       (int, default -1, i.e. no limit)
+  --duptime         :  duplicates detection, tolerance (in frames) for being
+                       the same hits (int,  default = 50)
+  --likes
+  --probs
+
+CAVEATS:
+  The script tries to be  memory-effective. The impact of this is that we
+  assume the results are sorted by KWID (i.e. all entries with the same KWID
+  are in a continuous block). The user is responsible for sorting it.
+EOU
+
+use strict;
+use warnings;
+use utf8;
+use POSIX;
+use Data::Dumper;
+use Getopt::Long;
+
+# if parameter nbest > 0, then filters the result list so that there is no
+# more than nbest hits in the output for each of the KWID
+#
+
+my $nbest = -1;
+my $duptime = 50;
+my $likes = 0;
+
+#print STDERR join(" ", $0, @ARGV) . "\n";
+GetOptions ("nbest=f" => \$nbest,
+            "likes" => \$likes,
+            "probs" => sub{ $likes = 0},
+            "duptime=i" => \$duptime) ||  do {
+  print STDERR "Cannot parse the command-line parameters.\n";
+  print STDERR "$Usage\n";
+  die "Cannot continue\n"
+};
+
+if (@ARGV != 0) {
+  print STDERR "Incorrect number of command-line parameters\n";
+  print STDERR "$Usage\n";
+  die "Cannot continue\n"
+}
+
+# Function for sorting
+sub KwslistOutputSort {
+  if ($a->[0] ne $b->[0]) {
+    if ($a->[0] =~ m/[0-9]+$/ && $b->[0] =~ m/[0-9]+$/) {
+      ($a->[0] =~ /([0-9]*)$/)[0] <=> ($b->[0] =~ /([0-9]*)$/)[0]
+    } else {
+      $a->[0] cmp $b->[0];
+    }
+  } elsif ($a->[5] ne $b->[5]) {
+    $b->[5] <=> $a->[5];
+  } else {
+    $a->[1] cmp $b->[1];
+  }
+}
+
+sub KwslistDupSort {
+  my ($a, $b, $duptime) = @_;
+  if ($a->[1] ne $b->[1]) {
+    #file
+    $a->[1] cmp $b->[1];
+  } elsif (abs($a->[2]-$b->[2]) >= $duptime){
+    #start
+    $a->[2] <=> $b->[2];
+  } elsif ($a->[4] ne $b->[4]) {
+    #score
+    $b->[4] <=> $a->[4];
+  } else {
+    #end time
+    $b->[3] <=> $a->[3];
+  }
+}
+
+my @RESULTS;
+my %SEEN_KWS;
+my $kw = "";
+
+while ( my $line = <STDIN> ) {
+  chomp $line;
+  my @F = split " ", $line;
+  @F == 5 || die "$0: Bad number of columns in raw results \"$line\"\n";
+
+  $F[4] = -$F[4] if $likes;
+
+  if ($F[0] eq $kw) {
+    push @RESULTS, \@F;
+  } elsif ($kw eq "" ) {
+    @RESULTS = ();
+    push @RESULTS, \@F;
+    $kw = $F[0];
+  } else {
+
+    my @results;
+    my @tmp = sort { KwslistDupSort($a, $b, $duptime) } @RESULTS;
+
+    @results = ();
+    if (@tmp >= 1) {push(@results, $tmp[0])};
+    for (my $i = 1; $i < scalar(@tmp); $i ++) {
+      my $prev = $results[-1];
+      my $curr = $tmp[$i];
+      if ((abs($prev->[2]-$curr->[2]) < $duptime ) &&
+          ($prev->[1] eq $curr->[1])) {
+        next;
+      } else {
+        push(@results, $curr);
+      }
+    }
+
+    # this is probably needed only when nbest > 0
+    @results = sort { ($b->[4] + 0.0) <=> ($a->[4] + 0.0) } @results;
+
+    my $len;
+    if( $nbest > 0)  {
+      $len = scalar @results < $nbest ? scalar @results : $nbest;
+    } else {
+      $len = scalar @results;
+    }
+    for (my $i=0; $i < $len; $i++) {
+      $results[$i]->[4] = -$results[$i]->[4] if $likes;
+      print join(" ", @{$results[$i]}) . "\n";
+    }
+
+    @RESULTS = ();
+    push @RESULTS, \@F;
+    $kw = $F[0];
+  }
+}
+do {
+  my @results;
+  my @tmp = sort { KwslistDupSort($a, $b, $duptime) } @RESULTS;
+
+  @results = ();
+  if (@tmp >= 1) {push(@results, $tmp[0])};
+  for (my $i = 1; $i < scalar(@tmp); $i ++) {
+    my $prev = $results[-1];
+    my $curr = $tmp[$i];
+    if ((abs($prev->[2]-$curr->[2]) < $duptime ) &&
+        ($prev->[1] eq $curr->[1])) {
+      next;
+    } else {
+      push(@results, $curr);
+    }
+  }
+
+  # this is probably needed only when nbest > 0
+  @results = sort { ($b->[4] + 0.0) <=> ($a->[4] + 0.0) } @results;
+
+  my $len;
+  if( $nbest > 0)  {
+    $len = scalar @results < $nbest ? scalar @results : $nbest;
+  } else {
+    $len = scalar @results;
+  }
+  for (my $i=0; $i < $len; $i++) {
+    $results[$i]->[4] = -$results[$i]->[4] if $likes;
+    print join(" ", @{$results[$i]}) . "\n";
+  }
+}
+
+
diff --git a/egs/mini_librispeech/s5/local/kws/generate_hitlist.pl b/egs/mini_librispeech/s5/local/kws/generate_hitlist.pl
new file mode 100755
index 00000000000..41df32626a6
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/generate_hitlist.pl
@@ -0,0 +1,117 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2018  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+# this will generate the hitlist (list of all hits) using the word-level
+# alignments
+# Format of the file
+# utt-id word-1 duration-1 ; word-2 duration-2 ; ....
+# it is exactly the same format that you can get from ali-to-phones with
+# parameter --write-lengths (see the script create_hitlist.sh for complete
+# example)
+
+# The script is not very optimized -- the finding of the hits in the utterance
+# is done by concatenating the word_ids sequence using '|' and then by searching
+# for a substring processed the same way. After that, we workout the word-level
+# indices of the individual hits (remember, there may be more hits per utterance)
+# Probably still faster than rolling our own searching algorithm due to the fact
+# that it goes directly to (optimized) perl's runtime function
+
+use strict;
+use warnings;
+use utf8;
+
+if ((scalar @ARGV > 2) || (scalar @ARGV < 1)) {
+  print STDERR "Usage: $0 <keywords.int> [<alignment>]\n";
+  print STDERR "E.g.\n";
+  print STDERR "  $0 data/train_clean_5/kws/keywords.int < exp/tri3b_ali_train_clean_5/align.txt\n";
+  die "Incorrect number of arguments."
+}
+
+my $keyword_file = shift @ARGV;
+open(my $keywords, "<$keyword_file") or
+  die "Cannot open $keyword_file for reading";
+
+my @KW;
+while (<$keywords>) {
+  chomp;
+  next unless $_;
+  my @F = split;
+  my $kwid = shift @F;
+  push @KW, [$kwid, \@F];
+}
+
+while (<>) {
+  chomp;
+  next unless $_;
+
+  my @F = split(" ", $_, 2);
+  my $utt_id = shift @F;
+  @F = split(/ ; /, $F[0]);
+
+  my $frames_prev = 0;
+  my @UTT;
+  foreach my $entry (@F) {
+    (my $word, my $frames) = split(" ", $entry, 2);
+    if ($word ne 0) {
+      my $frames_start = $frames_prev;
+      my $frames_end = $frames_start + $frames;
+      $frames_prev = $frames_end;
+      push @UTT, [$word + 0, $frames_start, $frames_end];
+    } else {
+      $frames_prev += $frames;
+    }
+  }
+
+  my $utt_string = '|' . join('|', map { $_->[0] } @UTT) . '|';
+  my %utt_indices;
+  my $counter = 0;
+  my $idx = 0;
+  #mapping between the position in the utt_string and the position of
+  #the word in the original utterance
+  while () {
+    $idx = index($utt_string, '|', $idx);
+    last if $idx == -1;
+    $utt_indices{$idx} = $counter;
+    $idx += 1;
+    $counter +=1
+  }
+
+
+  foreach my $kw (@KW) {
+    my $kw_string = "|" . join('|', @{$kw->[1]}) . '|';
+    my $kwlen = scalar @{$kw->[1]};
+
+    my $idx = 0;
+    my @all_idx;
+    while () {
+      $idx = index($utt_string, $kw_string, $idx);
+      last if $idx == -1;
+      push @all_idx, $idx;
+      $idx += 1;
+    }
+
+    foreach my $hit (@all_idx) {
+      my $start_idx =  $utt_indices{$hit};
+      my $end_idx = $start_idx + $kwlen - 1;
+      my $start = $UTT[$start_idx]->[1];
+      my $end = $UTT[$end_idx]->[2];
+
+      print "$kw->[0] $utt_id $start $end 0\n";
+    }
+  }
+}
diff --git a/egs/mini_librispeech/s5/local/kws/keywords_to_indices.pl b/egs/mini_librispeech/s5/local/kws/keywords_to_indices.pl
new file mode 100755
index 00000000000..7eb721cf1c3
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/keywords_to_indices.pl
@@ -0,0 +1,123 @@
+#!/usr/bin/env perl
+# Copyright 2012-2018  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0.
+
+use Data::Dumper;
+$Data::Dumper::Indent = 1;
+
+binmode STDOUT, ":utf8";
+binmode STDIN, ":utf8";
+
+sub permute {
+
+  my $last = pop @_;
+
+  unless(@_) {
+    return map([$_], @$last);
+  }
+
+  return map {
+  my $left = $_;
+  map([@$left, $_], @$last)
+  }
+  permute(@_);
+}
+
+$oov_count=0;
+
+$ignore_oov = 0;
+$ignore_first_field = 0;
+for($x = 0; $x < 2; $x++) {
+  if ($ARGV[0] eq "--map-oov") {
+    shift @ARGV; $map_oov = shift @ARGV;
+  }
+  if ($ARGV[0] eq "-f") {
+    shift @ARGV;
+    $field_spec = shift @ARGV;
+    if ($field_spec =~ m/^\d+$/) {
+      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
+    }
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+      if ($1 ne "") {
+        $field_begin = $1 - 1;  # Change to zero-based indexing.
+      }
+      if ($2 ne "") {
+        $field_end = $2 - 1;    # Change to zero-based indexing.
+      }
+    }
+    if (!defined $field_begin && !defined $field_end) {
+      die "Bad argument to -f option: $field_spec";
+    }
+  }
+}
+
+$symtab = shift @ARGV;
+if (!defined $symtab) {
+  print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" .
+  "options: [--map-oov <oov-symbol> ]  [-f <field-range> ]\n" .
+  "note: <field-range> can look like 4-5, or 4-, or 5-, or 1.\n";
+}
+open(F, "<:encoding(UTF-8)", $symtab) || die "Error opening symbol table file $symtab";
+while(<F>) {
+  @A = split(" ", $_);
+  @A == 2 || die "bad line in symbol table file: $_";
+
+  if ( not defined( $sym2int{$A[0]} ) ) {
+    $sym2int{$A[0]} = [];
+  }
+  push @{ $sym2int{$A[0]} }, $A[1] + 0;
+}
+#print Dumper(\%sym2int);
+
+if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up
+  if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; }
+  $map_oov = $sym2int{$map_oov};
+}
+
+$lines=0;
+while (<>) {
+  @A = split(" ", $_);
+  @B = ();
+  $lines = $lines + 1;
+  $undefined_words = 0;
+  for ($n = 1; $n < @A; $n++) {
+    $a = $A[$n];
+    $i = $sym2int{$a};
+    if (!defined ($i)) {
+      if (defined $map_oov) {
+        if ($num_warning++ < $max_warning) {
+          print STDERR "sym2int.pl: replacing $a with $map_oov\n";
+          if ($num_warning == $max_warning) {
+            print STDERR "sym2int.pl: not warning for OOVs any more times\n";
+          }
+        }
+        $i = [ $map_oov ];
+      } else {
+        $pos = $n+1;
+        die "sym2int.pl: undefined symbol $a (in position $pos)\n";
+      }
+      $undefined_words = $undefined_words + 1;
+    }
+    $a = $i;
+    push @B, $a;
+  }
+  #if ( defined $sym2int{$A[$n]} ) {
+  #  push @B, $sym2int{$A[$n]};
+  #} else {
+  #  push @B, [0];
+  #}
+  if ($undefined_words > 0) {
+    $oov_count = $oov_count + 1;
+  }
+  @C = permute @B;
+  #print Dumper(\@B);
+  #print Dumper(\@C);
+  foreach $phrase ( @C ) {
+    print "$A[0] ";
+    print join(" ", @{$phrase});
+    print "\n";
+  }
+}
+
+print STDERR "Found $oov_count phrases containing (at least one) OOV...\n";
+
diff --git a/egs/mini_librispeech/s5/local/kws/make_L_align.sh b/egs/mini_librispeech/s5/local/kws/make_L_align.sh
new file mode 100755
index 00000000000..72a1e9e3f4c
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/make_L_align.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Copyright 2013-2018  Johns Hopkins University (authors: Guoguo Chen, Yenda Trmal)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+set -o pipefail
+set -e
+set -x
+
+if [ $# -ne 3 ]; then
+  echo "This is a simple script that will generate the L_align.fst"
+  echo "The FST L_align.fst is used for getting the force-aligned "
+  echo "utterances"
+  echo "The script automaticky recognizes the probabilistic lexicon"
+  echo "is used and will use the correct file"
+  echo ""
+  echo "usage: local/L_align.sh <lang-local-dir> <lang-dir> <out-dir>"
+  echo "e.g.: local/L_align.sh data/local/lang data/lang data/lang"
+  exit 1;
+fi
+
+tmpdir=$1
+dir=$2
+outdir=$3
+
+silphone=`cat $dir/phones/optional_silence.txt` || exit 1;
+
+# Create lexicon with alignment info
+if  [ -f $tmpdir/lexicon.txt ] ; then
+  cat $tmpdir/lexicon.txt | \
+    awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
+    utils/make_lexicon_fst.pl - 0.5 $silphone | \
+    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+    --keep_isymbols=false --keep_osymbols=false | \
+    fstarcsort --sort_type=olabel > $outdir/L_align.fst
+elif [ -f $tmpdir/lexiconp.txt ] ;  then
+  cat $tmpdir/lexiconp.txt | \
+    awk '{printf("%s #1 ", $1); for (n=3; n <= NF; n++) { printf("%s ", $n); } print "#2"; }' | \
+    utils/make_lexicon_fst.pl - 0.5 $silphone | \
+    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+    --keep_isymbols=false --keep_osymbols=false | \
+    fstarcsort --sort_type=olabel > $outdir/L_align.fst
+else
+  echo >&2 "Neither $tmpdir/lexicon.txt nor $tmpdir/lexiconp.txt does not exist"
+  exit 1
+fi
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/kws/normalize_results_kst.pl b/egs/mini_librispeech/s5/local/kws/normalize_results_kst.pl
new file mode 100755
index 00000000000..5e8e6419959
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/normalize_results_kst.pl
@@ -0,0 +1,203 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2015-2018  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+my $Usage = <<EOU;
+Perform KST (keyword specific thresholt) normalization of the results
+
+Usage: cat results | $0 [options]  > results.normalized
+
+Allowed options:
+  --probs               : the input is probabilities instead of neg-loglikelihoods
+
+  --duration|--trials   : size of the searched collectiona in seconds (float)
+  --beta                : the FA vs MISS rate    (float, default 999.9)
+  --ntrue-scale         : scales for scaling the expected count of true hits (float, default 1.0)
+  --thr|--threshold     : the decision threshold (float, default 0.5)
+EOU
+
+use strict;
+use warnings;
+use utf8;
+use POSIX;
+use Data::Dumper;
+use Getopt::Long;
+
+my $ntrue_scale = 1.0;
+my $global_thr = 0.5;
+my $beta = 999.9;
+my $duration = 35785.578;
+my $ntrue_table_filename;
+my $probs=0;
+my $bsum_filename;
+
+GetOptions("duration|trials=f" => \$duration,
+           "ntrue-scale=f"     => \$ntrue_scale,
+           "beta=f"            => \$beta,
+           "probs"             => \$probs,
+           "thr|threshold=f"   => \$global_thr,
+           "ntrue-table=s"     => \$ntrue_table_filename,
+           "bsum-table=s"      => \$bsum_filename) or do
+ {
+  print STDERR "$0: Cannot parse the command-line parameters.\n";
+  print STDERR "$Usage\n";
+  die "$0: Cannot continue\n"
+};
+
+if (@ARGV != 0) {
+  print STDERR "$0: Incorrect number of command-line parameters\n";
+  print STDERR "$Usage\n";
+  die "$0: Cannot continue\n"
+}
+
+sub ComputeKST {
+  my @instances = @{shift @_};
+  my $ntrue_scale = shift @_;
+  my %ntrue_table = %{shift @_};
+
+
+  my $ntrue = 0;
+  foreach my $elem(@instances) {
+    $ntrue += $elem->[4];
+  }
+  #$ntrue = $ntrue / @instances;
+  if (defined ($ntrue_table{$instances[0]->[0]})) {
+    #print STDERR "For KW "  . $instances[0]->[0] . " using the value " .  $ntrue_table{$instances[0]->[0]}  . "\n";
+    $ntrue = $ntrue * $ntrue_table{$instances[0]->[0]};
+  } else {
+    #print STDERR  "Using the default vsalue $ntrue_scale\n";
+    $ntrue = $ntrue * $ntrue_scale;
+  }
+
+  my $thr = $beta * $ntrue / ( $duration  + $ntrue * ($beta - 1));
+  return $thr;
+}
+
+sub ComputeKSTWithExpected {
+  my @instances = @{shift @_};
+  my %expected_table = %{shift @_};
+  my $ntrue_scale = shift @_;
+  my %ntrue_table = %{shift @_};
+
+
+  my $ntrue = $expected_table{$instances[0]->[0]};
+  #$ntrue = $ntrue / @instances;
+  if (defined ($ntrue_table{$instances[0]->[0]})) {
+    #print STDERR "For KW "  . $instances[0]->[0] . " using the value " .  $ntrue_table{$instances[0]->[0]}  . "\n";
+    $ntrue = $ntrue * $ntrue_table{$instances[0]->[0]};
+  } else {
+    #print STDERR  "Using the default vsalue $ntrue_scale\n";
+    $ntrue = $ntrue * $ntrue_scale;
+  }
+
+  my $thr = $beta * $ntrue / ( $duration  + $ntrue * ($beta - 1));
+  return $thr;
+}
+sub NormalizeScores {
+  my @instances = @{shift @_};
+  my $thr = shift @_;
+  my $global_thr = shift @_;
+
+
+  if ($thr == 0) {
+    $thr = 0.001;
+  }
+  my $q = log($global_thr)/log($thr);
+
+  foreach my $elem(@instances) {
+    $elem->[4] = pow($elem->[4], $q);
+  }
+}
+
+sub WriteResults {
+  my @instances = @{shift @_};
+
+  foreach my $elem(@instances) {
+    print join(" ", @{$elem}) . "\n";
+    die "$0: " . join(" ", @{$elem}) . "\n" if $elem->[-1] > 1.0;
+  }
+
+}
+
+my $KWID;
+my @putative_hits;
+my %NTRUE_TABLE = ();
+
+my %BSUM=();
+if (defined $bsum_filename) {
+  open(BSUMF, $bsum_filename) or die "$0: Cannot open $bsum_filename";
+  while (my $line = <BSUMF> ) {
+    chomp $line;
+    next unless (($line =~ m/^\s*KW/) || ($line =~ m/^Keyword\s*KW/));
+    $line =~ s/^Keyword//g;
+    $line =~ s/^\s+|\s+$//g;
+    my @entries = split /\s*\|\s*/, $line;
+    $BSUM{$entries[0]} = $entries[12];
+  }
+  close(BSUMF);
+}
+
+if ( defined $ntrue_table_filename) {
+  open (F, $ntrue_table_filename) or die "$0: Cannot open the Ntrue-table file\n";
+  while (my $line = <F>) {
+    my @entries=split(" ", $line);
+
+    die "$0: The Ntrue-table does not have expected format\n" if @entries != 2;
+    $NTRUE_TABLE{$entries[0]} = $entries[1] + 0.0;
+  }
+  close (F);
+}
+
+while (my $line = <STDIN>) {
+  chomp $line;
+  (my $kwid, my $file, my $start, my $end, my $score) = split " ", $line;
+
+  if ($KWID && ($kwid ne $KWID)) {
+
+    my $thr = ComputeKST(\@putative_hits, $ntrue_scale, \%NTRUE_TABLE );
+    if ((defined $BSUM{$KWID}) && (scalar @putative_hits > 100)) {
+      print STDERR "$0: $KWID $thr $BSUM{$KWID} " .  log($thr)/log($global_thr) . "\n";
+      my $old_thr = $thr;
+      $thr = pow($BSUM{$KWID}, log($thr)/log($global_thr));
+    }
+    if ($thr < 0.9999 ) {
+      NormalizeScores(\@putative_hits, $thr, $global_thr);
+      WriteResults(\@putative_hits);
+    }
+
+    $KWID = $kwid;
+    @putative_hits = ();
+  } elsif ( not $KWID ) {
+    $KWID = $kwid;
+  }
+
+  unless ($probs) {
+    $score = exp(-$score);
+  }
+  push @putative_hits, [$kwid, $file, $start, $end, $score];
+}
+
+if ($KWID) {
+  my $thr = ComputeKST(\@putative_hits, $ntrue_scale, \%NTRUE_TABLE );
+  if ((defined $BSUM{$KWID}) && (scalar @putative_hits > 100)) {
+    $thr = pow($BSUM{$KWID}, log($thr)/log($global_thr));
+  }
+  if ($thr < 0.9999 ) {
+    NormalizeScores(\@putative_hits, $thr, $global_thr);
+    WriteResults(\@putative_hits);
+  }
+}
+
diff --git a/egs/mini_librispeech/s5/local/kws/run_kws.sh b/egs/mini_librispeech/s5/local/kws/run_kws.sh
new file mode 100755
index 00000000000..8e7b56f0082
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/run_kws.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# Copyright (c) 2018, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+flen=0.01
+stage=0
+cmd=run.pl
+data=data/dev_clean_2
+lang=data/lang
+keywords=local/kws/example/keywords.txt
+output=data/dev_clean_2/kws/
+# End configuration section
+
+. ./utils/parse_options.sh
+. ./path.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+mkdir -p $output
+if [ $stage -le 1 ] ; then
+  ## generate the auxiliary data files
+  ## utt.map
+  ## wav.map
+  ## trials
+  ## frame_length
+  ## keywords.int
+
+  ## For simplicity, we do not generate the following files
+  ## categories
+
+  ## We will generate the following files later
+  ## hitlist
+  ## keywords.fsts
+
+  [ ! -f $data/utt2dur ] &&
+    utils/data/get_utt2dur.sh $data
+
+  duration=$(cat $data/utt2dur | awk '{sum += $2} END{print sum}' )
+
+  echo $duration > $output/trials
+  echo $flen > $output/frame_length
+
+  echo "Number of trials: $(cat $output/trials)"
+  echo "Frame lengths: $(cat $output/frame_length)"
+
+  echo "Generating map files"
+  cat $data/utt2dur | awk 'BEGIN{i=1}; {print $1, i; i+=1;}' > $output/utt.map
+  cat $data/wav.scp | awk 'BEGIN{i=1}; {print $1, i; i+=1;}' > $output/wav.map
+
+  cp $lang/words.txt $output/words.txt
+  cp $keywords $output/keywords.txt
+  cat $output/keywords.txt | \
+    local/kws/keywords_to_indices.pl --map-oov 0  $output/words.txt | \
+    sort -u > $output/keywords.int
+fi
+
+if [ $stage -le 2 ] ; then
+  ## this step generates the file hitlist
+
+  ## in many cases, when the reference hits are given, the followin two steps \
+  ## are not needed
+  ## we create the alignments of the data directory
+  ## this is only so that we can obtain the hitlist
+  steps/align_fmllr.sh --nj 5 --cmd "$cmd" \
+    $data data/lang exp/tri3b exp/tri3b_ali_$(basename $data)
+
+  local/kws/create_hitlist.sh $data $lang data/local/lang_tmp \
+    exp/tri3b_ali_$(basename $data) $output
+fi
+
+if [ $stage -le 3 ] ; then
+  ## this steps generates the file keywords.fsts
+
+  ## compile the keywords (it's done via tmp work dirs, so that
+  ## you can use the keywords filtering and then just run fsts-union
+  local/kws/compile_keywords.sh $output $lang  $output/tmp.2
+  cp $output/tmp.2/keywords.fsts $output/keywords.fsts
+  # for example
+  #    fsts-union scp:<(sort data/$dir/kwset_${set}/tmp*/keywords.scp) \
+  #      ark,t:"|gzip -c >data/$dir/kwset_${set}/keywords.fsts.gz"
+  ##
+fi
+
+system=exp/chain/tdnn1h_sp_online/decode_tglarge_dev_clean_2/
+if [ $stage -le 4 ]; then
+  ## this is not exactly necessary for a single system and single keyword set
+  ## but if you have multiple keyword sets, then it avoids having to recompute
+  ## the indices unnecesarily every time (see --indices-dir and --skip-indexing
+  ## parameters to the search script bellow).
+  for lmwt in `seq 8 14` ; do
+    steps/make_index.sh --cmd "$cmd" --lmwt $lmwt --acwt 1.0 \
+      --frame-subsampling-factor 3\
+      $output $lang $system $system/kws_indices_$lmwt
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  ## find the hits, normalize and score
+  local/kws/search.sh --cmd "$cmd" --min-lmwt 8 --max-lmwt 14  \
+    --indices-dir $system/kws_indices --skip-indexing true\
+    $lang $data $system
+fi
+
+echo "Done"
+
+
diff --git a/egs/mini_librispeech/s5/local/kws/score.sh b/egs/mini_librispeech/s5/local/kws/score.sh
new file mode 100755
index 00000000000..b056e150e83
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/score.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+# Copyright 2012-2018  Johns Hopkins University (Author: Guoguo Chen, Yenda Trmal)
+# Apache 2.0.
+
+# Begin configuration section.
+# case_insensitive=true
+extraid=
+min_lmwt=8
+max_lmwt=12
+cmd=run.pl
+stage=0
+ntrue_from=
+# End configuration section.
+
+help_message="$0: score the kwslist using the F4DE scorer from NIST
+  Example:
+    $0 [additional-parameters] <kaldi-data-dir> <kws-results-dir>
+    where the most important additional parameters can be:
+    --extraid  <extra-id> #for using, when a non-default kws tasks are setup
+              (using the kws_setup.sh --extraid) for a kaldi-single data-dir"
+
+echo $0 $@
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+
+if [ $# -ne 3 ]; then
+    printf "FATAL: incorrect number of variables given to the script\n\n"
+    printf "$help_message\n"
+    exit 1;
+fi
+
+set -e -o pipefail
+
+langdir=$1
+if [ -z $extraid ] ; then
+  kwsdatadir=$2/kws
+else
+  kwsdatadir=$2/kwset_${extraid}
+fi
+kwsoutputdir="$3"
+
+trials=$(cat $kwsdatadir/trials)
+mkdir -p $kwsoutputdir/log/
+
+if [ $stage -le 0 ] ; then
+  if [ -z "$ntrue_from" ]; then
+    for LMWT in $(seq $min_lmwt $max_lmwt) ; do
+      mkdir -p ${kwsoutputdir}_$LMWT/details/
+      mkdir -p ${kwsoutputdir}_$LMWT/scoring/
+
+      # as we need to sweep through different ntrue-scales we will
+      # we will do it in one parallel command -- it will be more effective
+      # than sweeping in a loop and for all lmwts in parallel (as usuallyu
+      # there will be just a couple of different lmwts, but the ntrue-scale
+      # has a larger dynamic range
+      $cmd NTRUE=1:21 $kwsoutputdir/log/score.${LMWT}.NTRUE.log \
+        ntrue=\$\(perl -e 'print 1+(NTRUE-1)/5.0' \) '&&' \
+        cat ${kwsoutputdir}_$LMWT/results \|\
+          local/kws/normalize_results_kst.pl --trials $trials --ntrue-scale \$ntrue \|\
+          local/kws/filter_kws_results.pl --probs --nbest 200   \|\
+          compute-atwv $trials ark,t:$kwsdatadir/hitlist ark:- \
+          \> ${kwsoutputdir}_$LMWT/scoring/score.NTRUE.txt
+
+      ntrue=$(grep ATWV ${kwsoutputdir}_$LMWT/scoring/score.*.txt | \
+              sort -k2,2nr -t '='  | head -n 1 | \
+              sed 's/.*score\.\([0-9][0-9]*\)\.txt.*/\1/g')
+      #The calculation of ntrue must be the same as in the command above
+      echo "$ntrue" > ${kwsoutputdir}_$LMWT/details/ntrue_raw
+      ntrue=$(perl -e "print 1+($ntrue-1)/5.0")
+      echo "$ntrue" > ${kwsoutputdir}_$LMWT/details/ntrue
+    done
+  else
+    for LMWT in $(seq $min_lmwt $max_lmwt) ; do
+      mkdir -p ${kwsoutputdir}_$LMWT/details/
+      mkdir -p ${kwsoutputdir}_$LMWT/scoring/
+
+      cp ${ntrue_from}_${LMWT}/details/ntrue  ${kwsoutputdir}_${LMWT}/details/ntrue
+      [ -f  ${ntrue_from}_${LMWT}/details/ntrue_raw ] && \
+        cp ${ntrue_from}_${LMWT}/details/ntrue_raw  ${kwsoutputdir}_${LMWT}/details/ntrue_raw
+      echo "$ntrue_from" > ${kwsoutputdir}_${LMWT}/details/ntrue_from
+    done
+  fi
+fi
+
+if [ $stage -le 1 ] ; then
+  $cmd LMWT=$min_lmwt:$max_lmwt $kwsoutputdir/log/normalize.LMWT.log \
+    cat ${kwsoutputdir}_LMWT/results \|\
+      local/kws/normalize_results_kst.pl --trials $trials --ntrue-scale \$\(cat ${kwsoutputdir}_LMWT/details/ntrue\)\
+      \> ${kwsoutputdir}_LMWT/details/results
+
+  $cmd LMWT=$min_lmwt:$max_lmwt $kwsoutputdir/log/score.final.LMWT.log \
+    cat ${kwsoutputdir}_LMWT/details/results \|\
+      compute-atwv $trials ark,t:$kwsdatadir/hitlist ark:- \
+      ${kwsoutputdir}_LMWT/details/alignment.csv \> ${kwsoutputdir}_LMWT/details/score.txt  '&&' \
+    cp ${kwsoutputdir}_LMWT/details/score.txt ${kwsoutputdir}_LMWT/score.txt
+
+  if [ -f $kwsdatadir/categories ]; then
+    $cmd LMWT=$min_lmwt:$max_lmwt $kwsoutputdir/log/per-category-stats.LMWT.log \
+      cat ${kwsoutputdir}_LMWT/details/alignment.csv \|\
+        perl local/search/per_category_stats.pl --sweep-step 0.005  $trials \
+        $kwsdatadir/categories \> ${kwsoutputdir}_LMWT/details/per-category-score.txt
+  else
+    echo "$0: Categories file not found, not generating per-category scores"
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+if [ -f $kwsdatadir/f4de_attribs ] ; then
+  language=""
+  flen=0.01
+  kwlist_name=""
+  . $kwsdatadir/f4de_attribs #override the previous variables
+
+  ecf=$kwsdatadir/ecf.xml
+  rttm=$kwsdatadir/rttm
+  kwlist=$kwsdatadir/kwlist.xml
+
+  $cmd LMWT=$min_lmwt:$max_lmwt $kwsoutputdir/log/f4de_prepare.LMWT.log \
+    mkdir -p ${kwsoutputdir}_LMWT/f4de/ '&&' cat $kwlist \| \
+    local/search/annotate_kwlist.pl $kwsdatadir/categories \> ${kwsoutputdir}_LMWT/f4de/kwlist.xml
+
+  $cmd LMWT=$min_lmwt:$max_lmwt $kwsoutputdir/log/f4de_write_kwslist.LMWT.log \
+    cat ${kwsoutputdir}_LMWT/details/results \| \
+      utils/int2sym.pl -f 2 $kwsdatadir/utt.map \| \
+      local/search/utt_to_files.pl --flen $flen $kwsdatadir/../segments \|\
+      local/search/write_kwslist.pl --flen $flen --language $language \
+      --kwlist-id $kwlist_name \> ${kwsoutputdir}_LMWT/f4de/kwslist.xml
+
+  $cmd LMWT=$min_lmwt:$max_lmwt $kwsoutputdir/log/f4de_score.LMWT.log \
+    KWSEval -e $ecf -r $rttm -t ${kwsoutputdir}_LMWT/f4de/kwlist.xml -a  \
+      --zGlobalMeasures Optimum --zGlobalMeasures Supremum \
+      -O -B -q 'Characters:regex=.*' -q 'NGramOrder:regex=.*' \
+      -O -B -q 'OOV:regex=.*' -q 'BaseOOV:regex=.*' \
+      -s ${kwsoutputdir}_LMWT/f4de/kwslist.xml -c -o -b -d -f  ${kwsoutputdir}_LMWT/f4de/
+
+  $cmd LMWT=$min_lmwt:$max_lmwt $kwsoutputdir/log/f4de_report.LMWT.log \
+    local/kws_oracle_threshold.pl --duration $trials \
+      ${kwsoutputdir}_LMWT/f4de/alignment.csv \> ${kwsoutputdir}_LMWT/f4de/metrics.txt
+fi
+fi
+
+echo "$0: Done"
+exit 0;
+
+
diff --git a/egs/mini_librispeech/s5/local/kws/search.sh b/egs/mini_librispeech/s5/local/kws/search.sh
new file mode 100755
index 00000000000..1c69b0da556
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/kws/search.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+# Copyright 2012-2018  Johns Hopkins University (Author: Guoguo Chen, Yenda Trmal)
+# License: Apache 2.0
+
+
+help_message="$(basename $0): do keyword indexing and search.  data-dir is assumed to have
+                 kws/ subdirectory that specifies the terms to search for.  Output is in
+                 decode-dir/kws/
+             Usage:
+                 $(basename $0) <lang-dir> <data-dir> <decode-dir>"
+
+# Begin configuration section.
+min_lmwt=8
+max_lmwt=12
+cmd=run.pl
+model=
+skip_scoring=false
+skip_optimization=false # true can speed it up if #keywords is small.
+max_states=350000
+indices_dir=
+kwsout_dir=
+stage=0
+word_ins_penalty=0
+extraid=
+silence_word=  # specify this if you did to in kws_setup.sh, it's more accurate.
+strict=false
+duptime=0.6
+ntrue_scale=1.0
+frame_subsampling_factor=1
+nbest=-1
+max_silence_frames=50
+skip_indexing=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+set -u
+set -e
+set -o pipefail
+
+
+if [[ "$#" -ne "3" ]] ; then
+    echo -e "$0: FATAL: wrong number of script parameters!\n\n"
+    printf "$help_message\n\n"
+    exit 1;
+fi
+
+silence_opt=
+
+langdir=$1
+datadir=$2
+decodedir=$3
+
+if [ -z $extraid ] ; then
+  kwsdatadir=$datadir/kws
+else
+  kwsdatadir=$datadir/kwset_${extraid}
+fi
+
+if [ -z $extraid ] ; then
+  kwsoutdir=$decodedir/kws
+else
+  kwsoutdir=$decodedir/kwset_${extraid}
+fi
+
+
+if [ -z $indices_dir ]; then
+  indices_dir=$kwsoutdir
+fi
+
+if [ ! -z "$model" ]; then
+    model_flags="--model $model"
+else
+    model_flags=
+fi
+
+mkdir -p $kwsoutdir
+for d in "$datadir" "$kwsdatadir" "$langdir" "$decodedir"; do
+  if [ ! -d "$d" ]; then
+    echo "$0: FATAL: expected directory $d to exist"
+    exit 1;
+  fi
+done
+
+echo "$0: Searching: $kwsdatadir"
+duration=$(cat $kwsdatadir/trials)
+echo "$0: Duration: $duration"
+
+
+frame_subsampling_factor=1
+if [ -f $decodedir/../frame_subsampling_factor ] ; then
+  frame_subsampling_factor=$(cat $decodedir/../frame_subsampling_factor)
+  echo "$0: Frame subsampling factor autodetected: $frame_subsampling_factor"
+elif [ -f $decodedir/../../frame_subsampling_factor ] ; then
+  frame_subsampling_factor=$(cat $decodedir/../../frame_subsampling_factor)
+  echo "$0: Frame subsampling factor autodetected: $frame_subsampling_factor"
+fi
+
+if [ $stage -le 0 ] ; then
+  if [ ! -f $indices_dir/.done.index ] && ! $skip_indexing ; then
+    [ ! -d $indices_dir ] && mkdir  $indices_dir
+    for lmwt in $(seq $min_lmwt $max_lmwt) ; do
+      indices=${indices_dir}_$lmwt
+      mkdir -p $indices
+
+      acwt=$(perl -e "print 1.0/$lmwt")
+      [ ! -z $silence_word ] && silence_opt="--silence-word $silence_word"
+      steps/make_index.sh $silence_opt --cmd "$cmd" --acwt $acwt $model_flags\
+        --skip-optimization $skip_optimization --max-states $max_states \
+        --word-ins-penalty $word_ins_penalty --max-silence-frames $max_silence_frames\
+        --frame-subsampling-factor ${frame_subsampling_factor} \
+        $kwsdatadir $langdir $decodedir $indices  || exit 1
+    done
+    touch $indices_dir/.done.index
+  else
+    echo "$0: Assuming indexing has been aready done. If you really need to re-run "
+    echo "$0: the indexing again, delete the file $indices_dir/.done.index"
+  fi
+fi
+
+keywords=$kwsdatadir/keywords.fsts
+if [ -f $keywords ] ; then
+  echo "$0: Using ${keywords} for search"
+  keywords="ark:$keywords"
+elif [ -f ${keywords}.gz ] ; then
+  echo "$0: Using ${keywords}.gz for search"
+  keywords="ark:gunzip -c ${keywords}.gz |"
+else
+  echo "$0: The keyword file ${keywords}[.gz] does not exist"
+fi
+
+
+if [ $stage -le 1 ]; then
+  for lmwt in $(seq $min_lmwt $max_lmwt) ; do
+    kwsoutput=${kwsoutdir}_$lmwt
+    indices=${indices_dir}_$lmwt
+    nj=$(cat $indices/num_jobs)
+
+
+    for f in $indices/index.1.gz ; do
+      [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+    done
+
+    mkdir -p $kwsoutput/log
+    $cmd JOB=1:$nj $kwsoutput/log/search.JOB.log \
+      set -e  -o pipefail '&&' \
+      kws-search --strict=$strict --negative-tolerance=-1 \
+      --frame-subsampling-factor=${frame_subsampling_factor} \
+      "ark:gzip -cdf $indices/index.JOB.gz|" "$keywords" \
+      "ark,t:| sort -u | gzip -c > $kwsoutput/result.JOB.gz" \
+      "ark,t:| sort -u | gzip -c > $kwsoutput/stats.JOB.gz" || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  for lmwt in $(seq $min_lmwt $max_lmwt) ; do
+    kwsoutput=${kwsoutdir}_$lmwt
+    indices=${indices_dir}_$lmwt
+    nj=$(cat $indices/num_jobs)
+
+    # This is a memory-efficient way how to do the filtration
+    # we do this in this way because the result.* files can be fairly big
+    # and we do not want to run into troubles with memory
+    files=""
+    for job in $(seq 1 $nj); do
+      if [ -f $kwsoutput/result.${job}.gz ] ; then
+       files="$files <(gunzip -c $kwsoutput/result.${job}.gz)"
+      elif [ -f $kwsoutput/result.${job} ] ; then
+       files="$files $kwsoutput/result.${job}"
+      else
+        echo >&2 "The file $kwsoutput/result.${job}[.gz] does not exist"
+        exit 1
+      fi
+    done
+    # we have to call it using eval as we need the bash to interpret
+    # the (possible) command substitution in case of gz files
+    # bash -c would probably work as well, but would spawn another
+    # shell instance
+    eval "sort -m -u $files" |\
+      local/kws/filter_kws_results.pl --likes --nbest $nbest > $kwsoutput/results || exit 1
+  done
+fi
+
+if [ -z $extraid ] ; then
+  extraid_flags=
+else
+  extraid_flags="  --extraid ""$extraid"" "
+fi
+
+if [ $stage -le 4 ]; then
+  if $skip_scoring ; then
+    echo "$0: Not scoring, because --skip-scoring true was issued"
+  elif [ ! -x local/kws/score.sh ] ; then
+    echo "$0: Not scoring, because the file local/kws_score.sh is not present"
+  else
+    echo "$0: Scoring KWS results"
+    local/kws/score.sh --cmd "$cmd" \
+      --min-lmwt $min_lmwt --max-lmwt $max_lmwt $extraid_flags \
+      $langdir $datadir ${kwsoutdir} || exit 1;
+  fi
+fi
+
+echo "$0: Done"
+exit 0
+
diff --git a/egs/mini_librispeech/s5/local/nnet3/run_ivector_common.sh b/egs/mini_librispeech/s5/local/nnet3/run_ivector_common.sh
index 2663fb12ee5..f44b0cb0284 100755
--- a/egs/mini_librispeech/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/mini_librispeech/s5/local/nnet3/run_ivector_common.sh
@@ -13,6 +13,8 @@ train_set=train_clean_5
 test_sets="dev_clean_2"
 gmm=tri3b
 
+online_cmvn_iextractor=false
+
 nnet3_affix=
 
 . ./cmd.sh
@@ -52,7 +54,7 @@ if [ $stage -le 3 ]; then
   echo "$0: creating high-resolution MFCC features"
   mfccdir=data/${train_set}_sp_hires/data
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    utils/create_split_dir.pl /export/b1{5,6,8,9}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
   for datadir in ${train_set}_sp ${test_sets}; do
@@ -103,7 +105,9 @@ if [ $stage -le 5 ]; then
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
   echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 15 \
+     --num-threads 4 --num-processes 2 \
+     --online-cmvn-iextractor $online_cmvn_iextractor \
      data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
      exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
@@ -122,7 +126,7 @@ if [ $stage -le 6 ]; then
 
   ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
-    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+    utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/ivectors/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
 
 
diff --git a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
index de858973c98..c2f90df4b5c 100755
--- a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -99,7 +99,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
index ba4ecc268df..2b3c2844972 100755
--- a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1b.sh
@@ -102,7 +102,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   lstm_opts="decay-time=20 delay=-3 dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
index 74df56b0537..5118cb0f8bd 100755
--- a/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/mini_librispeech/s5/local/nnet3/tuning/run_tdnn_lstm_1c.sh
@@ -100,7 +100,7 @@ if [ $stage -le 10 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.05"
   lstm_opts="l2-regularize=0.01 decay-time=20 delay=-3 dropout-proportion=0.0"
   output_opts="l2-regularize=0.01"
diff --git a/egs/mini_librispeech/s5/path.sh b/egs/mini_librispeech/s5/path.sh
index 705600ad47a..34244b27f2e 100644
--- a/egs/mini_librispeech/s5/path.sh
+++ b/egs/mini_librispeech/s5/path.sh
@@ -1,4 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
diff --git a/egs/mini_librispeech/s5/run.sh b/egs/mini_librispeech/s5/run.sh
index 30b0e8bda7c..2a13668e2c2 100755
--- a/egs/mini_librispeech/s5/run.sh
+++ b/egs/mini_librispeech/s5/run.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Change this location to somewhere where you want to put the data.
-data=/export/a05/dgalvez/
+data=./corpus/
 
 data_url=www.openslr.org/resources/31
 lm_url=www.openslr.org/resources/11
@@ -21,7 +21,7 @@ for part in dev-clean-2 train-clean-5; do
 done
 
 if [ $stage -le 0 ]; then
-  local/download_lm.sh $lm_url data/local/lm
+  local/download_lm.sh $lm_url $data data/local/lm
 fi
 
 if [ $stage -le 1 ]; then
@@ -196,8 +196,10 @@ fi
 
 # Train a chain model
 if [ $stage -le 9 ]; then
-  local/chain/run_tdnn.sh --stage 0
+  local/chain/run_tdnn.sh
 fi
 
+# local/grammar/simple_demo.sh
+
 # Don't finish until all background decoding jobs are finished.
 wait
diff --git a/egs/multi_cn/s5/README.md b/egs/multi_cn/s5/README.md
new file mode 100644
index 00000000000..4cfcb8d6941
--- /dev/null
+++ b/egs/multi_cn/s5/README.md
@@ -0,0 +1,18 @@
+This is a Chinese speech recognition recipe that trains on all Chinese corpora on [OpenSLR](http://www.openslr.org), including:
+* Aidatatang (140 hours)
+* Aishell (151 hours)
+* MagicData (712 hours)
+* Primewords (99 hours)
+* ST-CMDS (110 hours)
+* THCHS-30 (26 hours)
+
+This recipe was developed by Xingyu Na (Microsoft Corporation) and Hui Bu (AISHELL Foundation).
+
+## Highlights
+
+1. This recipe start from bootstraping small GMM models using small portion of data to speaker adaptive training using cleaned full partition, which is over 1k hours.
+2. A general lexicon is prepared by combining CMU English dictionary and CC-CEDIT Chinese dictionary, then **expanded using G2P**.
+3. A general language model is trained using all training transcriptions, while **corpus specific LMs** are optionally obtained by interpolated with the general LM.
+4. Features are extracted in an online fashion.
+5. A Chain model ready for **online ASR** is trained, prepared and evaluated.
+6. Data preparation scripts are copied from existing recipes, so it is straightforward for any user to **expand the corpora**.
diff --git a/egs/multi_cn/s5/RESULTS b/egs/multi_cn/s5/RESULTS
new file mode 100644
index 00000000000..0b9f652a2ff
--- /dev/null
+++ b/egs/multi_cn/s5/RESULTS
@@ -0,0 +1,67 @@
+# GMM results w/o corpus LM
+# ./run.sh --stage 17
+# aishell test set results
+%WER 22.25 [ 23314 / 104765, 675 ins, 1534 del, 21105 sub ] exp/tri1b/decode_aishell_test_tg/cer_13_0.0
+%WER 20.63 [ 21616 / 104765, 701 ins, 1413 del, 19502 sub ] exp/tri2a/decode_aishell_test_tg/cer_13_0.0
+%WER 19.03 [ 19941 / 104765, 725 ins, 1222 del, 17994 sub ] exp/tri3a/decode_aishell_test_tg/cer_13_0.5
+%WER 21.68 [ 22710 / 104765, 902 ins, 2361 del, 19447 sub ] exp/tri4a/decode_aishell_test_tg/cer_14_0.0
+%WER 16.64 [ 17436 / 104765, 857 ins, 706 del, 15873 sub ] exp/tri4a_cleaned/decode_aishell_test_tg/cer_14_0.5
+%WER 5.90 [ 6176 / 104765, 119 ins, 169 del, 5888 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aishell_tg/cer_11_1.0
+%WER 5.90 [ 6177 / 104765, 121 ins, 168 del, 5888 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aishell_tg/cer_11_1.0
+
+# aidatatang test set results
+%WER 33.86 [ 158799 / 468933, 3856 ins, 33811 del, 121132 sub ] exp/tri1b/decode_aidatatang_test_tg/cer_14_0.0
+%WER 32.62 [ 152977 / 468933, 4182 ins, 31249 del, 117546 sub ] exp/tri2a/decode_aidatatang_test_tg/cer_14_0.0
+%WER 23.67 [ 111009 / 468933, 4535 ins, 19118 del, 87356 sub ] exp/tri3a/decode_aidatatang_test_tg/cer_14_0.0
+%WER 20.01 [ 93829 / 468933, 4563 ins, 16970 del, 72296 sub ] exp/tri4a/decode_aidatatang_test_tg/cer_15_0.0
+%WER 17.85 [ 83717 / 468933, 6506 ins, 13716 del, 63495 sub ] exp/tri4a_cleaned/decode_aidatatang_test_tg/cer_15_0.0
+%WER 4.98 [ 23370 / 468933, 2190 ins, 3188 del, 17992 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_aidatatang_tg/cer_10_0.0
+%WER 4.98 [ 23371 / 468933, 2224 ins, 3171 del, 17976 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_aidatatang_tg/cer_10_0.0
+
+# magicdata test set results
+%WER 27.01 [ 64815 / 239927, 4838 ins, 14852 del, 45125 sub ] exp/tri1b/decode_magicdata_test_tg/cer_17_0.0
+%WER 27.10 [ 65010 / 239927, 5746 ins, 12552 del, 46712 sub ] exp/tri2a/decode_magicdata_test_tg/cer_17_0.0
+%WER 22.42 [ 53784 / 239927, 6513 ins, 7409 del, 39862 sub ] exp/tri3a/decode_magicdata_test_tg/cer_17_0.0
+%WER 15.45 [ 37076 / 239927, 3942 ins, 5217 del, 27917 sub ] exp/tri4a/decode_magicdata_test_tg/cer_17_0.0
+%WER 13.99 [ 33568 / 239927, 6267 ins, 3705 del, 23596 sub ] exp/tri4a_cleaned/decode_magicdata_test_tg/cer_17_0.5
+%WER 4.24 [ 10180 / 239927, 1405 ins, 2001 del, 6774 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_magicdata_tg/cer_11_1.0
+%WER 4.25 [ 10188 / 239927, 1428 ins, 1997 del, 6763 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_magicdata_tg/cer_11_1.0
+
+# thchs test set results
+%WER 35.75 [ 29005 / 81139, 353 ins, 1824 del, 26828 sub ] exp/tri1b/decode_thchs_test_tg/cer_10_1.0
+%WER 32.59 [ 26446 / 81139, 326 ins, 1622 del, 24498 sub ] exp/tri2a/decode_thchs_test_tg/cer_11_0.5
+%WER 30.26 [ 24549 / 81139, 328 ins, 1412 del, 22809 sub ] exp/tri3a/decode_thchs_test_tg/cer_10_1.0
+%WER 27.67 [ 22449 / 81139, 410 ins, 1102 del, 20937 sub ] exp/tri4a/decode_thchs_test_tg/cer_10_0.5
+%WER 25.41 [ 20615 / 81139, 399 ins, 847 del, 19369 sub ] exp/tri4a_cleaned/decode_thchs_test_tg/cer_11_0.5
+%WER 12.96 [ 10514 / 81139, 120 ins, 300 del, 10094 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp/decode_thchs_tg/cer_10_1.0
+%WER 12.94 [ 10499 / 81139, 120 ins, 299 del, 10080 sub ] exp/chain_cleaned/tdnn_cnn_1a_sp_online/decode_thchs_tg/cer_10_1.0
+
+# GMM results w/ corpus LM
+# ./run.sh --stage 17 --corpus-lm true
+# aishell test set results
+%WER 19.10 [ 20013 / 104765, 694 ins, 1203 del, 18116 sub ] exp/tri1b/decode_aishell_test_clm/cer_13_0.0
+%WER 17.36 [ 18188 / 104765, 726 ins, 984 del, 16478 sub ] exp/tri2a/decode_aishell_test_clm/cer_13_0.0
+%WER 16.25 [ 17023 / 104765, 763 ins, 850 del, 15410 sub ] exp/tri3a/decode_aishell_test_clm/cer_13_0.0
+%WER 13.25 [ 13880 / 104765, 693 ins, 607 del, 12580 sub ] exp/tri4a/decode_aishell_test_clm/cer_13_0.5
+%WER 13.35 [ 13986 / 104765, 647 ins, 688 del, 12651 sub ] exp/tri4a_cleaned/decode_aishell_test_clm/cer_16_0.0
+
+# aidatatang test set results
+%WER 28.04 [ 131471 / 468933, 4093 ins, 29140 del, 98238 sub ] exp/tri1b/decode_aidatatang_test_clm/cer_15_0.0
+%WER 27.22 [ 127656 / 468933, 4885 ins, 26015 del, 96756 sub ] exp/tri2a/decode_aidatatang_test_clm/cer_14_0.0
+%WER 19.66 [ 92204 / 468933, 4748 ins, 16197 del, 71259 sub ] exp/tri3a/decode_aidatatang_test_clm/cer_15_0.0
+%WER 15.46 [ 72485 / 468933, 4665 ins, 12969 del, 54851 sub ] exp/tri4a/decode_aidatatang_test_clm/cer_15_0.0
+%WER 15.65 [ 73376 / 468933, 6387 ins, 12384 del, 54605 sub ] exp/tri4a_cleaned/decode_aidatatang_test_clm/cer_16_0.0
+
+# magicdata test set results
+%WER 25.60 [ 61422 / 239927, 4852 ins, 13931 del, 42639 sub ] exp/tri1b/decode_magicdata_test_clm/cer_17_0.0
+%WER 25.95 [ 62262 / 239927, 5827 ins, 12034 del, 44401 sub ] exp/tri2a/decode_magicdata_test_clm/cer_17_0.0
+%WER 21.28 [ 51063 / 239927, 6478 ins, 6831 del, 37754 sub ] exp/tri3a/decode_magicdata_test_clm/cer_17_0.0
+%WER 14.76 [ 35422 / 239927, 3924 ins, 4924 del, 26574 sub ] exp/tri4a/decode_magicdata_test_clm/cer_16_0.5
+%WER 13.61 [ 32655 / 239927, 6256 ins, 3636 del, 22763 sub ] exp/tri4a_cleaned/decode_magicdata_test_clm/cer_17_0.5
+
+# thchs test set results
+%WER 35.67 [ 28941 / 81139, 300 ins, 1697 del, 26944 sub ] exp/tri1b/decode_thchs_test_clm/cer_12_0.0
+%WER 32.36 [ 26259 / 81139, 277 ins, 1363 del, 24619 sub ] exp/tri2a/decode_thchs_test_clm/cer_12_0.0
+%WER 29.92 [ 24276 / 81139, 288 ins, 1128 del, 22860 sub ] exp/tri3a/decode_thchs_test_clm/cer_11_0.0
+%WER 26.08 [ 21162 / 81139, 296 ins, 974 del, 19892 sub ] exp/tri4a/decode_thchs_test_clm/cer_12_0.0
+%WER 25.68 [ 20834 / 81139, 342 ins, 919 del, 19573 sub ] exp/tri4a_cleaned/decode_thchs_test_clm/cer_12_0.0
diff --git a/egs/multi_cn/s5/cmd.sh b/egs/multi_cn/s5/cmd.sh
new file mode 100644
index 00000000000..71dd849a93b
--- /dev/null
+++ b/egs/multi_cn/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/multi_cn/s5/conf/cmu2pinyin b/egs/multi_cn/s5/conf/cmu2pinyin
new file mode 100644
index 00000000000..c02eb600fcc
--- /dev/null
+++ b/egs/multi_cn/s5/conf/cmu2pinyin
@@ -0,0 +1,39 @@
+AA A
+AE A
+AH A
+AO UO
+AW U
+AY AI
+B B
+CH CH 
+D D
+DH S I
+EH AI
+ER E
+EY AI
+F F
+G G
+HH H
+IH I
+IY I
+JH ZH 
+K K
+L L
+M M
+N N
+NG N
+OW UO
+OY UO
+P P
+R R
+S S
+SH SH
+T T
+TH S
+UH U
+UW U
+V W
+W W
+Y Y
+Z Z 
+ZH X  
diff --git a/egs/multi_cn/s5/conf/decode.config b/egs/multi_cn/s5/conf/decode.config
new file mode 100644
index 00000000000..d91f86183af
--- /dev/null
+++ b/egs/multi_cn/s5/conf/decode.config
@@ -0,0 +1,5 @@
+beam=11.0 # beam for decoding.  Was 13.0 in the scripts.
+first_beam=8.0 # beam for 1st-pass decoding in SAT.
+
+
+
diff --git a/egs/multi_cn/s5/conf/mfcc.conf b/egs/multi_cn/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/multi_cn/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/multi_cn/s5/conf/mfcc_hires.conf b/egs/multi_cn/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..d96b86ddfcb
--- /dev/null
+++ b/egs/multi_cn/s5/conf/mfcc_hires.conf
@@ -0,0 +1,9 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20    # low cutoff frequency for mel bins
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
diff --git a/egs/multi_cn/s5/conf/online_cmvn.conf b/egs/multi_cn/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/multi_cn/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/multi_cn/s5/conf/online_pitch.conf b/egs/multi_cn/s5/conf/online_pitch.conf
new file mode 100644
index 00000000000..c0f1342160d
--- /dev/null
+++ b/egs/multi_cn/s5/conf/online_pitch.conf
@@ -0,0 +1,4 @@
+--sample-frequency=16000
+--simulate-first-pass-online=true
+--normalization-right-context=25
+--frames-per-chunk=10
diff --git a/egs/multi_cn/s5/conf/pinyin2cmu b/egs/multi_cn/s5/conf/pinyin2cmu
new file mode 100644
index 00000000000..a6e53620479
--- /dev/null
+++ b/egs/multi_cn/s5/conf/pinyin2cmu
@@ -0,0 +1,58 @@
+A AA
+AI AY
+AN AE N 
+ANG AE NG
+AO AW   
+B B 
+CH CH
+C T S
+D D
+E ER 
+EI EY
+EN AH N
+ENG AH NG
+ER AA R 
+F F
+G G
+H HH
+IA IY AA
+IANG IY AE NG
+IAN IY AE N
+IAO IY AW
+IE IY EH
+I IY
+ING IY NG
+IN IY N
+IONG IY UH NG
+IU IY UH 
+J J
+K K
+L L
+M M
+N N
+O AO
+ONG UH NG
+OU OW
+P P
+Q Q
+R R
+SH SH
+S S
+T T
+UAI UW AY
+UANG UW AE NG
+UAN UW AE N
+UA UW AA
+UI UW IY 
+UN UW AH N
+UO UW AO
+U UW
+UE IY EH 
+VE IY EH 
+V IY UW
+VN IY N 
+W W
+X X 
+Y Y
+ZH JH 
+Z Z
diff --git a/egs/multi_cn/s5/local/aidatatang_data_prep.sh b/egs/multi_cn/s5/local/aidatatang_data_prep.sh
new file mode 100755
index 00000000000..518a0e99866
--- /dev/null
+++ b/egs/multi_cn/s5/local/aidatatang_data_prep.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/data_aidatatang_200zh data/aidatatang"
+  exit 1;
+fi
+
+aidatatang_audio_dir=$1/corpus
+aidatatang_text=$1/transcript/aidatatang_200_zh_transcript.txt
+data=$2
+
+train_dir=$data/local/train
+dev_dir=$data/local/dev
+test_dir=$data/local/test
+tmp_dir=$data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aidatatang_audio_dir ] || [ ! -f $aidatatang_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+echo "**** Creating aidatatang data folder ****"
+
+# find wav audio file for train, dev and test resp.
+find $aidatatang_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 237265 ] && \
+  echo Warning: expected 237265 data files, found $n
+
+grep -i "corpus/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "corpus/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "corpus/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aidatatang_text | sed 's/Ａ/A/g' > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" T0055"$2}' > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p $data/train $data/dev $data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f $data/train/$f || exit 1;
+  cp $dev_dir/$f $data/dev/$f || exit 1;
+  cp $test_dir/$f $data/test/$f || exit 1;
+done
+
+utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
+
+echo "$0: aidatatang_200zh data preparation succeeded"
+exit 0;
diff --git a/egs/multi_cn/s5/local/aidatatang_download_and_untar.sh b/egs/multi_cn/s5/local/aidatatang_download_and_untar.sh
new file mode 100755
index 00000000000..a2616ba0e20
--- /dev/null
+++ b/egs/multi_cn/s5/local/aidatatang_download_and_untar.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/62 aidatatang_200zh"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: aidatatang_200zh."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+part_ok=false
+list="aidatatang_200zh"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="18756983399"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.gz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+dev_dir=$data/$part/corpus/dev
+test_dir=$data/$part/corpus/test
+train_dir=$data/$part/corpus/train
+if [ $part == "aidatatang_200zh" ]; then
+  for set in $dev_dir $test_dir $train_dir;do
+    cd $set
+    for wav in ./*.tar.gz; do
+      echo "Extracting wav from $wav"
+      tar -zxf $wav && rm $wav
+    done
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/aishell_data_prep.sh b/egs/multi_cn/s5/local/aishell_data_prep.sh
new file mode 100755
index 00000000000..7896e208f33
--- /dev/null
+++ b/egs/multi_cn/s5/local/aishell_data_prep.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/data_aishell data/aishell"
+  exit 1;
+fi
+
+aishell_audio_dir=$1/wav
+aishell_text=$1/transcript/aishell_transcript_v0.8.txt
+data=data/aishell
+
+train_dir=$data/local/train
+dev_dir=$data/local/dev
+test_dir=$data/local/test
+tmp_dir=$data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+echo "**** Creating aishell data folder ****"
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text | \
+    sed 's/ａ/a/g' | sed 's/ｂ/b/g' |\
+    sed 's/ｃ/c/g' | sed 's/ｋ/k/g' |\
+    sed 's/ｔ/t/g' > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u | awk '{print $1" BAC009"$2}' > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p $data/train $data/dev $data/test
+
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f $data/train/$f || exit 1;
+  cp $dev_dir/$f $data/dev/$f || exit 1;
+  cp $test_dir/$f $data/test/$f || exit 1;
+done
+
+utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs/multi_cn/s5/local/aishell_download_and_untar.sh b/egs/multi_cn/s5/local/aishell_download_and_untar.sh
new file mode 100755
index 00000000000..e251a9aae2f
--- /dev/null
+++ b/egs/multi_cn/s5/local/aishell_download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  mkdir -p $data
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/chain/compare_cer.sh b/egs/multi_cn/s5/local/chain/compare_cer.sh
new file mode 100755
index 00000000000..1e63495df74
--- /dev/null
+++ b/egs/multi_cn/s5/local/chain/compare_cer.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+# This script is modified from egs/librispeech/s5/local/chain/compare_wer.sh
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_cer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_cer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_cer.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "# CER on aidatatang(tg)      "
+  "# CER on aishell(tg)         "
+  "# CER on magicdata(tg)       "
+  "# CER on thchs30(tg)         ")
+
+for n in 0 1 2 3; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(aidatatang_tg aishell_tg magicdata_tg thchs_tg)
+
+     wer=$(grep WER $dirname/decode_${decode_names[$n]}/cer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(grep WER ${dirname}_online/decode_${decode_names[$n]}/cer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-parameters             "
+for x in $*; do
+  num_params=$(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+  printf "% 10d" $num_params
+done
+echo
diff --git a/egs/multi_cn/s5/local/chain/run_chain_common.sh b/egs/multi_cn/s5/local/chain/run_chain_common.sh
new file mode 100755
index 00000000000..2f57c4765cf
--- /dev/null
+++ b/egs/multi_cn/s5/local/chain/run_chain_common.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# this script has common stages shared across librispeech chain recipes.
+# It generates a new topology in a new lang directory, gets the alignments as
+# lattices, and builds a tree for the new topology
+set -e
+
+stage=11
+
+# input directory names. These options are actually compulsory, and they have
+# been named for convenience
+gmm_dir=
+ali_dir=
+lores_train_data_dir=
+
+num_leaves=9000
+
+# output directory names. They are also compulsory.
+lang=
+lat_dir=
+tree_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+[ -z $lang ] && echo "Set --lang, this specifies the new lang directory which will have the new topology" && exit 1;
+[ -z $lat_dir ] && echo "Set --lat-dir, this specifies the experiment directory to store lattice" && exit 1;
+[ -z $tree_dir ] && echo "Set --tree-dir, this specifies the directory to store new tree " && exit 1;
+
+for f in $gmm_dir/final.mdl $ali_dir/ali.1.gz $lores_train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 12 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat ${ali_dir}/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    $lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} $lang $ali_dir $tree_dir
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/chain/run_cnn_tdnn.sh b/egs/multi_cn/s5/local/chain/run_cnn_tdnn.sh
new file mode 120000
index 00000000000..ab83f3c43e8
--- /dev/null
+++ b/egs/multi_cn/s5/local/chain/run_cnn_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/multi_cn/s5/local/chain/run_ivector_common.sh b/egs/multi_cn/s5/local/chain/run_ivector_common.sh
new file mode 100755
index 00000000000..d328d93fe41
--- /dev/null
+++ b/egs/multi_cn/s5/local/chain/run_ivector_common.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This script is copied from librispeech
+
+# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually
+# be called by more scripts).  It contains the common feature preparation and iVector-related parts
+# of the script.  See those scripts for examples of usage.
+
+
+stage=0
+train_set=train_all_cleaned # you might set this to e.g. train_all
+test_sets=""
+gmm=tri4a_cleaned        # This specifies a GMM-dir from the features of the type you're training the system on;
+num_threads_ubm=16
+num_processes=4
+nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
+                         # becomes exp/nnet3_cleaned or whatever.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  #Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment.  _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc_pitch_online.sh --cmd "$train_cmd" --nj 50 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/multi_cn-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  utils/copy_data_dir.sh data/${train_set}_sp data/${train_set}_sp_hires
+  for datadir in $test_sets; do
+    utils/copy_data_dir.sh data/$datadir/test data/$datadir/test_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
+  steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+    --cmd "$train_cmd" data/${train_set}_sp_hires || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp_hires || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp_hires
+
+  for datadir in $test_sets; do
+    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/$datadir/test_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/$datadir/test_hires || exit 1;
+    utils/fix_data_dir.sh data/$datadir/test_hires
+  done
+
+  # now create a data subset.  60k is 1/5th of the training dataset (around 200 hours).
+  utils/subset_data_dir.sh data/${train_set}_sp_hires 60000 \
+    data/${train_set}_sp_hires_60k
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: making a subset of data to train the diagonal UBM and the PCA transform."
+  # We'll use one hundredth of the data, since whole training set is very large.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/100]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+
+if [ $stage -le 5 ]; then
+  # iVector extractors can in general be sensitive to the amount of data, but
+  # this one has a fairly small dim (defaults to 100) so we don't use all of it,
+  # we use just the 60k subset (about one fifth of the data, or 200 hours).
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    --num-processes $num_processes data/${train_set}_sp_hires_60k \
+    exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: extracting iVectors for training data"
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/multi_cn-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker. this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${ivectordir}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
+    ${ivectordir}/${train_set}_sp_hires_max2 exp/nnet3${nnet3_affix}/extractor \
+    $ivectordir || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: extracting iVectors for test data"
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
+      data/${data}/test_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires || exit 1;
+  done
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
new file mode 100755
index 00000000000..f63e6d0f566
--- /dev/null
+++ b/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -0,0 +1,254 @@
+#!/bin/bash
+
+# This script is copied from librispeech/s5
+# In a previous version, pitch is used with hires mfcc, however,
+# removing pitch does not cause regression, and helps online
+# decoding, so pitch is removed in this recipe.
+
+# This is based on tdnn_1d_sp, but adding cnn as the front-end.
+# The cnn-tdnn-f (tdnn_cnn_1a_sp) outperforms the tdnn-f (tdnn_1d_sp).
+
+# local/chain/compare_cer.sh --online exp/chain_cleaned/tdnn_cnn_1a_pitch_sp exp/chain_nopitch/tdnn_cnn_1a_sp
+# System                      tdnn_cnn_1a_pitch_sp tdnn_cnn_1a_sp
+# CER on aidatatang(tg)            4.99      4.98
+#             [online:]          4.99      4.98
+# CER on aishell(tg)               6.01      5.90
+#             [online:]          6.01      5.90
+# CER on magicdata(tg)             4.21      4.24
+#             [online:]          4.23      4.25
+# CER on thchs30(tg)              13.02     12.96
+#             [online:]         13.00     12.94
+# Final train prob              -0.0436   -0.0438
+# Final valid prob              -0.0553   -0.0544
+# Final train prob (xent)       -0.8083   -0.8157
+# Final valid prob (xent)       -0.8766   -0.8730
+# Num-parameters               19141072  19141072
+
+set -e
+
+# configs for 'chain'
+stage=0
+decode_nj=10
+train_set=train_all_cleaned
+gmm=tri4a_cleaned
+nnet3_affix=_cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=cnn_1a
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+
+# TDNN options
+frames_per_eg=150,110,100
+remove_egs=true
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_sets=""
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/chain/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+local/chain/run_chain_common.sh --stage $stage \
+                                --gmm-dir $gmm_dir \
+                                --ali-dir $ali_dir \
+                                --lores-train-data-dir ${lores_train_data_dir} \
+                                --lang $lang \
+                                --lat-dir $lat_dir \
+                                --num-leaves 9000 \
+                                --tree-dir $tree_dir || exit 1;
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.01"
+  ivector_affine_opts="l2-regularize=0.0"
+  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_first_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.75"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # MFCC to filterbank
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+  batchnorm-component name=idct-batchnorm input=idct
+
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=10 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+
+  # the first TDNN-F layer has no bypass
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1536 bottleneck-dim=256 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/multi_cn-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --use-gpu "wait" \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 3000000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00015 \
+    --trainer.optimization.final-effective-lrate 0.000015 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+
+fi
+
+graph_dir=$dir/graph_tg
+if [ $stage -le 16 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \
+    data/lang_combined_tg $dir $graph_dir
+  # remove <UNK> (word id is 3) from the graph, and convert back to const-FST.
+  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
+    fstconvert --fst_type=const > $graph_dir/temp.fst
+  mv $graph_dir/temp.fst $graph_dir/HCLG.fst
+fi
+
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in $test_sets; do
+    (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $decode_nj --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+        $graph_dir data/${decode_set}/test_hires $dir/decode_${decode_set}_tg || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 18 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}/test/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $graph_dir data/${data}/test ${dir}_online/decode_${data}_tg || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
new file mode 100755
index 00000000000..a28f53b9a0f
--- /dev/null
+++ b/egs/multi_cn/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# This script is copied from mini_librispeech/s5
+
+# 1b is as 1a but adding SpecAugment and removing dropout (which, in
+# combination with SpecAugment, no longer seemed to give an improvement).
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_all_cleaned
+gmm=tri4a_cleaned
+nnet3_affix=_cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=_specaug   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=150,110,100
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+# decode options
+test_sets=""
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/chain/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+# Please take this as a reference on how to specify all the options of
+# local/chain/run_chain_common.sh
+local/chain/run_chain_common.sh --stage $stage \
+                                --gmm-dir $gmm_dir \
+                                --ali-dir $ali_dir \
+                                --lores-train-data-dir ${lores_train_data_dir} \
+                                --lang $lang \
+                                --lat-dir $lat_dir \
+                                --num-leaves 9000 \
+                                --tree-dir $tree_dir || exit 1;
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.01"
+  ivector_affine_opts="l2-regularize=0.0"
+  tdnn_opts="l2-regularize=0.008"
+  tdnnf_first_opts="l2-regularize=0.008 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.008"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+  batchnorm-component name=idct-batchnorm input=idct
+
+  spec-augment-layer name=idct-spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=20
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-spec-augment, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1536 bottleneck-dim=256 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf18 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=256 big-dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=256 big-dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/multi_cn-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.00015 \
+    --trainer.optimization.final-effective-lrate=0.000015 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+graph_dir=$dir/graph_tg
+if [ $stage -le 16 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \
+    data/lang_combined_tg $dir $graph_dir || exit 1;
+  # remove <UNK> (word id is 3) from the graph, and convert back to const-FST.
+  fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \
+    fstconvert --fst_type=const > $graph_dir/temp.fst
+  mv $graph_dir/temp.fst $graph_dir/HCLG.fst
+fi
+
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}/test_hires/spk2utt)
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $graph_dir data/${data}/test_hires ${dir}/decode_${data}_tg || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 18 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}/test/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $graph_dir data/${data}/test ${dir}_online/decode_${data}_tg || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/create_oov_char_lexicon.pl b/egs/multi_cn/s5/local/create_oov_char_lexicon.pl
new file mode 100755
index 00000000000..e9d2ca777ef
--- /dev/null
+++ b/egs/multi_cn/s5/local/create_oov_char_lexicon.pl
@@ -0,0 +1,49 @@
+#!/usr/bin/env perl
+# Copyright 2016 Alibaba Robotics Corp. (Author: Xingyu Na)
+#
+# A script for char-based Chinese OOV lexicon generation.
+#
+# Input 1: char-based dictionary, example
+# CHAR1 ph1 ph2
+# CHAR2 ph3
+# CHAR3 ph2 ph4
+#
+# Input 2: OOV word list, example
+# WORD1
+# WORD2
+# WORD3
+#
+# where WORD1 is in the format of "CHAR1CHAR2".
+#
+# Output: OOV lexicon, in the format of normal lexicon
+
+if($#ARGV != 1) {
+  print STDERR "usage: perl create_oov_char_lexicon.pl chardict oovwordlist > oovlex\n\n";
+  print STDERR "### chardict: a dict in which each line contains the pronunciation of one Chinese char\n";
+  print STDERR "### oovwordlist: OOV word list\n";
+  print STDERR "### oovlex: output OOV lexicon\n";
+  exit;
+}
+
+use utf8;
+my %prons;
+open(DICT, $ARGV[0]) || die("Can't open dict ".$ARGV[0]."\n");
+binmode(DICT,":encoding(utf8)");
+foreach (<DICT>) {
+  chomp; @A = split(" ", $_); $prons{$A[0]} = $A[1];
+}
+close DICT;
+
+open(WORDS, $ARGV[1]) || die("Can't open oov word list ".$ARGV[1]."\n");
+binmode(WORDS,":encoding(utf8)");
+binmode(STDOUT,":encoding(utf8)");
+while (<WORDS>) {
+  chomp;
+  print $_;
+  @A = split("", $_);
+  foreach (@A) {
+    print " $prons{$_}";
+  }
+  print "\n";
+}
+close WORDS;
diff --git a/egs/multi_cn/s5/local/magicdata_badlist b/egs/multi_cn/s5/local/magicdata_badlist
new file mode 100644
index 00000000000..67636273d53
--- /dev/null
+++ b/egs/multi_cn/s5/local/magicdata_badlist
@@ -0,0 +1,2 @@
+16_4013_20170819121429.wav
+18_1565_20170712000170.wav
diff --git a/egs/multi_cn/s5/local/magicdata_data_filter.py b/egs/multi_cn/s5/local/magicdata_data_filter.py
new file mode 100755
index 00000000000..ecf63e1d07f
--- /dev/null
+++ b/egs/multi_cn/s5/local/magicdata_data_filter.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+def main(argv):
+  try:
+    files={}
+    for line in open(argv[0]):
+      fpath = line.strip('\r\n')
+      wname = os.path.basename(fpath)
+      files[wname] = fpath
+  except IOError:
+    print(argv[0] + " not exist!")
+    sys.exit(1)
+
+  bad = []
+  if len(argv) == 4:
+    for line in open(argv[3]):
+      bad.append(line.strip('\r\n'))
+
+  fWavScp = open(os.path.join(argv[2], 'wav.scp'), 'w')
+  fText = open(os.path.join(argv[2], 'transcripts.txt'), 'w', encoding = "utf-8")
+  fUtt2Spk = open(os.path.join(argv[2], 'utt2spk'), 'w')
+  for line in open(argv[1], encoding = "utf-8"):
+    if '.wav' not in line:
+      continue
+    (wavid, spkid, text) = line.strip('\r\n').split('\t')
+    if len(bad) > 0 and wavid in bad:
+      continue
+    if wavid in files.keys():
+      uttid = wavid.replace('.wav', '')
+      fWavScp.write(uttid + ' ' + files[wavid] + '\n')
+      fText.write(uttid + ' ' + text + '\n')
+      fUtt2Spk.write(uttid + ' ' + spkid + '\n')
+  fWavScp.close()
+  fText.close()
+  fUtt2Spk.close()
+
+if __name__ == "__main__":
+  main(sys.argv[1:])
diff --git a/egs/multi_cn/s5/local/magicdata_data_prep.sh b/egs/multi_cn/s5/local/magicdata_data_prep.sh
new file mode 100755
index 00000000000..f8d47716751
--- /dev/null
+++ b/egs/multi_cn/s5/local/magicdata_data_prep.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright 2019 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/magicdata data/magicdata"
+  exit 1;
+fi
+
+corpus=$1
+data=$2
+
+if [ ! -d $corpus/train ] || [ ! -d $corpus/dev ] || [ ! -d $corpus/test ]; then
+  echo "Error: $0 requires complete corpus"
+  exit 1;
+fi
+
+echo "**** Creating magicdata data folder ****"
+
+mkdir -p $data/{train,dev,test,tmp}
+
+# find wav audio file for train, dev and test resp.
+tmp_dir=$data/tmp
+find $corpus -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 609552 ] && \
+  echo Warning: expected 609552 data data files, found $n
+
+for x in train dev test; do
+  grep -i "/$x/" $tmp_dir/wav.flist > $data/$x/wav.flist || exit 1;
+  echo "Filtering data using found wav list and provided transcript for $x"
+  local/magicdata_data_filter.py $data/$x/wav.flist $corpus/$x/TRANS.txt $data/$x local/magicdata_badlist
+  cat $data/$x/transcripts.txt |\
+    sed 's/！//g' | sed 's/？//g' |\
+    sed 's/，//g' | sed 's/－//g' |\
+    sed 's/：//g' | sed 's/；//g' |\
+    sed 's/　//g' | sed 's/。//g' |\
+    local/word_segment.py |\
+    tr '[a-z]' '[A-Z]' |\
+    sed 's/FIL/[FIL]/g' | sed 's/SPK/[SPK]/' |\
+    awk '{if (NF > 1) print $0;}' > $data/$x/text
+  for file in wav.scp utt2spk text; do
+    sort $data/$x/$file -o $data/$x/$file
+  done
+  utils/utt2spk_to_spk2utt.pl $data/$x/utt2spk > $data/$x/spk2utt
+done
+
+rm -r $tmp_dir
+
+utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
diff --git a/egs/multi_cn/s5/local/magicdata_download_and_untar.sh b/egs/multi_cn/s5/local/magicdata_download_and_untar.sh
new file mode 100755
index 00000000000..df8ca8d2296
--- /dev/null
+++ b/egs/multi_cn/s5/local/magicdata_download_and_untar.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2019  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/68 train_set"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: train_set, dev_set, test_set."
+fi
+
+data=$1
+url=$2
+part=$3
+part1=`echo $part | sed s/_set//`
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it."
+  mkdir -p $data
+fi
+
+part_ok=false
+list="train_set dev_set test_set"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part1/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="52627842921 1035537823 2201936013"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1;
+fi
+
+touch $data/$part1/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/prepare_dict.sh b/egs/multi_cn/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..6b160b60580
--- /dev/null
+++ b/egs/multi_cn/s5/local/prepare_dict.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+
+# this script is copied from egs/hkust/s5/local/hkust_prepare_dict.sh
+
+# Copyright 2016 LeSpeech (Author: Xingyu Na)
+
+# prepare dictionary for HKUST
+# it is done for English and Chinese separately,
+# For English, we use CMU dictionary, and Sequitur G2P
+# for OOVs, while all englist phone set will concert to Chinese
+# phone set at the end. For Chinese, we use an online dictionary,
+# for OOV, we just produce pronunciation using Charactrt Mapping.
+
+. ./path.sh
+
+[ $# != 0 ] && echo "Usage: $0" && exit 1;
+
+train_dir=data/train_combined
+dev_dir=data/test_combined
+dict_dir=data/local/dict
+mkdir -p $dict_dir
+mkdir -p $dict_dir/lexicon-{en,ch}
+
+# extract full vocabulary
+cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
+  perl -ape 's/ /\n/g;' | sort -u | grep -v '\[FIL\]' |\
+  grep -v '\[SPK\]' > $dict_dir/words.txt || exit 1;
+
+# split into English and Chinese
+cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt || exit 1;
+cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt || exit 1;
+
+
+##### produce pronunciations for english
+if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
+  echo "--- Downloading CMU dictionary ..."
+  svn co -r 13068 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+    $dict_dir/cmudict || exit 1;
+fi
+
+# format cmudict
+echo "--- Striping stress and pronunciation variant markers from cmudict ..."
+perl $dict_dir/cmudict/scripts/make_baseform.pl \
+  $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\
+  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt || exit 1;
+
+# extract in-vocab lexicon and oov words
+echo "--- Searching for English OOV words ..."
+awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/words-en-oov.txt || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-en/lexicon-en-iv.txt || exit 1;
+
+wc -l $dict_dir/lexicon-en/words-en-oov.txt
+wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt
+
+# setup g2p and generate oov lexicon
+if [ ! -f conf/g2p_model ]; then
+  echo "--- Downloading a pre-trained Sequitur G2P model ..."
+  wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
+  if [ ! -f conf/g2p_model ]; then
+    echo "Failed to download the g2p model!"
+    exit 1
+  fi
+fi
+
+echo "--- Preparing pronunciations for OOV words ..."
+g2p=`which g2p.py`
+if [ -z $g2p ] || [ ! -x $g2p ]; then
+  echo "g2p.py is not found. Checkout tools/extras/install_sequitur.sh."
+  exit 1
+fi
+g2p.py --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt \
+  > $dict_dir/lexicon-en/lexicon-en-oov.txt || exit 1;
+
+# merge in-vocab and oov lexicon
+cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\
+  sort > $dict_dir/lexicon-en/lexicon-en-phn.txt || exit 1;
+
+# convert cmu phoneme to pinyin phonenme
+mkdir -p $dict_dir/map
+cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/map/cmu || exit 1;
+cat conf/pinyin2cmu | awk -v cmu=$dict_dir/map/cmu \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/map/cmu-used || exit 1;
+cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \
+  'BEGIN{while((getline<cmu)) dict[$1] = 1;}
+   {if (!dict[$1]) print $1;}' > $dict_dir/map/cmu-not-used || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/map/cmu-not-used conf/cmu2pinyin |\
+  egrep -v '<.?s>' > $dict_dir/map/cmu-py || exit 1;
+
+cat $dict_dir/map/cmu-py | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
+    print "@entry";
+    print "\n";
+  }
+' conf/pinyin2cmu > $dict_dir/map/cmu-cmu || exit 1;
+
+cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \
+  perl -e '
+  open(MAPS, $ARGV[0]) or die("could not open map file");
+  my %py2ph;
+  foreach $line (<MAPS>) {
+    @A = split(" ", $line);
+    $py = shift(@A);
+    $py2ph{$py} = [@A];
+  }
+  my @entry;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    $W = shift(@A);
+    push(@entry, $W);
+    for($i = 0; $i < @A; $i++) {
+      if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
+      else {push(@entry, $A[$i])};
+    }
+    print "@entry";
+    print "\n";
+  }
+' $dict_dir/map/cmu-cmu > $dict_dir/lexicon-en/lexicon-en.txt || exit 1;
+
+
+##### produce pronunciations for chinese
+if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then
+  echo "------------- Downloading cedit dictionary ---------------"
+  mkdir -p $dict_dir/cedict
+  wget -P $dict_dir/cedict http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
+  gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
+fi
+
+cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
+ perl -e '
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    print $A[1];
+    for($n = 2; $n < @A; $n++) {
+      $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:;
+      $tmp = uc($A[$n]);
+      print " $tmp";
+    }
+    print "\n";
+  }
+ ' | sort -k1 > $dict_dir/cedict/ch-dict.txt || exit 1;
+
+echo "--- Searching for Chinese OOV words ..."
+awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+  $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt || exit 1;
+
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+  $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\
+  egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt || exit 1;
+
+wc -l $dict_dir/lexicon-ch/words-ch-oov.txt
+wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
+
+
+# validate Chinese dictionary and compose a char-based
+# dictionary in order to get OOV pronunciations
+cat $dict_dir/cedict/ch-dict.txt |\
+  perl -e '
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    $word_len = length($A[0]);
+    $proun_len = @A - 1 ;
+    if ($word_len == $proun_len) {print $_;}
+  }
+  ' > $dict_dir/cedict/ch-dict-1.txt || exit 1;
+
+# extract chars
+cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
+  perl -e '
+  use utf8;
+  binmode(STDIN,":encoding(utf8)");
+  binmode(STDOUT,":encoding(utf8)");
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @chars = split("", $A[0]);
+    foreach (@chars) {
+      print "$_\n";
+    }
+  }
+  ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt || exit 1;
+
+# extract individual pinyins
+cat $dict_dir/cedict/ch-dict-1.txt |\
+  awk '{for(i=2; i<=NF; i++) print $i}' |\
+  perl -ape 's/ /\n/g;' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1;
+
+# first make sure number of characters and pinyins
+# are equal, so that a char-based dictionary can
+# be composed.
+nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt`
+npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt`
+if [ $nchars -ne $npinyin ]; then
+  echo "Found $nchars chars and $npinyin pinyin. Please check!"
+  exit 1
+fi
+
+paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt |\
+  sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt || exit 1;
+
+# create a multiple pronunciation dictionary
+cat $dict_dir/lexicon-ch/ch-char-dict.txt |\
+  perl -e '
+  my $prev = "";
+  my $out_line = "";
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    $cur = $A[0];
+    $cur_py = $A[1];
+    #print length($prev);
+    if (length($prev) == 0) { $out_line = $_; chomp($out_line);}
+    if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);}
+    if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";}
+    $prev = $cur;
+  }
+  print $out_line;
+  ' >  $dict_dir/lexicon-ch/ch-char-dict-mp.txt || exit 1;
+
+# get lexicon for Chinese OOV words
+local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt \
+  $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt || exit 1;
+
+# seperate multiple prons for Chinese OOV lexicon
+cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\
+  perl -e '
+  my @entry;
+  my @entry1;
+  while (<STDIN>) {
+    @A = split(" ", $_);
+    @entry = ();
+    push(@entry, $A[0]);
+    for($i = 1; $i < @A; $i++ ) {
+      @py = split("/", $A[$i]);
+      @entry1 = @entry;
+      @entry = ();
+      for ($j = 0; $j < @entry1; $j++) {
+        for ($k = 0; $k < @py; $k++) {
+          $tmp = $entry1[$j]." ".$py[$k];
+          push(@entry, $tmp);
+        }
+      }
+    }
+    for ($i = 0; $i < @entry; $i++) {
+      print $entry[$i];
+      print "\n";
+    }
+  }
+  ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt || exit 1;
+
+# compose IV and OOV lexicons for Chinese
+cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\
+  awk '{if (NF > 1 && $2 ~ /[A-Za-z0-9]+/) print $0;}' |\
+  sed '/ M2/d' > $dict_dir/lexicon-ch/lexicon-ch.txt || exit 1;
+
+# convert Chinese pinyin to CMU format
+cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
+  utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt || exit 1;
+
+# combine English and Chinese lexicons
+cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\
+  sort -u > $dict_dir/lexicon1.txt || exit 1;
+
+cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
+  sort -u |\
+  perl -e '
+  my %ph_cl;
+  while (<STDIN>) {
+    $phone = $_;
+    chomp($phone);
+    chomp($_);
+    $phone =~ s:([A-Z]+)[0-9]:$1:;
+    if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
+    else { $ph_cl{$phone} = [$_]; }
+  }
+  foreach $key ( keys %ph_cl ) {
+     print "@{ $ph_cl{$key} }\n"
+  }
+  ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
+
+( echo SIL; echo SPN; echo NSN; ) > $dict_dir/silence_phones.txt
+
+echo SIL > $dict_dir/optional_silence.txt
+
+# No "extra questions" in the input to this setup, as we don't
+# have stress or tone
+
+cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dict_dir/extra_questions.txt || exit 1;
+cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dict_dir/extra_questions.txt || exit 1;
+
+# Add to the lexicon the silences, noises etc.
+(echo '!SIL SIL'; echo '[SPK] SPN'; echo '[FIL] NSN'; echo '<UNK> SPN' ) | \
+ cat - $dict_dir/lexicon1.txt  > $dict_dir/lexicon.txt || exit 1;
+
+echo "$0: dict preparation succeeded"
+exit 0;
diff --git a/egs/multi_cn/s5/local/primewords_data_prep.sh b/egs/multi_cn/s5/local/primewords_data_prep.sh
new file mode 100755
index 00000000000..bcf3b6698a4
--- /dev/null
+++ b/egs/multi_cn/s5/local/primewords_data_prep.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright 2019 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/primewords data/primewords"
+  exit 1;
+fi
+
+corpus=$1/primewords_md_2018_set1
+data=$2
+
+if [ ! -d $corpus/audio_files ] || [ ! -f $corpus/set1_transcript.json ]; then
+  echo "Error: $0 requires complete corpus"
+  exit 1;
+fi
+
+echo "**** Creating primewords data folder ****"
+
+mkdir -p $data/train
+
+# find wav audio file for train
+
+find $corpus -iname "*.wav" > $data/wav.flist
+n=`cat $data/wav.flist | wc -l`
+[ $n -ne 50384 ] && \
+  echo Warning: expected 50384 data files, found $n
+
+echo "Filtering data using found wav list and provided transcript"
+local/primewords_parse_transcript.py $data/wav.flist $corpus/set1_transcript.json $data/train
+cat $data/train/transcripts.txt |\
+  local/word_segment.py |\
+  awk '{if (NF > 1) print $0;}' > $data/train/text
+
+for file in wav.scp utt2spk text; do
+  sort $data/train/$file -o $data/train/$file
+done
+utils/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt
+
+rm -r $data/wav.flist
+
+utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
diff --git a/egs/multi_cn/s5/local/primewords_download_and_untar.sh b/egs/multi_cn/s5/local/primewords_download_and_untar.sh
new file mode 100755
index 00000000000..7e716c7a0a6
--- /dev/null
+++ b/egs/multi_cn/s5/local/primewords_download_and_untar.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+part=primewords_md_2018_set1
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="9057625192"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1;
+fi
+
+touch $data/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/primewords_parse_transcript.py b/egs/multi_cn/s5/local/primewords_parse_transcript.py
new file mode 100755
index 00000000000..996b1210732
--- /dev/null
+++ b/egs/multi_cn/s5/local/primewords_parse_transcript.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+
+def main(argv):
+  fp = open(argv[1], encoding = "utf-8")
+  js = json.load(fp)
+  fp.close()
+  metas = {}
+  for ele in js:
+    fname = ele['file']
+    metas[fname] = ele
+
+  fWavScp = open(os.path.join(argv[2], 'wav.scp'), 'w')
+  fText = open(os.path.join(argv[2], 'transcripts.txt'), 'w', encoding = "utf-8")
+  fUtt2Spk = open(os.path.join(argv[2], 'utt2spk'), 'w')
+  for line in open(argv[0]):
+    fpath = line.strip('\r\n')
+    wname = os.path.basename(fpath)
+    meta = metas[wname]
+    spkid = 'P' + meta['user_id']
+    uttid = spkid + '-' + meta['id']
+    fWavScp.write(uttid + ' ' + fpath + '\n')
+    fText.write(uttid + ' ' + meta['text'] + '\n')
+    fUtt2Spk.write(uttid + ' ' + spkid + '\n')
+  fWavScp.close()
+  fText.close()
+  fUtt2Spk.close()
+
+if __name__ == "__main__":
+  main(sys.argv[1:])
diff --git a/egs/multi_cn/s5/local/run_cleanup_segmentation.sh b/egs/multi_cn/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..f1ea4a2f574
--- /dev/null
+++ b/egs/multi_cn/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Yiming Wang
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn_6z.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train_all
+cleanup_affix=cleaned
+srcdir=exp/tri4a
+nj=100
+decode_nj=10
+decode_num_threads=4
+test_sets=""
+corpus_lm=false
+
+. ./path.sh
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+    $data data/lang $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    12000 190000 $cleaned_data data/lang ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+if [ $stage -le 4 ]; then
+  # Test with the models trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_combined_tg ${cleaned_dir} ${cleaned_dir}/graph_tg
+
+  for c in $test_sets; do
+    (
+      steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+        --cmd "$decode_cmd" \
+        ${cleaned_dir}/graph_tg data/${c}/test ${cleaned_dir}/decode_${c}_tg
+      if $corpus_lm; then
+        $mkgraph_cmd ${cleaned_dir}/log/mkgraph.$c.log \
+          utils/mkgraph.sh data/lang_${c}_tg ${cleaned_dir} ${cleaned_dir}/graph_$c || exit 1;
+        steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+          --cmd "$decode_cmd" \
+          ${cleaned_dir}/graph_$c data/$c/test ${cleaned_dir}/decode_${c}_test_clm || exit 1;
+      fi
+   ) &
+  done
+fi
+
+wait;
+exit 0;
diff --git a/egs/multi_cn/s5/local/score.sh b/egs/multi_cn/s5/local/score.sh
new file mode 100755
index 00000000000..a9786169973
--- /dev/null
+++ b/egs/multi_cn/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/multi_cn/s5/local/stcmds_data_prep.sh b/egs/multi_cn/s5/local/stcmds_data_prep.sh
new file mode 100755
index 00000000000..6375d0d9a1b
--- /dev/null
+++ b/egs/multi_cn/s5/local/stcmds_data_prep.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright 2019 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <corpus-path> <data-path>"
+  echo " $0 /export/a05/xna/data/stcmds data/stcmds"
+  exit 1;
+fi
+
+corpus=$1/ST-CMDS-20170001_1-OS
+data=$2
+
+if [ ! -d $corpus ]; then
+  echo "Error: $0 requires complete corpus"
+  exit 1;
+fi
+
+echo "**** Creating ST-CMDS data folder ****"
+
+mkdir -p $data/train
+
+# find wav audio file for train
+
+find $corpus -iname "*.wav" > $data/wav.list
+n=`cat $data/wav.list | wc -l`
+[ $n -ne 102600 ] && \
+  echo Warning: expected 102600 data files, found $n
+
+cat $data/wav.list | awk -F'20170001' '{print $NF}' | awk -F'.' '{print $1}' > $data/utt.list
+cat $data/utt.list | awk '{print substr($1,1,6)}' > $data/spk.list
+while read line; do
+  tn=`dirname $line`/`basename $line .wav`.txt;
+  cat $tn; echo;
+done < $data/wav.list > $data/text.list
+
+paste -d' ' $data/utt.list $data/wav.list > $data/train/wav.scp
+paste -d' ' $data/utt.list $data/spk.list > $data/train/utt2spk
+paste -d' ' $data/utt.list $data/text.list |\
+  sed 's/，//g' |\
+  local/word_segment.py |\
+  tr '[a-z]' '[A-Z]' |\
+  awk '{if (NF > 1) print $0;}' > $data/train/text
+
+for file in wav.scp utt2spk text; do
+  sort $data/train/$file -o $data/train/$file
+done
+
+utils/utt2spk_to_spk2utt.pl $data/train/utt2spk > $data/train/spk2utt
+
+rm -r $data/{wav,utt,spk,text}.list
+
+utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
diff --git a/egs/multi_cn/s5/local/stcmds_download_and_untar.sh b/egs/multi_cn/s5/local/stcmds_download_and_untar.sh
new file mode 100755
index 00000000000..ca89b5a292a
--- /dev/null
+++ b/egs/multi_cn/s5/local/stcmds_download_and_untar.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/38"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+fi
+
+data=$1
+url=$2
+part=ST-CMDS-20170001_1-OS
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="8231662593"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1;
+fi
+
+touch $data/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
+
+exit 0;
diff --git a/egs/multi_cn/s5/local/thchs-30_data_prep.sh b/egs/multi_cn/s5/local/thchs-30_data_prep.sh
new file mode 100755
index 00000000000..8f48133a1dd
--- /dev/null
+++ b/egs/multi_cn/s5/local/thchs-30_data_prep.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright 2016  Tsinghua University (Author: Dong Wang, Xuewei Zhang).  Apache 2.0.
+#           2016  LeSpeech (Author: Xingyu Na)
+
+#This script pepares the data directory for thchs30 recipe. 
+#It reads the corpus and get wav.scp and transcriptions.
+
+corpus_dir=$1
+data=$2
+
+echo "**** Creating THCHS-30 data folder ****"
+mkdir -p $data/{train,dev,test}
+
+#create wav.scp, utt2spk.scp, spk2utt.scp, text
+(
+for x in train dev test; do
+  echo "cleaning $data/$x"
+  part=$data/$x
+  rm -rf $part/{wav.scp,utt2spk,spk2utt,text}
+  echo "preparing scps and text in $part"
+  # updated new "for loop" figured out the compatibility issue with Mac     created by Xi Chen, in 03/06/2018
+  for nn in `find  $corpus_dir/$x -name "*.wav" | sort -u | xargs -I {} basename {} .wav`; do
+      spkid=`echo $nn | awk -F"_" '{print "" $1}'`
+      spk_char=`echo $spkid | sed 's/\([A-Z]\).*/\1/'`
+      spk_num=`echo $spkid | sed 's/[A-Z]\([0-9]\)/\1/'`
+      spkid=$(printf '%s%.2d' "$spk_char" "$spk_num")
+      utt_num=`echo $nn | awk -F"_" '{print $2}'`
+      uttid=$(printf '%s%.2d_%.3d' "$spk_char" "$spk_num" "$utt_num")
+      echo $uttid $corpus_dir/$x/$nn.wav >> $part/wav.scp
+      echo $uttid $spkid >> $part/utt2spk
+      echo $uttid `sed -n 1p $corpus_dir/data/$nn.wav.trn` | sed 's/ l =//' >> $part/text
+  done 
+  sort $part/wav.scp -o $part/wav.scp
+  sort $part/utt2spk -o $part/utt2spk
+  sort $part/text -o $part/text
+  utils/utt2spk_to_spk2utt.pl $part/utt2spk > $part/spk2utt
+done
+) || exit 1
+
+utils/data/validate_data_dir.sh --no-feats $data/train || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/dev || exit 1;
+utils/data/validate_data_dir.sh --no-feats $data/test || exit 1;
+
+
+
diff --git a/egs/multi_cn/s5/local/thchs_download_and_untar.sh b/egs/multi_cn/s5/local/thchs_download_and_untar.sh
new file mode 100755
index 00000000000..6294fca7d9b
--- /dev/null
+++ b/egs/multi_cn/s5/local/thchs_download_and_untar.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey) 
+# Copyright   2016  Tsinghua University (author: Dong Wang)
+# Apache 2.0
+
+# Adapted from librispeech recipe local/download_and_untar.sh
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /nfs/public/materials/data/thchs30-openslr www.openslr.org/resources/18 data_thchs30"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_thchs30, test-noise, resource"
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data, make it"
+  mkdir -p $data
+fi
+
+part_ok=false
+list="data_thchs30 test-noise resource"
+for x in $list; do 
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+
+sizes="6453425169 1971460210 24813708"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  pwd
+  echo " wget --no-check-certificate $full_url"
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
diff --git a/egs/multi_cn/s5/local/train_corpus_lm.sh b/egs/multi_cn/s5/local/train_corpus_lm.sh
new file mode 100755
index 00000000000..181ff4c5522
--- /dev/null
+++ b/egs/multi_cn/s5/local/train_corpus_lm.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/$1/train/text
+mainlm=$2
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$mainlm"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# *_data_prep.sh and combine the data folders
+# It takes as input the files
+# data/train_combined/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm/$1
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+ngram=`which ngram`
+if [ -z $ngram ]; then
+  echo "$0: ngram is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extras/install_srilm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk '{$1=""; print substr($0, 2)}' | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+if [ `wc -l < $cleantext` -le 10000 ]; then
+  mkdir -p $dir/3gram-mincount
+  # create ngrams and heldout_ngrams
+  cat > $dir/3gram-mincount/config.get_ngrams <<EOF
+D=0 tau=0 phi=1
+D=0 tau=0 phi=1
+D=1 tau=0 phi=1
+EOF
+  gunzip -c $dir/train.gz | tail -n +1000 |\
+    get_raw_ngrams 3 | sort | uniq -c | uniq_to_ngrams |\
+    sort | discount_ngrams $dir/3gram-mincount/config.get_ngrams |\
+    sort | merge_ngrams | gzip -c > $dir/3gram-mincount/ngrams.gz
+  gunzip -c $dir/train.gz | head -n 1000 | \
+    get_raw_ngrams 3 | sort | uniq -c | uniq_to_ngrams | \
+    perl -ane 's/(\S+)$/:$1/; print;' | sort | gzip -c > $dir/3gram-mincount/heldout_ngrams.gz
+fi
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz -mix-lm $mainlm -lambda 0.9 -write-lm $dir/lm_interp.gz
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/multi_cn/s5/local/train_lms.sh b/egs/multi_cn/s5/local/train_lms.sh
new file mode 100755
index 00000000000..ac632538ec5
--- /dev/null
+++ b/egs/multi_cn/s5/local/train_lms.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/train_combined/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# This script takes no arguments.  It assumes you have already run
+# *_data_prep.sh and combine the data folders
+# It takes as input the files
+# data/train_combined/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+kaldi_lm=`which train_lm.sh`
+if [ -z $kaldi_lm ]; then
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
+  exit 1
+fi
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk '{$1=""; print substr($0, 2)}' | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+# note: we probably won't really make use of <SPOKEN_NOISE> as there aren't any OOVs
+cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
+   || exit 1;
+
+# note: ignore 1st field of train.txt, it's the utterance-id.
+cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
+  { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
+   || exit 1;
+
+train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
+
+# LM is small enough that we don't need to prune it (only about 0.7M N-grams).
+# Perplexity over 128254.000000 words is 90.446690
+
+# note: output is
+# data/local/lm/3gram-mincount/lm_unpruned.gz
+
+exit 0
+
+
+# From here is some commands to do a baseline with SRILM (assuming
+# you have it installed).
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
+mkdir -p $sdir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $sdir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $sdir/train
+
+cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
+
+
+ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482
+
+# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above.
+# Difference in WSJ must have been due to different treatment of <SPOKEN_NOISE>.
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
+# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379
diff --git a/egs/multi_cn/s5/local/wer_hyp_filter b/egs/multi_cn/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..c6660e4efe1
--- /dev/null
+++ b/egs/multi_cn/s5/local/wer_hyp_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SPOKEN_NOISE>','<UNK>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/multi_cn/s5/local/wer_output_filter b/egs/multi_cn/s5/local/wer_output_filter
new file mode 100755
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/multi_cn/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/multi_cn/s5/local/wer_ref_filter b/egs/multi_cn/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..c6660e4efe1
--- /dev/null
+++ b/egs/multi_cn/s5/local/wer_ref_filter
@@ -0,0 +1,19 @@
+#!/usr/bin/env perl
+
+@filters=('<SPOKEN_NOISE>','<UNK>');
+
+foreach $w (@filters) {
+  $bad{$w} = 1;
+}
+
+while(<STDIN>) {
+  @A  = split(" ", $_);
+  $id = shift @A;
+  print "$id ";
+  foreach $a (@A) {
+    if (!defined $bad{$a}) {
+      print "$a ";
+    }
+  }
+  print "\n";
+}
diff --git a/egs/multi_cn/s5/local/word_segment.py b/egs/multi_cn/s5/local/word_segment.py
new file mode 100755
index 00000000000..3ce990e7e09
--- /dev/null
+++ b/egs/multi_cn/s5/local/word_segment.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python2.7
+#coding:utf-8
+# this script is copied from hkust/s5/local/hkust_segment.py
+
+
+from __future__ import print_function
+import sys
+from mmseg import seg_txt
+for line in sys.stdin:
+  blks = str.split(line)
+  out_line = blks[0]
+  for i in range(1, len(blks)):
+    for j in seg_txt(blks[i]):
+      out_line += " " + j
+  print(out_line)
diff --git a/egs/multi_cn/s5/path.sh b/egs/multi_cn/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/multi_cn/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/multi_cn/s5/run.sh b/egs/multi_cn/s5/run.sh
new file mode 100755
index 00000000000..bd03355ea61
--- /dev/null
+++ b/egs/multi_cn/s5/run.sh
@@ -0,0 +1,268 @@
+#!/bin/bash
+
+# Copyright 2019 Microsoft Corporation (authors: Xingyu Na)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+stage=0
+dbase=/mnt/data/openslr
+aidatatang_url=www.openslr.org/resources/62
+aishell_url=www.openslr.org/resources/33
+magicdata_url=www.openslr.org/resources/68
+primewords_url=www.openslr.org/resources/47
+stcmds_url=www.openslr.org/resources/38
+thchs_url=www.openslr.org/resources/18
+
+test_sets="aishell aidatatang magicdata thchs"
+corpus_lm=false   # interpolate with corpus lm
+
+. utils/parse_options.sh
+
+if [ $stage -le 0 ]; then
+  # download all training data
+  local/aidatatang_download_and_untar.sh $dbase/aidatatang $aidatatang_url aidatatang_200zh || exit 1;
+  local/aishell_download_and_untar.sh $dbase/aishell $aishell_url data_aishell || exit 1;
+  local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url train_set || exit 1;
+  local/primewords_download_and_untar.sh $dbase/primewords $primewords_url || exit 1;
+  local/stcmds_download_and_untar.sh $dbase/stcmds $stcmds_url || exit 1;
+  local/thchs_download_and_untar.sh $dbase/thchs $thchs_url data_thchs30 || exit 1;
+
+  # download all test data
+  local/thchs_download_and_untar.sh $dbase/thchs $thchs_url test-noise || exit 1;
+  local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url dev_set || exit 1;
+  local/magicdata_download_and_untar.sh $dbase/magicdata $magicdata_url test_set || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  local/aidatatang_data_prep.sh $dbase/aidatatang/aidatatang_200zh data/aidatatang || exit 1;
+  local/aishell_data_prep.sh $dbase/aishell/data_aishell data/aishell || exit 1;
+  local/thchs-30_data_prep.sh $dbase/thchs/data_thchs30 data/thchs || exit 1;
+  local/magicdata_data_prep.sh $dbase/magicdata data/magicdata || exit 1;
+  local/primewords_data_prep.sh $dbase/primewords data/primewords || exit 1;
+  local/stcmds_data_prep.sh $dbase/stcmds data/stcmds || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # normalize transcripts
+  utils/combine_data.sh data/train_combined \
+    data/{aidatatang,aishell,magicdata,primewords,stcmds,thchs}/train || exit 1;
+  utils/combine_data.sh data/test_combined \
+    data/{aidatatang,aishell,magicdata,thchs}/{dev,test} || exit 1;
+  local/prepare_dict.sh || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  # train LM using transcription
+  local/train_lms.sh || exit 1;
+  if $corpus_lm; then
+    for c in $test_sets; do
+      local/train_corpus_lm.sh $c data/local/lm/3gram-mincount/lm_unpruned.gz
+    done
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  # prepare LM
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang || exit 1;
+  utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \
+    data/local/dict/lexicon.txt data/lang_combined_tg || exit 1;
+  if $corpus_lm; then
+    for c in $test_sets; do
+      utils/format_lm.sh data/lang data/local/lm/$c/lm_interp.gz \
+        data/local/dict/lexicon.txt data/lang_${c}_tg || exit 1;
+    done
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # make features
+  mfccdir=mfcc
+  corpora="aidatatang aishell magicdata primewords stcmds thchs"
+  for c in $corpora; do
+    (
+      steps/make_mfcc_pitch_online.sh --cmd "$train_cmd" --nj 20 \
+        data/$c/train exp/make_mfcc/$c/train $mfccdir/$c || exit 1;
+      steps/compute_cmvn_stats.sh data/$c/train \
+        exp/make_mfcc/$c/train $mfccdir/$c || exit 1;
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 6 ]; then
+  # make test features
+  mfccdir=mfcc
+  for c in $test_sets; do
+    (
+      steps/make_mfcc_pitch_online.sh --cmd "$train_cmd" --nj 10 \
+        data/$c/test exp/make_mfcc/$c/test $mfccdir/$c || exit 1;
+      steps/compute_cmvn_stats.sh data/$c/test \
+        exp/make_mfcc/$c/test $mfccdir/$c || exit 1;
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 7 ]; then
+  # train mono and tri1a using aishell(~120k)
+  # mono has been used in aishell recipe, so no test
+  steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
+    data/aishell/train data/lang exp/mono || exit 1;
+
+  steps/align_si.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \
+    data/aishell/train data/lang exp/mono exp/mono_ali || exit 1;
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 20000 \
+    data/aishell/train data/lang exp/mono_ali exp/tri1a || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  # train tri1b using aishell + primewords + stcmds + thchs (~280k)
+  utils/combine_data.sh data/train_280k \
+    data/{aishell,primewords,stcmds,thchs}/train || exit 1;
+
+  steps/align_si.sh --boost-silence 1.25 --nj 40 --cmd "$train_cmd" \
+    data/train_280k data/lang exp/tri1a exp/tri1a_280k_ali || exit 1;
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 4500 36000 \
+    data/train_280k data/lang exp/tri1a_280k_ali exp/tri1b || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # test tri1b
+  utils/mkgraph.sh data/lang_combined_tg exp/tri1b exp/tri1b/graph_tg || exit 1;
+  for c in $test_sets; do
+    (
+      steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+        exp/tri1b/graph_tg data/$c/test exp/tri1b/decode_${c}_test_tg || exit 1;
+      if $corpus_lm; then
+        $mkgraph_cmd exp/tri1b/log/mkgraph.$c.log \
+          utils/mkgraph.sh data/lang_${c}_tg exp/tri1b exp/tri1b/graph_$c || exit 1;
+        steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+          exp/tri1b/graph_$c data/$c/test exp/tri1b/decode_${c}_test_clm || exit 1;
+      fi
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 10 ]; then
+  # train tri2a using train_280k
+  steps/align_si.sh --boost-silence 1.25 --nj 40 --cmd "$train_cmd" \
+    data/train_280k data/lang exp/tri1b exp/tri1b_280k_ali || exit 1;
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 5500 90000 \
+    data/train_280k data/lang exp/tri1b_280k_ali exp/tri2a || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  # test tri2a
+  utils/mkgraph.sh data/lang_combined_tg exp/tri2a exp/tri2a/graph_tg || exit 1;
+  for c in $test_sets; do
+    (
+      steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+        exp/tri2a/graph_tg data/$c/test exp/tri2a/decode_${c}_test_tg || exit 1;
+      if $corpus_lm; then
+        $mkgraph_cmd exp/tri2a/log/mkgraph.$c.log \
+          utils/mkgraph.sh data/lang_${c}_tg exp/tri2a exp/tri2a/graph_$c || exit 1;
+        steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+          exp/tri2a/graph_$c data/$c/test exp/tri2a/decode_${c}_test_clm || exit 1;
+      fi
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 12 ]; then
+  # train tri3a using aidatatang + aishell + primewords + stcmds + thchs (~440k)
+  utils/combine_data.sh data/train_440k \
+    data/{aidatatang,aishell,primewords,stcmds,thchs}/train || exit 1;
+
+  steps/align_si.sh --boost-silence 1.25 --nj 60 --cmd "$train_cmd" \
+    data/train_440k data/lang exp/tri2a exp/tri2a_440k_ali || exit 1;
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 7000 110000 \
+    data/train_440k data/lang exp/tri2a_440k_ali exp/tri3a || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # test tri3a
+  utils/mkgraph.sh data/lang_combined_tg exp/tri3a exp/tri3a/graph_tg || exit 1;
+  for c in $test_sets; do
+    (
+      steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+        exp/tri3a/graph_tg data/$c/test exp/tri3a/decode_${c}_test_tg || exit 1;
+      if $corpus_lm; then
+        $mkgraph_cmd exp/tri3a/log/mkgraph.$c.log \
+          utils/mkgraph.sh data/lang_${c}_tg exp/tri3a exp/tri3a/graph_$c || exit 1;
+        steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+          exp/tri3a/graph_$c data/$c/test exp/tri3a/decode_${c}_test_clm || exit 1;
+      fi
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 14 ]; then
+  # train tri4a using all
+  utils/combine_data.sh data/train_all \
+    data/{aidatatang,aishell,magicdata,primewords,stcmds,thchs}/train || exit 1;
+
+  steps/align_fmllr.sh --cmd "$train_cmd" --nj 100 \
+    data/train_all data/lang exp/tri3a exp/tri3a_ali || exit 1;
+  steps/train_sat.sh --cmd "$train_cmd" 12000 190000 \
+    data/train_all data/lang exp/tri3a_ali exp/tri4a || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # test tri4a
+  utils/mkgraph.sh data/lang_combined_tg exp/tri4a exp/tri4a/graph_tg || exit 1;
+  for c in $test_sets; do
+    (
+      steps/decode_fmllr.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+        exp/tri4a/graph_tg data/$c/test exp/tri4a/decode_${c}_test_tg || exit 1;
+      if $corpus_lm; then
+        $mkgraph_cmd exp/tri4a/log/mkgraph.$c.log \
+          utils/mkgraph.sh data/lang_${c}_tg exp/tri4a exp/tri4a/graph_$c || exit 1;
+        steps/decode_fmllr.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
+          exp/tri4a/graph_$c data/$c/test exp/tri4a/decode_${c}_test_clm || exit 1;
+      fi
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 16 ]; then
+  # run clean and retrain
+  local/run_cleanup_segmentation.sh --test-sets "$test_sets" --corpus-lm $corpus_lm
+fi
+
+if [ $stage -le 17 ]; then
+  # collect GMM test results
+  if ! $corpus_lm; then
+    for c in $test_sets; do
+      echo "$c test set results"
+      for x in exp/*/decode_${c}*_tg; do
+        grep WER $x/cer_* | utils/best_wer.sh
+      done
+      echo ""
+    done
+  else
+    # collect corpus LM results
+    for c in $test_sets; do
+      echo "$c test set results"
+      for x in exp/*/decode_${c}*_clm; do
+        grep WER $x/cer_* | utils/best_wer.sh
+      done
+      echo ""
+    done
+  fi
+fi
+
+exit 0;
+
+# chain modeling script
+local/chain/run_cnn_tdnn.sh --test-sets "$test_sets"
+for c in $test_sets; do
+  for x in exp/chain_cleaned/*/decode_${c}*_tg; do
+    grep WER $x/cer_* | utils/best_wer.sh
+  done
+done
diff --git a/egs/multi_cn/s5/steps b/egs/multi_cn/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/multi_cn/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/multi_cn/s5/utils b/egs/multi_cn/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/multi_cn/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/multi_en/s5/local/ami_ihm_data_prep.sh b/egs/multi_en/s5/local/ami_ihm_data_prep.sh
index 55f8bb22d41..ed1c882e862 100755
--- a/egs/multi_en/s5/local/ami_ihm_data_prep.sh
+++ b/egs/multi_en/s5/local/ami_ihm_data_prep.sh
@@ -78,7 +78,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -r 8000 -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -r 8000 -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/multi_en/s5/local/chain/run_tdnn_lstm.sh b/egs/multi_en/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/multi_en/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/multi_en/s5/local/chain/run_tdnn_opgru.sh b/egs/multi_en/s5/local/chain/run_tdnn_opgru.sh
index aedd4c8b4ac..20d4c87b289 120000
--- a/egs/multi_en/s5/local/chain/run_tdnn_opgru.sh
+++ b/egs/multi_en/s5/local/chain/run_tdnn_opgru.sh
@@ -1 +1 @@
-tuning/run_tdnn_opgru_1a.sh
\ No newline at end of file
+tuning/run_tdnn_opgru_1b.sh
\ No newline at end of file
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
index 9f8c49387b1..96f5fdac8f3 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_5b.sh
@@ -132,7 +132,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.0015 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="l2-regularize=0.0015 orthonormal-constraint=-1.0"
   output_opts="l2-regularize=0.001"
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..194592a822a
--- /dev/null
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,319 @@
+#!/bin/bash
+# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng
+#           2018 Xiaohui Zhang
+#           2018 Vimal Manohar
+# Apache 2.0
+
+# This recipe is similar with tdnn_lstm_1b recipefrom fisher_swbd/s5, and is currently
+# the best performing multi-en recipe.
+
+# System             tdnn_opgru_1b_sp  tdnn_lstm_1a_sp
+# WER on eval2000(tg)           11.4     11.4
+# WER on eval2000(fg)           11.2     11.2
+# WER on rt03(tg)               11.1     10.7
+# WER on rt03(fg)               10.8     10.5
+# Final train prob            -0.091    -0.095
+# Final valid prob             -0.091   -0.089
+# Final train prob (xent)     -0.990    -0.970
+# Final valid prob (xent)     -0.091    -0.9638
+# Num-parameters            34976320    39704128
+
+# ./steps/info/chain_dir_info.pl exp/multi_a/chain/tdnn_lstm_1a_sp
+# exp/multi_a/chain/tdnn_lstm_1a_sp: num-iters=2096 nj=3..16 num-params=39.7M dim=40+100->6176 combine=-0.088->-0.087 (over 3) 
+# xent:train/valid[1395,2095,final]=(-1.38,-0.960,-0.970/-1.39,-0.964,-0.964) 
+# logprob:train/valid[1395,2095,final]=(-0.117,-0.091,-0.095/-0.109,-0.087,-0.089)
+
+# online results
+# Eval2000
+# %WER 14.2 | 2628 21594 | 87.8 8.6 3.5 2.1 14.2 49.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 11.4 | 4459 42989 | 90.3 7.0 2.7 1.7 11.4 46.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.4 | 1831 21395 | 92.8 5.3 2.0 1.2 8.4 41.2 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 14.0 | 2628 21594 | 88.0 8.5 3.4 2.1 14.0 48.6 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 11.2 | 4459 42989 | 90.5 6.9 2.6 1.7 11.2 45.4 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.1 | 1831 21395 | 93.1 5.1 1.8 1.2 8.1 40.6 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_eval2000_fg/score_9_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# RT03
+# %WER 8.7 | 3970 36721 | 92.2 5.3 2.5 1.0 8.7 37.3 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 10.8 | 8420 76157 | 90.4 6.5 3.2 1.2 10.8 40.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 12.7 | 4450 39436 | 88.7 7.7 3.6 1.4 12.7 42.5 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 8.5 | 3970 36721 | 92.4 5.1 2.5 0.9 8.5 37.2 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03_fg/score_7_1.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 10.5 | 8420 76157 | 90.6 6.3 3.1 1.2 10.5 40.1 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 12.4 | 4450 39436 | 88.9 7.2 3.9 1.3 12.4 42.7 | exp/multi_a/chain/tdnn_lstm_1a_sp_online/decode_rt03_fg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+
+set -e
+
+# configs for 'chain'
+stage=-10
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+multi=multi_a
+gmm=tri5a
+decode_iter=
+decode_dir_affix=
+decode_nj=50
+
+# training options
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=4
+
+remove_egs=false
+common_egs_dir=
+
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+nnet3_affix=
+tdnn_affix=_1a
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=exp/$multi/chain/tdnn_lstm${tdnn_affix}${suffix}
+train_set=${multi}/${gmm}${suffix}
+lats_dir=exp/${multi}/${gmm}_lats_nodup${suffix}
+treedir=exp/$multi/chain/${gmm}_tree
+lang=data/${multi}/lang_${gmm}_chain
+lang_dir=data/lang_${multi}_${gmm}_fsh_sw1_tg
+rescore_lang_dir=data/lang_${multi}_${gmm}_fsh_sw1_fg
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --multi $multi \
+  --gmm $gmm \
+  --speed-perturb $speed_perturb || exit 1
+
+online_ivector_dir=exp/$multi/nnet3${nnet3_affix}/ivectors_${train_set}
+
+if [ $stage -le 9 ]; then
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \
+    --generate-ali-from-lats true data/$train_set \
+    data/lang_${multi}_${gmm} exp/${multi}/$gmm $lats_dir
+  rm ${lats_dir}/fsts.*.gz  # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then 
+    echo "$lang exists. Remove it or skip this stage."
+    exit 1
+  fi
+
+  cp -r data/lang_${multi}_${gmm} $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+
+  if [ -f $treedir/final.mdl ]; then 
+    echo "$treedir exists. Remove it or skip this stage."
+    exit 1
+  fi
+
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $lats_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  lstm_opts="dropout-proportion=0.0 decay-time=40"
+
+  relu_dim=1024
+  cell_dim=1024
+  projection_dim=256
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=$relu_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$relu_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$relu_dim
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$relu_dim
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$relu_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$relu_dim
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$relu_dim
+  fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/multi-en-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/$multi/nnet3${nnet3_affix}/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.dropout-schedule=$dropout_schedule \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir $lats_dir \
+    --dir $dir  || exit 1;
+fi
+
+lang_suffix=${lang_dir##*lang}
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_dir \
+    $dir $dir/graph${lang_suffix}
+fi
+
+graph_dir=$dir/graph${lang_suffix}
+if [ $stage -le 15 ]; then
+  iter_opts=
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in eval2000 rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 50 --cmd "$decode_cmd" $iter_opts \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk "$frames_per_chunk_primary" \
+        --online-ivector-dir exp/$multi/nnet3${nnet3_affix}/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires \
+         $dir/decode${lang_suffix}_${decode_set}${decode_dir_affix:+_$decode_dir_affix}${decode_iter:+_iter$decode_iter} 
+      
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          $lang_dir $rescore_lang_dir data/${decode_set}_hires \
+          $dir/decode${lang_suffix}_${decode_set}${decode_dir_affix:+_$decode_dir_affix}{,_fg}${decode_iter:+_iter$decode_iter} || exit 1;
+      ) &
+  done
+fi
+wait;
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
index 98e7c2ed6c1..79cd3eb3014 100755
--- a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -150,7 +150,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
new file mode 100755
index 00000000000..a7170af9431
--- /dev/null
+++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+# Copyright 2018 Xiaohui Zhang
+#           2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng
+# Apache 2.0
+
+# This is similar with tdnn_opgru_1a but with correct num_leaves (7k rather than 11k),
+# aligments from lattices when building the tree, and better l2-regularization as opgru-1a
+# from fisher-swbd.
+
+# ./local/chain/compare_wer_general.sh tdnn_opgru_1a_sp tdnn_opgru_1b_sp
+# System            tdnn_opgru_1a_sp  tdnn_opgru_1b_sp
+# WER on eval2000(tg)        11.6      11.4
+# WER on eval2000(fg)        11.5      11.2
+# WER on rt03(tg)            11.5      11.1
+# WER on rt03(fg)            11.2      10.8
+# Final train prob         -0.088     -0.091
+# Final valid prob         -0.088     -0.091
+# Final train prob (xent)  -1.048     -0.990
+# Final valid prob (xent)  -1.0253    -0.091
+# Num-parameters          37364848    34976320
+
+
+# ./steps/info/chain_dir_info.pl exp/${multi}/chain/tdnn_opgru_1b_sp
+# exp/${multi}/chain/tdnn_opgru_1b_sp: num-iters=2621 nj=3..16 num-params=35.0M dim=40+100->6176 combine=-0.098->-0.096 (over 4)
+# xent:train/valid[1744,2620,final]=(-1.49,-0.991,-0.990/-1.51,-1.01,-1.01) 
+# logprob:train/valid[1744,2620,final]=(-0.118,-0.091,-0.091/-0.117,-0.093,-0.091)
+
+# online results
+# Eval2000
+# %WER 14.3 | 2628 21594 | 87.8 8.9 3.3 2.1 14.3 49.8 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 11.4 | 4459 42989 | 90.2 7.2 2.7 1.6 11.4 46.3 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_8_0.5/eval2000_hires.ctm.filt.sys
+# %WER 8.4 | 1831 21395 | 92.7 5.3 2.0 1.1 8.4 41.8 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 14.2 | 2628 21594 | 88.0 8.8 3.3 2.2 14.2 49.4 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 11.4 | 4459 42989 | 90.3 7.1 2.6 1.7 11.4 45.9 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.2 | 1831 21395 | 92.9 5.1 2.0 1.1 8.2 41.3 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# RT03
+# %WER 9.0 | 3970 36721 | 92.0 5.5 2.4 1.1 9.0 37.9 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.2 | 8420 76157 | 90.0 6.8 3.2 1.2 11.2 41.1 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.2 | 4450 39436 | 88.1 7.5 4.4 1.3 13.2 44.1 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_10_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 8.7 | 3970 36721 | 92.3 5.1 2.6 1.0 8.7 37.8 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 10.9 | 8420 76157 | 90.3 6.5 3.1 1.2 10.9 40.6 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 12.9 | 4450 39436 | 88.5 7.9 3.6 1.4 12.9 43.1 | exp/${multi}/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+set -e
+
+# configs for 'chain'
+stage=-1
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+multi=multi_a
+gmm=tri5a
+dir=exp/${multi}/chain/tdnn_opgru_1b # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+rescore=true # whether to rescore lattices
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+
+# training options
+num_epochs=4
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=${multi}/${gmm}_sp
+build_tree_ali_dir=exp/${multi}/${gmm}_ali_sp
+treedir=exp/${multi}/chain/${gmm}_tree
+lang=data/${multi}/lang_${gmm}_chain
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --multi ${multi} \
+  --gmm $gmm \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $build_tree_ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
+    --generate-ali-from-lats true data/$train_set  \
+    data/lang_${multi}_${gmm} exp/${multi}/$gmm exp/${multi}/${gmm}_lats_nodup$suffix
+  rm exp/${multi}/${gmm}_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_${multi}_${gmm} $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang exp/${multi}/${gmm}_lats_nodup$suffix $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,7,9,8}/$USER/kaldi-data/egs/multi-en-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/${multi}/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epocs \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/${multi}/tri5a_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  if $rescore && [ ! -f data/lang_${multi}_${gmm}_fsh_sw1_fg/G.carpa ]; then
+    LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz
+    utils/build_const_arpa_lm.sh $LM_fg data/lang_${multi}_${gmm}_fsh_sw1_tg data/lang_${multi}_${gmm}_fsh_sw1_fg
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 50 --cmd "$decode_cmd" $iter_opts \
+        --extra-left-context $extra_left_context  \
+        --extra-right-context $extra_right_context  \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk "$frames_per_chunk" \
+        --online-ivector-dir exp/${multi}/nnet3/ivectors_${decode_set} \
+        $graph_dir data/${decode_set}_hires \
+        $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $rescore; then
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_${multi}_${gmm}_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+test_online_decoding=true
+lang=data/lang_${multi}_${gmm}_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/${multi}/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $rescore; then
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_${multi}_${gmm}_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+          ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py b/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py
index 3c447c5976a..75cc4458d85 100755
--- a/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py
+++ b/egs/multi_en/s5/local/format_acronyms_ctm_eval2000.py
@@ -10,6 +10,7 @@
 # en_4156 B 414.58 0.16 l
 # en_4156 B 414.74 0.17 a
 
+from __future__ import division
 import argparse,re
 __author__ = 'Minhua Wu'
  
@@ -27,7 +28,7 @@
     if items[4].find(".") != -1:
         letters = items[4].split("._")
         acronym_period = round(float(items[3]), 2)
-        letter_slot = round(acronym_period / len(letters), 2)
+        letter_slot = round(acronym_period/len(letters), 2)
         time_start = round(float(items[2]), 2)
         for l in letters[:-1]:
             time = " %.2f %.2f " % (time_start, letter_slot)
diff --git a/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py b/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py
index 59814beb4ea..8438bbdaf81 100755
--- a/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py
+++ b/egs/multi_en/s5/local/format_acronyms_ctm_rt03.py
@@ -10,6 +10,7 @@
 # en_4156 B 414.58 0.16 l
 # en_4156 B 414.74 0.17 a
 
+from __future__ import division
 import argparse,re
 __author__ = 'Minhua Wu'
  
@@ -27,7 +28,7 @@
     if items[4].find(".") != -1:
         letters = items[4].split("._")
         acronym_period = round(float(items[3]), 2)
-        letter_slot = round(acronym_period / len(letters), 2)
+        letter_slot = round(acronym_period/len(letters), 2)
         time_start = round(float(items[2]), 2)
         for l in letters[:-1]:
             time = " %.2f %.2f " % (time_start, letter_slot)
diff --git a/egs/multi_en/s5/local/normalize_transcript.py b/egs/multi_en/s5/local/normalize_transcript.py
index 4572f4d658d..c640723a885 100755
--- a/egs/multi_en/s5/local/normalize_transcript.py
+++ b/egs/multi_en/s5/local/normalize_transcript.py
@@ -7,6 +7,7 @@
 # This script normalizes the given "text" (transcript) file. The normalized result
 # is printed to STDOUT. This normalization should be applied to all corpora.
 
+from __future__ import print_function
 import re
 import sys
 
@@ -26,7 +27,7 @@ def normalize(utt):
 
 def main():
     if len(sys.argv) != 2:
-        print 'Usage: local/normalize_transcript.py [text_file]'
+        print('Usage: local/normalize_transcript.py [text_file]')
         sys.exit(1)
     with open(sys.argv[1], 'r') as f:
         for line in f.readlines():
diff --git a/egs/multi_en/s5/local/tedlium_join_suffix.py b/egs/multi_en/s5/local/tedlium_join_suffix.py
index c85e8f364f6..47db4ce0b05 100755
--- a/egs/multi_en/s5/local/tedlium_join_suffix.py
+++ b/egs/multi_en/s5/local/tedlium_join_suffix.py
@@ -12,6 +12,7 @@
 # Apache 2.0
 
 
+from __future__ import print_function
 import sys
 from codecs import open
 
diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
index 3a1262101aa..034ffeb4e66 100755
--- a/egs/multi_en/s5/run.sh
+++ b/egs/multi_en/s5/run.sh
@@ -58,8 +58,8 @@ if [ $stage -le 1 ]; then
   # We prepare the basic dictionary in data/local/dict_combined.
   local/prepare_dict.sh $swbd $tedlium2
   (
-   local/g2p/train_g2p.sh --stage 0 --silence-phones \
-     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p || touch exp/g2p/.error
+   steps/dict/train_g2p_phonetisaurus.sh --stage 0 --silence-phones \
+     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined/lexicon.txt exp/g2p || touch exp/g2p/.error
   ) &
 fi
 
@@ -114,8 +114,28 @@ if [ $stage -le 4 ]; then
   mkdir -p $dict_dir
   rm $dict_dir/lexiconp.txt 2>/dev/null || true
   cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
-  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
-    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || exit 1;
+
+  echo 'Gathering missing words...'
+  
+  lexicon=data/local/dict_combined/lexicon.txt
+  g2p_tmp_dir=data/local/g2p_phonetisarus
+  mkdir -p $g2p_tmp_dir
+
+  # awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
+  cat data/*/train/text | \
+    local/count_oovs.pl $lexicon | \
+    awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
+    perl -ape 's/\s/\n/g;' | \
+    sort | uniq > $g2p_tmp_dir/missing.txt
+  cat $g2p_tmp_dir/missing.txt | \
+    grep "^[a-z]*$"  > $g2p_tmp_dir/missing_onlywords.txt
+
+  steps/dict/apply_g2p_phonetisaurus.sh --nbest 1 $g2p_tmp_dir/missing_onlywords.txt exp/g2p exp/g2p/oov_lex || exit 1;
+  cp exp/g2p/oov_lex/lexicon.lex $g2p_tmp_dir/missing_lexicon.txt
+
+  extended_lexicon=$dict_dir/lexicon.txt
+  echo "Adding new pronunciations to get extended lexicon $extended_lexicon"
+  cat <(cut -f 1,3 $g2p_tmp_dir/missing_lexicon.txt) $lexicon | sort | uniq > $extended_lexicon
 fi
 
 # We'll do multiple iterations of pron/sil-prob estimation. So the structure of
diff --git a/egs/reverb/s5/README.txt b/egs/reverb/s5/README.txt
index 1daa214edb6..0ac97059952 100644
--- a/egs/reverb/s5/README.txt
+++ b/egs/reverb/s5/README.txt
@@ -1,130 +1,36 @@
-Improved multi condition training baseline for REVERB challenge based on Kaldi
-==============================================================================
+Improved baseline for REVERB challenge 
+======================================
 
-updated
-Wed Apr 29 19:10:33 EDT 2015 Shinji Watanabe <watanabe@merl.com>
-
-updated 
-Wed Apr  9 12:14:02 CEST 2014 Felix Weninger <felix@weninger.de>
-
-original:
-Wed Nov  6 14:47:59 EST 2013 Felix Weninger <felix@weninger.de>
+This is an improvement over "Improved multi condition training baseline" from Felix Weninger & Shinji Watanabe
 
 Key specs:
-- MFCC-LDA-STC front-end
-- Boosted MMI trained GMM-HMM
-- Utterance-based adaptation using basis fMLLR
-- Tri-gram LM minimum Bayes risk decoding
-
-WER [%]
-@ Language model weight = 15
-Avg(SimData_(far|near)) = 11.73
-Avg(RealData)           = 30.44
-@ Language model weight = 16 (optimal)
-Avg(SimData_(far|near)) = 11.72
-Avg(RealData)           = 30.28
-
-See RESULTS in more detail
-
-Kaldi SVN rev. 5035, 4/26/15
-tested on Ubuntu 13.04
+- Nara-WPE and BeamformIt front-end enhancement
+- TDNN acoustic model
 
+RESULT:
+For experiment results, please see RESULTS for more detail
 
 REFERENCE:
 ++++++++
 If you find this software useful for your own research, please cite the
-following paper:
+following papers:
 
 Felix Weninger, Shinji Watanabe, Jonathan Le Roux, John R. Hershey, Yuuki
 Tachioka, Jürgen Geiger, Björn Schuller, Gerhard Rigoll: "The MERL/MELCO/TUM
 system for the REVERB Challenge using Deep Recurrent Neural Network Feature
 Enhancement", Proc. REVERB Workshop, IEEE, Florence, Italy, May 2014.
 
+Lukas Drude, Jahn Heymann, Christoph Boeddeker, and Reinhold Haeb-Umbach:
+"NARA-WPE: A Python package for weighted prediction error dereverberation in
+Numpy and Tensorflow for online and offline processing." In Speech Communication;
+13th ITG-Symposium, pp. 1-5. VDE, 2018.
 
 INSTRUCTIONS:
 +++++++++++++
-
-1) Set the path names in corpus.sh.default, 
-   and copy this file to "corpus.sh"
-
------
-2) [optional:] If you have speech enhancement (processed waveforms), then
-
-3a) Change directories and data preparation steps
-    For example, you could have something like
-
-    local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt REVERB_dt_derev dt
-
-    The first argument is supposed to point to a folder that has the same
-    structure as the REVERB corpus.
-
-3b) run the multi-condition training steps in run.sh with the processed
-    training set, e.g., REVERB_tr_cut_derev, if you want to investigate
-    recognizer re-training
-
-    - Any system that has _mc in its name uses multi-condition training
-    - You probably want to change the system names if you are using enhanced
-      data for training (e.g. tri2b_mc -> tri2b_mc_derev)
-
-3c) Add your re-trained recognizer to the list of recognizers that are
-    discriminatively re-trained
-
-3d) Modify the decoding steps in run.sh so that they use enhanced data and add
-    your re-trained recognizer(s) to the list
------
-
-4) Execute the training and recognition steps by
+1) Execute the training and recognition steps by
 
    ./run.sh
 
    Depending on your system specs (# of CPUs, RAM) you might want (or have) to 
-   change the number of parallel jobs -- this is controlled by the nj_train,
-   nj_bg, and nj_tg variables (# of jobs for training, for bi-gram and tri-gram
-   decoding).
-
-   If you also want to have the re-implementation of the HTK baseline in Kaldi 
-   (tri2a and tri2a_mc systems), set the do_tri2a variable to true in run.sh.
-
-5) Execute 
-
-   ./local/get_results.sh 
-
-   to display the results corresponding to Table 1 in
-   the following paper,
-
-   Felix Weninger, Shinji Watanabe, Jonathan Le Roux, John R. Hershey, Yuuki
-   Tachioka, Jürgen Geiger, Björn Schuller, Gerhard Rigoll: "The MERL/MELCO/TUM
-   system for the REVERB Challenge using Deep Recurrent Neural Network Feature
-   Enhancement", to appear in Proc. REVERB Workshop, IEEE, Florence, Italy, 2014.
-
-   NOTE: It is very common to have slightly different results (up to +/- 1%
-   absolute WER per REVERB task file) on different machines.  The reason for
-   this is not fully known.
-
-   NOTE 2: By default, only the LDA-STC systems are trained - set do_tri2a in
-   run.sh to true to also train the Delta+Delta-Delta systems (cf. above).
-
------
-6) You can get more recognition results (for other combinations of front-ends, 
-   adaptation, language model, etc.), by 
-
-   $> local/summarize_results.pl [options] <system_name> [ <decoding_prefix> [ <data_suffix ] ]
-
-   where system_name is, e.g., tri2b_mc, or tri2b_mc_derev 
-   (a hypothetical system trained on dereverberated data)
-   
-   decoding_prefix: one of basis_fmllr, mbr, mbr_basis_fmllr, or '' (empty)
-    - if the string "basis_fmllr" is given, (basis) fMLLR results are displayed
-    - if mbr is given, minimum Bayes risk decoding results are displayed
-    - if '' is given, no adaptation is used and ML decoding is used
-   
-   data_suffix is, e.g., "derev" if your data sets are named "REVERB_dt_derev", etc.
-   
-   By default, the optimum language model weight across all conditions is selected and
-   displayed. Note that Table 1 in the above paper uses a constant weight of 15.
-   
-   Options: 
-   --lmw=x      Set fixed language model weight instead of best, x \in { 9, ..., 20 }
-   --lm=xg_5k   Display tri-gram (x=t) or bi-gram (x=b) LM decoding results
-   ----
-
+   change the number of parallel jobs -- this is controlled by the nj
+   and decode_nj variables (# of jobs for training, for decoding).
diff --git a/egs/reverb/s5/RESULTS b/egs/reverb/s5/RESULTS
index 3537852a827..2c72a482914 100644
--- a/egs/reverb/s5/RESULTS
+++ b/egs/reverb/s5/RESULTS
@@ -1,150 +1,299 @@
-####################
-exp/tri2a/decode_bg_5k_REVERB_*dt*
-RealData_dt_for_1ch_far_room1_A	89.13
-RealData_dt_for_1ch_near_room1_A	90.27
-SimData_dt_for_1ch_far_room1_A	22.44
-SimData_dt_for_1ch_far_room2_A	88.44
-SimData_dt_for_1ch_far_room3_A	91.27
-SimData_dt_for_1ch_near_room1_A	12.19
-SimData_dt_for_1ch_near_room2_A	42.74
-SimData_dt_for_1ch_near_room3_A	49.31
-Avg_Real(2)	89.70
-Avg_Sim(6)	51.06
-
-exp/tri2a/decode_bg_5k_REVERB_*et*
-RealData_et_for_1ch_far_room1_A	88.45
-RealData_et_for_1ch_near_room1_A	88.66
-SimData_et_for_1ch_far_room1_A	22.72
-SimData_et_for_1ch_far_room2_A	81.53
-SimData_et_for_1ch_far_room3_A	89.25
-SimData_et_for_1ch_near_room1_A	14.37
-SimData_et_for_1ch_near_room2_A	40.46
-SimData_et_for_1ch_near_room3_A	51.50
-Avg_Real(2)	88.56
-Avg_Sim(6)	49.97
-
-####################
-exp/tri2a_mc/decode_bg_5k_REVERB_*dt*
-RealData_dt_for_1ch_far_room1_A	53.38
-RealData_dt_for_1ch_near_room1_A	56.27
-SimData_dt_for_1ch_far_room1_A	16.96
-SimData_dt_for_1ch_far_room2_A	44.15
-SimData_dt_for_1ch_far_room3_A	49.88
-SimData_dt_for_1ch_near_room1_A	15.00
-SimData_dt_for_1ch_near_room2_A	21.81
-SimData_dt_for_1ch_near_room3_A	25.10
-Avg_Real(2)	54.83
-Avg_Sim(6)	28.82
-
-exp/tri2a_mc/decode_bg_5k_REVERB_*et*
-RealData_et_for_1ch_far_room1_A	52.94
-RealData_et_for_1ch_near_room1_A	55.35
-SimData_et_for_1ch_far_room1_A	18.91
-SimData_et_for_1ch_far_room2_A	37.33
-SimData_et_for_1ch_far_room3_A	46.69
-SimData_et_for_1ch_near_room1_A	17.77
-SimData_et_for_1ch_near_room2_A	21.23
-SimData_et_for_1ch_near_room3_A	26.17
-Avg_Real(2)	54.14
-Avg_Sim(6)	28.02
-
-####################
-exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*dt*
-RealData_dt_for_1ch_far_room1_A	46.27
-RealData_dt_for_1ch_near_room1_A	48.85
-SimData_dt_for_1ch_far_room1_A	15.59
-SimData_dt_for_1ch_far_room2_A	35.86
-SimData_dt_for_1ch_far_room3_A	39.54
-SimData_dt_for_1ch_near_room1_A	12.78
-SimData_dt_for_1ch_near_room2_A	17.75
-SimData_dt_for_1ch_near_room3_A	20.23
-Avg_Real(2)	47.56
-Avg_Sim(6)	23.62
-
-exp/tri2a_mc/decode_basis_fmllr_bg_5k_REVERB_*et*
-RealData_et_for_1ch_far_room1_A	48.11
-RealData_et_for_1ch_near_room1_A	48.42
-SimData_et_for_1ch_far_room1_A	16.57
-SimData_et_for_1ch_far_room2_A	31.54
-SimData_et_for_1ch_far_room3_A	39.32
-SimData_et_for_1ch_near_room1_A	14.31
-SimData_et_for_1ch_near_room2_A	18.42
-SimData_et_for_1ch_near_room3_A	21.03
-Avg_Real(2)	48.27
-Avg_Sim(6)	23.53
-
-####################
-exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*dt*
-RealData_dt_for_1ch_far_room1_A	34.04
-RealData_dt_for_1ch_near_room1_A	33.37
-SimData_dt_for_1ch_far_room1_A	10.57
-SimData_dt_for_1ch_far_room2_A	22.63
-SimData_dt_for_1ch_far_room3_A	25.00
-SimData_dt_for_1ch_near_room1_A	7.57
-SimData_dt_for_1ch_near_room2_A	10.97
-SimData_dt_for_1ch_near_room3_A	12.59
-Avg_Real(2)	33.70
-Avg_Sim(6)	14.89
-
-exp/tri2b_mc/decode_basis_fmllr_tg_5k_REVERB_*et*
-RealData_et_for_1ch_far_room1_A	33.49
-RealData_et_for_1ch_near_room1_A	34.72
-SimData_et_for_1ch_far_room1_A	10.03
-SimData_et_for_1ch_far_room2_A	20.16
-SimData_et_for_1ch_far_room3_A	25.08
-SimData_et_for_1ch_near_room1_A	8.45
-SimData_et_for_1ch_near_room2_A	11.16
-SimData_et_for_1ch_near_room3_A	12.88
-Avg_Real(2)	34.11
-Avg_Sim(6)	14.63
-
-####################
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*dt*
-RealData_dt_for_1ch_far_room1_A	31.17
-RealData_dt_for_1ch_near_room1_A	31.82
-SimData_dt_for_1ch_far_room1_A	8.53
-SimData_dt_for_1ch_far_room2_A	17.43
-SimData_dt_for_1ch_far_room3_A	21.04
-SimData_dt_for_1ch_near_room1_A	6.78
-SimData_dt_for_1ch_near_room2_A	8.97
-SimData_dt_for_1ch_near_room3_A	10.01
-Avg_Real(2)	31.50
-Avg_Sim(6)	12.13
-
-exp/tri2b_mc_mmi_b0.1/decode_basis_fmllr_tg_5k_REVERB_*et*
-RealData_et_for_1ch_far_room1_A	31.20
-RealData_et_for_1ch_near_room1_A	30.98
-SimData_et_for_1ch_far_room1_A	8.42
-SimData_et_for_1ch_far_room2_A	17.63
-SimData_et_for_1ch_far_room3_A	20.71
-SimData_et_for_1ch_near_room1_A	7.03
-SimData_et_for_1ch_near_room2_A	9.50
-SimData_et_for_1ch_near_room3_A	11.11
-Avg_Real(2)	31.09
-Avg_Sim(6)	12.40
-
-####################
-exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*dt*
-RealData_dt_for_1ch_far_room1_A	30.42
-RealData_dt_for_1ch_near_room1_A	31.50
-SimData_dt_for_1ch_far_room1_A	8.24
-SimData_dt_for_1ch_far_room2_A	17.25
-SimData_dt_for_1ch_far_room3_A	20.72
-SimData_dt_for_1ch_near_room1_A	6.76
-SimData_dt_for_1ch_near_room2_A	8.87
-SimData_dt_for_1ch_near_room3_A	9.92
-Avg_Real(2)	30.96
-Avg_Sim(6)	11.96
-
-exp/tri2b_mc_mmi_b0.1/decode_mbr_basis_fmllr_tg_5k_REVERB_*et*
-RealData_et_for_1ch_far_room1_A	30.89
-RealData_et_for_1ch_near_room1_A	31.01
-SimData_et_for_1ch_far_room1_A	8.20
-SimData_et_for_1ch_far_room2_A	17.34
-SimData_et_for_1ch_far_room3_A	20.56
-SimData_et_for_1ch_near_room1_A	6.91
-SimData_et_for_1ch_near_room2_A	9.50
-SimData_et_for_1ch_near_room3_A	10.93
-Avg_Real(2)	30.95
-Avg_Sim(6)	12.24
+########################################
+GMM RESULTs:
+########################################
 
+No Front-End
+########################################
+exp/tri3/decode_dt_real_1ch
+%WER 34.18 [ 500 / 1463, 24 ins, 125 del, 351 sub ] exp/tri3/decode_dt_real_1ch/wer_17_1.0_far_room1
+%WER 29.63 [ 475 / 1603, 24 ins, 127 del, 324 sub ] exp/tri3/decode_dt_real_1ch/wer_15_0.5_near_room1
+
+exp/tri3/decode_dt_simu_1ch
+%WER 6.85 [ 279 / 4071, 38 ins, 40 del, 201 sub ] exp/tri3/decode_dt_simu_1ch/wer_12_1.0_far_room1
+%WER 18.31 [ 743 / 4058, 65 ins, 156 del, 522 sub ] exp/tri3/decode_dt_simu_1ch/wer_14_0.5_far_room2
+%WER 19.78 [ 800 / 4045, 76 ins, 147 del, 577 sub ] exp/tri3/decode_dt_simu_1ch/wer_13_0.0_far_room3
+%WER 5.58 [ 227 / 4071, 33 ins, 34 del, 160 sub ] exp/tri3/decode_dt_simu_1ch/wer_13_1.0_near_room1
+%WER 7.49 [ 304 / 4058, 51 ins, 33 del, 220 sub ] exp/tri3/decode_dt_simu_1ch/wer_12_0.0_near_room2
+%WER 7.96 [ 322 / 4045, 32 ins, 64 del, 226 sub ] exp/tri3/decode_dt_simu_1ch/wer_12_1.0_near_room3
+
+exp/tri3/decode_et_real_1ch
+%WER 33.09 [ 980 / 2962, 103 ins, 157 del, 720 sub ] exp/tri3/decode_et_real_1ch/wer_13_0.0_far_room1
+%WER 33.18 [ 1039 / 3131, 104 ins, 194 del, 741 sub ] exp/tri3/decode_et_real_1ch/wer_16_0.0_near_room1
+
+exp/tri3/decode_et_simu_1ch
+%WER 7.43 [ 439 / 5907, 72 ins, 48 del, 319 sub ] exp/tri3/decode_et_simu_1ch/wer_16_0.5_far_room1
+%WER 18.34 [ 1142 / 6226, 120 ins, 208 del, 814 sub ] exp/tri3/decode_et_simu_1ch/wer_12_0.5_far_room2
+%WER 21.85 [ 1282 / 5868, 110 ins, 278 del, 894 sub ] exp/tri3/decode_et_simu_1ch/wer_14_0.5_far_room3
+%WER 7.35 [ 434 / 5907, 76 ins, 46 del, 312 sub ] exp/tri3/decode_et_simu_1ch/wer_17_1.0_near_room1
+%WER 9.35 [ 582 / 6226, 86 ins, 69 del, 427 sub ] exp/tri3/decode_et_simu_1ch/wer_14_0.0_near_room2
+%WER 10.24 [ 601 / 5868, 93 ins, 87 del, 421 sub ] exp/tri3/decode_et_simu_1ch/wer_13_0.0_near_room3
+
+1ch - WPE
+########################################
+exp/tri3/decode_dt_real_1ch_wpe
+%WER 33.01 [ 483 / 1463, 41 ins, 85 del, 357 sub ] exp/tri3/decode_dt_real_1ch_wpe/wer_17_0.0_far_room1
+%WER 27.32 [ 438 / 1603, 31 ins, 98 del, 309 sub ] exp/tri3/decode_dt_real_1ch_wpe/wer_16_0.0_near_room1
+
+exp/tri3/decode_dt_simu_1ch_wpe
+%WER 6.53 [ 266 / 4071, 38 ins, 36 del, 192 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_13_1.0_far_room1
+%WER 17.62 [ 715 / 4058, 40 ins, 186 del, 489 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_15_1.0_far_room2
+%WER 19.04 [ 770 / 4045, 70 ins, 146 del, 554 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_15_0.0_far_room3
+%WER 5.50 [ 224 / 4071, 31 ins, 33 del, 160 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_14_1.0_near_room1
+%WER 7.76 [ 315 / 4058, 60 ins, 36 del, 219 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_11_0.5_near_room2
+%WER 7.89 [ 319 / 4045, 30 ins, 64 del, 225 sub ] exp/tri3/decode_dt_simu_1ch_wpe/wer_14_1.0_near_room3
+
+exp/tri3/decode_et_real_1ch_wpe
+%WER 30.08 [ 891 / 2962, 89 ins, 164 del, 638 sub ] exp/tri3/decode_et_real_1ch_wpe/wer_17_0.0_far_room1
+%WER 30.57 [ 957 / 3131, 105 ins, 162 del, 690 sub ] exp/tri3/decode_et_real_1ch_wpe/wer_17_0.0_near_room1
+
+exp/tri3/decode_et_simu_1ch_wpe
+%WER 6.97 [ 412 / 5907, 71 ins, 52 del, 289 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_15_1.0_far_room1
+%WER 16.59 [ 1033 / 6226, 91 ins, 217 del, 725 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_13_1.0_far_room2
+%WER 20.60 [ 1209 / 5868, 92 ins, 285 del, 832 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_16_0.5_far_room3
+%WER 7.48 [ 442 / 5907, 93 ins, 41 del, 308 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_15_1.0_near_room1
+%WER 8.77 [ 546 / 6226, 76 ins, 59 del, 411 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_14_0.0_near_room2
+%WER 9.20 [ 540 / 5868, 63 ins, 113 del, 364 sub ] exp/tri3/decode_et_simu_1ch_wpe/wer_15_1.0_near_room3
+
+2ch - WPE+BeamformIt
+########################################
+exp/tri3/decode_dt_real_2ch_beamformit
+%WER 29.67 [ 434 / 1463, 45 ins, 70 del, 319 sub ] exp/tri3/decode_dt_real_2ch_beamformit/wer_17_0.5_far_room1
+%WER 24.08 [ 386 / 1603, 38 ins, 87 del, 261 sub ] exp/tri3/decode_dt_real_2ch_beamformit/wer_13_1.0_near_room1
+
+exp/tri3/decode_dt_simu_2ch_beamformit
+%WER 6.76 [ 275 / 4071, 60 ins, 43 del, 172 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_16_0.5_far_room1
+%WER 11.93 [ 484 / 4058, 68 ins, 67 del, 349 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_14_0.0_far_room2
+%WER 14.36 [ 581 / 4045, 77 ins, 105 del, 399 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_13_0.5_far_room3
+%WER 6.24 [ 254 / 4071, 41 ins, 40 del, 173 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_16_1.0_near_room1
+%WER 7.00 [ 284 / 4058, 54 ins, 33 del, 197 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_14_0.5_near_room2
+%WER 7.17 [ 290 / 4045, 44 ins, 50 del, 196 sub ] exp/tri3/decode_dt_simu_2ch_beamformit/wer_15_1.0_near_room3
+
+exp/tri3/decode_et_real_2ch_beamformit
+%WER 23.94 [ 709 / 2962, 92 ins, 108 del, 509 sub ] exp/tri3/decode_et_real_2ch_beamformit/wer_16_0.0_far_room1
+%WER 23.09 [ 723 / 3131, 78 ins, 144 del, 501 sub ] exp/tri3/decode_et_real_2ch_beamformit/wer_16_1.0_near_room1
+
+exp/tri3/decode_et_simu_2ch_beamformit
+%WER 7.18 [ 424 / 5907, 74 ins, 47 del, 303 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_15_1.0_far_room1
+%WER 12.14 [ 756 / 6226, 92 ins, 122 del, 542 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_11_1.0_far_room2
+%WER 15.20 [ 892 / 5868, 123 ins, 161 del, 608 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_14_0.0_far_room3
+%WER 7.62 [ 450 / 5907, 87 ins, 51 del, 312 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_17_1.0_near_room1
+%WER 7.53 [ 469 / 6226, 52 ins, 69 del, 348 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_17_1.0_near_room2
+%WER 8.08 [ 474 / 5868, 62 ins, 87 del, 325 sub ] exp/tri3/decode_et_simu_2ch_beamformit/wer_15_1.0_near_room3
+
+8ch - WPE+BeamformIt
+########################################
+exp/tri3/decode_dt_real_8ch_beamformit
+%WER 20.92 [ 306 / 1463, 44 ins, 43 del, 219 sub ] exp/tri3/decode_dt_real_8ch_beamformit/wer_13_1.0_far_room1
+%WER 17.53 [ 281 / 1603, 29 ins, 46 del, 206 sub ] exp/tri3/decode_dt_real_8ch_beamformit/wer_16_1.0_near_room1
+
+exp/tri3/decode_dt_simu_8ch_beamformit
+%WER 6.07 [ 247 / 4071, 39 ins, 40 del, 168 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_16_1.0_far_room1
+%WER 6.68 [ 271 / 4058, 45 ins, 44 del, 182 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_15_1.0_far_room2
+%WER 5.91 [ 239 / 4045, 35 ins, 39 del, 165 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_14_1.0_far_room3
+%WER 6.76 [ 275 / 4071, 56 ins, 39 del, 180 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_15_1.0_near_room1
+%WER 6.83 [ 277 / 4058, 81 ins, 31 del, 165 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_14_1.0_near_room2
+%WER 5.91 [ 239 / 4045, 43 ins, 36 del, 160 sub ] exp/tri3/decode_dt_simu_8ch_beamformit/wer_17_1.0_near_room3
+
+exp/tri3/decode_et_real_8ch_beamformit
+%WER 15.87 [ 470 / 2962, 66 ins, 81 del, 323 sub ] exp/tri3/decode_et_real_8ch_beamformit/wer_15_1.0_far_room1
+%WER 15.08 [ 472 / 3131, 81 ins, 69 del, 322 sub ] exp/tri3/decode_et_real_8ch_beamformit/wer_16_1.0_near_room1
+
+exp/tri3/decode_et_simu_8ch_beamformit
+%WER 7.03 [ 415 / 5907, 66 ins, 47 del, 302 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_15_1.0_far_room1
+%WER 7.31 [ 455 / 6226, 67 ins, 62 del, 326 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_16_0.5_far_room2
+%WER 7.29 [ 428 / 5868, 71 ins, 63 del, 294 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_14_1.0_far_room3
+%WER 7.43 [ 439 / 5907, 80 ins, 47 del, 312 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_17_1.0_near_room1
+%WER 7.00 [ 436 / 6226, 75 ins, 64 del, 297 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_17_1.0_near_room2
+%WER 6.99 [ 410 / 5868, 62 ins, 62 del, 286 sub ] exp/tri3/decode_et_simu_8ch_beamformit/wer_16_1.0_near_room3
+
+########################################
+TDNN RESULTs:
+########################################
+
+exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt*
+########################################
+
+No Front-End
+########################################
+%WER 20.51 [ 300 / 1463, 20 ins, 80 del, 200 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch/wer_9_1.0_far_room1
+%WER 17.90 [ 287 / 1603, 13 ins, 85 del, 189 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch/wer_11_0.5_near_room1
+%WER 3.24 [ 132 / 4071, 16 ins, 29 del, 87 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_8_1.0_far_room1
+%WER 7.20 [ 292 / 4058, 30 ins, 56 del, 206 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_8_0.5_far_room2
+%WER 6.67 [ 270 / 4045, 21 ins, 56 del, 193 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_9_0.5_far_room3
+%WER 2.85 [ 116 / 4071, 17 ins, 16 del, 83 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_7_0.0_near_room1
+%WER 3.52 [ 143 / 4058, 18 ins, 22 del, 103 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_8_0.5_near_room2
+%WER 4.23 [ 171 / 4045, 22 ins, 29 del, 120 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch/wer_9_0.0_near_room3
+
+1ch - WPE
+########################################
+%WER 18.66 [ 273 / 1463, 17 ins, 72 del, 184 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch_wpe/wer_10_1.0_far_room1
+%WER 15.41 [ 247 / 1603, 17 ins, 68 del, 162 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_1ch_wpe/wer_12_0.0_near_room1
+%WER 3.14 [ 128 / 4071, 20 ins, 19 del, 89 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_8_0.0_far_room1
+%WER 6.73 [ 273 / 4058, 34 ins, 46 del, 193 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_8_0.5_far_room2
+%WER 6.33 [ 256 / 4045, 23 ins, 52 del, 181 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_10_0.5_far_room3
+%WER 2.60 [ 106 / 4071, 16 ins, 15 del, 75 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_7_0.0_near_room1
+%WER 3.18 [ 129 / 4058, 13 ins, 23 del, 93 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_8_1.0_near_room2
+%WER 3.98 [ 161 / 4045, 21 ins, 27 del, 113 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_1ch_wpe/wer_9_0.0_near_room3
+
+2ch - WPE+BeamformIt
+########################################
+%WER 14.90 [ 218 / 1463, 15 ins, 58 del, 145 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_2ch_beamformit/wer_10_1.0_far_room1
+%WER 12.23 [ 196 / 1603, 13 ins, 41 del, 142 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_2ch_beamformit/wer_12_0.0_near_room1
+%WER 3.24 [ 132 / 4071, 24 ins, 18 del, 90 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_7_0.0_far_room1
+%WER 4.21 [ 171 / 4058, 17 ins, 33 del, 121 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_9_0.5_far_room2
+%WER 4.65 [ 188 / 4045, 20 ins, 33 del, 135 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_9_0.5_far_room3
+%WER 2.65 [ 108 / 4071, 11 ins, 23 del, 74 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_8_0.5_near_room1
+%WER 2.98 [ 121 / 4058, 7 ins, 26 del, 88 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_8_1.0_near_room2
+%WER 3.44 [ 139 / 4045, 25 ins, 21 del, 93 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_2ch_beamformit/wer_8_0.0_near_room3
+
+8ch - WPE+BeamformIt
+########################################
+%WER 11.07 [ 162 / 1463, 17 ins, 38 del, 107 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_8ch_beamformit/wer_10_0.5_far_room1
+%WER 9.86 [ 158 / 1603, 12 ins, 46 del, 100 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_real_8ch_beamformit/wer_12_1.0_near_room1
+%WER 3.05 [ 124 / 4071, 17 ins, 22 del, 85 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_8_0.0_far_room1
+%WER 3.01 [ 122 / 4058, 12 ins, 23 del, 87 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_10_0.5_far_room2
+%WER 3.19 [ 129 / 4045, 19 ins, 21 del, 89 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_7_1.0_far_room3
+%WER 2.65 [ 108 / 4071, 15 ins, 20 del, 73 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_7_0.5_near_room1
+%WER 2.51 [ 102 / 4058, 9 ins, 21 del, 72 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_8_1.0_near_room2
+%WER 2.79 [ 113 / 4045, 17 ins, 21 del, 75 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt_simu_8ch_beamformit/wer_8_0.5_near_room3
+
+exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et*
+########################################
+
+No Front-End
+########################################
+%WER 20.90 [ 619 / 2962, 36 ins, 147 del, 436 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch/wer_11_1.0_far_room1
+%WER 18.65 [ 584 / 3131, 45 ins, 136 del, 403 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch/wer_11_0.5_near_room1
+%WER 3.79 [ 224 / 5907, 20 ins, 49 del, 155 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_11_1.0_far_room1
+%WER 7.68 [ 478 / 6226, 60 ins, 94 del, 324 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_10_0.0_far_room2
+%WER 7.40 [ 434 / 5868, 46 ins, 93 del, 295 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_9_0.5_far_room3
+%WER 3.28 [ 194 / 5907, 36 ins, 29 del, 129 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_9_0.0_near_room1
+%WER 4.63 [ 288 / 6226, 33 ins, 57 del, 198 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_8_1.0_near_room2
+%WER 4.75 [ 279 / 5868, 26 ins, 60 del, 193 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch/wer_10_1.0_near_room3
+
+1ch - WPE
+########################################
+%WER 17.69 [ 524 / 2962, 39 ins, 100 del, 385 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch_wpe/wer_13_0.0_far_room1
+%WER 16.00 [ 501 / 3131, 39 ins, 115 del, 347 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_1ch_wpe/wer_11_0.5_near_room1
+%WER 3.67 [ 217 / 5907, 31 ins, 34 del, 152 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_8_0.5_far_room1
+%WER 7.15 [ 445 / 6226, 39 ins, 91 del, 315 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_10_0.5_far_room2
+%WER 7.11 [ 417 / 5868, 39 ins, 100 del, 278 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_9_1.0_far_room3
+%WER 3.03 [ 179 / 5907, 37 ins, 24 del, 118 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_8_0.0_near_room1
+%WER 4.74 [ 295 / 6226, 34 ins, 57 del, 204 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_8_1.0_near_room2
+%WER 4.31 [ 253 / 5868, 27 ins, 51 del, 175 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_1ch_wpe/wer_9_1.0_near_room3
+
+2ch - WPE+BeamformIt
+########################################
+%WER 14.35 [ 425 / 2962, 32 ins, 90 del, 303 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_2ch_beamformit/wer_11_1.0_far_room1
+%WER 12.17 [ 381 / 3131, 44 ins, 76 del, 261 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_2ch_beamformit/wer_10_0.5_near_room1
+%WER 3.23 [ 191 / 5907, 18 ins, 40 del, 133 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_1.0_far_room1
+%WER 5.35 [ 333 / 6226, 31 ins, 75 del, 227 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_1.0_far_room2
+%WER 5.81 [ 341 / 5868, 43 ins, 57 del, 241 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_0.5_far_room3
+%WER 3.15 [ 186 / 5907, 24 ins, 33 del, 129 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_8_1.0_near_room1
+%WER 4.42 [ 275 / 6226, 28 ins, 57 del, 190 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_8_1.0_near_room2
+%WER 4.12 [ 242 / 5868, 21 ins, 43 del, 178 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_2ch_beamformit/wer_10_1.0_near_room3
+
+8ch - WPE+BeamformIt
+########################################
+%WER 11.01 [ 326 / 2962, 30 ins, 58 del, 238 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_8ch_beamformit/wer_11_1.0_far_room1
+%WER 9.49 [ 297 / 3131, 27 ins, 78 del, 192 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_real_8ch_beamformit/wer_12_1.0_near_room1
+%WER 3.50 [ 207 / 5907, 29 ins, 33 del, 145 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_8_1.0_far_room1
+%WER 4.42 [ 275 / 6226, 32 ins, 61 del, 182 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_9_1.0_far_room2
+%WER 3.83 [ 225 / 5868, 34 ins, 37 del, 154 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_9_0.5_far_room3
+%WER 3.15 [ 186 / 5907, 26 ins, 31 del, 129 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_8_1.0_near_room1
+%WER 4.00 [ 249 / 6226, 27 ins, 57 del, 165 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_9_1.0_near_room2
+%WER 3.54 [ 208 / 5868, 16 ins, 41 del, 151 sub ] exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et_simu_8ch_beamformit/wer_10_1.0_near_room3
+########################################
+
+########################################
+SE Scores - 8ch - WPE+BeamformIt
+########################################
+
+Data type   : SimData
+########################################
+
+==============================================
+           Cepstral distance in dB            
+----------------------------------------------
+            	      	  mean	      	median
+----------------------------------------------
+            	   org	   enh	   org	   enh
+----------------------------------------------
+  dt_far_room1	  2.65	  1.97	  2.36	  1.74	
+  dt_far_room2	  5.08	  4.66	  4.94	  4.30	
+  dt_far_room3	  4.82	  4.03	  4.60	  3.63	
+ dt_near_room1	  1.96	  1.67	  1.67	  1.37	
+ dt_near_room2	  4.58	  4.33	  4.30	  3.88	
+ dt_near_room3	  4.20	  3.71	  3.91	  3.26	
+----------------------------------------------
+       average	  3.88	  3.39	  3.63	  3.03
+==============================================
+
+
+==============================================
+            SRMR  (only mean used)            
+----------------------------------------------
+            	      	  mean	      	median
+----------------------------------------------
+            	   org	   enh	   org	   enh
+----------------------------------------------
+  dt_far_room1	  4.63	  4.91	     -	     -	
+  dt_far_room2	  2.94	  5.13	     -	     -	
+  dt_far_room3	  2.76	  4.87	     -	     -	
+ dt_near_room1	  4.37	  4.62	     -	     -	
+ dt_near_room2	  3.67	  4.39	     -	     -	
+ dt_near_room3	  3.66	  4.54	     -	     -	
+----------------------------------------------
+       average	  3.67	  4.74	     -	     -
+==============================================
+
+
+==============================================
+             Log likelihood ratio             
+----------------------------------------------
+            	      	  mean	      	median
+----------------------------------------------
+            	   org	   enh	   org	   enh
+----------------------------------------------
+  dt_far_room1	  0.38	  0.33	  0.35	  0.30	
+  dt_far_room2	  0.77	  0.56	  0.64	  0.43	
+  dt_far_room3	  0.85	  0.52	  0.77	  0.45	
+ dt_near_room1	  0.34	  0.34	  0.33	  0.32	
+ dt_near_room2	  0.51	  0.50	  0.43	  0.33	
+ dt_near_room3	  0.65	  0.50	  0.59	  0.43	
+----------------------------------------------
+       average	  0.58	  0.46	  0.52	  0.38
+==============================================
+
+
+==============================================
+    Frequency-weighted segmental SNR in dB    
+----------------------------------------------
+            	      	  mean	      	median
+----------------------------------------------
+            	   org	   enh	   org	   enh
+----------------------------------------------
+  dt_far_room1	  6.75	  8.99	  8.93	 11.06	
+  dt_far_room2	  0.53	  3.84	  0.37	  5.91	
+  dt_far_room3	  0.14	  3.76	  0.39	  6.57	
+ dt_near_room1	  8.10	  9.50	 10.47	 11.32	
+ dt_near_room2	  3.07	  5.10	  4.58	  8.12	
+ dt_near_room3	  2.32	  4.54	  4.41	  8.15	
+----------------------------------------------
+       average	  3.48	  5.96	  4.86	  8.52
+==============================================
+
+Data type   : RealData
+########################################
+
+==============================
+            SRMR
+------------------------------
+            	   org	   enh
+------------------------------
+  dt_far_room1	  3.51	  6.03	
+ dt_near_room1	  4.05	  6.68	
+------------------------------
+       average	  3.78	  6.36
+==============================
+>>>>>>> 77343718c6dc1936d7374b4948be4706d6f9ee2a
diff --git a/egs/reverb/s5/conf/decode_dnn.config b/egs/reverb/s5/conf/decode_dnn.config
deleted file mode 100644
index bfaae86702e..00000000000
--- a/egs/reverb/s5/conf/decode_dnn.config
+++ /dev/null
@@ -1,2 +0,0 @@
-beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
-latbeam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/reverb/s5/conf/mfcc_hires.conf b/egs/reverb/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..fd64b62eb16
--- /dev/null
+++ b/egs/reverb/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=16000 
+--num-mel-bins=40
+--num-ceps=40
+--low-freq=40
+--high-freq=-400
diff --git a/egs/reverb/s5/conf/online_cmvn.conf b/egs/reverb/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/reverb/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/reverb/s5/conf/reverb_beamformit.cfg b/egs/reverb/s5/conf/reverb_beamformit.cfg
new file mode 100755
index 00000000000..70fdd858651
--- /dev/null
+++ b/egs/reverb/s5/conf/reverb_beamformit.cfg
@@ -0,0 +1,50 @@
+#BeamformIt sample configuration file for AMI data (http://groups.inf.ed.ac.uk/ami/download/)
+
+# scrolling size to compute the delays
+scroll_size = 250
+
+# cross correlation computation window size
+window_size = 500
+
+#amount of maximum points for the xcorrelation taken into account
+nbest_amount = 4
+
+#flag wether to apply an automatic noise thresholding 
+do_noise_threshold = 1
+
+#Percentage of frames with lower xcorr taken as noisy
+noise_percent = 10
+
+######## acoustic modelling parameters
+
+#transition probabilities weight for multichannel decoding
+trans_weight_multi = 25
+trans_weight_nbest = 25
+
+###
+
+#flag wether to print the feaures after setting them, or not
+print_features = 1
+
+#flag wether to use the bad frames in the sum process
+do_avoid_bad_frames = 1
+
+#flag to use the best channel (SNR) as a reference
+#defined from command line
+do_compute_reference = 1
+
+#flag wether to use a uem file or not(process all the file)
+do_use_uem_file = 0
+
+#flag wether to use an adaptative weights scheme or fixed weights
+do_adapt_weights = 1
+
+#flag wether to output the sph files or just run the system to create the auxiliary files
+do_write_sph_files = 1
+
+####directories where to store/retrieve info####
+#channels_file = ./cfg-files/channels
+
+#show needs to be passed as argument normally, here a default one is given just in case
+#show_id = Ttmp
+
diff --git a/egs/reverb/s5/local/Generate_mcTrainData_cut.m b/egs/reverb/s5/local/Generate_mcTrainData_cut.m
index cc01ff89b7d..831ff6a5226 100755
--- a/egs/reverb/s5/local/Generate_mcTrainData_cut.m
+++ b/egs/reverb/s5/local/Generate_mcTrainData_cut.m
@@ -1,13 +1,13 @@
 function Generate_mcTrainData_cut(WSJ_dir_name, save_dir)
 %
 % Input variables:
-%    WSJ_dir_name: string name of user's clean wsjcam0 corpus directory 
-%                  (*Directory structure for wsjcam0 corpushas to be kept as it is after obtaining it from LDC. 
+%    WSJ_dir_name: string name of WAV file directory converted from original wsjcam0 SPHERE files
+%                  (*Directory structure for wsjcam0 corpus to be kept as it is after obtaining it from LDC. 
 %                    Otherwise this script does not work.)
 %
 % This function generates multi-condition traiing data
 % based on the following items:
-%  1. wsjcam0 corpus (distributed from the LDC)
+%  1. wsjcam0 corpus (WAV files)
 %  2. room impulse responses (ones under ./RIR/)
 %  3. noise (ones under ./NOISE/).
 % Generated data has the same directory structure as original wsjcam0 corpus. 
@@ -26,8 +26,6 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir)
 
 display(['Name of directory for original wsjcam0: ',WSJ_dir_name])
 display(['Name of directory to save generated multi-condition training data: ',save_dir])
-unix(['chmod u+x sphere_to_wave.csh']);
-unix(['chmod u+x bin/*']);
 
 % Parameters related to acoustic conditions
 SNRdB=20;
@@ -89,7 +87,6 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir)
     save_dir_tr=[save_dir,'/data/mc_train/'];
 end
 mkdir([save_dir_tr]);
-%mkdir([save_dir,'/taskfiles/'])
 
 mic_idx=['A';'B';'C';'D';'E';'F';'G';'H'];
 prev_fname='dummy';
@@ -114,13 +111,12 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir)
         end
         prev_fname=fname(1:idx1(end));
        
-        % load (sphere format) speech signal 
-        x=read_sphere([WSJ_dir_name,'/data/', fname]);
-        x=x/(2^15);  % conversion from short-int to float
+        % load speech signal
+        x=audioread([WSJ_dir_name, '/data/', fname, '.wav'])';
         
         % load RIR and noise for "THIS" utterance
-        eval(['RIR=wavread(RIR_sim',num2str(rcount),');']);
-        eval(['NOISE=wavread([noise_sim',num2str(ceil(rcount/4)),',''_',num2str(ncount),'.wav'']);']);
+        eval(['RIR=audioread(RIR_sim',num2str(rcount),');']);
+        eval(['NOISE=audioread([noise_sim',num2str(ceil(rcount/4)),',''_',num2str(ncount),'.wav'']);']);
 
         % Generate 8ch noisy reverberant data        
         y=gen_obs(x,RIR,NOISE,SNRdB);
@@ -138,8 +134,9 @@ function Generate_mcTrainData_cut(WSJ_dir_name, save_dir)
         y=y/4; % common normalization to all the data to prevent clipping
                % denominator was decided experimentally
 
-        for ch=1:8 
-            eval(['wavwrite(y(:,',num2str(ch),'),16000,''',save_dir_tr fname,'_ch',num2str(ch),'.wav'');']);
+        for ch=1:8
+	    outfilename = [save_dir_tr, fname, '_ch', num2str(ch), '.wav'];
+            eval(['audiowrite(outfilename, y(:,',num2str(ch),'), 16000);']);
         end
            
         display(['sentence ',num2str(fcount),' (out of 7861) finished! (Multi-condition training data)'])
diff --git a/egs/reverb/s5/local/REVERB_create_mcdata.sh b/egs/reverb/s5/local/REVERB_create_mcdata.sh
deleted file mode 100755
index 4cc776aa159..00000000000
--- a/egs/reverb/s5/local/REVERB_create_mcdata.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-# Copyright 2013 MERL (author: Shinji Watanabe)
-# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-if [ $# -ne 2 ]; then
-  printf "\nUSAGE: %s <wsjcam0 data dir> <dest dir>\n\n" `basename $0`
-  echo "e.g.,:"
-  echo " `basename $0` /archive/speech-db/processed/public/REVERB/wsjcam0 data_mc_tr"
-  exit 1;
-fi
-
-wsjcam0_dir=$1
-reverb_tr_dir=$2
-
-dir=`pwd`/data/local/reverb_tools
-mkdir -p $dir $reverb_tr_dir
-lmdir=`pwd`/data/local/nist_lm
-
-# Download tools
-URL1="http://reverb2014.dereverberation.com/tools/reverb_tools_for_Generate_mcTrainData.tgz"
-URL2="http://reverb2014.dereverberation.com/tools/REVERB_TOOLS_FOR_ASR_ver2.0.tgz"
-for f in $URL1 $URL2; do
-  x=`basename $f`
-  if [ ! -e $dir/$x ]; then
-    wget $f -O $dir/$x || exit 1;
-    tar zxvf $dir/$x -C $dir || exit 1;
-  fi
-done
-URL3="http://reverb2014.dereverberation.com/tools/taskFiles_et.tgz"
-x=`basename $URL3`
-if [ ! -e $dir/$x ]; then
-  wget $URL3 -O $dir/$x || exit 1;
-  tar zxvf $dir/$x -C $dir || exit 1;
-  cp -fr $dir/`basename $x .tgz`/* $dir/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/
-fi
-
-# Download and install nist tools
-pushd $dir/ReleasePackage/reverb_tools_for_asr_ver2.0
-perl -ape "s|^main$|targetSPHEREDir\=tools/SPHERE\ninstall_nist|;" installTools > installnist
-chmod u+x installnist
-./installnist
-popd
-
-# Make mcTrainData
-cp local/Generate_mcTrainData_cut.m $dir/reverb_tools_for_Generate_mcTrainData/
-pushd $dir/reverb_tools_for_Generate_mcTrainData/
-# copied nist tools required for the following matlab command
-cp $dir/ReleasePackage/reverb_tools_for_asr_ver2.0/tools/SPHERE/nist/bin/{h_strip,w_decode} ./bin/
-
-tmpdir=`mktemp -d tempXXXXX `
-tmpmfile=$tmpdir/run_mat.m
-cat <<EOF > $tmpmfile
-addpath(genpath('.'))
-Generate_mcTrainData_cut('$wsjcam0_dir', '$reverb_tr_dir');
-EOF
-cat $tmpmfile | matlab -nodisplay
-rm -rf $tmpdir
-popd
-
-echo "Successfully generated multi-condition training data and stored it in $reverb_tr_dir." && exit 0;
diff --git a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh b/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
deleted file mode 100755
index a4599f97702..00000000000
--- a/egs/reverb/s5/local/REVERB_mcwsjav_data_prep.sh
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/bin/bash
-
-# Copyright 2013 MERL (author: Felix Weninger)
-# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# for REVERB challenge:
-
-dir=`pwd`/data/local/data
-lmdir=`pwd`/data/local/nist_lm
-mkdir -p $dir $lmdir
-local=`pwd`/local
-utils=`pwd`/utils
-root=`pwd`
-
-. ./path.sh # Needed for KALDI_ROOT
-export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-if [ ! -x $sph2pipe ]; then
-   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
-   exit 1;
-fi
-
-cd $dir
-
-MIC=primary
-
-# input corpus (original or processed, tr or dt, etc.)
-RWSJ=$1
-if [ ! -d "$RWSJ" ]; then
-    echo Could not find directory $RWSJ! Check pathnames in corpus.sh!
-    exit 1
-fi
-
-mcwsjav_mlf=$RWSJ/mlf/WSJ.mlf
-if [ ! -z "$4" ]; then
-    mcwsjav_mlf=$4
-fi
-
-# the name of the dataset to be created
-dataset=REVERB_Real_dt
-
-# the WSJCAM0 set that the set is based on (tr, dt, ...)
-# this will be used to find the correct transcriptions etc.
-dt_or_x=dt
-
-if [ ! -z "$2" ]; then
-   dataset=$2
-fi
-# dt or et
-if [ ! -z "$3" ]; then
-   dt_or_x=$3
-fi
-
-# unfortunately, we need a pointer to HTK baseline
-# since the corpus does NOT contain the data set descriptions
-# for the REVERB Challenge
-
-taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
-#taskFiles=`ls $taskFileDir/*Data_dt_for_*`
-taskFiles=`ls $taskFileDir/RealData_${dt_or_x}_for_1ch_{far,near}*`
-
-dir2=$dir/$dataset
-mkdir -p $dir2
-
-for taskFile in $taskFiles; do
-
-set=`basename $taskFile`
-
-
-echo $mcwsjav_mlf
-
-# MLF transcription correction
-# taken from HTK baseline script
-sed -e '
-# dos to unix line feed conversion
-s/\x0D$//' \
--e "
-            s/\x60//g              # remove unicode character grave accent.
-       " \
--e "
-            # fix the single quote for the word yield
-            # and the quoted ROOTS
-            # e.g. yield' --> yield
-            # reason: YIELD' is not in dict, while YIELD is
-            s/YIELD'/YIELD/g
-            s/'ROOTS'/ROOTS/g
-            s/'WHERE/WHERE/g
-            s/PEOPLE'/PEOPLE/g
-            s/SIT'/SIT/g
-            s/'DOMINEE/DOMINEE/g
-            s/CHURCH'/CHURCH/g" \
--e '
-              # fix the single missing double full stop issue at the end of an utterance
-              # e.g. I. C. N should be  I. C. N.
-              # reason: N is not in dict, while N. is
-              /^[A-Z]$/ {
-              # append a line
-                      N
-              # search for single dot on the second line
-                      /\n\./ {
-              # found it - now replace the
-                              s/\([A-Z]\)\n\./\1\.\n\./
-                      }
-              }' \
-$mcwsjav_mlf |\
-perl $local/mlf2text.pl > $dir2/$set.txt1
-
-#exit
-
-#taskFile=$taskFileDir/$set
-# contains pointer to wav files with relative path --> add absolute path
-echo taskFile = $taskFile
-awk '{print "'$RWSJ'"$1}' < $taskFile > $dir2/${set}.flist || exit 1;
-
-# this is like flist2scp.pl but it can take wav file list as input
-(perl -e 'while(<>){
-    m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_";
-    $id = lc $1;
-    print "$id $_";
-}' < $dir2/$set.flist || exit 1) | sort > $dir2/${set}_wav.scp
-
-
-# Make the utt2spk and spk2utt files.
-cat $dir2/${set}_wav.scp | awk '{print $1, $1}' > $dir2/$set.utt2spk || exit 1;
-cat $dir2/$set.utt2spk | $utils/utt2spk_to_spk2utt.pl > $dir2/$set.spk2utt || exit 1;
-
-awk '{print $1}' < $dir2/$set.utt2spk |\
-$local/find_transcripts_txt.pl $dir2/$set.txt1 | sort | uniq > $dir2/$set.txt
-#rm $dir2/$set.txt1
-
-# Create directory structure required by decoding scripts
-
-cd $root
-mkdir -p data/$dataset/$set
-cp $dir2/${set}_wav.scp data/$dataset/$set/wav.scp || exit 1;
-cp $dir2/$set.txt data/$dataset/$set/text || exit 1;
-cp $dir2/$set.spk2utt data/$dataset/$set/spk2utt || exit 1;
-cp $dir2/$set.utt2spk data/$dataset/$set/utt2spk || exit 1;
-
-echo "Data preparation for $set succeeded"
-#echo "Put files into $dir2/$set.*"
-
-
-mfccdir=mfcc/$dataset
-#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do
-#for x in si_tr; do
-steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \
-  data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
-steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
-
-done
diff --git a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh b/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
deleted file mode 100755
index 6ab2f2f4b73..00000000000
--- a/egs/reverb/s5/local/REVERB_wsjcam0_data_prep.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/bin/bash
-
-# Copyright 2013 MERL (author: Felix Weninger)
-# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-dir=$PWD/data/local/data
-lmdir=$PWD/data/local/nist_lm
-mkdir -p $dir $lmdir
-local=$PWD/local
-utils=$PWD/utils
-root=$PWD
-
-. ./path.sh # Needed for KALDI_ROOT
-export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-if [ ! -x $sph2pipe ]; then
-   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
-   exit 1;
-fi
-
-RWSJ=$1 # input corpus (original or processed, tr or dt, etc.)
-dataset=REVERB_dt # the name of the dataset to be created
-if [ ! -z "$2" ]; then
-   dataset=$2
-fi
-dt_or_x=dt # the WSJCAM0 set that the set is based on (tr, dt, ...)
-# this will be used to find the correct transcriptions etc.
-if [ ! -z "$3" ]; then
-   dt_or_x=$3
-fi
-
-if [ ! -d "$RWSJ" ]; then
-    echo Could not find directory $RWSJ! Check pathnames in corpus.sh!
-    exit 1
-fi
-
-cd $dir
-MIC=primary
-
-# unfortunately, we need a pointer to HTK baseline
-# since the corpus does NOT contain the data set descriptions
-# for the REVERB Challenge
-taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
-#taskFiles=`ls $taskFileDir/*Data_dt_for_*`
-nch=1
-if [ "$dt_or_x" = "tr" ]; then
-    taskFiles=`ls $taskFileDir/SimData_tr_for_${nch}ch*` || exit 1
-else
-    taskFiles=`ls $taskFileDir/SimData_${dt_or_x}_for_${nch}ch_{far,near}*` || exit 1
-fi
-for taskFile in $taskFiles; do
-
-set=`basename $taskFile`
-
-#taskFile=$taskFileDir/$set
-dir2=$dir/$dataset
-mkdir -p $dir2
-# contains pointer to wav files with relative path --> add absolute path
-echo taskFile = $taskFile
-awk '{print "'$RWSJ/data'"$1}' < $taskFile > $dir2/${set}.flist || exit 1;
-
-# this is like flist2scp.pl but it can take wav file list as input
-perl -e 'while(<>){
-    m:^\S+/(\w{8})\w*\.wav$: || die "Bad line $_";
-    $id = lc $1;
-    print "$id $_";
-}' < $dir2/$set.flist | sort > $dir2/${set}_wav.scp || exit 1;
-
-# find transcriptions of given utterances in si_dt.dot
-# create a trans1 file for each set, convert to txt (kaldi "MLF")
-dot=$dir/si_${dt_or_x}.dot
-perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' $taskFile |\
-perl $local/find_transcripts_singledot.pl $dot \
-> $dir2/$set.trans1 || exit 1;
-
-noiseword="<NOISE>";
-cat $dir2/$set.trans1 | $local/normalize_transcript.pl $noiseword | sort | uniq > $dir2/$set.txt || exit 1;
-#exit
-
-
-# Make the utt2spk and spk2utt files.
-cat $dir2/${set}_wav.scp | awk '{print $1, $1}' > $dir2/$set.utt2spk || exit 1;
-cat $dir2/$set.utt2spk | $utils/utt2spk_to_spk2utt.pl > $dir2/$set.spk2utt || exit 1;
-
-# Create directory structure required by decoding scripts
-cd $root
-mkdir -p data/$dataset/$set
-cp $dir2/${set}_wav.scp data/$dataset/$set/wav.scp || exit 1;
-cp $dir2/$set.txt data/$dataset/$set/text || exit 1;
-cp $dir2/$set.spk2utt data/$dataset/$set/spk2utt || exit 1;
-cp $dir2/$set.utt2spk data/$dataset/$set/utt2spk || exit 1;
-
-echo "Data preparation for $set succeeded"
-#echo "Put files into $dir2/$set.*"
-
-
-mfccdir=mfcc/$dataset
-#for x in test_eval92_clean test_eval92_5k_clean dev_dt_05_clean dev_dt_20_clean train_si84_clean; do
-#for x in si_tr; do
-steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 \
-  data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
-steps/compute_cmvn_stats.sh data/$dataset/$set exp/make_mfcc/$dataset/$set $mfccdir || exit 1;
-
-done
diff --git a/egs/reverb/s5/local/calc_wer.sh b/egs/reverb/s5/local/calc_wer.sh
deleted file mode 100755
index c4b5eeb87f3..00000000000
--- a/egs/reverb/s5/local/calc_wer.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-# Copyright 2016 MERL (author: Shinji Watanabe)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-. ./cmd.sh
-. ./path.sh
-
-lmw=15
-am="tri2a"
-lm="bg_5k"
-decode=""
-
-. utils/parse_options.sh
-
-if [ ! -z $decode ]; then
-  decode="_$decode"
-fi
-
-dir="exp/$am/decode${decode}_${lm}_REVERB_"
-echo "####################"
-echo "${dir}*dt*"
-for a in `echo ${dir}*dt* | tr " " "\n" | grep -v "A\.si"`; do
-  echo $a | awk -F '_' '{for(i=NF-6;i<NF;i++){printf("%s%s",$i,OFS="_")}print $NF}' | tr '\n' '\t'
-  grep WER $a/wer_${lmw} | awk '{print $2}'
-done | tee exp/$am/decode_${decode}_${lm}_dt.log
-echo -n -e "Avg_Real(`cat exp/$am/decode_${decode}_${lm}_dt.log | grep RealData | wc -l`)\t"
-cat exp/$am/decode_${decode}_${lm}_dt.log | grep RealData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
-echo -n -e "Avg_Sim(`cat exp/$am/decode_${decode}_${lm}_dt.log | grep SimData | wc -l`)\t"
-cat exp/$am/decode_${decode}_${lm}_dt.log | grep SimData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
-echo ""
-
-echo "${dir}*et*"
-for a in `echo ${dir}*et* | tr " " "\n" | grep -v "A\.si"`; do
-  echo $a | awk -F '_' '{for(i=NF-6;i<NF;i++){printf("%s%s",$i,OFS="_")}print $NF}' | tr '\n' '\t'
-  grep WER $a/wer_${lmw} | awk '{print $2}'
-done | tee exp/$am/decode_${decode}_${lm}_et.log
-echo -n -e "Avg_Real(`cat exp/$am/decode_${decode}_${lm}_et.log | grep RealData | wc -l`)\t"
-cat exp/$am/decode_${decode}_${lm}_et.log | grep RealData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
-echo -n -e "Avg_Sim(`cat exp/$am/decode_${decode}_${lm}_et.log | grep SimData | wc -l`)\t"
-cat exp/$am/decode_${decode}_${lm}_et.log | grep SimData | awk '{m+=$2} END{printf("%5.2f\n", m/NR);}'
-
-echo ""
diff --git a/egs/reverb/s5/local/chain/compare_wer.sh b/egs/reverb/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..cd6be14ed88
--- /dev/null
+++ b/egs/reverb/s5/local/chain/compare_wer.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev_clean_2 (tgsmall) "
+  "#WER dev_clean_2 (tglarge) ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/reverb/s5/local/chain/run_tdnn.sh b/egs/reverb/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/reverb/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/reverb/s5/local/chain/run_tdnn_lstm.sh b/egs/reverb/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/reverb/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..c8b4997161e
--- /dev/null
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=tr_simu_8ch
+test_sets="dt_real_1ch dt_simu_1ch et_real_1ch et_simu_1ch"
+gmm=tri3
+nnet3_affix=_tr_simu_8ch
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.01 bottleneck-dim=320"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 target-rms=0.5
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..4723400c76b
--- /dev/null
+++ b/egs/reverb/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=96
+train_set=tr_simu_8ch
+test_sets="dt_real_1ch dt_simu_1ch et_real_1ch et_simu_1ch"
+gmm=tri3
+nnet3_affix=_tr_simu_8ch
+lm_suffix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+common_egs_dir=
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+				  --test-sets "$test_sets" \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj ${nj} --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim
+
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/chime5-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$train_cmd --mem 4G" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.srand=$srand \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang${lm_suffix}/ \
+    $tree_dir $tree_dir/graph${lm_suffix} || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj 8 --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj 8 --cmd "$decode_cmd" \
+        $tree_dir/graph${lm_suffix} data/${data} ${dir}_online/decode${lm_suffix}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/reverb/s5/local/compute_se_scores.sh b/egs/reverb/s5/local/compute_se_scores.sh
new file mode 100755
index 00000000000..8168c2c46a2
--- /dev/null
+++ b/egs/reverb/s5/local/compute_se_scores.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+# This script computes the dereverberation scores given in REVERB challenge
+# Eg. local/compute_se_scores.sh --nch 8 /export/corpora5/REVERB_2014/REVERB ${PWD}/wav ${PWD}/local 
+
+. ./cmd.sh
+. ./path.sh
+set -e
+set -u
+set -o pipefail
+
+cmd=run.pl
+nch=8
+enable_pesq=false
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+   echo "Wrong #arguments ($#, expected 3)"
+   echo "Usage: local/compute_se.sh [options] <reverb_data> <enhancement-directory> <pesq-directory>"
+   echo "options"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   echo "  --nch <nch>                              # nch of WPE to use for computing SE scores"
+   echo "  --enable_pesq <enable_pesq>              # Boolean flag to enable PESQ"
+   exit 1;
+fi
+
+reverb_data=$1
+enhancement_directory=$2
+pesqdir=$3
+enhancement_directory_sim=$enhancement_directory/WPE/${nch}ch/REVERB_WSJCAM0_dt/data/
+enhancement_directory_real=$enhancement_directory/WPE/${nch}ch/MC_WSJ_AV_Dev/
+expdir=${PWD}/exp/compute_se_${nch}ch
+if $enable_pesq; then
+   compute_pesq=1
+else
+   compute_pesq=0
+fi
+
+pushd local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools
+$cmd $expdir/compute_se_real.log matlab -nodisplay -nosplash -r "addpath('SRMRToolbox'); score_RealData('$reverb_data','$enhancement_directory_real');exit"
+$cmd $expdir/compute_se_sim.log matlab -nodisplay -nosplash -r "addpath('SRMRToolbox'); score_SimData('$reverb_data','$enhancement_directory_sim','$pesqdir',$compute_pesq);exit"
+popd
+rm -rf $expdir/scores
+mv local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/scores $expdir/
diff --git a/egs/reverb/s5/local/download_se_eval_tool.sh b/egs/reverb/s5/local/download_se_eval_tool.sh
new file mode 100755
index 00000000000..0d7bb8305ea
--- /dev/null
+++ b/egs/reverb/s5/local/download_se_eval_tool.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# This script downloads the official REVERB challenge SE scripts and SRMR toolbox
+# This script also downloads and compiles PESQ
+# please make sure that you or your institution have the license to report PESQ
+# Apache 2.0
+
+wget 'https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-P.862-200102-I!!SOFT-ZST-E&type=items' -O PESQ.zip
+unzip PESQ.zip -d local/PESQ_sources
+rm PESQ.zip
+cd local/PESQ_sources/P862/Software/source
+gcc  *.c -lm -o PESQ
+cd ../../../../../
+mv local/PESQ_sources/P862/Software/source/PESQ local/
+
+wget 'https://reverb2014.dereverberation.com/tools/REVERB-SPEENHA.Release04Oct.zip' -O REVERB_scores.zip
+unzip REVERB_scores.zip -d local/REVERB_scores_source
+rm REVERB_scores.zip
+
+pushd local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools
+perl -i -pe 's/wavread/audioread/g' prog/score_sim.m
+git clone https://github.com/MuSAELab/SRMRToolbox.git
+perl -i -pe 's/wavread/audioread/g' SRMRToolbox/libs/preprocess.m
+perl -i -pe 's/SRMR_main/SRMR/g' prog/score_real.m
+perl -i -pe 's/SRMR_main/SRMR/g' prog/score_sim.m
+perl -i -pe 's/\+wb //g' prog/calcpesq.m
+perl -i -pe 's/pesq_/_pesq_/g' prog/calcpesq.m
+perl -n -i -e 'print unless /remove target file name/' prog/calcpesq.m
+patch score_RealData.m -i ../../../score_RealData.patch -o score_RealData_new.m
+mv score_RealData_new.m score_RealData.m
+patch score_SimData.m -i ../../../score_SimData.patch -o score_SimData_new.m
+mv score_SimData_new.m score_SimData.m
+popd
diff --git a/egs/reverb/s5/local/generate_data.sh b/egs/reverb/s5/local/generate_data.sh
new file mode 100755
index 00000000000..3228f0e1b3c
--- /dev/null
+++ b/egs/reverb/s5/local/generate_data.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#
+# Copyright  2018  Johns Hopkins University (Author: Shinji Watanabe)
+# Apache 2.0
+# This script is adapted from data preprations scripts in the Kaldi reverb recipe
+# https://github.com/kaldi-asr/kaldi/tree/master/egs/reverb/s5/local
+
+# Begin configuration section.
+wavdir=${PWD}/wav
+# End configuration section
+
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 1 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <wsjcam0-dir>"
+  echo -e >&2 "eg:\n  $0 /export/corpora3/LDC/LDC95S24/wsjcam0"
+  exit 1
+fi
+
+set -e -o pipefail
+
+wsjcam0=$1
+mkdir -p ${wavdir}
+
+# tool directory
+dir=${PWD}/data/local/reverb_tools
+mkdir -p ${dir}
+
+# Download tools
+URL1="http://reverb2014.dereverberation.com/tools/reverb_tools_for_Generate_mcTrainData.tgz"
+URL2="http://reverb2014.dereverberation.com/tools/REVERB_TOOLS_FOR_ASR_ver2.0.tgz"
+for f in $URL1 $URL2; do
+    x=`basename $f`
+    if [ ! -e $dir/$x ]; then
+	wget $f -O $dir/$x || exit 1;
+	tar zxvf $dir/$x -C $dir || exit 1;
+    fi
+done
+URL3="http://reverb2014.dereverberation.com/tools/taskFiles_et.tgz"
+x=`basename $URL3`
+if [ ! -e $dir/$x ]; then
+    wget $URL3 -O $dir/$x || exit 1;
+    tar zxvf $dir/$x -C $dir || exit 1;
+    cp -fr $dir/`basename $x .tgz`/* $dir/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/
+fi
+
+# generate WAV files for matlab
+echo "generating WAV files"
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+    echo "Could not find (or execute) the sph2pipe program at ${sph2pipe}";
+    exit 1;
+fi
+for sph in `cat ${dir}/reverb_tools_for_Generate_mcTrainData/etc/audio_si_tr.lst`; do
+    d=`dirname ${wavdir}/WSJCAM0/data/${sph}`
+    if [ ! -d "${d}" ]; then
+	mkdir -p ${d}
+    fi
+    ${sph2pipe} -f wav ${wsjcam0}/data/${sph}.wv1 > ${wavdir}/WSJCAM0/data/${sph}.wav
+done
+nwav=`find ${wavdir}/WSJCAM0/data/primary_microphone/si_tr | grep .wav | wc -l`
+echo "generated ${nwav} WAV files (it must be 7861)"
+[ "$nwav" -eq 7861 ] || echo "Warning: expected 7861 WAV files, got $nwav"
+
+# generalte training data
+reverb_tr_dir=${wavdir}/REVERB_WSJCAM0_tr
+cp local/Generate_mcTrainData_cut.m $dir/reverb_tools_for_Generate_mcTrainData/
+pushd $dir/reverb_tools_for_Generate_mcTrainData/
+tmpdir=`mktemp -d tempXXXXX `
+tmpmfile=$tmpdir/run_mat.m
+cat <<EOF > $tmpmfile
+addpath(genpath('.'))
+Generate_mcTrainData_cut('$wavdir/WSJCAM0', '$reverb_tr_dir');
+EOF
+cat $tmpmfile | matlab -nodisplay
+rm -rf $tmpdir
+popd
+
+echo "Successfully generated multi-condition training data and stored it in $reverb_tr_dir." && exit 0;
diff --git a/egs/reverb/s5/local/get_results.sh b/egs/reverb/s5/local/get_results.sh
index 7c74736e5d1..8867961dcdd 100755
--- a/egs/reverb/s5/local/get_results.sh
+++ b/egs/reverb/s5/local/get_results.sh
@@ -1,18 +1,86 @@
 #!/bin/bash
 
-# Reproduce selected results in Table 1 from Weninger et al. (2014)
 # "Our baselines"
-
-# LDA-STC  fMLLR  MCT    DT     LM     MBR
-# No       No     No     No     BG     No
-local/calc_wer.sh
-# No       No     Yes    No     BG     No
-local/calc_wer.sh --am tri2a_mc
-# No       Yes    Yes    No     BG     No
-local/calc_wer.sh --am tri2a_mc --decode basis_fmllr
-# Yes      Yes    Yes    No     TG     No
-local/calc_wer.sh --am tri2b_mc --lm tg_5k --decode basis_fmllr
-# Yes      Yes    Yes    Yes    TG     No
-local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode basis_fmllr
-# Yes      Yes    Yes    Yes    TG     Yes
-local/calc_wer.sh --am tri2b_mc_mmi_b0.1 --lm tg_5k --decode mbr_basis_fmllr
+echo "########################################"
+echo "GMM RESULTs:"
+echo "exp/tri3/decode_dt_real_1ch"
+cat exp/tri3/decode_dt_real_1ch/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_simu_1ch"
+cat exp/tri3/decode_dt_simu_1ch/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_real_1ch"
+cat exp/tri3/decode_et_real_1ch/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_simu_1ch"
+cat exp/tri3/decode_et_simu_1ch/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_real_1ch_wpe"
+cat exp/tri3/decode_dt_real_1ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_simu_1ch_wpe"
+cat exp/tri3/decode_dt_simu_1ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_real_1ch_wpe"
+cat exp/tri3/decode_et_real_1ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_simu_1ch_wpe"
+cat exp/tri3/decode_et_simu_1ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_real_2ch_wpe"
+cat exp/tri3/decode_dt_real_2ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_simu_2ch_wpe"
+cat exp/tri3/decode_dt_simu_2ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_real_2ch_wpe"
+cat exp/tri3/decode_et_real_2ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_simu_2ch_wpe"
+cat exp/tri3/decode_et_simu_2ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_real_8ch_wpe"
+cat exp/tri3/decode_dt_real_8ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_simu_8ch_wpe"
+cat exp/tri3/decode_dt_simu_8ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_real_8ch_wpe"
+cat exp/tri3/decode_et_real_8ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_simu_8ch_wpe"
+cat exp/tri3/decode_et_simu_8ch_wpe/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_real_2ch_beamformit"
+cat exp/tri3/decode_dt_real_2ch_beamformit/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_simu_2ch_beamformit"
+cat exp/tri3/decode_dt_simu_2ch_beamformit/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_real_2ch_beamformit"
+cat exp/tri3/decode_et_real_2ch_beamformit/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_simu_2ch_beamformit"
+cat exp/tri3/decode_et_simu_2ch_beamformit/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_real_8ch_beamformit"
+cat exp/tri3/decode_dt_real_8ch_beamformit/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_simu_8ch_beamformit"
+cat exp/tri3/decode_dt_simu_8ch_beamformit/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_real_8ch_beamformit"
+cat exp/tri3/decode_et_real_8ch_beamformit/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_dt_cln"
+cat exp/tri3/decode_dt_cln/scoring_kaldi/best_wer*
+echo ""
+echo "exp/tri3/decode_et_cln"
+cat exp/tri3/decode_et_cln/scoring_kaldi/best_wer*
+echo "########################################"
+echo "TDNN RESULTs:"
+echo "exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt*"
+cat exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_dt*/scoring_kaldi/best_wer_*
+echo ""
+echo "exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et*"
+cat exp/chain_tr_simu_8ch/tdnn1a_sp/decode_test_tg_5k_et*/scoring_kaldi/best_wer_*
diff --git a/egs/reverb/s5/local/nnet3/compare_wer.sh b/egs/reverb/s5/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..095e85cc338
--- /dev/null
+++ b/egs/reverb/s5/local/nnet3/compare_wer.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev_clean_2 (tgsmall) "
+  "#WER dev_clean_2 (tglarge) ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/egs/reverb/s5/local/nnet3/run_ivector_common.sh b/egs/reverb/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..3af3ad77565
--- /dev/null
+++ b/egs/reverb/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train_worn_u100k
+test_sets="dev_worn dev_beamformit_ref"
+gmm=tri3
+nj=96
+
+nnet3_affix=_train_worn_u100k
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj ${nj} --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{4,5,6,8}/$USER/kaldi-data/mfcc/reverb-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 8 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 20 \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/ivectors/reverb-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj ${nj} \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0
diff --git a/egs/reverb/s5/local/prepare_real_data.sh b/egs/reverb/s5/local/prepare_real_data.sh
new file mode 100755
index 00000000000..2da51b9786b
--- /dev/null
+++ b/egs/reverb/s5/local/prepare_real_data.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+#
+# Copyright 2018 Johns Hopkins University (Author: Shinji Watanabe)
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# This script is adapted from data preparation scripts in the Kaldi reverb recipe
+# https://github.com/kaldi-asr/kaldi/tree/master/egs/reverb/s5/local
+
+# Begin configuration section.
+wavdir=${PWD}/wav
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 1 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <reverb-dir>"
+  echo -e >&2 "eg:\n  $0 /export/corpora5/REVERB_2014/REVERB"
+  exit 1
+fi
+
+set -e -o pipefail
+
+reverb=$1
+
+# working directory
+dir=${PWD}/data/local/data
+mkdir -p ${dir}
+
+for task in dt et; do
+    if [ ${task} == 'dt' ]; then
+	mlf=${reverb}/MC_WSJ_AV_Dev/mlf/WSJ.mlf
+    elif [ ${task} == 'et' ]; then
+	mlf=${reverb}/MC_WSJ_AV_Eval/mlf/WSJ.mlf
+    fi
+    # MLF transcription correction
+    # taken from HTK baseline script
+    sed -e '
+# dos to unix line feed conversion
+s/\x0D$//' \
+	-e "
+            s/\x60//g              # remove unicode character grave accent.
+       " \
+	-e "
+            # fix the single quote for the word yield
+            # and the quoted ROOTS
+            # e.g. yield' --> yield
+            # reason: YIELD' is not in dict, while YIELD is
+            s/YIELD'/YIELD/g
+            s/'ROOTS'/ROOTS/g
+            s/'WHERE/WHERE/g
+            s/PEOPLE'/PEOPLE/g
+            s/SIT'/SIT/g
+            s/'DOMINEE/DOMINEE/g
+            s/CHURCH'/CHURCH/g" \
+	-e '
+              # fix the single missing double full stop issue at the end of an utterance
+              # e.g. I. C. N should be  I. C. N.
+              # reason: N is not in dict, while N. is
+              /^[A-Z]$/ {
+              # append a line
+                      N
+              # search for single dot on the second line
+                      /\n\./ {
+              # found it - now replace the
+                              s/\([A-Z]\)\n\./\1\.\n\./
+                      }
+              }' \
+	$mlf |\
+	perl local/mlf2text.pl > ${dir}/${task}.txt
+done
+
+
+noiseword="<NOISE>";
+for nch in 1 2 8; do
+    taskdir=data/local/reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/${nch}ch
+    # make a wav list
+    for task in dt et; do
+	if [ ${task} == 'dt' ]; then
+	    audiodir=${reverb}/MC_WSJ_AV_Dev
+	    audiodir_wpe=${wavdir}/WPE/${nch}ch/MC_WSJ_AV_Dev
+	elif [ ${task} == 'et' ]; then
+	    audiodir=${reverb}/MC_WSJ_AV_Eval
+	    audiodir_wpe=${wavdir}/WPE/${nch}ch/MC_WSJ_AV_Eval
+	fi
+	for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do
+	    perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id $dir$_";}' -- -dir=${audiodir} ${taskdir}/$x |\
+		sed -e "s/^\(...\)/\1_${x}_\1/"
+	done > ${dir}/${task}_real_${nch}ch_wav.scp
+	for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do
+	    perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id $dir$_";}' -- -dir=${audiodir_wpe} ${taskdir}/$x |\
+		sed -e "s/^\(...\)/\1_${x}_\1/"
+	done > ${dir}/${task}_real_${nch}ch_wpe_wav.scp
+    done
+    # make a transcript
+    for task in dt et; do
+	for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do
+	    perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id\n";}' ${taskdir}/$x |\
+		perl local/find_transcripts_txt.pl ${dir}/${task}.txt |\
+		sed -e "s/^\(...\)/\1_${x}_\1/"
+	done > ${dir}/${task}_real_${nch}ch.trans1 || exit 1;
+	cat ${dir}/${task}_real_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_real_${nch}ch.txt || exit 1;
+    done
+    
+    # Make the utt2spk and spk2utt files.
+    for task in dt et; do
+	cat ${dir}/${task}_real_${nch}ch_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_real_${nch}ch.utt2spk || exit 1;
+	cat ${dir}/${task}_real_${nch}ch.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_real_${nch}ch.spk2utt || exit 1;
+    done
+done
+
+# finally copy the above files to the data directory
+for nch in 1 2 8; do
+    for task in dt et; do
+	datadir=data/${task}_real_${nch}ch
+	mkdir -p ${datadir}
+	sort ${dir}/${task}_real_${nch}ch_wav.scp > ${datadir}/wav.scp
+	sort ${dir}/${task}_real_${nch}ch.txt     > ${datadir}/text
+	sort ${dir}/${task}_real_${nch}ch.utt2spk > ${datadir}/utt2spk
+	sort ${dir}/${task}_real_${nch}ch.spk2utt > ${datadir}/spk2utt
+	./utils/fix_data_dir.sh ${datadir}
+	if [ ${nch} != 1 ]; then
+	    datadir=data/${task}_real_${nch}ch_beamformit
+	    mkdir -p ${datadir}
+	    sort ${dir}/${task}_real_1ch_wpe_wav.scp | sed -e "s/-[1-8]_/-bf${nch}_/" | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp
+	    sort ${dir}/${task}_real_1ch.txt     > ${datadir}/text
+	    sort ${dir}/${task}_real_1ch.utt2spk > ${datadir}/utt2spk
+	    sort ${dir}/${task}_real_1ch.spk2utt > ${datadir}/spk2utt
+	    ./utils/fix_data_dir.sh ${datadir}
+	fi
+	datadir=data/${task}_real_${nch}ch_wpe
+	mkdir -p ${datadir}
+	sort ${dir}/${task}_real_1ch_wpe_wav.scp | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp
+	sort ${dir}/${task}_real_1ch.txt     > ${datadir}/text
+	sort ${dir}/${task}_real_1ch.utt2spk > ${datadir}/utt2spk
+	sort ${dir}/${task}_real_1ch.spk2utt > ${datadir}/spk2utt
+	./utils/fix_data_dir.sh ${datadir}
+    done
+done
diff --git a/egs/reverb/s5/local/prepare_simu_data.sh b/egs/reverb/s5/local/prepare_simu_data.sh
new file mode 100755
index 00000000000..8757021ddd7
--- /dev/null
+++ b/egs/reverb/s5/local/prepare_simu_data.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+#
+# Copyright 2018 Johns Hopkins University (Author: Shinji Watanabe)
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# This script is adapted from data preparation scripts in the Kaldi reverb recipe
+# https://github.com/kaldi-asr/kaldi/tree/master/egs/reverb/s5/local
+
+# Begin configuration section.
+wavdir=${PWD}/wav
+# End configuration section
+. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
+
+. ./path.sh
+
+echo >&2 "$0" "$@"
+if [ $# -ne 2 ] ; then
+  echo >&2 "$0" "$@"
+  echo >&2 "$0: Error: wrong number of arguments"
+  echo -e >&2 "Usage:\n  $0 [opts] <reverb-dir> <wsjcam0-dir>"
+  echo -e >&2 "eg:\n  $0 /export/corpora5/REVERB_2014/REVERB /export/corpora3/LDC/LDC95S24/wsjcam0"
+  exit 1
+fi
+
+set -e -o pipefail
+
+reverb=$1
+wsjcam0=$2
+
+# tool directory
+tooldir=${PWD}/data/local/reverb_tools
+
+# working directory
+dir=${PWD}/data/local/data
+mkdir -p ${dir}
+
+# make a one dot file for train, dev, and eval data
+# the directory structure of WSJCAM0 is not consistent and we need such process for each task
+cp ${wsjcam0}/data/primary_microphone/etc/si_tr.dot ${dir}/tr.dot
+cat ${wsjcam0}/data/primary_microphone/etc/si_dt*.dot | sort > ${dir}/dt.dot
+cat ${wsjcam0}/data/*/si_et*/*/*.dot | sort > ${dir}/et.dot
+
+noiseword="<NOISE>";
+for nch in 1 2 8; do
+    taskdir=data/local/reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/${nch}ch
+    # make a wav list
+    task=tr
+    for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do
+	perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${wavdir}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\
+	    sed -e "s/^\(...\)/\1_${x}_\1/"
+    done > ${dir}/${task}_simu_${nch}ch_wav.scp
+    for task in dt et; do
+	for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do
+	    perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${reverb}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\
+		sed -e "s/^\(...\)/\1_${x}_\1/"
+	done > ${dir}/${task}_simu_${nch}ch_wav.scp
+	if [ ${nch} == 1 ]; then
+	    for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e cln`; do
+	        perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${reverb}/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\
+	    	sed -e "s/^\(...\)/\1_${x}_\1/"
+	    done > ${dir}/${task}_cln_wav.scp
+        fi
+    done
+
+    task=tr
+    for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do
+	perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${wavdir}/WPE/${nch}ch/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\
+	    sed -e "s/^\(...\)/\1_${x}_\1/"
+    done > ${dir}/${task}_simu_${nch}ch_wpe_wav.scp
+    for task in dt et; do
+	for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do
+	    perl -se 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, " ", $dir, $_, "\n"; } }' -- -dir=${wavdir}/WPE/${nch}ch/REVERB_WSJCAM0_${task}/data ${taskdir}/$x |\
+		sed -e "s/^\(...\)/\1_${x}_\1/"
+	done > ${dir}/${task}_simu_${nch}ch_wpe_wav.scp
+    done
+
+    # make a transcript
+    task=tr
+    for x in `ls ${taskdir} | grep SimData | grep _${task}_`; do
+        perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' ${taskdir}/$x |\
+	    perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\
+	    sed -e "s/^\(...\)/\1_${x}_\1/"
+    done > ${dir}/${task}_simu_${nch}ch.trans1 || exit 1;
+    cat ${dir}/${task}_simu_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_simu_${nch}ch.txt || exit 1;
+    for task in dt et; do
+	for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e far -e near`; do
+	    perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' ${taskdir}/$x |\
+		perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\
+		sed -e "s/^\(...\)/\1_${x}_\1/"
+	done > ${dir}/${task}_simu_${nch}ch.trans1 || exit 1;
+	cat ${dir}/${task}_simu_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_simu_${nch}ch.txt || exit 1;
+	if [ ${nch} == 1 ]; then
+	    for x in `ls ${taskdir} | grep SimData | grep _${task}_ | grep -e cln`; do
+		perl -e 'while (<>) { chomp; if (m/\/(\w{8})[^\/]+$/) { print $1, "\n"; } }' ${taskdir}/$x |\
+		    perl local/find_transcripts_singledot.pl ${dir}/${task}.dot |\
+		    sed -e "s/^\(...\)/\1_${x}_\1/"
+	    done > ${dir}/${task}_cln.trans1 || exit 1;
+	    cat ${dir}/${task}_cln.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_cln.txt || exit 1;
+        fi
+    done
+    
+    # Make the utt2spk and spk2utt files.
+    for task in tr dt et; do
+	cat ${dir}/${task}_simu_${nch}ch_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_simu_${nch}ch.utt2spk || exit 1;
+	cat ${dir}/${task}_simu_${nch}ch.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_simu_${nch}ch.spk2utt || exit 1;
+    done
+    for task in dt et; do
+	cat ${dir}/${task}_cln_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_cln.utt2spk || exit 1;
+	cat ${dir}/${task}_cln.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_cln.spk2utt || exit 1;
+    done
+done
+
+# finally copy the above files to the data directory
+for nch in 1 2 8; do
+    for task in tr dt et; do
+	datadir=data/${task}_simu_${nch}ch
+	mkdir -p ${datadir}
+	sort ${dir}/${task}_simu_${nch}ch_wav.scp > ${datadir}/wav.scp
+	sort ${dir}/${task}_simu_${nch}ch.txt     > ${datadir}/text
+	sort ${dir}/${task}_simu_${nch}ch.utt2spk > ${datadir}/utt2spk
+	sort ${dir}/${task}_simu_${nch}ch.spk2utt > ${datadir}/spk2utt
+	./utils/fix_data_dir.sh ${datadir}
+	if [ ${task} != 'tr' ]; then
+	    datadir=data/${task}_simu_${nch}ch_wpe
+	    mkdir -p ${datadir}
+	    sort ${dir}/${task}_simu_1ch_wpe_wav.scp | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp
+	    sort ${dir}/${task}_simu_1ch.txt     > ${datadir}/text
+	    sort ${dir}/${task}_simu_1ch.utt2spk > ${datadir}/utt2spk
+	    sort ${dir}/${task}_simu_1ch.spk2utt > ${datadir}/spk2utt
+	    ./utils/fix_data_dir.sh ${datadir}
+	    if [ ${nch} != 1 ]; then
+		datadir=data/${task}_simu_${nch}ch_beamformit
+		mkdir -p ${datadir}
+		sort ${dir}/${task}_simu_1ch_wpe_wav.scp | sed -e "s/ch1/bf${nch}/" | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp
+		sort ${dir}/${task}_simu_1ch.txt     > ${datadir}/text
+		sort ${dir}/${task}_simu_1ch.utt2spk > ${datadir}/utt2spk
+		sort ${dir}/${task}_simu_1ch.spk2utt > ${datadir}/spk2utt
+		./utils/fix_data_dir.sh ${datadir}
+            else
+		datadir=data/${task}_cln
+		mkdir -p ${datadir}
+		sort ${dir}/${task}_cln_wav.scp > ${datadir}/wav.scp
+		sort ${dir}/${task}_cln.txt     > ${datadir}/text
+		sort ${dir}/${task}_cln.utt2spk > ${datadir}/utt2spk
+		sort ${dir}/${task}_cln.spk2utt > ${datadir}/spk2utt
+		./utils/fix_data_dir.sh ${datadir}
+	    fi
+	fi
+    done
+done
diff --git a/egs/reverb/s5/local/run_beamform.sh b/egs/reverb/s5/local/run_beamform.sh
new file mode 100755
index 00000000000..1c8aade7287
--- /dev/null
+++ b/egs/reverb/s5/local/run_beamform.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+# Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Shinji Watanabe)
+# Copyright 2018, Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=50
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+if [ $# != 1 ]; then
+   echo "Wrong #arguments ($#, expected 1)"
+   echo "Usage: local/run_beamform.sh [options] <wav-out-dir>"
+   echo "main options (for others, see top of script file)"
+   echo "  --nj <nj>                                # number of parallel jobs"
+   echo "  --cmd <cmd>                              # Command to run in parallel with"
+   exit 1;
+fi
+
+odir=$1
+dir=${PWD}/data/local/data
+
+if [ -z $BEAMFORMIT ] ; then
+  export BEAMFORMIT=$KALDI_ROOT/tools/extras/BeamformIt
+fi
+export PATH=${PATH}:$BEAMFORMIT
+! hash BeamformIt && echo "Missing BeamformIt, run 'cd ../../../tools/; extras/install_beamformit.sh;'" && exit 1
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+for task in dt et; do
+    for nch in 2 8; do
+        wdir=exp/beamform_real_${task}_${nch}ch
+        mkdir -p $wdir/log
+        arrays=$wdir/channels
+        output_wavfiles=$wdir/wavfiles.list
+        if [ ${nch} == 2 ]; then
+            allwavs=`cat ${dir}/${task}_real_${nch}ch_wpe_wav.scp | cut -d " " -f2`
+            allwavs_beamformit=`cat data/${task}_real_${nch}ch_beamformit/wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==1' > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==0' > $wdir/channels.2nd
+            echo $allwavs_beamformit | tr ' ' '\n' | rev | sort | rev | awk -F 'WPE/' '{print $2}' | awk -F '.wav' '{print $1}' > $output_wavfiles
+            paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd > $arrays
+        elif [ ${nch} == 8 ]; then
+            allwavs=`cat ${dir}/${task}_real_${nch}ch_wpe_wav.scp | cut -d " " -f2`
+            allwavs_beamformit=`cat data/${task}_real_${nch}ch_beamformit/wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==1' > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==2' > $wdir/channels.2nd
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==3' > $wdir/channels.3rd
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==4' > $wdir/channels.4th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==5' > $wdir/channels.5th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==6' > $wdir/channels.6th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==7' > $wdir/channels.7th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==0' > $wdir/channels.8th
+            echo $allwavs_beamformit | tr ' ' '\n' | rev | sort | rev | awk -F 'WPE/' '{print $2}' | awk -F '.wav' '{print $1}' > $output_wavfiles
+            paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th > $arrays
+        fi
+	# split the list for parallel processing
+        split_wavfiles=""
+        for n in `seq $nj`; do
+          split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+        done
+        utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+        
+        echo -e "Beamforming - $task - real - $nch ch\n"
+        # making a shell script for each job
+        for n in `seq $nj`; do
+	cat <<-EOF > $wdir/log/beamform.$n.sh
+	while read line; do
+	  $BEAMFORMIT/BeamformIt -s \$line -c $arrays \
+	    --config_file `pwd`/conf/reverb_beamformit.cfg \
+	    --result_dir $odir
+	done < $output_wavfiles.$n
+	EOF
+        done
+        
+        chmod a+x $wdir/log/beamform.*.sh
+        $cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
+          $wdir/log/beamform.JOB.sh
+    done
+done
+
+for task in dt et; do
+    for nch in 2 8; do
+        wdir=exp/beamform_simu_${task}_${nch}ch
+        mkdir -p $wdir/log
+        arrays=$wdir/channels
+        output_wavfiles=$wdir/wavfiles.list
+        if [ ${nch} == 2 ]; then
+            allwavs=`cat ${dir}/${task}_simu_${nch}ch_wpe_wav.scp | grep "ch[1-2].wav" | cut -d " " -f2`
+            allwavs_beamformit=`cat data/${task}_simu_${nch}ch_beamformit/wav.scp | grep "bf2.wav" | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd
+            echo $allwavs_beamformit | tr ' ' '\n' | awk -F 'WPE/' '{print $2}' | sort | awk -F '.wav' '{print $1}'  > $output_wavfiles
+            paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd > $arrays
+        elif [ ${nch} == 8 ]; then
+            allwavs=`cat ${dir}/${task}_simu_${nch}ch_wpe_wav.scp | grep "ch[1-8].wav" | cut -d " " -f2`
+            allwavs_beamformit=`cat data/${task}_simu_${nch}ch_beamformit/wav.scp | grep "bf8.wav" | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd
+            echo $allwavs | tr ' ' '\n' | grep 'ch3' | sort > $wdir/channels.3rd
+            echo $allwavs | tr ' ' '\n' | grep 'ch4' | sort > $wdir/channels.4th
+            echo $allwavs | tr ' ' '\n' | grep 'ch5' | sort > $wdir/channels.5th
+            echo $allwavs | tr ' ' '\n' | grep 'ch6' | sort > $wdir/channels.6th
+            echo $allwavs | tr ' ' '\n' | grep 'ch7' | sort > $wdir/channels.7th
+            echo $allwavs | tr ' ' '\n' | grep 'ch8' | sort > $wdir/channels.8th
+            echo $allwavs_beamformit | tr ' ' '\n' | awk -F 'WPE/' '{print $2}' | sort | awk -F '.wav' '{print $1}' > $output_wavfiles
+            paste -d" " $output_wavfiles $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th > $arrays
+        fi
+	# split the list for parallel processing
+        split_wavfiles=""
+        for n in `seq $nj`; do
+          split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+        done
+        utils/split_scp.pl $output_wavfiles $split_wavfiles || exit 1;
+        
+        echo -e "Beamforming - $task - simu - $nch ch\n"
+        # making a shell script for each job
+        for n in `seq $nj`; do
+	cat <<-EOF > $wdir/log/beamform.$n.sh
+	while read line; do
+	  $BEAMFORMIT/BeamformIt -s \$line -c $arrays \
+	    --config_file `pwd`/conf/reverb_beamformit.cfg \
+	    --result_dir $odir
+	done < $output_wavfiles.$n
+	EOF
+        done
+        
+        chmod a+x $wdir/log/beamform.*.sh
+        $cmd JOB=1:$nj $wdir/log/beamform.JOB.log \
+          $wdir/log/beamform.JOB.sh
+    done
+done
+echo "`basename $0` Done."
diff --git a/egs/reverb/s5/local/run_wpe.py b/egs/reverb/s5/local/run_wpe.py
new file mode 100644
index 00000000000..cc9cd41927a
--- /dev/null
+++ b/egs/reverb/s5/local/run_wpe.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+# Works with both python2 and python3
+
+import numpy as np
+import soundfile as sf
+import time
+import os, errno
+from tqdm import tqdm
+import argparse
+
+from nara_wpe.wpe import wpe
+from nara_wpe.utils import stft, istft 
+from nara_wpe import project_root
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--files', '-f', nargs='+')
+args = parser.parse_args()
+
+input_files = args.files[:len(args.files)//2]
+output_files = args.files[len(args.files)//2:]
+out_dir = os.path.dirname(output_files[0])
+try: 
+    os.makedirs(out_dir)
+except OSError as e:
+    if e.errno != errno.EEXIST:
+        raise
+
+stft_options = dict(
+    size=512,
+    shift=128,
+    window_length=None,
+    fading=True,
+    pad=True,
+    symmetric_window=False
+)
+
+sampling_rate = 16000
+delay = 3
+iterations = 5
+taps = 10
+
+signal_list = [
+    sf.read(f)[0]
+    for f in input_files
+]
+y = np.stack(signal_list, axis=0)
+Y = stft(y, **stft_options).transpose(2, 0, 1)
+Z = wpe(Y, iterations=iterations, statistics_mode='full').transpose(1, 2, 0)
+z = istft(Z, size=stft_options['size'], shift=stft_options['shift'])
+
+for d in range(len(signal_list)):
+    sf.write(output_files[d], z[d,:], sampling_rate)
diff --git a/egs/reverb/s5/local/run_wpe.sh b/egs/reverb/s5/local/run_wpe.sh
new file mode 100755
index 00000000000..d1ea56c6c55
--- /dev/null
+++ b/egs/reverb/s5/local/run_wpe.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+# Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+# Config:
+nj=50
+cmd=run.pl
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+miniconda_dir=$HOME/miniconda3/
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run '../../../tools/extras/install_miniconda.sh' and '../../../tools/extras/install_wpe.sh';"
+fi
+
+# check if WPE is installed
+result=`$HOME/miniconda3/bin/python -c "\
+try:
+    import nara_wpe
+    print('1')
+except ImportError:
+    print('0')"`
+
+if [ "$result" == "1" ]; then
+    echo "WPE is installed"
+else
+    echo "WPE is not installed. Please run ../../../tools/extras/install_wpe.sh"
+fi
+
+dir=${PWD}/data/local/data
+
+for task in dt et; do
+    for nch in 1 2 8; do
+        wdir=exp/wpe_real_${task}_${nch}ch
+        mkdir -p $wdir/log
+        arrays=$wdir/channels
+        output_wavfiles=$wdir/wavfiles.list
+        if [ ${nch} == 1 ]; then
+            allwavs=`cat ${dir}/${task}_real_1ch_wav.scp | cut -d " " -f2`
+            allwavs_output=`cat ${dir}/${task}_real_1ch_wpe_wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' > $wdir/channels_input
+            echo $allwavs_output | tr ' ' '\n' > $wdir/channels_output
+            paste -d" " $wdir/channels_input $wdir/channels_output > $arrays
+        elif [ ${nch} == 2 ]; then
+            allwavs=`cat ${dir}/${task}_real_2ch_wav.scp | cut -d " " -f2`
+            allwavs_output=`cat ${dir}/${task}_real_2ch_wpe_wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==1' > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==0' > $wdir/channels.2nd
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==1' > $wdir/channels_output.1st
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%2==0' > $wdir/channels_output.2nd
+            paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels_output.1st $wdir/channels_output.2nd > $arrays
+        elif [ ${nch} == 8 ]; then
+            allwavs=`cat ${dir}/${task}_real_8ch_wav.scp | cut -d " " -f2`
+            allwavs_output=`cat ${dir}/${task}_real_8ch_wpe_wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==1' > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==2' > $wdir/channels.2nd
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==3' > $wdir/channels.3rd
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==4' > $wdir/channels.4th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==5' > $wdir/channels.5th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==6' > $wdir/channels.6th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==7' > $wdir/channels.7th
+            echo $allwavs | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==0' > $wdir/channels.8th
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==1' > $wdir/channels_output.1st
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==2' > $wdir/channels_output.2nd
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==3' > $wdir/channels_output.3rd
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==4' > $wdir/channels_output.4th
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==5' > $wdir/channels_output.5th
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==6' > $wdir/channels_output.6th
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==7' > $wdir/channels_output.7th
+            echo $allwavs_output | tr ' ' '\n' | rev | sort | rev | awk 'NR%8==0' > $wdir/channels_output.8th
+            paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th $wdir/channels_output.1st $wdir/channels_output.2nd $wdir/channels_output.3rd $wdir/channels_output.4th $wdir/channels_output.5th $wdir/channels_output.6th $wdir/channels_output.7th $wdir/channels_output.8th > $arrays
+        fi
+        
+        # split the list for parallel processing
+        split_wavfiles=""
+        for n in `seq $nj`; do
+            split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+        done
+        utils/split_scp.pl $arrays $split_wavfiles || exit 1;
+        
+        echo -e "Dereverberation - $task - real - $nch ch\n"
+        # making a shell script for each job
+	for n in `seq $nj`; do
+	cat <<-EOF > $wdir/log/wpe.$n.sh
+	while read line; do
+	  $HOME/miniconda3/bin/python local/run_wpe.py \
+	    --file \$line
+	done < $output_wavfiles.$n
+	EOF
+	done
+
+        chmod a+x $wdir/log/wpe.*.sh
+        $cmd JOB=1:$nj $wdir/log/wpe.JOB.log \
+          $wdir/log/wpe.JOB.sh
+    done
+done
+
+for task in dt et; do
+    for nch in 1 2 8; do
+        wdir=exp/wpe_simu_${task}_${nch}ch
+        mkdir -p $wdir/log
+        arrays=$wdir/channels
+        output_wavfiles=$wdir/wavfiles.list
+        if [ ${nch} == 1 ]; then
+            allwavs=`cat ${dir}/${task}_simu_1ch_wav.scp | cut -d " " -f2`
+            allwavs_output=`cat ${dir}/${task}_simu_1ch_wpe_wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' > $wdir/channels_input
+            echo $allwavs_output | tr ' ' '\n' > $wdir/channels_output
+            paste -d" " $wdir/channels_input $wdir/channels_output > $arrays
+        elif [ ${nch} == 2 ]; then
+            allwavs=`cat ${dir}/${task}_simu_2ch_wav.scp | cut -d " " -f2`
+            allwavs_output=`cat ${dir}/${task}_simu_2ch_wpe_wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels_output.1st
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels_output.2nd
+            paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels_output.1st $wdir/channels_output.2nd > $arrays
+        elif [ ${nch} == 8 ]; then
+            allwavs=`cat ${dir}/${task}_simu_8ch_wav.scp | cut -d " " -f2`
+            allwavs_output=`cat ${dir}/${task}_simu_8ch_wpe_wav.scp | cut -d " " -f2`
+            echo $allwavs | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels.1st
+            echo $allwavs | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels.2nd
+            echo $allwavs | tr ' ' '\n' | grep 'ch3' | sort > $wdir/channels.3rd
+            echo $allwavs | tr ' ' '\n' | grep 'ch4' | sort > $wdir/channels.4th
+            echo $allwavs | tr ' ' '\n' | grep 'ch5' | sort > $wdir/channels.5th
+            echo $allwavs | tr ' ' '\n' | grep 'ch6' | sort > $wdir/channels.6th
+            echo $allwavs | tr ' ' '\n' | grep 'ch7' | sort > $wdir/channels.7th
+            echo $allwavs | tr ' ' '\n' | grep 'ch8' | sort > $wdir/channels.8th
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch1' | sort > $wdir/channels_output.1st
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch2' | sort > $wdir/channels_output.2nd
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch3' | sort > $wdir/channels_output.3rd
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch4' | sort > $wdir/channels_output.4th
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch5' | sort > $wdir/channels_output.5th
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch6' | sort > $wdir/channels_output.6th
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch7' | sort > $wdir/channels_output.7th
+            echo $allwavs_output | tr ' ' '\n' | grep 'ch8' | sort > $wdir/channels_output.8th
+            paste -d" " $wdir/channels.1st $wdir/channels.2nd $wdir/channels.3rd $wdir/channels.4th $wdir/channels.5th $wdir/channels.6th $wdir/channels.7th $wdir/channels.8th $wdir/channels_output.1st $wdir/channels_output.2nd $wdir/channels_output.3rd $wdir/channels_output.4th $wdir/channels_output.5th $wdir/channels_output.6th $wdir/channels_output.7th $wdir/channels_output.8th > $arrays
+        fi
+        
+        # split the list for parallel processing
+        split_wavfiles=""
+        for n in `seq $nj`; do
+            split_wavfiles="$split_wavfiles $output_wavfiles.$n"
+        done
+        utils/split_scp.pl $arrays $split_wavfiles || exit 1;
+        
+        echo -e "Dereverberation - $task - simu - $nch ch\n"
+        # making a shell script for each job
+	for n in `seq $nj`; do
+	cat <<-EOF > $wdir/log/wpe.$n.sh
+	while read line; do
+	  $HOME/miniconda3/bin/python local/run_wpe.py \
+	    --file \$line
+	done < $output_wavfiles.$n
+	EOF
+	done
+
+        chmod a+x $wdir/log/wpe.*.sh
+        $cmd JOB=1:$nj $wdir/log/wpe.JOB.log \
+          $wdir/log/wpe.JOB.sh
+    done
+done
+echo "`basename $0` Done."
diff --git a/egs/reverb/s5/local/score.sh b/egs/reverb/s5/local/score.sh
index abd8149a672..66bc976333f 100755
--- a/egs/reverb/s5/local/score.sh
+++ b/egs/reverb/s5/local/score.sh
@@ -1,23 +1,29 @@
 #!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
 # Apache 2.0
 
+# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER
+
 [ -f ./path.sh ] && . ./path.sh
 
 # begin configuration section.
 cmd=run.pl
 stage=0
-decode_mbr=true
-word_ins_penalty=0.0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
 min_lmwt=7
 max_lmwt=17
+iter=final
 #end configuration section.
 
+echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
 if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
@@ -37,21 +43,122 @@ for f in $symtab $dir/lat.1.gz $data/text; do
   [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
 done
 
-mkdir -p $dir/scoring/log
 
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+if echo $data | grep -q "real"; then
+  tasks="\
+  near_room1 far_room1"
+elif echo $data | grep -q "cln"; then
+  tasks="\
+  cln_room1 cln_room2 cln_room3"
+else
+  tasks="\
+  near_room1 far_room1 \
+  near_room2 far_room2 \
+  near_room3 far_room3"
+fi
+for task in ${tasks}; do
+  grep $task $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt_${task}.txt || exit 1;
+done
+
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
 
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
-  lattice-best-path --word-symbol-table=$symtab \
-    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+    for task in ${tasks}; do
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+        grep $task $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+        compute-wer --text --mode=present \
+        ark:$dir/scoring_kaldi/test_filt_${task}.txt  ark,p:- ">&" $dir/wer_LMWT_${wip}_${task} || exit 1;
+    done
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+  for task in ${tasks}; do 
+    for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+      for lmwt in $(seq $min_lmwt $max_lmwt); do
+        # adding /dev/null to the command list below forces grep to output the filename
+        grep WER $dir/wer_${lmwt}_${wip}_${task} /dev/null
+      done
+    done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer_${task} || exit 1
+  
+    best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer_${task})
+    best_wip=$(echo $best_wer_file | awk -F_ '{N=NF-2; print $N}')
+    best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-3; print $N}')
+
+    if [ -z "$best_lmwt" ]; then
+      echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+      exit 1;
+    fi
+    if $stats; then
+      mkdir -p $dir/scoring_kaldi/wer_details
+      echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+      echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+      $cmd $dir/scoring_kaldi/log/stats1.log \
+        cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+        align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt_${task}.txt ark:- ark,t:- \|  \
+        utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+         utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+      $cmd $dir/scoring_kaldi/log/stats2.log \
+        cat $dir/scoring_kaldi/wer_details/per_utt \| \
+        utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+        sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+      $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+        compute-wer-bootci --mode=present \
+          ark:$dir/scoring_kaldi/test_filt_${task}.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+          '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+    fi
+  done
+fi
 
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-   cat $dir/scoring/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
 
 exit 0;
diff --git a/egs/reverb/s5/local/score_RealData.patch b/egs/reverb/s5/local/score_RealData.patch
new file mode 100644
index 00000000000..cafa521d483
--- /dev/null
+++ b/egs/reverb/s5/local/score_RealData.patch
@@ -0,0 +1,14 @@
+11c11
+< clear all;
+---
+> function score_RealData(download_from_ldc,senhroot)
+26c26,27
+< srmrdir = 'SRMRtoolbox-ReverbChallenge';
+---
+> srmrdir = 'SRMRToolbox';
+> addpath(genpath('SRMRToolbox/libs'));
+32d32
+< senhroot = '../output/RealData';
+129a130,131
+> 
+> end
diff --git a/egs/reverb/s5/local/score_SimData.patch b/egs/reverb/s5/local/score_SimData.patch
new file mode 100644
index 00000000000..4fb0d9f48ac
--- /dev/null
+++ b/egs/reverb/s5/local/score_SimData.patch
@@ -0,0 +1,23 @@
+11c11
+< clear all;
+---
+> function score_SimData(download_from_ldc,senhroot,pesqdir,compute_pesq)
+26,27c26,27
+< srmrdir = 'SRMRtoolbox-ReverbChallenge';
+< % pesqdir = '/directory/where/pesq/executable/is/stored';
+---
+> srmrdir = 'SRMRToolbox';
+> addpath(genpath('SRMRToolbox/libs'));
+36d35
+< senhroot = '../output/SimData';
+39c38
+< if exist('pesqdir', 'var')
+---
+> if exist('pesqdir', 'var') && compute_pesq~=0
+471c470,472
+< fclose(fid);
+\ No newline at end of file
+---
+> fclose(fid);
+> 
+> end
diff --git a/egs/reverb/s5/local/score_mbr.sh b/egs/reverb/s5/local/score_mbr.sh
deleted file mode 120000
index 2573fadf042..00000000000
--- a/egs/reverb/s5/local/score_mbr.sh
+++ /dev/null
@@ -1 +0,0 @@
-../../../wsj/s5/local/score_mbr.sh
\ No newline at end of file
diff --git a/egs/reverb/s5/path.sh b/egs/reverb/s5/path.sh
index 1a6fb5f891b..f46c5d8cb72 100644
--- a/egs/reverb/s5/path.sh
+++ b/egs/reverb/s5/path.sh
@@ -1,4 +1,6 @@
 export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+[ -f $KALDI_ROOT/tools/extras/env.sh ] && . $KALDI_ROOT/tools/extras/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
diff --git a/egs/reverb/s5/run.sh b/egs/reverb/s5/run.sh
index cb0b00c19b6..999ec98e637 100755
--- a/egs/reverb/s5/run.sh
+++ b/egs/reverb/s5/run.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
 # Copyright 2013-2014 MERL (author: Felix Weninger and Shinji Watanabe)
+#                     Johns Hopkins University (author: Szu-Jui Chen)
+#                     Johns Hopkins University (author: Aswin Shanmugam Subramanian)
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,7 +35,13 @@ fi
 . ./cmd.sh
 . ./path.sh
 
-stage=1
+stage=0
+nch_se=8
+# flag for turing on computation of dereverberation measures
+compute_se=true
+# please make sure that you or your institution have the license to report PESQ before turning on the below flag
+enable_pesq=false
+
 . utils/parse_options.sh
 # Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
 # -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
@@ -41,297 +49,141 @@ set -euxo pipefail
 
 # please make sure to set the paths of the REVERB and WSJ0 data
 if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then
-  REVERB_home=/export/corpora5/REVERB_2014/REVERB
+  reverb=/export/corpora5/REVERB_2014/REVERB
   export wsjcam0=/export/corpora3/LDC/LDC95S24/wsjcam0
   # set LDC WSJ0 directory to obtain LMs
   # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
   export wsj0=/export/corpora5/LDC/LDC93S6A/11-13.1 #LDC93S6A or LDC93S6B
   # It is assumed that there will be a 'wsj0' subdirectory
   # within the top-level corpus directory
-elif [[ $(hostname -f) == *.merl.com ]] ; then
-  REVERB_home=/db/laputa1/data/original/public/REVERB
-  export wsjcam0=$REVERB_home/wsjcam0
-  # set LDC WSJ0 directory to obtain LMs
-  # REVERB data directory only provides bi-gram (bcb05cnp), but this recipe also uses 3-gram (tcb05cnp.z)
-  export wsj0=/db/laputa1/data/original/public/WSJ0/11-13.1 #LDC93S6A or LDC93S6B
-  # It is assumed that there will be a 'wsj0' subdirectory
-  # within the top-level corpus directory
 else
   echo "Set the data directory locations." && exit 1;
 fi
-export reverb_dt=$REVERB_home/REVERB_WSJCAM0_dt
-export reverb_et=$REVERB_home/REVERB_WSJCAM0_et
-export reverb_real_dt=$REVERB_home/MC_WSJ_AV_Dev
-export reverb_real_et=$REVERB_home/MC_WSJ_AV_Eval
-
-# set the directory of the multi-condition training data to be generated
-reverb_tr=`pwd`/data_tr_cut/REVERB_WSJCAM0_tr_cut
 
-# LDA context size (left/right) (4 is default)
-context_size=4
+#training set and test set
+train_set=tr_simu_8ch
+test_sets="dt_real_8ch_beamformit dt_simu_8ch_beamformit et_real_8ch_beamformit et_simu_8ch_beamformit dt_real_1ch_wpe dt_simu_1ch_wpe et_real_1ch_wpe et_simu_1ch_wpe dt_cln et_cln"
 
 # The language models with which to decode (tg_5k or bg_5k)
 lm="tg_5k"
 
 # number of jobs for feature extraction and model training
-nj_train=30
-
+nj=92
 # number of jobs for decoding
-nj_decode=8
-
-# set to true if you want the tri2a systems (re-implementation of the HTK baselines)
-do_tri2a=true
+decode_nj=10
 
-if [ $stage -le 1 ]; then
-  # Generate multi-condition training data
-  # Note that utterance lengths match the original set.
-  # This enables using clean alignments in multi-condition training (stereo training)
-  local/REVERB_create_mcdata.sh $wsjcam0 $reverb_tr
+wavdir=${PWD}/wav
+pesqdir=${PWD}/local
+if [ ${stage} -le 1 ]; then
+  # data preparation
+  echo "stage 0: Data preparation"
+  local/generate_data.sh --wavdir ${wavdir} ${wsjcam0}
+  local/prepare_simu_data.sh --wavdir ${wavdir} ${reverb} ${wsjcam0}
+  local/prepare_real_data.sh --wavdir ${wavdir} ${reverb}
 fi
 
 if [ $stage -le 2 ]; then
+  local/run_wpe.sh --cmd "$train_cmd"
+  local/run_beamform.sh --cmd "$train_cmd" ${wavdir}/WPE/
+fi
+
+# Compute dereverberation scores
+if [ $stage -le 3 ] && $compute_se; then
+  if [ ! -d local/REVERB_scores_source ] || [ ! -d local/REVERB_scores_source/REVERB-SPEENHA.Release04Oct/evaltools/SRMRToolbox ] || [ ! -f local/PESQ ]; then
+    # download and install speech enhancement evaluation tools
+    local/download_se_eval_tool.sh
+  fi
+  local/compute_se_scores.sh --nch $nch_se --enable_pesq $enable_pesq $reverb $wavdir $pesqdir
+  cat exp/compute_se_${nch_se}ch/scores/score_SimData
+  cat exp/compute_se_${nch_se}ch/scores/score_RealData
+fi
+
+if [ $stage -le 4 ]; then
   # Prepare wsjcam0 clean data and wsj0 language model.
   local/wsjcam0_data_prep.sh $wsjcam0 $wsj0
-
+  
   # Prepare merged BEEP/CMU dictionary.
   local/wsj_prepare_beep_dict.sh
 
   # Prepare wordlists, etc.
-  utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang
+  utils/prepare_lang.sh data/local/dict "<NOISE>" data/local/lang_tmp data/lang
 
   # Prepare directory structure for clean data. Apply some language model fixes.
   local/wsjcam0_format_data.sh
+fi
 
-  # Now it's getting more interesting.
-  # Prepare the multi-condition training data and the REVERB dt set.
-  # This also extracts MFCC features (!!!)
-  # This creates the data sets called REVERB_tr_cut and REVERB_dt.
-  # If you have processed waveforms, this is a good starting point to integrate them.
-  # For example, you could have something like
-  # local/REVERB_wsjcam0_data_prep.sh /path/to/processed/REVERB_WSJCAM0_dt processed_REVERB_dt dt
-  # The first argument is supposed to point to a folder that has the same structure
-  # as the REVERB corpus.
-  local/REVERB_wsjcam0_data_prep.sh $reverb_tr REVERB_tr_cut tr
-  local/REVERB_wsjcam0_data_prep.sh $reverb_dt REVERB_dt dt
-  local/REVERB_wsjcam0_data_prep.sh $reverb_et REVERB_et et
-
-  # Prepare the REVERB "real" dt set from MCWSJAV corpus.
-  # This corpus is *never* used for training.
-  # This creates the data set called REVERB_Real_dt and its subfolders
-  local/REVERB_mcwsjav_data_prep.sh $reverb_real_dt REVERB_Real_dt dt
-  # The MLF file exists only once in the corpus, namely in the real_dt directory
-  # so we pass it as 4th argument
-  local/REVERB_mcwsjav_data_prep.sh $reverb_real_et REVERB_Real_et et $reverb_real_dt/mlf/WSJ.mlf
+if [ $stage -le 5 ]; then
+  for dset in ${train_set} ${test_sets}; do
+    utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset}
+  done
 fi
 
-if [ $stage -le 3 ]; then
-  # Extract MFCC features for clean sets.
-  # For the non-clean data sets, this is outsourced to the data preparation scripts.
+if [ $stage -le 6 ]; then
+  # Extract MFCC features for train and test sets.
   mfccdir=mfcc
-  ### for x in si_tr si_dt; do it seems that the number of transcriptions of si_dt is not correct.
-  for x in si_tr; do
-   steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj_train \
+  for x in ${train_set} ${test_sets}; do
+   steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 \
      data/$x exp/make_mfcc/$x $mfccdir
    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
   done
 fi
 
-if [ $stage -le 4 ]; then
-  # Train monophone model on clean data (si_tr).
-  echo "### TRAINING mono0a ###"
-  steps/train_mono.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \
-    data/si_tr data/lang exp/mono0a
-
-  # Align monophones with clean data.
-  echo "### ALIGNING mono0a_ali ###"
-  steps/align_si.sh --boost-silence 1.25 --nj $nj_train --cmd "$train_cmd" \
-    data/si_tr data/lang exp/mono0a exp/mono0a_ali
-
-  # Create first triphone recognizer.
-  echo "### TRAINING tri1 ###"
-  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
-    2000 10000 data/si_tr data/lang exp/mono0a_ali exp/tri1
-
-  echo "### ALIGNING tri1_ali ###"
-  # Re-align triphones.
-  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
-    data/si_tr data/lang exp/tri1 exp/tri1_ali
+if [ $stage -le 7 ]; then
+  # Starting basic training on MFCC features
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+		      data/${train_set} data/lang exp/mono
 fi
 
-# The following code trains and evaluates a delta feature recognizer, which is similar to the HTK
-# baseline (but using per-utterance basis fMLLR instead of batch MLLR). This is for reference only.
-if $do_tri2a; then
-if [ $stage -le 5 ]; then
-  # Train tri2a, which is deltas + delta-deltas, on clean data.
-  steps/train_deltas.sh --cmd "$train_cmd" \
-    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2a
-
-  # Re-align triphones using clean data. This gives a smallish performance gain.
-  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
-    data/si_tr data/lang exp/tri2a exp/tri2a_ali
+if [ $stage -le 8 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/mono exp/mono_ali
 
-  # Train a multi-condition triphone recognizer.
-  # This uses alignments on *clean* data, which is allowed for REVERB.
-  # However, we have to use the "cut" version so that the length of the
-  # waveforms match.
-  # It is actually asserted by the Challenge that clean and multi-condition waves are aligned.
   steps/train_deltas.sh --cmd "$train_cmd" \
-    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_ali exp/tri2a_mc
-
-  # Prepare clean and mc tri2a models for decoding.
-  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg_5k &
-  utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a_mc exp/tri2a_mc/graph_bg_5k &
-  wait
+			2500 30000 data/${train_set} data/lang exp/mono_ali exp/tri1
 fi
 
-if [ $stage -le 6 ]; then
-  # decode REVERB dt using tri2a, clean
-  for dataset in data/REVERB_*{dt,et}/*; do
-    steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
-      exp/tri2a/graph_bg_5k $dataset exp/tri2a/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
-  done
-
-  # decode REVERB dt using tri2a, mc
-  for dataset in data/REVERB_*{dt,et}/*; do
-    steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
-      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
-  done
-
-  # basis fMLLR for tri2a_mc system
-  # This computes a transform for every training utterance and computes a basis from that.
-  steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri2a_mc
-
-  # Recognition using fMLLR adaptation (per-utterance processing).
-  for dataset in data/REVERB_*{dt,et}/*; do
-    steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \
-      exp/tri2a_mc/graph_bg_5k $dataset exp/tri2a_mc/decode_basis_fmllr_bg_5k_`echo $dataset | awk -F '/' '{print $2 "_" $3}'` &
-  done
-  wait
-fi
-fi
+if [ $stage -le 9 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri1 exp/tri1_ali
 
-if [ $stage -le 7 ]; then
-  # Train tri2b recognizer, which uses LDA-MLLT, using the default parameters from the WSJ recipe.
-  echo "### TRAINING tri2b ###"
   steps/train_lda_mllt.sh --cmd "$train_cmd" \
-    --splice-opts "--left-context=$context_size --right-context=$context_size" \
-    2500 15000 data/si_tr data/lang exp/tri1_ali exp/tri2b
-
-  # tri2b (LDA-MLLT system) with multi-condition training, using default parameters.
-  echo "### TRAINING tri2b_mc ###"
-  steps/train_lda_mllt.sh  --cmd "$train_cmd"\
-    --splice-opts "--left-context=$context_size --right-context=$context_size" \
-    2500 15000 data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/tri1_ali exp/tri2b_mc
+			  4000 50000 data/${train_set} data/lang exp/tri1_ali exp/tri2
 fi
 
-# Prepare tri2b* systems for decoding.
-if [ $stage -le 8 ]; then
-  echo "### MAKING GRAPH {tri2b,tri2b_mc}/graph_$lm ###"
-  for recog in tri2b tri2b_mc; do
-    utils/mkgraph.sh data/lang_test_$lm exp/$recog exp/$recog/graph_$lm &
+if [ $stage -le 10 ]; then
+  utils/mkgraph.sh data/lang_test_$lm exp/tri2 exp/tri2/graph
+  for dset in ${test_sets}; do
+    steps/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+		    exp/tri2/graph data/${dset} exp/tri2/decode_${dset} &
   done
   wait
 fi
 
-# discriminative training on top of multi-condition systems
-# one could also add tri2b here to have a DT clean recognizer for reference
-if [ $stage -le 9 ]; then
-  base_recog=tri2b_mc
-  bmmi_recog=${base_recog}_mmi_b0.1
-  echo "### DT $base_recog --> $bmmi_recog ###"
-
-  # get alignments from base recognizer
-  steps/align_si.sh --nj $nj_train --cmd "$train_cmd" \
-    --use-graphs true data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/${base_recog}_ali
-
-  # get lattices from base recognizer
-  denlats_dir=${base_recog}_denlats
-  subsplit=`echo $nj_train \* 2 | bc`
-  # DT with multi-condition data ...
-  steps/make_denlats.sh --sub-split $subsplit --nj $nj_train --cmd "$decode_cmd" \
-    data/REVERB_tr_cut/SimData_tr_for_1ch_A data/lang exp/$base_recog exp/$denlats_dir
+if [ $stage -le 11 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+		    data/${train_set} data/lang exp/tri2 exp/tri2_ali
 
-  # boosted MMI training
-  steps/train_mmi.sh --boost 0.1 --cmd "$train_cmd" \
-    data/REVERB_tr_cut/SimData_tr_for_1ch_A \
-    data/lang \
-    exp/${base_recog}_ali \
-    exp/$denlats_dir \
-    exp/$bmmi_recog
-  cp exp/$base_recog/ali.* exp/$bmmi_recog
+  steps/train_sat.sh --cmd "$train_cmd" \
+		     5000 100000 data/${train_set} data/lang exp/tri2_ali exp/tri3
 fi
 
-# decoding using various recognizers
-if [ $stage -le 10 ]; then
-  # put tri2b last since it takes longest due to the large mismatch.
-  for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
-    # The graph from the ML directory is used in recipe
-    recog2=`echo $recog | sed s/_mmi.*//`
-    graph=exp/$recog2/graph_$lm
-
-    echo "### DECODING with $recog, noadapt, $lm ###"
-    for dataset in data/REVERB_*{dt,et}/*; do
-      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
-      steps/decode.sh --nj $nj_decode --cmd "$decode_cmd" \
-        $graph $dataset \
-        exp/$recog/decode_$decode_suff &
-    done
-    wait
-
-    echo " ## MBR RESCORING with $recog, noadapt ##"
-    for dataset in data/REVERB_*{dt,et}/*; do
-      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
-      mkdir -p exp/$recog/decode_mbr_$decode_suff
-      cp exp/$recog/decode_$decode_suff/lat.*.gz exp/$recog/decode_mbr_$decode_suff
-      local/score_mbr.sh --cmd "$decode_cmd" \
-        $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_$decode_suff &
-    done
-    wait
-
-  done # loop recog
+if [ $stage -le 12 ]; then
+  utils/mkgraph.sh data/lang_test_$lm exp/tri3 exp/tri3/graph
+  for dset in ${test_sets}; do
+    steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+			  exp/tri3/graph data/${dset} exp/tri3/decode_${dset} &
+  done
+  wait
 fi
 
-# decoding using various recognizers with adaptation
-if [ $stage -le 11 ]; then
-  # put tri2b last since it takes longest due to the large mismatch.
-  for recog in tri2b_mc tri2b_mc_mmi_b0.1 tri2b; do
-    # The graph from the ML directory is used in recipe
-    recog2=`echo $recog | sed s/_mmi.*//`
-    graph=exp/$recog2/graph_$lm
-
-    # set the adaptation data
-    if [[ "$recog" =~ _mc ]]; then
-      tr_dataset=REVERB_tr_cut/SimData_tr_for_1ch_A
-    else
-      tr_dataset=si_tr
-    fi
-
-    echo "### DECODING with $recog, basis_fmllr, $lm ###"
-    steps/get_fmllr_basis.sh --cmd "$train_cmd" --per-utt true data/$tr_dataset data/lang exp/$recog
-    for dataset in data/REVERB_*{dt,et}/*; do
-      (
-        decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
-        steps/decode_basis_fmllr.sh --nj $nj_decode --cmd "$decode_cmd" \
-          $graph $dataset \
-          exp/$recog/decode_basis_fmllr_$decode_suff
-      ) &
-    done
-    wait
-
-    echo " ## MBR RESCORING with $recog, basis_fmllr ##"
-    for dataset in data/REVERB_*{dt,et}/*; do
-      decode_suff=${lm}_`echo $dataset | awk -F '/' '{print $2 "_" $3}'`
-      mkdir -p exp/$recog/decode_mbr_basis_fmllr_$decode_suff
-      cp exp/$recog/decode_basis_fmllr_$decode_suff/lat.*.gz exp/$recog/decode_mbr_basis_fmllr_$decode_suff
-      local/score_mbr.sh --cmd "$decode_cmd" \
-        $dataset data/lang_test_$lm/ exp/$recog/decode_mbr_basis_fmllr_$decode_suff &
-    done
-    wait
-
-  done # loop recog
+if [ $stage -le 13 ]; then
+  # chain TDNN
+  local/chain/run_tdnn.sh --nj ${nj} --train-set ${train_set} --test-sets "$test_sets" --gmm tri3 --nnet3-affix _${train_set} \
+  --lm-suffix _test_$lm
 fi
 
-# get all WERs with lmw=15
-if [ $stage -le 12 ]; then
+# get all WERs. 
+if [ $stage -le 14 ]; then
   local/get_results.sh
 fi
diff --git a/egs/rimes/README.txt b/egs/rimes/README.txt
new file mode 100644
index 00000000000..d201c5fec4e
--- /dev/null
+++ b/egs/rimes/README.txt
@@ -0,0 +1,13 @@
+Rimes is a French handwriting recognition database created by A2iA.
+The database was created by asking individuals to write letters on a given scenario like
+a change of personal information, payment difficulty, damage declaration. The
+dataset has been used in several international research including ICFHR 2008,
+ICDAR-2009, ICDAR-2011 competitions for isolated word level and
+line level recognition tasks.
+
+It contains 11333 training lines and 788 test lines. It does not include
+a validation split but in a recent publication a 10% sampling of the total
+training lines for validation purposes were performed
+(http://www.jpuigcerver.net/pubs/jpuigcerver_icdar2017.pdf).
+We have used a similar train, test and validation split.
+More info: http://www.a2ialab.com/doku.php?id=rimes_database:start
diff --git a/egs/rimes/v1/RESULTS b/egs/rimes/v1/RESULTS
new file mode 100644
index 00000000000..4a9d7225e33
--- /dev/null
+++ b/egs/rimes/v1/RESULTS
@@ -0,0 +1,45 @@
+Run_end2end.sh Word-based system WER using lang_unk and lang. WER at line-level and paragraph level
+flat_start:
+Line-level:
+  • %WER 13.97 [ 788 / 5639, 136 ins, 62 del, 590 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_10_1.0
+  • %WER 16.56 [ 934 / 5639, 158 ins, 75 del, 701 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_13_1.0
+
+Paragraph-level:
+  • %WER 12.89 [ 727 / 5639, 116 ins, 42 del, 569 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_10_1.0
+  • %WER 15.50 [ 874 / 5639, 133 ins, 50 del, 691 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_13_1.0
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 10.43 [ 588 / 5639, 115 ins, 57 del, 416 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_7_1.0
+  • %WER 13.78 [ 777 / 5639, 153 ins, 58 del, 566 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_10_1.0
+
+Paragraph-level:
+  • %WER 9.35 [ 527 / 5639, 89 ins, 31 del, 407 sub ] exp/chain/cnn_e2eali_1a/decode_test//para/wer_7_1.0
+  • %WER 12.70 [ 716 / 5639, 134 ins, 39 del, 543 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_10_1.0
+
+
+Run_end2end.sh BPE-based system WER using lang. WER at line-level and paragraph level
+flat_start:
+Line-level:
+  • %WER 11.58 [ 653 / 5639, 72 ins, 67 del, 514 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_8_1.0
+
+Paragraph-level:
+  • %WER 10.50 [ 592 / 5639, 54 ins, 49 del, 489 sub ] exp/chain/e2e_cnn_1a/decode_test/para/wer_8_1.0
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 8.48 [ 478 / 5639, 56 ins, 54 del, 368 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5
+
+Paragraph-level:
+  • %WER 7.41 [ 418 / 5639, 38 ins, 36 del, 344 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_9_0.5
+
+
+Run_end2end.sh BPE-based system WER using lang with optional open-source extra corpus text. 
+WER at line-level and paragraph level.
+
+cnn_e2eali_1a:
+Line-level:
+  • %WER 7.66 [ 432 / 5639, 50 ins, 38 del, 344 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_7_0.5
+
+Paragraph-level:
+  • %WER 6.85 [ 386 / 5639, 35 ins, 36 del, 315 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_8_1.0
diff --git a/egs/rimes/v1/cmd.sh b/egs/rimes/v1/cmd.sh
new file mode 100755
index 00000000000..6080a8bab68
--- /dev/null
+++ b/egs/rimes/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="retry.pl queue.pl"
diff --git a/egs/rimes/v1/image b/egs/rimes/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/rimes/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/rimes/v1/local/chain/compare_wer.sh b/egs/rimes/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..4a2cc29481c
--- /dev/null
+++ b/egs/rimes/v1/local/chain/compare_wer.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+. ./path.sh
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# WER val                    "
+for x in $*; do
+  wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val                    "
+for x in $*; do
+  cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Parameters                 "
+for x in $*; do
+  params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+  printf "% 10s" $params
+done
+echo
diff --git a/egs/rimes/v1/local/chain/run_cnn_e2eali.sh b/egs/rimes/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..e2545b0186e
--- /dev/null
+++ b/egs/rimes/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
\ No newline at end of file
diff --git a/egs/rimes/v1/local/chain/run_e2e_cnn.sh b/egs/rimes/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/rimes/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..33eb9dcb98c
--- /dev/null
+++ b/egs/rimes/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,257 @@
+#!/bin/bash
+
+# e2eali_1a is a 6 cnn layer 3 tdnn layer model with dropout, l2-regularization, batch-normalization
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+# System                      cnn_e2eali_1a
+# WER                              7.75
+# CER                              2.68
+# Final train prob              -0.0779
+# Final valid prob              -0.0860
+# Final train prob (xent)       -0.7744
+# Final valid prob (xent)       -0.8111
+# Parameters                      4.96M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
+# exp/chain/cnn_e2eali_1a: num-iters=36 nj=3..8 num-params=5.0M dim=40->944 combine=-0.076->-0.076 (over 1) xent:train/valid[23,35,final]=(-1.48,-0.871,-0.774/-1.46,-0.888,-0.811) logprob:train/valid[23,35,final]=(-0.208,-0.102,-0.078/-0.189,-0.104,-0.086)
+
+# line level scoring result
+# WER 7.75 [ 437 / 5639, 62 ins, 55 del, 320 sub ] exp/chain/cnn_e2eali_1d/decode_test/wer_7_1.0
+# paragraph scoring result
+# WER 6.69 [ 377 / 5639, 44 ins, 37 del, 296 sub ] exp/chain/cnn_e2eali_1a/decode_test/para/wer_7_1.0
+
+set -e -o pipefail
+
+stage=0
+
+nj=50
+train_set=train
+decode_val=true
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+tree_affix=_1a
+bnf_chain_model_dir=exp/chain/e2e_cnn_1a
+bnf_layer_name=tdnn6.affine
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=1000
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=true
+lang_decode=data/lang
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e${tree_affix}
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 --generate-ali-from-lats true \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+bnf_data_dir=$bnf_chain_model_dir/$(basename $train_data_dir)
+if [ $stage -le 3 ]; then
+  if [ -f $bnf_data_dir/feats.scp ]; then
+    echo "$0: $bnf_data_dir/feats.scp exists. Refusing to dump features!"
+    exit 1
+  fi
+
+  steps/nnet3/make_bottleneck_features.sh --cmd "$cmd" --use-gpu true \
+    --compress false --nj $nj \
+    $bnf_layer_name ${train_data_dir} ${bnf_data_dir} $bnf_chain_model_dir || exit 1
+fi
+
+if [ $stage -le 4 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 4 \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${bnf_data_dir} \
+    $lang $lat_dir $tree_dir
+fi
+
+
+if [ $stage -le 5 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 6 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.apply-deriv-weights=true \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+    --chain.frame-subsampling-factor=4 \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
new file mode 100755
index 00000000000..9d28a41316d
--- /dev/null
+++ b/egs/rimes/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a
+# System                      e2e_cnn_1d
+# WER                             10.07
+# CER                              3.95
+# Final train prob               0.0369
+# Final valid prob              -0.0129
+# Final train prob (xent)
+# Final valid prob (xent)
+# Parameters                     12.73M
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=20 nj=2..4 num-params=12.7M dim=40->19404 combine=0.079->0.079 (over 3) logprob:train/valid[12,19,final]=(0.017,0.034,0.037/-0.024,-0.013,-0.013)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+nj=50
+
+# training options
+tdnn_dim=450
+minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
+common_egs_dir=
+train_set=train
+decode_val=true
+lang_decode=data/lang
+if $decode_val; then maybe_val=val; else maybe_val= ; fi
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_bitree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type biphone \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn6 input=Append(-4,0,4) dim=200
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.num-epochs 3 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 4 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  for decode_set in test $maybe_val; do
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nj --cmd "$cmd" \
+      $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1;
+  done
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/rimes/v1/local/combine_line_txt_to_paragraph.py b/egs/rimes/v1/local/combine_line_txt_to_paragraph.py
new file mode 100755
index 00000000000..5a794506b47
--- /dev/null
+++ b/egs/rimes/v1/local/combine_line_txt_to_paragraph.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+""" This script creates paragraph level text file. It reads 
+    the line level text file and combines them to get
+    paragraph level file.
+  Eg. local/combine_line_txt_to_paragraph.py
+  Eg. Input:  writer000000_eval2011-0_000001  Comme indiqué dans
+              writer000000_eval2011-0_000002  habitation n° DVT 36
+              writer000000_eval2011-0_000003  de mon domicile
+      Output: writer000000_eval2011-0 Comme indiqué dans habitation n° DVT 36 de mon domicile
+"""
+
+import argparse
+import os
+import io
+import sys
+### main ###
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+paragraph_txt_dict = dict()
+for line in infile:
+  line_vect = line.strip().split(' ')
+  line_id = int(line_vect[0].split('_')[-1])
+  paragraph_id = line_vect[0].split('-')[-1]
+  paragraph_id = int(paragraph_id.split('_')[0])
+  line_text = " ".join(line_vect[1:])
+  if paragraph_id not in paragraph_txt_dict.keys():
+      paragraph_txt_dict[paragraph_id] = dict()
+  paragraph_txt_dict[paragraph_id][line_id] = line_text
+
+
+para_txt_dict = dict()
+for para_id in sorted(paragraph_txt_dict.keys()):
+    para_txt = ""
+    for line_id in sorted(paragraph_txt_dict[para_id]):
+        text = paragraph_txt_dict[para_id][line_id]
+        para_txt = para_txt + " " + text
+    para_txt_dict[para_id] = para_txt
+    utt_id = 'writer' + str(para_id).zfill(6) + '_' + 'eval2011-' + str(para_id)
+    output.write(utt_id + ' ' + para_txt + '\n')
diff --git a/egs/rimes/v1/local/extract_features.sh b/egs/rimes/v1/local/extract_features.sh
new file mode 100755
index 00000000000..ec3bc8a268c
--- /dev/null
+++ b/egs/rimes/v1/local/extract_features.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment_type=no_aug
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/rimes/v1/local/prepare_data.sh b/egs/rimes/v1/local/prepare_data.sh
new file mode 100755
index 00000000000..502718e7777
--- /dev/null
+++ b/egs/rimes/v1/local/prepare_data.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# This script creates traing and validations splits, downloads text corpus for language modeling,
+#  prepares the training, validation and test data for rimes dataset 
+# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py.
+
+#  Eg. local/prepare_data.sh
+#  Eg. text file: writer000150_train2011-150_000001 J'ai perdu mon emploi depuis 3 mois et je me
+#      utt2spk file: writer000150_train2011-150_000001 writer000150
+#      images.scp file: writer000150_train2011-150_000001 data/local/rimes_data/line_image/train/train2011-150_000001.png
+
+stage=0
+download_dir=data/local/rimes_data
+data_dir=data/local/rimes_data
+page_image=$data_dir/page_image
+xml=$data_dir/xml
+train_img_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:training_2011.tar";
+train_xml_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:training_2011.xml";
+test_xml_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:eval_2011_annotated.xml";
+test_img_url="http://www.a2ialab.com/lib/exe/fetch.php?media=rimes_database:data:icdar2011:line:eval_2011.tar";
+text_url="http://opus.nlpl.eu/download.php?f=OfisPublik.tar.gz"
+use_extra_corpus_text=true
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p data/{train,test,val}
+
+if [ -d $page_image ]; then
+  echo "$0: Not downloading data as it is already there."
+else
+  mkdir -p $data_dir/{page_image,xml,line_image}/{train_total,test,val,train}
+  tar -xf $download_dir/training_2011.tar -C $page_image/train_total || exit 1;
+  tar -xf $download_dir/eval_2011.tar -C $page_image/test || exit 1;
+  cp -r $download_dir/training_2011.xml $xml/train_total/rimes_2011.xml
+  cp -r $download_dir/eval_2011_annotated.xml $xml/test/rimes_2011.xml
+  echo "$0: Done downloading and extracting data"
+
+  #First 150 training page images are used for validation  
+  cat $xml/train_total/rimes_2011.xml | head -n451  > $xml/val/rimes_2011.xml
+  cat $xml/train_total/rimes_2011.xml | tail -1  >> $xml/val/rimes_2011.xml
+  cp -r $page_image/train_total/* $page_image/train
+
+  #Remaining training page images are used for training
+  cat $xml/train_total/rimes_2011.xml | head -1  > $xml/train/rimes_2011.xml
+  cat $xml/train_total/rimes_2011.xml | tail -n+452  >> $xml/train/rimes_2011.xml
+  cp -r $page_image/train_total/* $page_image/val
+fi
+
+if $use_extra_corpus_text; then
+  # using freely available french text corpus for language modeling
+  mkdir -p data/local/text_data
+  wget -P data/local/text_data $text_url || exit 1;
+  tar -xf data/local/text_data/download.php?f=OfisPublik.tar.gz -C data/local/text_data || exit 1;
+  zcat data/local/text_data/OfisPublik/raw/fr/*.gz > data/local/text_data/fr_text
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: Processing train, val and test data... $(date)."
+  local/process_data.py $data_dir train --augment true || exit 1
+  local/process_data.py $data_dir val || exit 1
+  local/process_data.py $data_dir  test || exit 1
+  for dataset in test train val; do
+    echo "$0: Fixing data directory for dataset: $dataset $(date)."
+    image/fix_data_dir.sh data/$dataset
+  done
+fi
diff --git a/egs/rimes/v1/local/prepare_dict.sh b/egs/rimes/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..d8093658c30
--- /dev/null
+++ b/egs/rimes/v1/local/prepare_dict.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/rimes/v1/local/prepare_lexicon.py b/egs/rimes/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..5a6ac5b6dbf
--- /dev/null
+++ b/egs/rimes/v1/local/prepare_lexicon.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Ashish Arora
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+text_fh = open(text_path, 'r', encoding='utf-8')
+
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+            # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
+            characters = " ".join(['SIL' if char == '|' else char for char in characters])
+            lex[line_vect[i]] = characters
+            if line_vect[i] == '#':
+                lex[line_vect[i]] = "<HASH>"
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/rimes/v1/local/process_data.py b/egs/rimes/v1/local/process_data.py
new file mode 100755
index 00000000000..b87d9fbc5e2
--- /dev/null
+++ b/egs/rimes/v1/local/process_data.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+""" This script reads xml file and creates the following files :text, utt2spk, images.scp.
+    It also creates line images from page image and stores it into
+    data/local/rimes_data/train/lines.
+  Eg. local/process_data.py data/local/rimes_data/train train
+  Eg. text file: writer000000_train2011-0_000001 Je vous adresse ce courrier afin
+      utt2spk file: writer000000_train2011-0_000001 writer000000
+      images.scp file: writer000000_train2011-0_000001 \
+      data/local/rimes_data/train/lines/train2011-0_000001.png
+"""
+
+import argparse
+import xml.dom.minidom as minidom
+from PIL import Image
+import os
+import random
+parser = argparse.ArgumentParser(description="""Creates line images from page image.""")
+parser.add_argument('database_path', type=str,
+                    help='Path to the downloaded (and extracted) mdacat data')
+parser.add_argument('dataset', type=str,
+                    help='Subset of data to process.')
+parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="performs image augmentation")
+parser.add_argument('--pixel-scaling', type=int, default=20,
+                    help='padding across horizontal/verticle direction')
+args = parser.parse_args()
+
+def expand_aabb(left, right, top, bottom, delta_pixel):
+    """ Increases size of axis aligned bounding box (aabb).
+    """
+    left = left - delta_pixel
+    right = right + delta_pixel
+    top = top - delta_pixel
+    bottom = bottom + delta_pixel
+    return left, right, top, bottom
+
+def get_line_images_from_page_image(file_name, left, right, top, bottom, line_id):
+    """ Given a page image, extracts the line images from it.
+    Input
+    -----
+    file_name (string): name of the page image.
+    left, right, top, bottom (int): coordinates corresponding to the line image.
+    line_id (int): line number on the page image.
+    """
+    page_image_path = os.path.join(page_image_folder, file_name)
+    im = Image.open(page_image_path)
+    box = (left, top, right, bottom)
+    region = im.crop(box)
+    base_name = os.path.splitext(os.path.basename(file_name))[0]
+    line_image_file_name = base_name + '_' +  str(line_id).zfill(6) + '.png'
+    imgray = region.convert('L')
+    line_image_path = os.path.join(args.database_path, 'line_image', args.dataset, line_image_file_name)
+    imgray.save(line_image_path)
+    return base_name, line_image_path
+
+def write_kaldi_process_data_files(base_name, line_id, text):
+    """creates files requires for dictionary and feats.scp.
+    Input
+    -----
+    image_path (string): name of the page image.
+    line_id (str): line number on the page image.
+    text: transcription of the line image.
+    base_name (string): 
+    """
+    writer_id = str(base_name.split('-')[1])
+    writer_id = str(writer_id).zfill(6)
+    writer_id = 'writer' + writer_id
+    utt_id = writer_id + '_' + base_name + '_' +  str(line_id).zfill(6)
+    line_image_file_name = base_name + '_' +  str(line_id).zfill(6) + '.png'
+    image_path = os.path.join(args.database_path, 'line_image', args.dataset, line_image_file_name)
+    text_fh.write(utt_id + ' ' + text + '\n')
+    utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+    image_fh.write(utt_id + ' ' + image_path + '\n')
+
+### main ###
+text_file = os.path.join('data', args.dataset, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join('data', args.dataset, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join('data', args.dataset, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+xml_path = os.path.join(args.database_path, 'xml', args.dataset) + '/rimes_2011.xml'
+page_image_folder = os.path.join(args.database_path, 'page_image', args.dataset)
+doc = minidom.parse(xml_path)
+single_page = doc.getElementsByTagName('SinglePage')
+for page in single_page:
+    file_name = page.getAttribute('FileName')
+    line = page.getElementsByTagName('Line')
+    id = 0
+    for node in line:
+        id += 1
+        bottom = int(node.getAttribute('Bottom'))
+        left = int(node.getAttribute('Left'))
+        right = int(node.getAttribute('Right'))
+        top = int(node.getAttribute('Top'))
+        text = node.getAttribute('Value')
+        text_vect = text.split() # this is to avoid non-utf-8 spaces
+        text = " ".join(text_vect)
+        if args.augment:
+            base_name, image_path = get_line_images_from_page_image(file_name, left, right, top, bottom, str(id))
+            write_kaldi_process_data_files(base_name, str(id), text)
+            additional_pixel = random.randint(1, args.pixel_scaling)
+            left, right, top, bottom = expand_aabb(left, right, top, bottom, args.pixel_scaling + additional_pixel + 1)
+            line_id = str(id) + '_scale' + str(2)
+            base_name, image_path = get_line_images_from_page_image(file_name, left, right, top, bottom, line_id)
+            write_kaldi_process_data_files(base_name, line_id, text)
+        else:
+            base_name, image_path = get_line_images_from_page_image(file_name, left, right, top, bottom, str(id))
+            write_kaldi_process_data_files(base_name, str(id), text)
diff --git a/egs/rimes/v1/local/score.sh b/egs/rimes/v1/local/score.sh
new file mode 100755
index 00000000000..0cfbda9b556
--- /dev/null
+++ b/egs/rimes/v1/local/score.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+decode_dir=$3
+steps/scoring/score_kaldi_wer.sh --word_ins_penalty $word_ins_penalty \
+  --min_lmwt $min_lmwt --max_lmwt $max_lmwt "$@"
+
+steps/scoring/score_kaldi_cer.sh --word_ins_penalty $word_ins_penalty \
+  --min_lmwt $min_lmwt --max_lmwt $max_lmwt --stage 2 "$@"
+
+local/score_paragraph.sh --word_ins_penalty $word_ins_penalty \
+  --min_lmwt $min_lmwt --max_lmwt $max_lmwt $decode_dir
diff --git a/egs/rimes/v1/local/score_paragraph.sh b/egs/rimes/v1/local/score_paragraph.sh
new file mode 100755
index 00000000000..c6ef4da1d5b
--- /dev/null
+++ b/egs/rimes/v1/local/score_paragraph.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+min_lmwt=7
+max_lmwt=17
+word_ins_penalty=0.0,0.5,1.0
+
+set -e
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+decode_dir=$1
+test_para=$decode_dir/scoring_kaldi/test_filt_para.txt
+
+cat $decode_dir/scoring_kaldi/test_filt.txt | \
+  local/combine_line_txt_to_paragraph.py > $test_para
+
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  for LMWT in $(seq $min_lmwt $max_lmwt); do
+      mkdir -p $decode_dir/para/penalty_$wip
+      cat $decode_dir/scoring_kaldi/penalty_$wip/$LMWT.txt | \
+      local/combine_line_txt_to_paragraph.py > $decode_dir/para/penalty_$wip/$LMWT.txt
+  done
+done
+
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  for LMWT in $(seq $min_lmwt $max_lmwt); do
+      compute-wer --text --mode=present \
+      ark:$test_para ark:$decode_dir/para/penalty_$wip/$LMWT.txt &> $decode_dir/para/wer_${LMWT}_${wip} || exit 1;
+  done
+done
+
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  for lmwt in $(seq $min_lmwt $max_lmwt); do
+    # adding /dev/null to the command list below forces grep to output the filename
+    grep WER $decode_dir/para/wer_${lmwt}_${wip} /dev/null
+  done
+done | utils/best_wer.sh  >& $decode_dir/para/best_wer || exit 1
diff --git a/egs/rimes/v1/local/train_lm.sh b/egs/rimes/v1/local/train_lm.sh
new file mode 100755
index 00000000000..51927b7a97e
--- /dev/null
+++ b/egs/rimes/v1/local/train_lm.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=6
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  head -2000 data/train/text | cut -d " " -f 2-  > ${dir}/data/text/dev.txt
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  tail -n +2000 data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+
+  if [ -d "data/local/text_data" ]; then
+    cat data/local/text_data/fr_text | \
+      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > ${dir}/data/text/corpus_text.txt
+  fi
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='corpus_text=2 train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/rimes/v1/local/wer_output_filter b/egs/rimes/v1/local/wer_output_filter
new file mode 100755
index 00000000000..d9cf1f4072e
--- /dev/null
+++ b/egs/rimes/v1/local/wer_output_filter
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/rimes/v1/path.sh b/egs/rimes/v1/path.sh
new file mode 100755
index 00000000000..c7ebe7f2abf
--- /dev/null
+++ b/egs/rimes/v1/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH
+export LC_ALL=C
diff --git a/egs/rimes/v1/run_end2end.sh b/egs/rimes/v1/run_end2end.sh
new file mode 100755
index 00000000000..d3e3da2be13
--- /dev/null
+++ b/egs/rimes/v1/run_end2end.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright 2018    Hossein Hadian
+#                   Ashish Arora
+#                   Jonathan Chang
+# Apache 2.0
+
+set -e
+stage=0
+nj=50
+overwrite=false
+rimes_database=/export/corpora5/handwriting_ocr/RIMES
+train_set=train
+use_extra_corpus_text=true
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+if [ $stage -le 0 ]; then
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir "$rimes_database" \
+    --use_extra_corpus_text $use_extra_corpus_text
+
+fi
+
+mkdir -p data/{train,test,val}/data
+if [ $stage -le 1 ]; then
+  echo "$(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$(date) Extracting features, creating feats.scp file"
+  for set in train test val; do
+    local/extract_features.sh --nj $nj --cmd "$cmd" data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  utils/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Preparing BPE..."
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/train_data.txt
+  cat data/local/phones.txt data/local/train_data.txt | \
+    utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+  
+  for set in test train val; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | \
+      utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
+      | sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+                        data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh --train_set $train_set
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
+                       --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
+                       data/$train_set data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --train_set $train_set
+fi
diff --git a/egs/rimes/v1/steps b/egs/rimes/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/rimes/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/rimes/v1/utils b/egs/rimes/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/rimes/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/rm/README.txt b/egs/rm/README.txt
index ed588e481c6..4fa3d7c87e8 100644
--- a/egs/rm/README.txt
+++ b/egs/rm/README.txt
@@ -9,7 +9,7 @@ About the Resource Management corpus:
 
 Each subdirectory of this directory contains the
 scripts for a sequence of experiments. 
-s5 is the currently recommmended setup.
+s5 is the currently recommended setup.
 
   s5: This is the "new-new-style" recipe.  It is now finished.
       All further work will be on top of this style of recipe.  Note: 
diff --git a/egs/rm/s5/RESULTS b/egs/rm/s5/RESULTS
index a8156e10e14..4abd6ac4ed6 100644
--- a/egs/rm/s5/RESULTS
+++ b/egs/rm/s5/RESULTS
@@ -1,5 +1,5 @@
 #!/bin/bash
-for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+for x in exp/*/decode* exp/chain*/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
 exit 0
 
 # Monophone, MFCC+delta+accel
@@ -37,7 +37,7 @@ exit 0
 %WER 1.77 [ 222 / 12533, 34 ins, 33 del, 155 sub ] exp/tri3b_mmi/decode2/wer_4
 
 
-# LDA+MLLT+SAT+fMMI (fMMI+MMI on top of this SAT system) Various configurations.  
+# LDA+MLLT+SAT+fMMI (fMMI+MMI on top of this SAT system) Various configurations.
 # Note: it doesn't really help here.  Probably not enough data.
 %WER 1.87 [ 234 / 12533, 35 ins, 42 del, 157 sub ] exp/tri3b_fmmi_b/decode_it3/wer_4
 %WER 1.85 [ 232 / 12533, 38 ins, 39 del, 155 sub ] exp/tri3b_fmmi_b/decode_it4/wer_4
@@ -131,7 +131,7 @@ exit 0
 
 %WER 1.37 [ 172 / 12533, 14 ins, 36 del, 122 sub ] exp/nnet4e_gpu/decode/wer_3
 %WER 8.03 [ 1006 / 12533, 61 ins, 179 del, 766 sub ] exp/nnet4e_gpu/decode_ug/wer_8
- 
+
 # Discriminatively trained system (using SMBR, on CPU)
 %WER 1.70 [ 213 / 12533, 21 ins, 52 del, 140 sub ] exp/nnet5c_mpe/decode_epoch1/wer_4
 %WER 1.71 [ 214 / 12533, 21 ins, 50 del, 143 sub ] exp/nnet5c_mpe/decode_epoch2/wer_4
@@ -178,7 +178,7 @@ for x in exp/nnet2_online/nnet*/decode*; do grep WER $x/wer_* | utils/best_wer.s
  # script is not checked in for this, it's pnorm with 800/160 instead of 1000/200.
  %WER 2.58 [ 323 / 12533, 38 ins, 81 del, 204 sub ] exp/nnet2_online/nnet2b/decode/wer_6
  %WER 10.72 [ 1344 / 12533, 124 ins, 234 del, 986 sub ] exp/nnet2_online/nnet2b/decode_ug/wer_10
-# This is the baseline for the nnet+ivector decoding, with no iVector.  This is 
+# This is the baseline for the nnet+ivector decoding, with no iVector.  This is
 # better than with the iVector, i.e. the iVector is not working.  I assume this
 # is due to overtraining.  I plan to try this on a larger setup.
 %WER 2.30 [ 288 / 12533, 44 ins, 51 del, 193 sub ] exp/nnet2_online/nnet_baseline/decode/wer_4
@@ -233,10 +233,20 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 # current best chain result with TDNN (check local/chain/run_tdnn_5g.sh)
 %WER 2.86 [ 358 / 12533, 46 ins, 61 del, 251 sub ] exp/chain/tdnn_5g/decode/wer_5_0.0
 %WER 2.71 [ 340 / 12533, 58 ins, 59 del, 223 sub ] exp/chain/tdnn_5n/decode/wer_4_0.0
-
+# Its topology of chain model is from mini_librispeech's.
+# It uses a new configs convention for chain model after kaldi 5.2.
+%WER 1.32 [ 166 / 12533, 19 ins, 31 del, 116 sub ] exp/chain/tdnn_5o/decode/wer_4_0.0
 ### WSJ->RM Transfer learning using chain model ###
 %WER 1.68 [ 210 / 12533, 25 ins, 33 del, 152 sub ] exp/chain/tdnn_wsj_rm_1a/decode/wer_2_0.0
 
+# Testing online-cmn for iextractor and chain-tdnn,
+# without cmn,
+%WER 1.60 [ 201 / 12533, 17 ins, 34 del, 150 sub ] exp/chain/tdnn_5o/decode/wer_3_0.0
+%WER 8.11 [ 1016 / 12533, 76 ins, 140 del, 800 sub ] exp/chain/tdnn_5o/decode_ug/wer_7_0.0
+# with online-cmn,
+%WER 1.57 [ 197 / 12533, 21 ins, 29 del, 147 sub ] exp/chain/tdnn_5o_online-cmn/decode/wer_2_0.5
+%WER 8.11 [ 1017 / 12533, 59 ins, 160 del, 798 sub ] exp/chain/tdnn_5o_online-cmn/decode_ug/wer_8_0.0
+
 ### nnet1 results ###
 
 # dnn4b, MFCC,LDA,fMLLR feaures, (Karel - 30.7.2015)
@@ -300,7 +310,7 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
 %WER 1.81 [ 227 / 12533, 29 ins, 49 del, 149 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it3/wer_3_1.0
 %WER 1.86 [ 233 / 12533, 34 ins, 46 del, 153 sub ] exp/dnn4h-dummy-ivec_pretrain-dbn_dnn_smbr/decode_it6/wer_3_0.5
 
-# blstm4i, FBANK+pitch, (Karel - 9.8.2016) 
+# blstm4i, FBANK+pitch, (Karel - 9.8.2016)
 %WER 2.03 [ 254 / 12533, 21 ins, 63 del, 170 sub ] exp/blstm4i/decode/wer_4_0.5
 
 ### ^^^ nnet1 results ^^^ ###
diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh
index ed6991ae7ad..96c6c2a4303 100644
--- a/egs/rm/s5/cmd.sh
+++ b/egs/rm/s5/cmd.sh
@@ -15,7 +15,7 @@ export decode_cmd=queue.pl
 # the use of cuda_cmd is deprecated, used only in 'nnet1',
 export cuda_cmd="queue.pl --gpu 1"
 
-if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
+if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
diff --git a/egs/rm/s5/conf/mfcc_hires.conf b/egs/rm/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..19f218f982e
--- /dev/null
+++ b/egs/rm/s5/conf/mfcc_hires.conf
@@ -0,0 +1,8 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/rm/s5/local/chain/run_tdnn.sh b/egs/rm/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..778106e3630
--- /dev/null
+++ b/egs/rm/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_5p.sh
\ No newline at end of file
diff --git a/egs/rm/s5/local/chain/run_tdnn_5g.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
similarity index 100%
rename from egs/rm/s5/local/chain/run_tdnn_5g.sh
rename to egs/rm/s5/local/chain/tuning/run_tdnn_5g.sh
diff --git a/egs/rm/s5/local/chain/run_tdnn_5n.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
similarity index 100%
rename from egs/rm/s5/local/chain/run_tdnn_5n.sh
rename to egs/rm/s5/local/chain/tuning/run_tdnn_5n.sh
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
new file mode 100755
index 00000000000..db5944fdbea
--- /dev/null
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5o.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+
+# this script is a modified version of run_tdnn_5n.sh. It uses
+# a new configs convention for chain model after kaldi 5.2.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+dir=exp/chain/tdnn_5o
+
+# training options
+num_epochs=13
+initial_effective_lrate=0.005
+final_effective_lrate=0.0005
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+#common_egs_dir=exp/chain/tdnn_5g/egs/
+common_egs_dir=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet2 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_5o_tree
+lang=data/lang_chain_5o
+
+local/online/run_nnet2_common.sh --stage $stage || exit 1;
+
+if [ $stage -le 4 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+    data/lang exp/tri3b exp/tri3b_lats
+  rm exp/tri3b_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 7 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=50 name=ivector
+  input dim=13 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet2_online/ivectors \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=200" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/train_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri3b_lats \
+    --dir $dir
+fi
+
+if [ $stage -le 9 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
+    data/test_hires exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --scoring-opts "--min-lmwt 1" \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2_online/ivectors_test \
+    $dir/graph_ug data/test_hires $dir/decode_ug || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_5p.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_5p.sh
new file mode 100755
index 00000000000..37073a53eba
--- /dev/null
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_5p.sh
@@ -0,0 +1,197 @@
+#!/bin/bash
+
+# this script is a modified version of run_tdnn_5o.sh. It uses online-cmn
+# for input features, both for ivector extractor and the chain model.
+# (i-vector extractor and chain model have same features on its input)
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# Setting 'online_cmvn' to true replaces 'apply-cmvn' by
+# 'apply-cmvn-online' both for i-vector extraction and TDNN input.
+# The i-vector extractor uses the config 'conf/online_cmvn.conf' for
+# both the UBM and the i-extractor. The TDNN input is configured via
+# '--feat.cmvn-opts' that is set to the same config, so we use the
+# same cmvn for i-extractor and the TDNN input.
+online_cmvn=true
+
+nnet_affix="_online_cmn" # for iextractor,
+dir=exp/chain/tdnn_5p # for nnet,
+
+# training options
+num_epochs=13
+initial_effective_lrate=0.005
+final_effective_lrate=0.0005
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+#common_egs_dir=exp/chain/tdnn_5g/egs/
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet2 setup, and you can skip them by setting "--stage 4" if you have already
+# run those things.
+
+ali_dir=exp/tri3b_ali
+treedir=exp/chain/tri4_5o_tree
+lang=data/lang_chain_5o
+
+local/online/run_nnet2_common.sh --stage $stage \
+  --online-cmvn-iextractor $online_cmvn --nnet-affix $nnet_affix || exit 1;
+
+if [ $stage -le 4 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
+    data/lang exp/tri3b exp/tri3b_lats
+  rm exp/tri3b_lats/fsts.*.gz # save space
+fi
+
+if [ $stage -le 5 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 6 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 7 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=50 name=ivector
+  input dim=13 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 8 ]; then
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet2${nnet_affix}/ivectors \
+    --feat.cmvn-opts="--config=conf/online_cmvn.conf" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=200" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --online-cmvn $online_cmvn" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate $final_effective_lrate \
+    --trainer.max-param-change $max_param_change \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/train_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri3b_lats \
+    --dir $dir
+fi
+
+if [ $stage -le 9 ]; then
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
+    data/test_hires exp/nnet2${nnet_affix}/extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --scoring-opts "--min-lmwt 1" \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \
+    $dir/graph data/test_hires $dir/decode || exit 1;
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_ug $dir $dir/graph_ug
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 20 --cmd "$decode_cmd" \
+    --online-ivector-dir exp/nnet2${nnet_affix}/ivectors_test \
+    $dir/graph_ug data/test_hires $dir/decode_ug || exit 1;
+fi
+wait;
+exit 0;
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
index 6b6c08e779a..f77ebb2a071 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1a.sh
@@ -130,7 +130,7 @@ if [ $stage -le 7 ]; then
   echo " generating new layers, that are specific to rm. These layers ";
   echo " are added to the transferred part of the wsj network.";
   num_targets=$(tree-info --print-args=false $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   mkdir -p $dir
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -165,7 +165,6 @@ if [ $stage -le 8 ]; then
     --cmd "$decode_cmd" \
     --trainer.input-model $dir/input.raw \
     --feat.online-ivector-dir "$ivector_dir" \
-    --chain.xent-regularize $xent_regularize \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient 0.1 \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
index 3e8d5717d4b..e38fa0b231c 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1b.sh
@@ -181,7 +181,6 @@ if [ $stage -le 7 ]; then
     --cmd "$decode_cmd" \
     --trainer.input-model $dir/input.raw \
     --feat.online-ivector-dir "$ivector_dir" \
-    --chain.xent-regularize 0.1 \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize 0.1 \
     --chain.leaky-hmm-coefficient 0.1 \
diff --git a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
index 611aede371d..04bef13fab0 100755
--- a/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
+++ b/egs/rm/s5/local/chain/tuning/run_tdnn_wsj_rm_1c.sh
@@ -188,7 +188,6 @@ if [ $stage -le 7 ]; then
     --cmd "$decode_cmd" \
     --trainer.input-model $dir/input.raw \
     --feat.online-ivector-dir "$ivector_dir" \
-    --chain.xent-regularize 0.1 \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize 0.1 \
     --chain.leaky-hmm-coefficient 0.1 \
diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh
index e0034ddd7d2..dfacfcf9c9f 100755
--- a/egs/rm/s5/local/online/run_nnet2_common.sh
+++ b/egs/rm/s5/local/online/run_nnet2_common.sh
@@ -6,15 +6,18 @@
 
 stage=1
 nnet_affix=_online
-extractor=exp/nnet2${nnet_affix}/extractor
 ivector_dim=50
 mfcc_config=conf/mfcc_hires.conf
 use_ivector=true # If false, it skips training ivector extractor and
                  # ivector extraction stages.
+online_cmvn_iextractor=false
+
 . ./cmd.sh
 . ./path.sh
 . ./utils/parse_options.sh
 
+extractor=exp/nnet2${nnet_affix}/extractor
+
 if $use_gpu; then
   if ! cuda-compiled; then
     cat <<EOF && exit 1
@@ -36,6 +39,7 @@ else
 fi
 
 train_set=train
+test_set=test
 if [ $stage -le 0 ]; then
   echo "$0: creating high-resolution MFCC features."
   mfccdir=data/${train_set}_hires/data
@@ -48,21 +52,24 @@ if [ $stage -le 0 ]; then
     steps/compute_cmvn_stats.sh data/${datadir}_hires
     utils/fix_data_dir.sh data/${datadir}_hires
   done
+  train_set=${train_set}_hires
+  test_set=${test_set}_hires
 fi
 
-train_set=${train_set}_hires
 if [ ! -f $extractor/final.ie ] && [ $ivector_dim -gt 0 ]; then
   if [ $stage -le 1 ]; then
     mkdir -p exp/nnet2${nnet_affix}
-    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 40 --num-frames 200000 \
+    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 40 \
+      --num-threads 6 --num-frames 200000 \
       data/${train_set} 256 exp/tri3b exp/nnet2${nnet_affix}/diag_ubm
   fi
 
   if [ $stage -le 2 ]; then
     # use a smaller iVector dim (50) than the default (100) because RM has a very
     # small amount of data.
-    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 40 \
-      --ivector-dim $ivector_dim \
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+      --num-threads 3 --num-processes 2 --ivector-dim $ivector_dim \
+      --online-cmvn-iextractor $online_cmvn_iextractor \
      data/${train_set} exp/nnet2${nnet_affix}/diag_ubm $extractor || exit 1;
   fi
 fi
@@ -76,5 +83,5 @@ if [ $stage -le 3 ] && [ $ivector_dim -gt 0 ]; then
     data/${train_set}_max2 $extractor exp/nnet2${nnet_affix}/ivectors || exit 1;
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
-    data/test_hires $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
+    data/${test_set} $extractor exp/nnet2${nnet_affix}/ivectors_test || exit 1;
 fi
diff --git a/egs/rm/s5/local/run_raw_fmllr.sh b/egs/rm/s5/local/run_raw_fmllr.sh
index 20b475fa32e..e02002aa1d0 100755
--- a/egs/rm/s5/local/run_raw_fmllr.sh
+++ b/egs/rm/s5/local/run_raw_fmllr.sh
@@ -25,8 +25,8 @@ steps/decode_raw_fmllr.sh --use-normal-fmllr true --config conf/decode.config --
 steps/align_raw_fmllr.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri3c exp/tri3c_ali
 
 
-                                        
-                                                                    
+
+
 if [ ! -f exp/ubm4c/final.mdl ]; then
   steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 400 data/train data/lang exp/tri3c_ali exp/ubm4c || exit 1;
 fi
@@ -43,7 +43,7 @@ steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
 
 steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
   --transform-dir exp/tri3c/decode  exp/sgmm2_4c/graph data/test exp/sgmm2_4c/decode_fmllr || exit 1;
- 
+
 
 exit 0;
 
@@ -61,7 +61,7 @@ exit 0;
 #    awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/decode_ug/vecs.1
 # )
 # exit 0;
-# ## 
+# ##
 # steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
 #   exp/sgmm2_4c.no_transform/graph data/test exp/sgmm2_4c.no_transform/decode || exit 1;
 
diff --git a/egs/rm/s5/local/run_sgmm2.sh b/egs/rm/s5/local/run_sgmm2.sh
index 5fa683ff0c2..95a40141892 100755
--- a/egs/rm/s5/local/run_sgmm2.sh
+++ b/egs/rm/s5/local/run_sgmm2.sh
@@ -9,7 +9,7 @@
 if [ ! -f exp/ubm4a/final.ubm ] || [ ! data/train/feats.scp -nt exp/ubm4a/final.ubm ]; then
   steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 400 data/train data/lang exp/tri3b_ali exp/ubm4a || exit 1;
 fi
-          
+
 steps/train_sgmm2.sh --cmd "$train_cmd" 5000 7000 data/train data/lang exp/tri3b_ali exp/ubm4a/final.ubm exp/sgmm2_4a || exit 1;
 
 utils/mkgraph.sh data/lang exp/sgmm2_4a exp/sgmm2_4a/graph || exit 1;
@@ -26,22 +26,22 @@ steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd
  steps/make_denlats_sgmm2.sh --nj 8 --sub-split 20 --cmd "$decode_cmd" --transform-dir exp/tri3b \
    data/train data/lang exp/sgmm2_4a_ali exp/sgmm2_4a_denlats
  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri3b --boost 0.2 \
-   data/train data/lang exp/sgmm2_4a_ali exp/sgmm2_4a_denlats exp/sgmm2_4a_mmi_b0.2 
+   data/train data/lang exp/sgmm2_4a_ali exp/sgmm2_4a_denlats exp/sgmm2_4a_mmi_b0.2
 
  for iter in 1 2 3 4; do
   steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
     --transform-dir exp/tri3b/decode data/lang data/test exp/sgmm2_4a/decode exp/sgmm2_4a_mmi_b0.2/decode_it$iter &
- done  
+ done
 (
  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri3b --boost 0.2 --drop-frames true \
-   data/train data/lang exp/sgmm2_4a_ali exp/sgmm2_4a_denlats exp/sgmm2_4a_mmi_b0.2_x 
+   data/train data/lang exp/sgmm2_4a_ali exp/sgmm2_4a_denlats exp/sgmm2_4a_mmi_b0.2_x
 
  for iter in 1 2 3 4; do
   steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
     --transform-dir exp/tri3b/decode data/lang data/test exp/sgmm2_4a/decode exp/sgmm2_4a_mmi_b0.2_x/decode_it$iter &
- done  
+ done
 )
-wait 
+wait
 steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1;
 steps/decode_combine.sh data/test data/lang exp/sgmm2_4a/decode exp/tri3b_mmi/decode exp/combine_sgmm2_4a_3b/decode || exit 1;
 # combining the sgmm run and the best MMI+fMMI run.
diff --git a/egs/rm/s5/local/run_sgmm2x.sh b/egs/rm/s5/local/run_sgmm2x.sh
index deea4feb13f..00730697693 100755
--- a/egs/rm/s5/local/run_sgmm2x.sh
+++ b/egs/rm/s5/local/run_sgmm2x.sh
@@ -26,14 +26,14 @@ steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd
  steps/make_denlats_sgmm2.sh --nj 8 --sub-split 20 --cmd "$decode_cmd" --transform-dir exp/tri3b \
    data/train data/lang exp/sgmm2x_4a_ali exp/sgmm2x_4a_denlats
  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri3b --boost 0.2 \
-   data/train data/lang exp/sgmm2x_4a_ali exp/sgmm2x_4a_denlats exp/sgmm2x_4a_mmi_b0.2 
+   data/train data/lang exp/sgmm2x_4a_ali exp/sgmm2x_4a_denlats exp/sgmm2x_4a_mmi_b0.2
 
  for iter in 1 2 3 4; do
   steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
     --transform-dir exp/tri3b/decode data/lang data/test exp/sgmm2x_4a/decode exp/sgmm2x_4a_mmi_b0.2/decode_it$iter &
- done  
+ done
 
-wait 
+wait
 steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1;
 steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_mmi/decode exp/combine_sgmm2x_4a_3b/decode || exit 1;
 # combining the sgmm run and the best MMI+fMMI run.
diff --git a/egs/rm/s5/local/run_sgmm_multiling.sh b/egs/rm/s5/local/run_sgmm_multiling.sh
index 2b2af7f5ca6..42369cd2937 100755
--- a/egs/rm/s5/local/run_sgmm_multiling.sh
+++ b/egs/rm/s5/local/run_sgmm_multiling.sh
@@ -45,7 +45,7 @@ utils/convert_models.sh exp/tri2b data_ml/lang_rm exp_ml/tri2b_rm data_ml/lang_r
 
 utils/convert_models.sh ../../wsj/exp/tri4b data_ml/lang_wsj exp_ml/tri4b_wsj data_ml/lang
 
-# Re-do the alignment of the RM tri2b setup with the converted models 
+# Re-do the alignment of the RM tri2b setup with the converted models
 # (this avoids the hassle of converting the alignment.)
 steps/align_si.sh --nj 8 --cmd "$train_cmd"  data_ml/train_rm data_ml/lang exp_ml/tri2b_rm \
     exp_ml/tri2b_rm_ali || exit 1;
@@ -66,7 +66,7 @@ steps/train_sat.sh 1800 9000 data_ml/train_rm data_ml/lang exp_ml/tri2b_rm_ali e
 # "merge-tree" program will need, for each tree, a record of which sets of
 # phones it was supposed to handle, since this is not recorded in the tree
 # itself-- we can get this from the transition models which do record this.
-# probably the "merge-tree" program will have usage: 
+# probably the "merge-tree" program will have usage:
 # merge-tree <tree1> <phone-set-1> <tree2> <phone-set-2> ... <tree-out>
 # where the phone-set-n's will probably be filenames that contain lists of
 # the phones.
diff --git a/egs/rm/s5/local/run_vtln2.sh b/egs/rm/s5/local/run_vtln2.sh
index 6437032ca61..b87030d2e3d 100755
--- a/egs/rm/s5/local/run_vtln2.sh
+++ b/egs/rm/s5/local/run_vtln2.sh
@@ -59,4 +59,4 @@ steps/compute_cmvn_stats.sh data/test_vtln exp/make_mfcc/test_vtln $featdir
 # %WER 3.13 [ 392 / 12533, 59 ins, 64 del, 269 sub ] exp/tri3b/decode.si/wer_3
 # %WER 10.36 [ 1298 / 12533, 147 ins, 192 del, 959 sub ] exp/tri3b/decode_ug/wer_12
 # %WER 13.48 [ 1689 / 12533, 159 ins, 277 del, 1253 sub ] exp/tri3b/decode_ug.si/wer_13
-# a04:s5: 
\ No newline at end of file
+# a04:s5: 
diff --git a/egs/rm/s5/run.sh b/egs/rm/s5/run.sh
index 7d27b2c6d91..61dcaa0e34a 100755
--- a/egs/rm/s5/run.sh
+++ b/egs/rm/s5/run.sh
@@ -251,4 +251,7 @@ local/run_sgmm2.sh
 # local/nnet/run_cnn2d.sh
 
 # chain recipe
-# local/chain/run_tdnn_5f.sh
+local/chain/run_tdnn.sh
+
+# chain recipe with online-cmn
+local/chain/run_tdnn_online_cmn.sh
diff --git a/egs/sitw/v1/local/make_musan.py b/egs/sitw/v1/local/make_musan.py
deleted file mode 100755
index 74c434990fb..00000000000
--- a/egs/sitw/v1/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/sitw/v1/local/make_musan.sh b/egs/sitw/v1/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/sitw/v1/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/sitw/v1/local/make_voxceleb1.pl b/egs/sitw/v1/local/make_voxceleb1.pl
index e56483563b8..279f8d6cbfe 100755
--- a/egs/sitw/v1/local/make_voxceleb1.pl
+++ b/egs/sitw/v1/local/make_voxceleb1.pl
@@ -26,6 +26,10 @@
   system("wget -O $out_dir/voxceleb1_sitw_overlap.txt http://www.openslr.org/resources/49/voxceleb1_sitw_overlap.txt");
 }
 
+if (! -e "$data_base/vox1_meta.csv") {
+  system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv");
+}
+
 # sitw_overlap contains the list of speakers that also exist in our evaluation set, SITW.
 my %sitw_overlap = ();
 open(OVERLAP, "<", "$out_dir/voxceleb1_sitw_overlap.txt") or die "Could not open the overlap file $out_dir/voxceleb1_sitw_overlap.txt";
@@ -34,6 +38,20 @@
   my $spkr_id = $_;
   $sitw_overlap{$spkr_id} = ();
 }
+close(OVERLAP) or die;
+
+open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
+
+# Also add the banned speakers to sitw_overlap using their ID format in the
+# newest version of VoxCeleb.
+while (<META_IN>) {
+  chomp;
+  my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
+  if (exists($sitw_overlap{$spkr_id})) {
+    $sitw_overlap{$vox_id} = ();
+  }
+}
+close(META_IN) or die;
 
 opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!";
 my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
diff --git a/egs/sitw/v1/run.sh b/egs/sitw/v1/run.sh
index 68d08dfc170..797451df263 100755
--- a/egs/sitw/v1/run.sh
+++ b/egs/sitw/v1/run.sh
@@ -122,7 +122,7 @@ if [ $stage -le 4 ]; then
 
   # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -137,7 +137,7 @@ if [ $stage -le 4 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -147,11 +147,11 @@ if [ $stage -le 4 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train_100k data/train_100k_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train_100k data/train_100k_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train_100k data/train_100k_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train_100k data/train_100k_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train_100k data/train_100k_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train_100k data/train_100k_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_100k_reverb data/train_100k_noise data/train_100k_music data/train_100k_babble
diff --git a/egs/sitw/v2/run.sh b/egs/sitw/v2/run.sh
index 499d436366a..aad58e4a853 100755
--- a/egs/sitw/v2/run.sh
+++ b/egs/sitw/v2/run.sh
@@ -88,7 +88,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -103,7 +103,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -113,11 +113,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
diff --git a/egs/spanish_dimex100/README.txt b/egs/spanish_dimex100/README.txt
new file mode 100644
index 00000000000..19406641f56
--- /dev/null
+++ b/egs/spanish_dimex100/README.txt
@@ -0,0 +1,22 @@
+About the DIMEx100 corpus:
+    Mexican Spanish clean speech corpus introduced in Pineda, et al. (2001).
+    "DIMEx100: A New Phonetic and Speech Corpus for Mexican Spanish".
+
+        > Studio recorded audio with a total of 6000 phrases by 100 speakers.
+        > Mono/16 bit/44.1 khz
+        > Three different levels of transcription
+        > For additional information about the corpus design and
+            characteristics refer to (Pineda, 2001)
+
+
+    Created by the computer science department of the "Investigaciones en
+    Matemáticas Aplicadas y en Sistemas (IIMAS)" institute at the "National
+    Autonomous University of Mexico (UNAM)".
+
+    DIMEx100 corpus is available free of charge for academic purposes
+    exclusively. For commercial use a formal agreement with UNAM is required.
+    For more information refer to
+    http://turing.iimas.unam.mx/~luis/DIME/CORPUS-DIMEX.html
+
+Example author:
+    Daniel A. Campoverde <alx@sillybytes.net>
diff --git a/egs/spanish_dimex100/s5/.gitignore b/egs/spanish_dimex100/s5/.gitignore
new file mode 100644
index 00000000000..5936e451c95
--- /dev/null
+++ b/egs/spanish_dimex100/s5/.gitignore
@@ -0,0 +1,5 @@
+DVDCorpusDimex100.zip
+CorpusDimex100
+
+data
+*.wav
diff --git a/egs/spanish_dimex100/s5/RESULTS b/egs/spanish_dimex100/s5/RESULTS
new file mode 100755
index 00000000000..dcab09973d6
--- /dev/null
+++ b/egs/spanish_dimex100/s5/RESULTS
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+exit 0
+
+# Result on decode_test (tri2b_mmi_b0.05)
+%WER 7.58 [ 72 / 950, 50 ins, 0 del, 22 sub ] exp/tri2b_mmi_b0.05/decode_test/wer_15:2
diff --git a/egs/spanish_dimex100/s5/cmd.sh b/egs/spanish_dimex100/s5/cmd.sh
new file mode 100644
index 00000000000..71dd849a93b
--- /dev/null
+++ b/egs/spanish_dimex100/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/spanish_dimex100/s5/conf/decode.config b/egs/spanish_dimex100/s5/conf/decode.config
new file mode 100644
index 00000000000..81c6a7b2745
--- /dev/null
+++ b/egs/spanish_dimex100/s5/conf/decode.config
@@ -0,0 +1,3 @@
+first_beam=10.0
+beam=13.0
+lattice_beam=6.0
diff --git a/egs/spanish_dimex100/s5/conf/mfcc.conf b/egs/spanish_dimex100/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..45d284ad05c
--- /dev/null
+++ b/egs/spanish_dimex100/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false
diff --git a/egs/spanish_dimex100/s5/local/data_prep.sh b/egs/spanish_dimex100/s5/local/data_prep.sh
new file mode 100755
index 00000000000..50cb3de4f9c
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/data_prep.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+mkdir -p "data/train" "data/test" "data/local"
+
+source ./path.sh
+
+# Dimex100 unziped corpus root directory
+CORPUS_DIR="$1"
+
+# Corpus data
+#
+#   Number of Different speakers:   100
+#   Speakers common utterances:     10
+#   Speakers individual utterances: 50
+#
+# Training/testing split
+#
+#   Common utterances for training:     10 (100%)
+#   Individual utterances for training: 40 (80%)
+#   Individual utterances for testing:  10 (20%)
+N_SPEAKERS=100
+N_COMMON_UTTERANCES=10
+N_INDIVIDUAL_UTTERANCES=50
+N_INDIVIDUAL_UTTERANCES_TRAINING=40
+N_INDIVIDUAL_UTTERANCES_TESTING=10
+
+# speakerId-utteranceId-[c|i]
+#   c = speaker common utterances (10)
+#   i = speaker individual utterances (50)
+#
+#   e.g.:
+#       s001-01-c
+#       ...
+#       s001-10-c
+#       ...
+#       s001-01-i
+#       ...
+#       s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+#    10/10 common utterances go into training
+#    40/50 individual utterances go into training
+#    10/50 individual utterances go into testing
+
+function make_speaker_id
+{
+    printf "s%03d" "$1"
+}
+
+function make_sentence_id
+{
+    printf "%02d" "$1"
+}
+
+#####################################
+# Convert wave audio to 16-bit, 16kHz
+#####################################
+
+function convert_to_16khz
+{
+    for i in $(seq 1 $N_SPEAKERS); do
+        speaker_id=$(make_speaker_id $i)
+
+        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/comunes"
+        mkdir -p "$CORPUS_DIR/$speaker_id/audio_16k/individuales"
+
+        # Common utterances
+        for j in $(seq 1 $N_COMMON_UTTERANCES); do
+            sentence_id=$(make_sentence_id $j)
+            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/comunes/$speaker_id$sentence_id.wav"
+            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+            sox "$old_wav_file" -r 16k "$new_wav_file"
+        done
+
+        # Individual utterances
+        for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+            sentence_id=$(make_sentence_id $k)
+            old_wav_file="$CORPUS_DIR/$speaker_id/audio_editado/individuales/$speaker_id$sentence_id.wav"
+            new_wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+            sox "$old_wav_file" -r 16k "$new_wav_file"
+        done
+    done
+}
+
+if [[ ! -d "$CORPUS_DIR/s001/audio_16k" ]]; then
+    echo
+    echo Converting audio from 44.1kHz to 16kHz
+    echo
+    convert_to_16khz
+fi
+
+
+
+#################
+# data/train/text
+# data/test/text
+#################
+
+# speakerId-utteranceId-[c|i]
+#   c = speaker common utterances (10)
+#   i = speaker individual utterances (50)
+#
+#   e.g.:
+#       s001-01-c
+#       ...
+#       s001-10-c
+#       ...
+#       s001-01-i
+#       ...
+#       s001-50-i
+
+## 80-20 train-test split
+## Only individual utterances are used in testing
+#    10/10 common utterances go into training
+#    40/50 individual utterances go into training
+#    10/50 individual utterances go into testing
+
+
+
+function clean
+{
+    echo "$1" \
+        | tr -d '\r' \
+        | tr '[:upper:]' '[:lower:]' \
+        | sed \
+            -e 's/á/a/g' -e 's/é/e/g' -e 's/í/i/g' -e 's/ó/o/g' -e 's/ú/u/g' \
+            -e 's/Á/a/g' -e 's/É/e/g' -e 's/Í/i/g' -e 's/Ó/o/g' -e 's/Ú/u/g' \
+            -e 's/ñ/n/g' -e 's/Ñ/n/g' -e 's/ü/u/g' -e 's/Ü/u/g' \
+        | tr -d -c "a-zA-Z0-9 \r\n"
+        # | tr -d -c "_,.;:\-?¿!'\"()" \
+}
+
+### Generate data/train/text
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Common utterances
+    for j in $(seq 1 $N_COMMON_UTTERANCES); do
+        sentence_id=$(make_sentence_id $j)
+        utterance_id="$speaker_id-$sentence_id-c"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/comunes/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/train/text"
+        fi
+    done
+
+    # Individual utterances
+    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/train/text"
+        fi
+    done
+
+done
+
+
+### Generate data/test/text
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Individual utterances
+    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        trans_file="$CORPUS_DIR/$speaker_id/texto/individuales/$speaker_id$sentence_id.txt"
+        iconv -f iso-8859-1 -t utf-8 "$trans_file" > "$trans_file.utf8"
+        if [ -f "$trans_file.utf8" ]; then
+            transcription=$(cat "$trans_file.utf8")
+            transcription=$(clean "$transcription")
+            echo "$utterance_id $transcription" >> "data/test/text"
+        fi
+    done
+
+done
+
+
+
+
+####################
+# data/train/wav.scp
+# data/test/wav.scp
+####################
+
+
+### Generate data/train/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Common utterances
+    for j in $(seq 1 $N_COMMON_UTTERANCES); do
+        sentence_id=$(make_sentence_id $j)
+        utterance_id="$speaker_id-$sentence_id-c"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/comunes/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+        fi
+    done
+
+    # Individual utterances
+    for k in $(seq 1 $N_INDIVIDUAL_UTTERANCES_TRAINING); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/train/wav.scp"
+        fi
+    done
+
+done
+
+
+### Generate data/test/wav.scp
+for i in $(seq 1 $N_SPEAKERS); do
+    speaker_id=$(make_speaker_id $i)
+
+    # Individual utterances
+    for k in $(seq $N_INDIVIDUAL_UTTERANCES_TRAINING $N_INDIVIDUAL_UTTERANCES); do
+        sentence_id=$(make_sentence_id $k)
+        utterance_id="$speaker_id-$sentence_id-i"
+        wav_file="$CORPUS_DIR/$speaker_id/audio_16k/individuales/$speaker_id$sentence_id.wav"
+        if [ -f "$wav_file" ]; then
+            echo "$utterance_id $wav_file" >> "data/test/wav.scp"
+        fi
+    done
+
+done
+
+
+
+
+####################
+# data/train/utt2spk
+# data/test/utt2spk
+####################
+
+# Take IDs from 'text' file to avoid including missing data's IDs
+
+### Generate data/train/utt2spk
+utterance_ids=$(cat "data/train/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+    echo "$utterance_id $speaker_id" >> "data/train/utt2spk"
+done <<< "$utterance_ids"
+
+
+### Generate data/test/utt2spk
+utterance_ids=$(cat "data/test/text" | cut -d' ' -f1)
+
+while read -r utterance_id; do
+    speaker_id=$(echo "$utterance_id" | cut -d'-' -f1)
+    echo "$utterance_id $speaker_id" >> "data/test/utt2spk"
+done <<< "$utterance_ids"
+
+
+############
+# Sort files
+############
+
+LC_ALL=C sort -o "data/train/text" "data/train/text"
+LC_ALL=C sort -o "data/test/text" "data/test/text"
+LC_ALL=C sort -o "data/train/wav.scp" "data/train/wav.scp"
+LC_ALL=C sort -o "data/test/wav.scp" "data/test/wav.scp"
+LC_ALL=C sort -o "data/train/utt2spk" "data/train/utt2spk"
+LC_ALL=C sort -o "data/test/utt2spk" "data/test/utt2spk"
+
+
+####################
+# data/train/spk2utt
+# data/test/spk2utt
+####################
+utils/utt2spk_to_spk2utt.pl "data/train/utt2spk" > "data/train/spk2utt"
+utils/utt2spk_to_spk2utt.pl "data/test/utt2spk" > "data/test/spk2utt"
diff --git a/egs/spanish_dimex100/s5/local/lang_prep.sh b/egs/spanish_dimex100/s5/local/lang_prep.sh
new file mode 100755
index 00000000000..1ba49bac6d6
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/lang_prep.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+CORPUS_DIR="$1"
+
+mkdir -p "data/local/dict"
+
+source ./path.sh
+
+#############################
+# data/local/dict/lexicon.txt
+#############################
+
+export LC_ALL=C
+
+echo -e '!SIL sil\n<UNK> spn' > data/local/dict/lexicon.txt
+cat "$CORPUS_DIR/diccionarios/T22.full.dic" \
+    | tr '[:upper:]' '[:lower:]' \
+    | sed -e 's/([0123456789]*)//g' \
+        -e 's/\([^ ]\)n\~/\1n/g' \
+        -e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \
+        -e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \
+        -e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \
+    | sed -e 's/_7n.*$//' \
+        -e 's/atl_7tica/atletica/' \
+        -e 's/biol_7gicas/biologicas/' \
+        -e 's/elec_7ctrico/electrico/' \
+        -e 's/gr_7afico/grafico/' \
+        -e 's/s_7lo/solo/' \
+    | sed -e 's/n~/ni/g' -e 's/r(/rh/g' \
+    | sed -e 's/\t/ /g' -e '/^$/d' \
+    | sort | uniq \
+    >> data/local/dict/lexicon.txt
+
+
+#######################################
+# data/local/dict/silence_phones.txt
+# data/local/dict/optional_silence.txt
+# data/local/dict/nonsilence_phones.txt
+# data/local/dict/extra_questions.txt
+#######################################
+
+echo -e 'sil\nspn' > data/local/dict/silence_phones.txt
+echo -e 'sil' > data/local/dict/optional_silence.txt
+cat data/local/dict/lexicon.txt \
+    | grep -v '<UNK>' \
+    | grep -v '!SIL' \
+    | cut -d' ' -f1 --complement \
+    | sed 's/ /\n/g' \
+    | sort -u \
+    > data/local/dict/nonsilence_phones.txt
diff --git a/egs/spanish_dimex100/s5/local/lm_prep.sh b/egs/spanish_dimex100/s5/local/lm_prep.sh
new file mode 100755
index 00000000000..82c3c22cddd
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/lm_prep.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+## Install SRILM in the `tools` directory (install_srilm.sh)
+
+## Only run this file from the example root directory
+##      $ ./local/data_prep.sh
+
+mkdir -p "data/local/tmp" "data/lang/tmp"
+
+source ./path.sh
+
+if [ -d "../../../tools/srilm/bin/i686-m64" ]; then
+    ngram_count_exe="../../../tools/srilm/bin/i686-m64/ngram-count"
+elif [ -d "../../../tools/srilm/bin/i686" ]; then
+    ngram_count_exe="../../../tools/srilm/bin/i686/ngram-count"
+else
+    echo
+    echo "[!] Install SRILM in the 'tools' directory (install_srilm.sh)"
+    echo
+    exit 1
+fi
+
+
+########################
+# data/local/tmp/lm_text
+########################
+
+# Text sentences input for language model generation
+# taken from data/[train|test]/text but with utterance IDs removed
+
+cat data/train/text data/test/text | cut -d' ' -f1 --complement > data/local/tmp/lm_text
+
+
+#################################
+# data/local/tmp/3gram_arpa_lm.gz
+##################################
+
+$ngram_count_exe -lm data/local/tmp/3gram_lm.arpa.kn.gz \
+    -order 3 \
+    -write-vocab data/local/tmp/vocab-full.txt \
+    -sort \
+    -wbdiscount \
+    -unk \
+    -map-unk "<UNK>" \
+    -text data/local/tmp/lm_text
+    # -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 \
+    # -kndiscount3 -gt3min 3 -order 3 \
+
+
+#################
+# data/lang/G.fst
+#################
+
+utils/format_lm.sh data/lang \
+    data/local/tmp/3gram_lm.arpa.kn.gz \
+    data/local/dict/lexicon.txt \
+    data/lang
diff --git a/egs/spanish_dimex100/s5/local/score.sh b/egs/spanish_dimex100/s5/local/score.sh
new file mode 100755
index 00000000000..0be7d192282
--- /dev/null
+++ b/egs/spanish_dimex100/s5/local/score.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+min_lmwt=7
+max_lmwt=17
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
+  lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
+    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
+
+# Note: the double level of quoting for the sed command
+$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+
+# Show results
+for f in $dir/wer_*; do echo $f; egrep  '(WER)|(SER)' < $f; done
+
+exit 0;
diff --git a/egs/spanish_dimex100/s5/path.sh b/egs/spanish_dimex100/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/spanish_dimex100/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/spanish_dimex100/s5/run.sh b/egs/spanish_dimex100/s5/run.sh
new file mode 100755
index 00000000000..30f1ad0397f
--- /dev/null
+++ b/egs/spanish_dimex100/s5/run.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+. ./path.sh || exit 1
+. ./cmd.sh || exit 1
+
+########
+# Config
+########
+
+train_cmd="utils/run.pl"
+decode_cmd="utils/run.pl"
+
+CORPUS_DIR="CorpusDimex100"
+
+N_HMM=2000 # leaves
+N_GAUSSIANS=11000
+
+
+#################
+# Download corpus
+#################
+
+echo
+echo Downloading corpus
+echo
+if [ ! -d "$CORPUS_DIR" ]; then
+  wget http://turing.iimas.unam.mx/~luis/DIME/DIMEx100/DVD/DVDCorpusDimex100.zip || exit 1;
+  unzip DVDCorpusDimex100.zip || exit 1;
+fi
+
+
+##################
+# Data preparation
+##################
+
+echo
+echo Data preparation
+echo
+rm -rf data exp mfcc
+local/data_prep.sh "$CORPUS_DIR"
+utils/fix_data_dir.sh "data/train"
+utils/fix_data_dir.sh "data/test"
+
+
+#####################
+# Features generation
+#####################
+
+echo
+echo Features generation
+echo
+steps/make_mfcc.sh --cmd "$train_cmd" "data/train" "exp/make_mfcc/train" mfcc
+steps/make_mfcc.sh --cmd "$train_cmd" "data/test"  "exp/make_mfcc/test"  mfcc
+
+steps/compute_cmvn_stats.sh "data/train" "exp/make_mfcc/train" mfcc
+steps/compute_cmvn_stats.sh "data/test" "exp/make_mfcc/test" mfcc
+
+utils/validate_data_dir.sh "data/train"
+utils/validate_data_dir.sh "data/test"
+
+
+#######################
+# Lang data preparation
+#######################
+
+echo
+echo Language data preparation
+echo
+rm -rf data/local/dict
+local/lang_prep.sh "$CORPUS_DIR"
+utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+utils/fix_data_dir.sh "data/train"
+utils/fix_data_dir.sh "data/test"
+
+
+############################
+# Language model preparation
+############################
+
+echo
+echo Language model preparation
+echo
+local/lm_prep.sh
+
+
+#######################
+# Training and Decoding
+#######################
+
+echo
+echo Training
+echo
+# utils/subset_data_dir.sh --first data/train 500 data/train_500
+
+# Training and aligning
+steps/train_mono.sh --cmd "$train_cmd" data/train data/lang exp/mono || exit 1
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/mono exp/mono_aligned || exit 1
+steps/train_deltas.sh "$N_HMM" "$N_GAUSSIANS" data/train data/lang exp/mono_aligned exp/tri1 || exit 1
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_aligned || exit 1
+
+# train tri2b [LDA+MLLT]
+steps/train_lda_mllt.sh --cmd "$train_cmd" "$N_HMM" "$N_GAUSSIANS" data/train data/lang exp/tri1_aligned exp/tri2b || exit 1;
+utils/mkgraph.sh data/lang exp/tri2b exp/tri2b/graph
+steps/align_si.sh --cmd "$train_cmd" data/train data/lang exp/tri2b exp/tri2b_aligned || exit 1
+
+#  Do MMI on top of LDA+MLLT.
+steps/make_denlats.sh --cmd "$train_cmd" data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1;
+steps/train_mmi.sh --boost 0.05 data/train data/lang exp/tri2b_aligned exp/tri2b_denlats exp/tri2b_mmi_b0.05 || exit 1;
+
+
+
+# Decoding
+echo
+echo Decoding
+echo
+steps/decode.sh --config conf/decode.config --cmd "$decode_cmd" exp/tri2b/graph data/test exp/tri2b_mmi_b0.05/decode_test
+
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/spanish_dimex100/s5/steps b/egs/spanish_dimex100/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/spanish_dimex100/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/spanish_dimex100/s5/utils b/egs/spanish_dimex100/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/spanish_dimex100/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
index ec6b8941955..47557f93696 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1a.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
index 53aa92710e8..7afa1b7f902 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1b.sh
@@ -153,7 +153,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
index 83c2f3607f0..e69e499e152 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1c.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
index 2665ea91ff8..86e0352828c 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1d.sh
@@ -164,7 +164,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
index 80f67d34ba9..313f899a471 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_lstm_1e.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
index e242660a10e..600f27ddf86 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -135,7 +135,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 86dc4b75a24..cedc448464a 100755
--- a/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/sprakbanken/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -145,7 +145,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/sprakbanken/s5/local/norm_dk/write_punct.sh b/egs/sprakbanken/s5/local/norm_dk/write_punct.sh
index 57726bd44cb..3b8decaf376 100755
--- a/egs/sprakbanken/s5/local/norm_dk/write_punct.sh
+++ b/egs/sprakbanken/s5/local/norm_dk/write_punct.sh
@@ -22,4 +22,4 @@ perl -pe 's/([\n ])\;([ \n])/\1SEMIKOLON\2/g' | \
 perl -pe 's/([\n ])_NL_([ \n])/\1NY LINJE\2/g' | \
 perl -pe 's/([\n ])_NS_([ \n])/\1NYT AFSNIT\2/g' | \
 
-tr -s ' '
\ No newline at end of file
+tr -s ' '
diff --git a/egs/sprakbanken/s5/local/normalize_transcript.py b/egs/sprakbanken/s5/local/normalize_transcript.py
index 2374418bee7..21d70864f04 100755
--- a/egs/sprakbanken/s5/local/normalize_transcript.py
+++ b/egs/sprakbanken/s5/local/normalize_transcript.py
@@ -17,8 +17,8 @@
             "\t": " "
             }
 
-from_chars = ''.join(normdict.keys())
-to_chars = ''.join(normdict.values())
+from_chars = ''.join(list(normdict.keys()))
+to_chars = ''.join(list(normdict.values()))
 
 #t_table = maketrans(from_chars, to_chars)
 
diff --git a/egs/sprakbanken/s5/local/sprak2kaldi.py b/egs/sprakbanken/s5/local/sprak2kaldi.py
index f3abf1d9a38..5fa4baa1fa2 100755
--- a/egs/sprakbanken/s5/local/sprak2kaldi.py
+++ b/egs/sprakbanken/s5/local/sprak2kaldi.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 '''
+from __future__ import print_function
 
 
 import sys
@@ -59,8 +60,8 @@ def create_parallel_file_list(session, sndlist, txtlist):
         if len(os.listdir(session.sessiondir)) != 0:  # Check if there are files in the directory
             global n
             n += 1
-            session.sessiondir = session.sessiondir + "_" + str(n)
-            session.speaker_id = session.speaker_id + "_" + str(n)
+            session.sessiondir = "{}_{}".format(session.sessiondir, n)
+            session.speaker_id = "{}_{}".format(session.speaker_id, n)
             os.mkdir(session.sessiondir)
             shadow = True
     else:
diff --git a/egs/sprakbanken/s5/local/sprak2parallel.py b/egs/sprakbanken/s5/local/sprak2parallel.py
index b5fe56fd60f..3dc82e30ac2 100755
--- a/egs/sprakbanken/s5/local/sprak2parallel.py
+++ b/egs/sprakbanken/s5/local/sprak2parallel.py
@@ -76,8 +76,8 @@ def make_speech_corpus(top, dest, srcfolder):
         session.sessiondir = os.path.join(dest, session.filestem) +"."+ session.speaker_id
         if os.path.exists(session.sessiondir):
             n += 1
-            session.sessiondir = session.sessiondir+ "_" +str(n)
-            session.speaker_id+ "_" +str(n)
+            session.sessiondir = "{}_{}".format(session.sessiondir, n)
+            session.speaker_id = "{}_{}".format(session.speaker_id, n)
         os.mkdir(session.sessiondir)
         
         create_parallel_files(session)
diff --git a/egs/sprakbanken/s5/local/sprakparser.py b/egs/sprakbanken/s5/local/sprakparser.py
index 7bdf6ac94e3..1221cf0b023 100755
--- a/egs/sprakbanken/s5/local/sprakparser.py
+++ b/egs/sprakbanken/s5/local/sprakparser.py
@@ -22,11 +22,12 @@
 
 
 '''
+from __future__ import print_function
 
 import codecs
 import os
 
-class Session:
+class Session(object):
     
     delimit = ">-<"
         
@@ -151,7 +152,7 @@ def set_channel_vars(self, handle):
                 pass
             
     def create_filename(self, uid, file_ending):
-        return self.filestem+ "." +self.speaker_id+ "." +str(uid)+ "." +file_ending
+        return "{}.{}.{}.{}".format(self.filestem, self.speaker_id, uid, file_ending)
         
     def wavpath(self, topfolder):
         prefix, suffix = topfolder.rsplit('/data/', 1)
diff --git a/egs/sprakbanken/s5/local/writenumbers.py b/egs/sprakbanken/s5/local/writenumbers.py
index df3235243d4..c419b3c7550 100755
--- a/egs/sprakbanken/s5/local/writenumbers.py
+++ b/egs/sprakbanken/s5/local/writenumbers.py
@@ -22,6 +22,7 @@
 
 Changed to write output to file to prevent problems with shell ascii codec.
 '''
+from __future__ import print_function
 
 import sys
 import os
@@ -215,7 +216,7 @@ def rmPvAnnotation(string):
 
 def normNumber(line, table):
     tokens = line.split()
-    keys = table.keys()
+    keys = list(table.keys())
     for num, tok in enumerate(tokens):
         newtoks = splitNumeric(tok)
         if newtoks != False:
diff --git a/egs/sprakbanken_swe/s5/local/normalize_transcript.py b/egs/sprakbanken_swe/s5/local/normalize_transcript.py
index 90e45744e2a..150a9563aba 100755
--- a/egs/sprakbanken_swe/s5/local/normalize_transcript.py
+++ b/egs/sprakbanken_swe/s5/local/normalize_transcript.py
@@ -18,8 +18,8 @@
             }
 #removes all the above signs
 
-from_chars = ''.join(normdict.keys())
-to_chars = ''.join(normdict.values())
+from_chars = ''.join(list(normdict.keys()))
+to_chars = ''.join(list(normdict.values()))
 
 t_table = str.maketrans(normdict)
 
diff --git a/egs/sprakbanken_swe/s5/local/sprak2kaldi.py b/egs/sprakbanken_swe/s5/local/sprak2kaldi.py
index cc67344c36e..8f723762e50 100755
--- a/egs/sprakbanken_swe/s5/local/sprak2kaldi.py
+++ b/egs/sprakbanken_swe/s5/local/sprak2kaldi.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 '''
+from __future__ import print_function
 
 
 import sys
@@ -59,8 +60,8 @@ def create_parallel_file_list(session, sndlist, txtlist):
         if len(os.listdir(session.sessiondir)) != 0:  # Check if there are files in the directory
             global n
             n += 1
-            session.sessiondir = session.sessiondir + "_" + str(n)
-            session.speaker_id = session.speaker_id + "_" + str(n)
+            session.sessiondir = "{}_{}".format(session.sessiondir, n)
+            session.speaker_id = "{}_{}".format(session.speaker_id, n)
             os.mkdir(session.sessiondir)
             shadow = True
     else:
diff --git a/egs/sprakbanken_swe/s5/local/sprakparser.py b/egs/sprakbanken_swe/s5/local/sprakparser.py
index 4775328b56b..0951f7f39e7 100755
--- a/egs/sprakbanken_swe/s5/local/sprakparser.py
+++ b/egs/sprakbanken_swe/s5/local/sprakparser.py
@@ -26,7 +26,7 @@
 import codecs
 import os
 
-class Session:
+class Session(object):
     
     delimit = ">-<"
         
@@ -151,7 +151,7 @@ def set_channel_vars(self, handle):
                 pass
             
     def create_filename(self, uid, file_ending):
-        return self.filestem+ "." +self.speaker_id+ "." +str(uid)+ "." +file_ending
+        return "{}.{}.{}.{}".format(self.filestem, self.speaker_id, uid, file_ending)
         
     def wavpath(self, topfolder):
         prefix, suffix = topfolder.rsplit('/data/', 1)
diff --git a/egs/sre08/v1/local/score_sre08.sh b/egs/sre08/v1/local/score_sre08.sh
index 92831502f45..c1584946735 100755
--- a/egs/sre08/v1/local/score_sre08.sh
+++ b/egs/sre08/v1/local/score_sre08.sh
@@ -35,11 +35,11 @@ tot_eer=0.0
 printf '% 12s' 'EER:'
 for condition in $(seq 8); do
   eer=$(awk '{print $3}' $scores | paste - $trials | awk -v c=$condition '{n=4+c; if ($n == "Y") print $1, $4}' | compute-eer - 2>/dev/null)
-  tot_eer=$(echo "$tot_eer+$eer" | bc)
+  tot_eer=$(perl -e "print ($tot_eer+$eer);")
   eers[$condition]=$eer
 done
 
-eers[0]=$(echo "$tot_eer/8" | bc -l)
+eers[0]=$(perl -e "print ($tot_eer/8.0);")
 
 for i in $(seq 0 8); do
   printf '% 7.2f' ${eers[$i]}
diff --git a/egs/sre08/v1/sid/compute_vad_decision.sh b/egs/sre08/v1/sid/compute_vad_decision.sh
deleted file mode 100755
index 7099d063c7f..00000000000
--- a/egs/sre08/v1/sid/compute_vad_decision.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash 
-
-# Copyright    2013  Daniel Povey
-# Apache 2.0
-# To be run from .. (one directory up from here)
-# see ../run.sh for example
-
-# Compute energy based VAD output 
-# We do this in just one job; it's fast.
-#
-
-nj=2
-cmd=run.pl
-vad_config=conf/vad.conf
-
-echo "$0 $@"  # Print the command line for logging
-
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
-
-if [ $# != 3 ]; then
-   echo "Usage: $0 [options] <data-dir> <log-dir> <path-to-vad-dir>";
-   echo "e.g.: $0 data/train exp/make_vad mfcc"
-   echo " Options:"
-   echo "  --vad-config <config-file>                       # config passed to compute-vad-energy"
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   exit 1;
-fi
-
-data=$1
-logdir=$2
-vaddir=$3
-
-# make $vaddir an absolute pathname.
-vaddir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $vaddir ${PWD}`
-
-# use "name" as part of name of the archive.
-name=`basename $data`
-
-mkdir -p $vaddir || exit 1;
-mkdir -p $logdir || exit 1;
-
-
-for f in $data/feats.scp "$vad_config"; do
-  if [ ! -f $f ]; then
-    echo "compute_vad_decision.sh: no such file $f"
-    exit 1;
-  fi
-done
-
-utils/split_data.sh $data $nj || exit 1;
-sdata=$data/split$nj;
-
-$cmd JOB=1:$nj $logdir/vad_${name}.JOB.log \
-  compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp \
-  || exit 1;
-
-for ((n=1; n<=nj; n++)); do
-  cat $vaddir/vad_${name}.$n.scp || exit 1;
-done > $data/vad.scp
-
-nc=`cat $data/vad.scp | wc -l` 
-nu=`cat $data/feats.scp | wc -l` 
-if [ $nc -ne $nu ]; then
-  echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);"
-  echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh"
-  [ $nc -eq 0 ] && exit 1;
-fi
-
-
-echo "Created VAD output for $name"
diff --git a/egs/sre08/v1/sid/compute_vad_decision.sh b/egs/sre08/v1/sid/compute_vad_decision.sh
new file mode 120000
index 00000000000..174321b847e
--- /dev/null
+++ b/egs/sre08/v1/sid/compute_vad_decision.sh
@@ -0,0 +1 @@
+../steps/compute_vad_decision.sh
\ No newline at end of file
diff --git a/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py b/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py
index 72a4572d9a0..e1a4fc534e0 100755
--- a/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py
+++ b/egs/sre08/v1/sid/nnet3/xvector/allocate_egs.py
@@ -65,6 +65,7 @@
 
 # We're using python 3.x style print but want it to work in python 2.x.
 from __future__ import print_function
+from __future__ import division
 import re, os, argparse, sys, math, warnings, random
 
 def get_args():
@@ -196,7 +197,7 @@ def deterministic_chunk_length(archive_id, num_archives, min_frames_per_chunk, m
   elif num_archives == 1:
     return int(max_frames_per_chunk);
   else:
-    return int(math.pow(float(max_frames_per_chunk) /
+    return int(math.pow(float(max_frames_per_chunk)/
                      min_frames_per_chunk, float(archive_id) /
                      (num_archives-1)) * min_frames_per_chunk + 0.5)
 
@@ -247,7 +248,7 @@ def main():
             length = deterministic_chunk_length(archive_index, args.num_archives, args.min_frames_per_chunk, args.max_frames_per_chunk);
         print("{0} {1}".format(archive_index + 1, length), file=info_f)
         archive_chunk_lengths.append(length)
-        this_num_egs = int((args.frames_per_iter / length) + 1)
+        this_num_egs = int(float(args.frames_per_iter) / length + 1)
         this_egs = [ ] # A 2-tuple of the form (utt-id, start-frame)
         spkrs = args.num_repeats * list(spk2utt.keys())
         random.shuffle(spkrs)
diff --git a/egs/sre10/v1/local/prepare_for_eer.py b/egs/sre10/v1/local/prepare_for_eer.py
index 59d2985e7c2..bb4e666f0ab 100755
--- a/egs/sre10/v1/local/prepare_for_eer.py
+++ b/egs/sre10/v1/local/prepare_for_eer.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 # Copyright 2015   David Snyder
 # Apache 2.0.
 #
@@ -12,4 +13,4 @@
   spkrutt2target[spkr+utt]=target
 for line in scores:
   spkr, utt, score = line.strip().split()
-  print score, spkrutt2target[spkr+utt]
+  print("{} {}".format(score, spkrutt2target[spkr+utt]))
diff --git a/egs/sre16/v1/local/make_musan.py b/egs/sre16/v1/local/make_musan.py
deleted file mode 100755
index b3f6652ba40..00000000000
--- a/egs/sre16/v1/local/make_musan.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/sre16/v1/local/make_musan.sh b/egs/sre16/v1/local/make_musan.sh
deleted file mode 100755
index 694940ad70f..00000000000
--- a/egs/sre16/v1/local/make_musan.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/sre16/v1/run.sh b/egs/sre16/v1/run.sh
index 52ee86ec5b2..2315d7ac78a 100755
--- a/egs/sre16/v1/run.sh
+++ b/egs/sre16/v1/run.sh
@@ -130,7 +130,7 @@ if [ $stage -le 4 ]; then
 
   # Make a reverberated version of the SRE list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -145,7 +145,7 @@ if [ $stage -le 4 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -155,11 +155,11 @@ if [ $stage -le 4 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/sre data/sre_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/sre data/sre_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/sre data/sre_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/sre data/sre_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/sre data/sre_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/sre data/sre_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/sre_aug data/sre_reverb data/sre_noise data/sre_music data/sre_babble
diff --git a/egs/sre16/v2/run.sh b/egs/sre16/v2/run.sh
index 0bc06431138..7780c30560b 100755
--- a/egs/sre16/v2/run.sh
+++ b/egs/sre16/v2/run.sh
@@ -82,7 +82,7 @@ if [ $stage -le 0 ]; then
 fi
 
 if [ $stage -le 1 ]; then
-  # Make filterbanks and compute the energy-based VAD for each dataset
+  # Make MFCCs and compute the energy-based VAD for each dataset
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
     utils/create_split_dir.pl \
       /export/b{14,15,16,17}/$USER/kaldi-data/egs/sre16/v2/xvector-$(date +'%m_%d_%H_%M')/mfccs/storage $mfccdir/storage
@@ -120,7 +120,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the SWBD+SRE list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -135,7 +135,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh /export/corpora/JHU/musan data
+  steps/data/make_musan.sh --sampling-rate 8000 /export/corpora/JHU/musan data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -145,11 +145,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/swbd_sre data/swbd_sre_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/swbd_sre data/swbd_sre_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/swbd_sre data/swbd_sre_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/swbd_sre data/swbd_sre_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/swbd_sre data/swbd_sre_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/swbd_sre data/swbd_sre_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/swbd_sre_aug data/swbd_sre_reverb data/swbd_sre_noise data/swbd_sre_music data/swbd_sre_babble
@@ -159,7 +159,7 @@ if [ $stage -le 2 ]; then
   utils/subset_data_dir.sh data/swbd_sre_aug 128000 data/swbd_sre_aug_128k
   utils/fix_data_dir.sh data/swbd_sre_aug_128k
 
-  # Make filterbanks for the augmented data.  Note that we do not compute a new
+  # Make MFCCs for the augmented data.  Note that we do not compute a new
   # vad.scp file here.  Instead, we use the vad.scp from the clean version of
   # the list.
   steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \
@@ -174,6 +174,7 @@ if [ $stage -le 2 ]; then
   utils/copy_data_dir.sh data/swbd_sre_combined data/sre_combined
   utils/filter_scp.pl data/sre/spk2utt data/swbd_sre_combined/spk2utt | utils/spk2utt_to_utt2spk.pl > data/sre_combined/utt2spk
   utils/fix_data_dir.sh data/sre_combined
+
 fi
 
 # Now we prepare the features to generate examples for xvector training.
diff --git a/egs/svhn/v1/local/process_data.py b/egs/svhn/v1/local/process_data.py
index f6ea85118f9..2a5bfc9a0d6 100755
--- a/egs/svhn/v1/local/process_data.py
+++ b/egs/svhn/v1/local/process_data.py
@@ -6,6 +6,7 @@
 
 """ This script prepares the training and test data for SVHN.
 """
+from __future__ import division
 
 import argparse
 import os
@@ -16,11 +17,11 @@
 parser = argparse.ArgumentParser(description="""Converts train/test data of
                                                 SVHN (Street View House Numbers)
                                                 dataset to Kaldi feature format""")
-parser.add_argument('matlab_file', type=str,
+parser.add_argument('matlab_file',
                     help='path to SVHN matlab data file (cropped version)')
-parser.add_argument('dir', type=str,
+parser.add_argument('dir',
                     help='output dir')
-parser.add_argument('--out-ark', type=str,
+parser.add_argument('--out-ark',
                     default='-', help='where to write output feature data')
 
 args = parser.parse_args()
@@ -48,7 +49,7 @@ def write_kaldi_matrix(file_handle, matrix, key):
         if num_cols != len(matrix[row_index]):
             raise Exception("All the rows of a matrix are expected to "
                             "have the same length")
-        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        file_handle.write(" ".join([str(x) for x in matrix[row_index]]))
         if row_index != num_rows - 1:
             file_handle.write("\n")
     file_handle.write(" ]\n")
@@ -80,7 +81,7 @@ def zeropad(x, length):
     lbl = labels[i, 0]
     if lbl == 10:
         lbl = 0
-    labels_fh.write(key + ' ' + str(lbl) + '\n')
+    labels_fh.write("{} {}\n".format(key, lbl))
     img = data[i]
     write_kaldi_matrix(out_fh, img, key)
     img_id += 1
diff --git a/egs/swahili/s5/run.sh b/egs/swahili/s5/run.sh
index 018ac84ea84..3da3a30cc3b 100755
--- a/egs/swahili/s5/run.sh
+++ b/egs/swahili/s5/run.sh
@@ -117,7 +117,7 @@ echo -e "fMMI+MMI training done.\n"
 ### Triphone + LDA and MLLT + SGMM
 ## SGMM
 # Training
-steps/train_ubm.sh --cmd "$train_cmd"  500 data/train data/lang exp/system1/tri3b_ali exp/system1/ubm5b2 || exit 1;
+steps/train_ubm.sh --cmd "$train_cmd" 500 data/train data/lang exp/system1/tri3b_ali exp/system1/ubm5b2 || exit 1;
 steps/train_sgmm2.sh  --cmd "$train_cmd" 5000 12000 data/train data/lang exp/system1/tri3b_ali exp/system1/ubm5b2/final.ubm exp/system1/sgmm2_5b2 || exit 1;
 # Graph compilation
 utils/mkgraph.sh data/lang exp/system1/sgmm2_5b2 exp/system1/sgmm2_5b2/graph
@@ -146,7 +146,7 @@ for iter in 1 2 3 4; do
 done
 
 ## MBR
-rm -r exp/system1/sgmm2_5b2_mmi_b0.1/decode_test_it3.mbr 2>/dev/null 
+rm -r exp/system1/sgmm2_5b2_mmi_b0.1/decode_test_it3.mbr 2>/dev/null
 cp -r exp/system1/sgmm2_5b2_mmi_b0.1/decode_test_it3{,.mbr}
 local/score_mbr.sh data/test data/lang exp/system1/sgmm2_5b2_mmi_b0.1/decode_test_it3.mbr
 
diff --git a/egs/swbd/s5/local/run_sgmm2.sh b/egs/swbd/s5/local/run_sgmm2.sh
index 1884e327db0..194dfa05e61 100755
--- a/egs/swbd/s5/local/run_sgmm2.sh
+++ b/egs/swbd/s5/local/run_sgmm2.sh
@@ -9,7 +9,7 @@
 if [ ! -f exp/ubm5a/final.ubm ]; then
   steps/train_ubm.sh --cmd "$train_cmd" 700 data/train_100k_nodup data/lang \
     exp/tri4a_ali_100k_nodup exp/ubm5a || exit 1;
-fi 
+fi
 
 steps/train_sgmm2.sh --cmd "$train_cmd" \
   9000 30000 data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup \
@@ -65,4 +65,4 @@ done
  done
  wait
 )
- 
+
diff --git a/egs/swbd/s5/run.sh b/egs/swbd/s5/run.sh
index 66aa1d99866..79fe7703314 100755
--- a/egs/swbd/s5/run.sh
+++ b/egs/swbd/s5/run.sh
@@ -14,7 +14,7 @@ exit 1;
 
 #local/swbd_p1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
 local/swbd_p1_data_prep.sh /data/corpora0/LDC97S62/
-#local/swbd_p1_data_prep.sh /export/corpora3/LDC/LDC97S62 
+#local/swbd_p1_data_prep.sh /export/corpora3/LDC/LDC97S62
 
 local/swbd_p1_prepare_dict.sh
 
@@ -33,13 +33,13 @@ local/eval2000_data_prep.sh  /data/corpora0/LDC2002S09/hub5e_00 /data/corpora0/L
 
 . ./cmd.sh
 # mfccdir should be some place with a largish disk where you
-# want to store MFCC features. 
+# want to store MFCC features.
 mfccdir=`pwd`/mfcc
 
 steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir || exit 1;
-# Don't do "|| exit 1" because actually some speakers don't have data, 
+# Don't do "|| exit 1" because actually some speakers don't have data,
 # we'll get rid of them later.  Ignore this error.
-steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 
+steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 
 # after this, the next command will remove the small number of utterances
 # that couldn't be extracted for some reason (e.g. too short; no such file).
@@ -77,22 +77,22 @@ utils/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup
 utils/subset_data_dir.sh --first data/train_nodev 100000 data/train_100k
 utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup
 
-# The next commands are not necessary for the scripts to run, but increase 
-# efficiency of data access by putting the mfcc's of the subset 
+# The next commands are not necessary for the scripts to run, but increase
+# efficiency of data access by putting the mfcc's of the subset
 # in a contiguous place in a file.
 ( . ./path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_10k_nodup/feats.scp{,.bak} 
+  cp data/train_10k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_10k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_swbd_10k_nodup.ark,$mfccdir/kaldi_swbd_10k_nodup.scp \
   && cp $mfccdir/kaldi_swbd_10k_nodup.scp data/train_10k_nodup/feats.scp
 )
 ( . ./path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_30k_nodup/feats.scp{,.bak} 
+  cp data/train_30k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_30k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_swbd_30k_nodup.ark,$mfccdir/kaldi_swbd_30k_nodup.scp \
   && cp $mfccdir/kaldi_swbd_30k_nodup.scp data/train_30k_nodup/feats.scp
 )
- 
+
 
 steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
   data/train_10k_nodup data/lang exp/mono0a || exit 1;
@@ -102,7 +102,7 @@ steps/align_si.sh --nj 30 --cmd "$train_cmd" \
 
 steps/train_deltas.sh --cmd "$train_cmd" \
     2500 20000 data/train_30k_nodup data/lang exp/mono0a_ali exp/tri1 || exit 1;
- 
+
 utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
 
 steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
diff --git a/egs/swbd/s5b/local/run_sgmm2.sh b/egs/swbd/s5b/local/run_sgmm2.sh
index eda22786d82..0cddc13bbd4 100755
--- a/egs/swbd/s5b/local/run_sgmm2.sh
+++ b/egs/swbd/s5b/local/run_sgmm2.sh
@@ -10,7 +10,7 @@ set -e
 if [ ! -f exp/ubm5b/final.ubm ]; then
   steps/train_ubm.sh --cmd "$train_cmd" 1400 data/train_nodup data/lang \
     exp/tri4b_ali_nodup exp/ubm5b || exit 1;
-fi 
+fi
 
 steps/train_sgmm2.sh --cmd "$train_cmd" \
   18000 60000 data/train_nodup data/lang exp/tri4b_ali_nodup \
diff --git a/egs/swbd/s5b/run.sh b/egs/swbd/s5b/run.sh
index a1424e1fa34..ba447e6f972 100755
--- a/egs/swbd/s5b/run.sh
+++ b/egs/swbd/s5b/run.sh
@@ -15,7 +15,7 @@ exit 1;
 . ./path.sh
 set -e # exit on error
 # mfccdir should be some place with a largish disk where you
-# want to store MFCC features. 
+# want to store MFCC features.
 mfccdir=mfcc
 
 if [ -z $IRSTLM ] ; then
@@ -36,7 +36,7 @@ fi
 # which specifies the directory to Switchboard documentations. Specifically, if
 # this argument is given, the script will look for the conv.tab file and correct
 # speaker IDs to the actual speaker personal identification numbers released in
-# the documentations. The documentations can be found here: 
+# the documentations. The documentations can be found here:
 # https://catalog.ldc.upenn.edu/docs/LDC97S62/
 # Note: if you are using this link, make sure you rename conv_tab.csv to conv.tab
 # after downloading.
@@ -52,7 +52,7 @@ local/swbd1_prepare_dict.sh
 utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
 # Now train the language models. We are using SRILM and interpolating with an
-# LM trained on the Fisher transcripts (part 2 disk is currently missing; so 
+# LM trained on the Fisher transcripts (part 2 disk is currently missing; so
 # only part 1 transcripts ~700hr are used)
 
 # If you have the Fisher data, you can set this "fisher_dir" variable.
@@ -75,10 +75,10 @@ for order in 3 4; do
   LM=data/local/lm/sw1.o${order}g.kn.gz
   utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
     data/lang $LM data/local/dict/lexicon.txt data/lang_sw1_$lm_suffix
-  
+
   LM=data/local/lm/sw1_fsh.o${order}g.kn.gz
   utils/build_const_arpa_lm.sh $LM data/lang data/lang_sw1_fsh_$lm_suffix
-  
+
   # For some funny reason we are still using IRSTLM for doing LM pruning :)
   prune-lm --threshold=1e-7 data/local/lm/sw1_fsh.o${order}g.kn.gz /dev/stdout \
     | gzip -c > data/local/lm/sw1_fsh.o${order}g.pr1-7.kn.gz || exit 1
@@ -98,9 +98,9 @@ done
 local/eval2000_data_prep.sh /export/corpora2/LDC/LDC2002S09/hub5e_00 /export/corpora2/LDC/LDC2002T43
 
 steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
-steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 
+steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir
 
-# Remove the small number of utterances that couldn't be extracted for some 
+# Remove the small number of utterances that couldn't be extracted for some
 # reason (e.g. too short; no such file).
 utils/fix_data_dir.sh data/train
 
@@ -120,10 +120,10 @@ utils/subset_data_dir.sh --last data/train $n data/train_nodev
 # perl -ne 'split; $s+=($_[3]-$_[2]); END{$h=int($s/3600); $r=($s-$h*3600); $m=int($r/60); $r-=$m*60; printf "%.1f sec -- %d:%d:%.1f\n", $s, $h, $m, $r;}' data/local/train/segments
 
 
-# Now-- there are 260k utterances (313hr 23min), and we want to start the 
-# monophone training on relatively short utterances (easier to align), but not 
+# Now-- there are 260k utterances (313hr 23min), and we want to start the
+# monophone training on relatively short utterances (easier to align), but not
 # only the shortest ones (mostly uh-huh).  So take the 100k shortest ones;
-# remove most of the repeated utterances (these are the uh-huh type ones), and 
+# remove most of the repeated utterances (these are the uh-huh type ones), and
 # then take 10k random utterances from those (about 4hr 40mins)
 
 utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort
@@ -144,13 +144,13 @@ utils/data/remove_dup_utts.sh 300 data/train_nodev data/train_nodup  # 286hr
 
 ## Starting basic training on MFCC features
 steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
-  data/train_10k_nodup data/lang exp/mono 
+  data/train_10k_nodup data/lang exp/mono
 
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_30k_nodup data/lang exp/mono exp/mono_ali 
+  data/train_30k_nodup data/lang exp/mono exp/mono_ali
 
 steps/train_deltas.sh --cmd "$train_cmd" \
-  3200 30000 data/train_30k_nodup data/lang exp/mono_ali exp/tri1 
+  3200 30000 data/train_30k_nodup data/lang exp/mono_ali exp/tri1
 
 for lm_suffix in tg fsh_tgpr; do
   (
@@ -163,10 +163,10 @@ for lm_suffix in tg fsh_tgpr; do
 done
 
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_30k_nodup data/lang exp/tri1 exp/tri1_ali 
+  data/train_30k_nodup data/lang exp/tri1 exp/tri1_ali
 
 steps/train_deltas.sh --cmd "$train_cmd" \
-  3200 30000 data/train_30k_nodup data/lang exp/tri1_ali exp/tri2 
+  3200 30000 data/train_30k_nodup data/lang exp/tri1_ali exp/tri2
 
 
 for lm_suffix in tg fsh_tgpr; do
@@ -183,14 +183,14 @@ for lm_suffix in tg fsh_tgpr; do
   ) &
 done
 
-# From now, we start building a bigger system (on train_100k_nodup, which has 
+# From now, we start building a bigger system (on train_100k_nodup, which has
 # 110hrs of data). We start with the LDA+MLLT system
 steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k_nodup data/lang exp/tri2 exp/tri2_ali_100k_nodup 
+  data/train_100k_nodup data/lang exp/tri2 exp/tri2_ali_100k_nodup
 
 # Train tri3b, which is LDA+MLLT, on 100k_nodup data.
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  5500 90000 data/train_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/tri3b 
+  5500 90000 data/train_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/tri3b
 
 for lm_suffix in tg fsh_tgpr; do
   (
@@ -204,12 +204,12 @@ done
 
 # Train tri4a, which is LDA+MLLT+SAT, on 100k_nodup data.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_100k_nodup data/lang exp/tri3b exp/tri3b_ali_100k_nodup 
+  data/train_100k_nodup data/lang exp/tri3b exp/tri3b_ali_100k_nodup
 
 
 steps/train_sat.sh  --cmd "$train_cmd" \
   5500 90000 data/train_100k_nodup data/lang exp/tri3b_ali_100k_nodup \
-   exp/tri4a 
+   exp/tri4a
 
 for lm_suffix in tg fsh_tgpr; do
   (
@@ -226,11 +226,11 @@ done
 # both train and test data.
 # local/run_resegment.sh
 
-# Now train a LDA+MLLT+SAT model on the entire training data (train_nodup; 
+# Now train a LDA+MLLT+SAT model on the entire training data (train_nodup;
 # 286 hours)
 # Train tri4b, which is LDA+MLLT+SAT, on train_nodup data.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_nodup data/lang exp/tri3b exp/tri3b_ali_nodup 
+  data/train_nodup data/lang exp/tri3b exp/tri3b_ali_nodup
 
 
 steps/train_sat.sh  --cmd "$train_cmd" \
@@ -257,7 +257,7 @@ steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
   data/lang_sw1_fsh_{tgpr,fg} data/eval2000 \
   exp/tri4b/decode_eval2000_sw1_fsh_{tgpr,fg} || exit 1;
 
-# MMI training starting from the LDA+MLLT+SAT systems on both the 
+# MMI training starting from the LDA+MLLT+SAT systems on both the
 # train_100k_nodup (110hr) and train_nodup (286hr) sets
 steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
   data/train_100k_nodup data/lang exp/tri4a exp/tri4a_ali_100k_nodup || exit 1
@@ -268,11 +268,11 @@ steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
 steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" --config conf/decode.config \
   --transform-dir exp/tri4a_ali_100k_nodup \
   data/train_100k_nodup data/lang exp/tri4a exp/tri4a_denlats_100k_nodup \
-  
+
 
 steps/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode.config \
   --transform-dir exp/tri4b_ali_nodup \
-  data/train_nodup data/lang exp/tri4b exp/tri4b_denlats_nodup 
+  data/train_nodup data/lang exp/tri4b exp/tri4b_denlats_nodup
 
 # 4 iterations of MMI seems to work well overall. The number of iterations is
 # used as an explicit argument even though train_mmi.sh will use 4 iterations by
@@ -280,11 +280,11 @@ steps/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode.config \
 num_mmi_iters=4
 steps/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --num-iters $num_mmi_iters \
   data/train_100k_nodup data/lang exp/tri4a_{ali,denlats}_100k_nodup \
-  exp/tri4a_mmi_b0.1 
+  exp/tri4a_mmi_b0.1
 
 steps/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --num-iters $num_mmi_iters \
   data/train_nodup data/lang exp/tri4b_{ali,denlats}_nodup \
-  exp/tri4b_mmi_b0.1 
+  exp/tri4b_mmi_b0.1
 
 for iter in 1 2 3 4; do
   for lm_suffix in tg fsh_tgpr; do
@@ -336,11 +336,11 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 100 --cmd "$train_cmd" \
 
 steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
   data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup exp/tri4a_dubm \
-  exp/tri4a_denlats_100k_nodup exp/tri4a_fmmi_b0.1 
+  exp/tri4a_denlats_100k_nodup exp/tri4a_fmmi_b0.1
 
 steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
   data/train_nodup data/lang exp/tri4b_ali_nodup exp/tri4b_dubm \
-  exp/tri4b_denlats_nodup exp/tri4b_fmmi_b0.1  
+  exp/tri4b_denlats_nodup exp/tri4b_fmmi_b0.1
 
 for iter in 4 5 6 7 8; do
   for lm_suffix in tg fsh_tgpr; do
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
index 6792332da56..20dcab8eb50 100755
--- a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_7k.sh
@@ -152,7 +152,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
new file mode 100755
index 00000000000..8762430ee7f
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/multi_condition/run_tdnn_aug_1a.sh
@@ -0,0 +1,281 @@
+#!/bin/bash
+
+# This recipe does multi-style training of TDNN model
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7q_sp tdnn1a_aug
+# System                tdnn7q_sp tdnn1a_aug
+# WER on train_dev(tg)      11.91     12.06
+# WER on train_dev(fg)      10.99     10.92
+# WER on eval2000(tg)        14.3      14.4
+# WER on eval2000(fg)        12.8      12.9
+# WER on rt03(tg)            17.2      17.1
+# WER on rt03(fg)            15.1      14.8
+# Final train prob         -0.062    -0.087
+# Final valid prob         -0.074    -0.105
+# Final train prob (xent)        -0.933    -1.164
+# Final valid prob (xent)       -0.9027   -1.2246
+# Num-parameters               18693376  18483664
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+num_epochs=3
+
+# Augmentation options
+aug_list="reverb babble music noise clean" # Original train dir is referred to as `clean`
+num_reverb_copies=1
+use_ivectors=true
+
+affix=1a
+suffix="_aug"
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+dir=exp/chain/tdnn${affix}${suffix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+clean_set=train_nodup
+clean_ali=tri4_ali_nodup
+train_set=$clean_set$suffix # Will be prepared by the script local/nnet3/prepare_multistyle_data.sh
+ali_dir=$clean_ali$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+# First creates augmented data and then extracts features for it data
+# The script also creates alignments for aug data by copying clean alignments
+local/nnet3/multi_condition/run_aug_common.sh --stage $stage \
+  --aug-list "$aug_list" --num-reverb-copies $num_reverb_copies \
+  --use-ivectors "$use_ivectors" \
+  --train-set $clean_set --clean-ali $clean_ali || exit 1;
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  prefixes=""
+  include_original=false
+  for n in $aug_list; do
+    if [ "$n" == "reverb" ]; then
+      for i in `seq 1 $num_reverb_copies`; do
+        prefixes="$prefixes "reverb$i
+      done
+    elif [ "$n" != "clean" ]; then
+      prefixes="$prefixes "$n
+    else
+      # The original train directory will not have any prefix
+      # include_original flag will take care of copying the original lattices
+      include_original=true
+    fi
+  done
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/${clean_set} \
+    data/lang exp/tri4 exp/tri4_lats_nodup${suffix}_clean
+  rm exp/tri4_lats_nodup${suffix}_clean/fsts.*.gz # save space
+  steps/copy_lat_dir.sh --nj $nj --cmd "$train_cmd" \
+    --include-original "$include_original" --prefixes "$prefixes" \
+    data/${train_set} exp/tri4_lats_nodup${suffix}_clean exp/tri4_lats_nodup${suffix} || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang exp/$ali_dir $treedir
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 16 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_cnn_tdnn.sh b/egs/swbd/s5c/local/chain/run_cnn_tdnn.sh
new file mode 120000
index 00000000000..ab83f3c43e8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_cnn_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh
index 2f256c1a5aa..af535373188 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_7q.sh
\ No newline at end of file
+tuning/run_tdnn_7r.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_aug.sh b/egs/swbd/s5c/local/chain/run_tdnn_aug.sh
new file mode 120000
index 00000000000..390ed99f5cc
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_aug.sh
@@ -0,0 +1 @@
+multi_condition/run_tdnn_aug_1a.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
index ae7c97e7d08..acdae844b65 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6j.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
index 90d672b9ae9..bbd8cb63697 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
@@ -116,7 +116,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
index 68daf81ab01..16f2ea211d0 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6l.sh
@@ -125,7 +125,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
index 4668aac9ebc..09f7d72434c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6m.sh
@@ -124,7 +124,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
index 22316d56ed2..8e44d0bc114 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6n.sh
@@ -123,7 +123,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
index ad2ac4bf043..6a836e81b09 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6o.sh
@@ -125,7 +125,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
new file mode 100755
index 00000000000..d1a61360f85
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# This is based on tdnn_7q, but adding cnn as the front-end.
+# The cnn-tdnn-f (cnn_tdnn_1a) outperforms the tdnn-f (tdnn_7q).
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7q_sp cnn_tdnn1a_sp
+# System                tdnn7q_sp cnn_tdnn1a_sp
+# WER on train_dev(tg)      12.08     11.97
+# WER on train_dev(fg)      11.15     11.12
+# WER on eval2000(tg)        14.1      13.9
+# WER on eval2000(fg)        12.8      12.5
+# WER on rt03(tg)            17.5      17.1
+# WER on rt03(fg)            15.3      14.9
+# Final train prob         -0.055    -0.056
+# Final valid prob         -0.072    -0.075
+# Final train prob (xent)        -0.875    -0.871
+# Final valid prob (xent)       -0.9064   -0.9110
+# Num-parameters               18725244  15187100
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_tdnn1a_sp
+# exp/chain/cnn_tdnn1a_sp: num-iters=394 nj=3..16 num-params=15.2M dim=40+100->6078 combine=-0.054->-0.054 (over 7) xent:train/valid[261,393,final]=(-1.03,-0.878,-0.871/-1.06,-0.918,-0.911) logprob:train/valid[261,393,final]=(-0.076,-0.057,-0.056/-0.091,-0.076,-0.075)
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=1a
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+suffix=
+$speed_perturb && suffix=_sp
+dir=exp/chain/cnn_tdnn${affix}${suffix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+
+  cnn_opts="l2-regularize=0.01"
+  ivector_affine_opts="l2-regularize=0.01"
+  tdnnf_first_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+  batchnorm-component name=idct-batchnorm input=idct
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=10  time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=256
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.  (we use time-stride=0 so no splicing, to
+  # limit the num-parameters).
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1536 bottleneck-dim=256 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=256 big-dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=256 big-dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+#    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
index e432435a551..48db81f586f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6j.sh
@@ -119,7 +119,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
index b9b7152dcbe..021eab09506 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
@@ -121,7 +121,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
index 12564c4faae..f219167f9ec 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6l.sh
@@ -131,7 +131,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
index fa6518a9ad9..0623d26a9e4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7g.sh
@@ -117,7 +117,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
index 9dfaa1d4509..dbbe3c1e6fd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
index c5b5633d94c..2a8a658bf6b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7i.sh
@@ -113,7 +113,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
index 793b40f7fe3..a9eba36ddaa 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
index bd47ed61f23..8e0b290cf87 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7k.sh
@@ -114,7 +114,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
index f7681a743e1..bb9ddf209d6 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
index 03b1ee3c97f..97f92c14f1f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
@@ -122,7 +122,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
index 0fa7353edb2..d9fe106e5d7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
@@ -452,7 +452,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
index cf4855db611..99e43443f99 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -119,7 +119,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
   output_opts="l2-regularize=0.0005 bottleneck-dim=256"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
index fb47b1e88ad..44ca3b3d279 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -126,7 +126,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
   output_opts="l2-regularize=0.002"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
index 096ed9c54fd..d19a4ef4c0b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh
@@ -114,7 +114,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004"
   output_opts="l2-regularize=0.002"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
index 8eab54a9dc2..cea0891d5d7 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh
@@ -118,7 +118,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7r.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7r.sh
new file mode 100755
index 00000000000..10e4ae1ddca
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7r.sh
@@ -0,0 +1,266 @@
+#!/bin/bash
+
+# 7r is as 7q but replaces the LDA layer at the input of the
+# network with traditional delta and delta-delta features.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7q_sp tdnn7r_sp
+# System                tdnn7q_sp tdnn7r_sp
+# WER on train_dev(tg)      11.87     11.94
+# WER on train_dev(fg)      11.00     11.05
+# WER on eval2000(tg)        14.1      14.1
+# WER on eval2000(fg)        12.7      12.8
+# WER on rt03(tg)            17.3      17.2
+# WER on rt03(fg)            15.0      15.0
+# Final train prob         -0.057    -0.056
+# Final valid prob         -0.072    -0.072
+# Final train prob (xent)        -0.880    -0.894
+# Final valid prob (xent)       -0.8997   -0.9036
+# Num-parameters               18705712  18705712
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn7r_sp
+# exp/chain/tdnn7r_sp: num-iters=394 nj=3..16 num-params=18.7M dim=40+100->6040 combine=-0.054->-0.054 (over 5) xent:train/valid[261,393,final]=(-1.12,-0.897,-0.894/-1.10,-0.905,-0.904) logprob:train/valid[261,393,final]=(-0.084,-0.057,-0.056/-0.089,-0.072,-0.072)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7r
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+suffix=
+$speed_perturb && suffix=_sp
+dir=exp/chain/tdnn${affix}${suffix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  delta-layer name=delta
+  no-op-component name=input2 input=Append(delta, Scale(1.0, ReplaceIndex(ivector, t, 0)))
+
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 input=input2
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+#    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$train_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
index 3ce4fa68397..d4febd61e94 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_attention_1a.sh
@@ -122,7 +122,7 @@ fi
 if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
index 7854bac44c5..4414147bf0e 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
@@ -120,7 +120,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
index 3929cdc432e..cd9d4dc6f2b 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1b.sh
@@ -122,7 +122,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
index 311fe15d895..18b660b4080 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1c.sh
@@ -119,7 +119,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
index 4894e492542..be615e0e361 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
@@ -112,7 +112,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20 dropout-proportion=0.0"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
index 89ed8ad1d72..43855e6f7ce 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -33,7 +33,6 @@ chunk_width=150
 chunk_left_context=40
 chunk_right_context=0
 xent_regularize=0.025
-self_repair_scale=0.00001
 label_delay=5
 # decode options
 extra_left_context=50
@@ -119,7 +118,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
index f0c88368245..5c82ed0eb11 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -29,7 +29,6 @@ chunk_width=150
 chunk_left_context=40
 chunk_right_context=0
 xent_regularize=0.025
-self_repair_scale=0.00001
 label_delay=5
 # decode options
 extra_left_context=50
@@ -115,7 +114,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
index d71301eb102..c3df0bf2b2c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -36,7 +36,6 @@ chunk_width=150
 chunk_left_context=40
 chunk_right_context=0
 xent_regularize=0.025
-self_repair_scale=0.00001
 label_delay=5
 # decode options
 extra_left_context=50
@@ -122,7 +121,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
index 22c7d2e582d..3d353387239 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -48,7 +48,6 @@ decode_iter=final
 
 # training options
 xent_regularize=0.025
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -141,7 +140,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
index 6987757757a..2a2d508ecdd 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -41,7 +41,6 @@ decode_nj=50
 
 # training options
 xent_regularize=0.01
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -136,7 +135,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
index 90e179379e4..5af5463b372 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -60,7 +60,6 @@ decode_iter=final
 
 # training options
 xent_regularize=0.01
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -153,7 +152,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
index cb73f020e3e..28105a587ec 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -42,7 +42,6 @@ decode_iter=final
 
 # training options
 xent_regularize=0.01
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -135,7 +134,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=15"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
index b12be22ce3d..d6e81f2d8eb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -39,7 +39,6 @@ decode_iter=final
 
 # training options
 xent_regularize=0.01
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -132,7 +131,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
index 7e05834c1fb..060d98c9d05 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -60,7 +60,6 @@ decode_iter=final
 
 # training options
 xent_regularize=0.01
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -153,7 +152,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
index 6a6a4ba30e1..9bd39a262c5 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -25,7 +25,6 @@ decode_nj=50
 
 # training options
 xent_regularize=0.01
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -120,7 +119,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
index 21cb4fa9373..ccd6138da6e 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -35,7 +35,6 @@ decode_nj=50
 
 # training options
 xent_regularize=0.01
-self_repair_scale=0.00001
 label_delay=5
 
 chunk_left_context=40
@@ -130,7 +129,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=20"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
index e88e199839c..f702033377a 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -34,7 +34,6 @@ chunk_width=150
 chunk_left_context=40
 chunk_right_context=0
 xent_regularize=0.025
-self_repair_scale=0.00001
 label_delay=5
 dropout_schedule='0,0@0.20,0.3@0.50,0'
 # decode options
@@ -121,7 +120,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
index b50692616c4..b43577bd76c 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -42,7 +42,6 @@ frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
 chunk_left_context=40
 chunk_right_context=0
 xent_regularize=0.025
-self_repair_scale=0.00001
 label_delay=5
 # decode options
 extra_left_context=50
@@ -129,7 +128,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   lstm_opts="decay-time=40"
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
index 9cb182b2915..5bb6e7da152 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -43,7 +43,6 @@ frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
 chunk_left_context=40
 chunk_right_context=0
 xent_regularize=0.025
-self_repair_scale=0.00001
 label_delay=5
 # decode options
 extra_left_context=50
@@ -126,7 +125,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   opts="l2-regularize=0.002"
   linear_opts="orthonormal-constraint=1.0"
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
index b1426bc22b7..4db38d74508 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -4,31 +4,36 @@
 
 # This is based on TDNN_LSTM_1b, but using the NormOPGRU to replace the LSTMP,
 # and adding chunk-{left,right}-context-initial=0
+# For the details of OPGRU structure, please check the paper
+# "Output-Gate Projected Gated Recurrent Unit for Speech Recognition"
+# by Gaofeng Cheng et al,
+# http://www.danielpovey.com/files/2018_interspeech_opgru.pdf
+
 # Different from the vanilla OPGRU, Norm-OPGRU adds batchnorm in its output (forward direction)
 # and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar
 # results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs).
 
 # ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_opgru_1a_sp
 # System                tdnn_lstm_1e_sp tdnn_opgru_1a_sp
-# WER on train_dev(tg)      12.81     12.39
-#           [looped:]       12.93     12.32
-# WER on train_dev(fg)      11.92     11.39
-#           [looped:]       12.07     11.35
+# WER on train_dev(tg)      12.81     12.31
+#           [looped:]       12.93     12.26
+# WER on train_dev(fg)      11.92     11.60
+#           [looped:]       12.07     11.65
 # WER on eval2000(tg)        15.6      15.1
 #           [looped:]        16.0      15.1
-# WER on eval2000(fg)        14.1      13.6
+# WER on eval2000(fg)        14.1      13.5
 #           [looped:]        14.5      13.5
-# Final train prob         -0.065    -0.066
-# Final valid prob         -0.087    -0.085
-# Final train prob (xent)        -0.918    -0.889
-# Final valid prob (xent)       -1.0309   -0.9837
+# Final train prob         -0.065    -0.068
+# Final valid prob         -0.087    -0.091
+# Final train prob (xent)        -0.918    -0.879
+# Final valid prob (xent)       -1.0309   -0.9667
 
 
 
 set -e
 
 # configs for 'chain'
-stage=12
+stage=0
 train_stage=-10
 get_egs_stage=-10
 speed_perturb=true
@@ -129,7 +134,7 @@ if [ $stage -le 12 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   gru_opts="dropout-per-frame=true dropout-proportion=0.0"
 
   mkdir -p $dir/configs
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
new file mode 100755
index 00000000000..7e9dec67068
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_opgru_1b.sh
@@ -0,0 +1,315 @@
+#!/bin/bash
+# Apache 2.0
+
+# This is based on TDNN_OPGRU_1A, but using the FastNormOPGRU to replace the NormPGRU.
+# For the details of OPGRU structure, please check the paper
+# "Output-Gate Projected Gated Recurrent Unit for Speech Recognition"
+# by Gaofeng Cheng et al,
+# http://www.danielpovey.com/files/2018_interspeech_opgru.pdf
+
+# Different from the vanilla OPGRU, Norm-OPGRU adds batchnorm in its output (forward direction)
+# and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar
+# results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs).
+
+# ./local/chain/compare_wer_general.sh --looped tdnn_opgru_1a_sp tdnn_opgru_1b_sp
+# System                tdnn_opgru_1a_sp tdnn_opgru_1b_sp
+# WER on train_dev(tg)      12.31     12.41
+#           [looped:]       12.26     12.38
+# WER on train_dev(fg)      11.49     11.60
+#           [looped:]       11.43     11.65
+# WER on eval2000(tg)        14.9      15.1
+#           [looped:]        15.0      15.1
+# WER on eval2000(fg)        13.5      13.7
+#           [looped:]        13.5      13.7
+# Final train prob         -0.068    -0.070
+# Final valid prob         -0.091    -0.092
+# Final train prob (xent)        -0.879    -0.889
+# Final valid prob (xent)       -0.9667   -0.9723
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_opgru_1b # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk=
+test_online_decoding=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  gru_opts="dropout-per-frame=true dropout-proportion=0.0 gru-nonlinearity-options=\"max-change=0.75\""
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/gru.py for the other options and defaults
+  fast-norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 17 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/map_acronyms_ctm.py b/egs/swbd/s5c/local/map_acronyms_ctm.py
index bee488f73b0..7ae59d2a1d0 100755
--- a/egs/swbd/s5c/local/map_acronyms_ctm.py
+++ b/egs/swbd/s5c/local/map_acronyms_ctm.py
@@ -10,6 +10,7 @@
 # en_4156 B 414.58 0.16 l
 # en_4156 B 414.74 0.17 a
 
+from __future__ import division
 import argparse,re
 __author__ = 'Minhua Wu'
  
diff --git a/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
new file mode 100755
index 00000000000..7d36cdfaac9
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/multi_condition/run_aug_common.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+. ./cmd.sh
+
+set -e
+stage=0
+aug_list="reverb music noise babble clean"  #clean refers to the original train dir
+use_ivectors=true
+num_reverb_copies=1
+
+# Alignment directories
+lda_mllt_ali=tri2_ali_100k_nodup
+clean_ali=tri4_ali_nodup
+
+# train directories for ivectors and TDNNs
+ivector_trainset=train_100k_nodup
+train_set=train_nodup
+
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+if [ $stage -le 0 ]; then
+  # Adding simulated RIRs to the original data directory
+  echo "$0: Preparing data/${train_set}_reverb directory"
+
+  if [ ! -d "RIRS_NOISES" ]; then
+    # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  if [ ! -f data/$train_set/reco2dur ]; then
+    utils/data/get_reco2dur.sh --nj 6 --cmd "$train_cmd" data/$train_set || exit 1;
+  fi
+
+  # Make a version with reverberated speech
+  rvb_opts=()
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+
+  # Make a reverberated version of the SWBD train_nodup.
+  # Note that we don't add any additive noise here.
+  steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --speech-rvb-probability 1 \
+    --prefix "reverb" \
+    --pointsource-noise-addition-probability 0 \
+    --isotropic-noise-addition-probability 0 \
+    --num-replications $num_reverb_copies \
+    --source-sampling-rate 8000 \
+    data/$train_set data/${train_set}_reverb
+fi
+
+if [ $stage -le 1 ]; then
+  # Prepare the MUSAN corpus, which consists of music, speech, and noise
+  # We will use them as additive noises for data augmentation.
+  steps/data/make_musan.sh --sampling-rate 8000 --use-vocals "true" \
+        /export/corpora/JHU/musan data
+
+  # Augment with musan_noise
+  steps/data/augment_data_dir.py --utt-prefix "noise" --modify-spk-id "true" \
+    --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" \
+    data/${train_set} data/${train_set}_noise
+
+  # Augment with musan_music
+  steps/data/augment_data_dir.py --utt-prefix "music" --modify-spk-id "true" \
+    --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" \
+    data/${train_set} data/${train_set}_music
+
+  # Augment with musan_speech
+  steps/data/augment_data_dir.py --utt-prefix "babble" --modify-spk-id "true" \
+    --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" \
+    --bg-noise-dir "data/musan_speech" \
+    data/${train_set} data/${train_set}_babble
+
+  # Combine all the augmentation dirs
+  # This part can be simplified once we know what noise types we will add
+  combine_str=""
+  for n in $aug_list; do
+    if [ "$n" == "clean" ]; then
+      # clean refers to original of training directory
+      combine_str+="data/$train_set "
+    else
+      combine_str+="data/${train_set}_${n} "
+    fi
+  done
+  utils/combine_data.sh data/${train_set}_aug $combine_str
+fi
+
+if [ $stage -le 2 ]; then
+  # Extract low-resolution MFCCs for the augmented data
+  # To be used later to generate alignments for augmented data
+  echo "$0: Extracting low-resolution MFCCs for the augmented data. Useful for generating alignments"
+  mfccdir=mfcc_aug
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/swbd-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+                     data/${train_set}_aug exp/make_mfcc/${train_set}_aug $mfccdir
+  steps/compute_cmvn_stats.sh data/${train_set}_aug exp/make_mfcc/${train_set}_aug $mfccdir
+  utils/fix_data_dir.sh data/${train_set}_aug || exit 1;
+fi
+
+if [ $stage -le 3 ] && $generate_alignments; then
+  # obtain the alignment of augmented data from clean data
+  include_original=false
+  prefixes=""
+  for n in $aug_list; do
+    if [ "$n" == "reverb" ]; then
+      for i in `seq 1 $num_reverb_copies`; do
+        prefixes="$prefixes "reverb$i
+      done
+    elif [ "$n" != "clean" ]; then
+      prefixes="$prefixes "$n
+    else
+      # The original train directory will not have any prefix
+      # include_original flag will take care of copying the original alignments
+      include_original=true
+    fi
+  done
+  echo "$0: Creating alignments of aug data by copying alignments of clean data"
+  steps/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
+    --include-original "$include_original" --prefixes "$prefixes" \
+    data/${train_set}_aug exp/${clean_ali} exp/${clean_ali}_aug
+fi
+
+if [ $stage -le 4 ]; then
+  mfccdir=mfcc_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    date=$(date +'%m_%d_%H_%M')
+    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/mfcc/swbd-$date/s5c/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for dataset in ${train_set}_aug; do
+    echo "$0: Creating hi resolution MFCCs for dir data/$dataset"
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    utils/data/perturb_data_dir_volume.sh data/${dataset}_hires
+
+    steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
+        --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir;
+
+    # Remove the small number of utterances that couldn't be extracted for some
+    # reason (e.g. too short; no such file).
+    utils/fix_data_dir.sh data/${dataset}_hires;
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  mfccdir=mfcc_hires
+  for dataset in eval2000 train_dev $maybe_rt03; do
+    echo "$0: Creating hi resolution MFCCs for data/$dataset"
+    # Create MFCCs for the eval set
+    utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
+        data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
+    utils/fix_data_dir.sh data/${dataset}_hires  # remove segments with problems
+  done
+fi
+
+if [ "$use_ivectors" == "true" ]; then
+  if [ $stage -le 6 ]; then
+    # Take  30k utterances from MS data this will be used for the diagubm training.
+    utils/subset_data_dir.sh data/${train_set}_aug_hires 30000 data/${train_set}_aug_30k_hires
+    utils/data/remove_dup_utts.sh 200 data/${train_set}_aug_30k_hires data/${train_set}_aug_30k_nodup_hires  # 33hr
+
+    # Make a 140 hr subset of augmented data to train i-vector extractor
+    # we don't extract hi res features again for ivector training data
+    # we take it from the ms features extracted on the entire training set
+    # First augment the train_100k_nodup directory which is used to train the i-vector extractor in baseline
+    utils/copy_data_dir.sh data/${train_set}_aug_hires data/${ivector_trainset}_aug_hires
+    utils/filter_scp.pl -f 2 data/${ivector_trainset}/utt2spk data/${train_set}_aug_hires/utt2uniq | \
+        utils/filter_scp.pl - data/${train_set}_aug_hires/utt2spk > data/${ivector_trainset}_aug_hires/utt2spk
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug_hires
+
+    # Since the data size is now increased make a subset of it to bring the duration back to required size (140hr)
+    utils/subset_data_dir.sh data/${ivector_trainset}_aug_hires 100000 data/${ivector_trainset}_aug_hires_subset
+    utils/data/remove_dup_utts.sh 200 data/${ivector_trainset}_aug_hires_subset data/${ivector_trainset}_aug_hires
+    steps/compute_cmvn_stats.sh data/${ivector_trainset}_aug_hires exp/make_hires/${ivector_trainset} $mfccdir;
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug_hires
+  fi
+
+  # ivector extractor training
+  if [ $stage -le 7 ]; then
+    # First copy the clean alignments to augmented alignments to train LDA+MLLT transform
+    # Since the alignments are created using  low-res mfcc features make a copy of ivector training directory
+    utils/copy_data_dir.sh data/${ivector_trainset}_aug_hires data/${ivector_trainset}_aug
+    utils/filter_scp.pl data/${ivector_trainset}_aug/utt2spk data/${train_set}_aug/feats.scp > data/${ivector_trainset}_aug/feats.scp
+    utils/fix_data_dir.sh data/${ivector_trainset}_aug
+    echo "$0: Creating alignments of aug data by copying alignments of clean data"
+    steps/copy_ali_dir.sh --nj 40 --cmd "$train_cmd" \
+        data/${ivector_trainset}_aug exp/${lda_mllt_ali} exp/${lda_mllt_ali}_aug
+
+    # We need to build a small system just because we need the LDA+MLLT transform
+    # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
+    # the transform (12th iter is the last), any further training is pointless.
+    # this decision is based on fisher_english
+    steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+      --splice-opts "--left-context=3 --right-context=3" \
+      5500 90000 data/${ivector_trainset}_aug_hires \
+      data/lang exp/${lda_mllt_ali}_aug exp/nnet3/tri3b
+  fi
+
+  if [ $stage -le 8 ]; then
+    # To train a diagonal UBM we don't need very much data, so use the smallest subset.
+    echo "$0: Training diagonal UBM for i-vector extractor"
+    steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \
+      data/${train_set}_aug_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm
+  fi
+
+  if [ $stage -le 9 ]; then
+    # iVector extractors can be sensitive to the amount of data, but this one has a
+    # fairly small dim (defaults to 100) so we don't use all of it, we use just the
+    # 100k subset (just under half the data).
+    echo "$0: Training i-vector extractor for speaker adaptation"
+    steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+      data/${ivector_trainset}_aug_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+  fi
+
+  if [ $stage -le 10 ]; then
+    # We extract iVectors on all the train_nodup data, which will be what we
+    # train the system on.
+    # having a larger number of speakers is helpful for generalization, and to
+    # handle per-utterance decoding well (iVector starts at zero).
+    echo "$0: Extracting ivectors for train and eval directories"
+    utils/data/modify_speaker_info.sh --utts-per-spk-max 2 data/${train_set}_aug_hires data/${train_set}_aug_max2_hires
+
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+      data/${train_set}_aug_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_${train_set}_aug || exit 1;
+
+    for dataset in eval2000 train_dev $maybe_rt03; do
+      steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+        data/${dataset}_hires exp/nnet3/extractor exp/nnet3/ivectors_$dataset || exit 1;
+    done
+  fi
+fi
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
index e0b9af96b8c..a0cac0c1d5d 100755
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -37,8 +37,8 @@ if $speed_perturb; then
     steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
       data/${train_set}_sp data/lang exp/tri4 exp/tri4_ali_nodup_sp
   fi
-  train_set=${train_set}_sp
 fi
+train_set=${train_set}_sp
 
 if [ $stage -le 3 ]; then
   mfccdir=mfcc_hires
diff --git a/egs/swbd/s5c/local/run_sgmm2.sh b/egs/swbd/s5c/local/run_sgmm2.sh
index 97697e5251d..5410819dadb 100755
--- a/egs/swbd/s5c/local/run_sgmm2.sh
+++ b/egs/swbd/s5c/local/run_sgmm2.sh
@@ -12,7 +12,7 @@ has_fisher=$1
 if [ ! -f exp/ubm5/final.ubm ]; then
   steps/train_ubm.sh --cmd "$train_cmd" 1400 data/train_nodup data/lang \
     exp/tri4_ali_nodup exp/ubm5 || exit 1;
-fi 
+fi
 
 # steps/train_sgmm2.sh --cmd "$train_cmd" \
 steps/train_sgmm2_group.sh --cmd "$train_cmd" \
diff --git a/egs/swbd/s5c/local/score_sclite_conf.sh b/egs/swbd/s5c/local/score_sclite_conf.sh
index 9a1fa5083bf..21da4520a4d 100755
--- a/egs/swbd/s5c/local/score_sclite_conf.sh
+++ b/egs/swbd/s5c/local/score_sclite_conf.sh
@@ -39,6 +39,12 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
+if [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
 name=`basename $data`; # e.g. eval2000
 
 mkdir -p $dir/scoring/log
@@ -51,7 +57,7 @@ if [ $stage -le 0 ]; then
       ACWT=\`perl -e \"print 1.0/LMWT\;\"\` '&&' \
       lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      lattice-to-ctm-conf --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=$decode_mbr --acoustic-scale=\$ACWT  ark:- - \| \
       utils/int2sym.pl -f 5 $lang/words.txt  \| \
       utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
       '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
diff --git a/egs/swbd/s5c/local/swbd1_map_words.pl b/egs/swbd/s5c/local/swbd1_map_words.pl
index 39f90d72816..125e4de0d61 100755
--- a/egs/swbd/s5c/local/swbd1_map_words.pl
+++ b/egs/swbd/s5c/local/swbd1_map_words.pl
@@ -44,7 +44,7 @@
       # which is a  mistake in the input.
       $a =~ s:^\{(.+)\}$:$1:;                 # e.g. {YUPPIEDOM} -> YUPPIEDOM
       $a =~ s:[A-Z]\[([^][])+\][A-Z]:$1-$3:i; # e.g. AMMU[N]IT- -> AMMU-IT-
-      $a =~ s:_\d$::;                         # e.g. THEM_1 -> THEM 
+      $a =~ s:_\d::;                         # e.g. THEM_1 -> THEM, THEM_1's -> THEM's
     }
     $A[$n] = $a;
   }
diff --git a/egs/tedlium/s5/local/join_suffix.py b/egs/tedlium/s5/local/join_suffix.py
index 64c62964331..c36b96a07f9 100755
--- a/egs/tedlium/s5/local/join_suffix.py
+++ b/egs/tedlium/s5/local/join_suffix.py
@@ -5,6 +5,7 @@
 # Apache 2.0
 
 
+from __future__ import print_function
 import sys
 from codecs import open
 
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
index 5e60ee1178c..2ac8c09dad1 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_blstm_1a.sh
@@ -139,7 +139,7 @@ if [ $stage -le 17 ]; then
   
   lstm_opts="decay-time=20"
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
index ec6b8941955..47557f93696 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1a.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
index 53aa92710e8..7afa1b7f902 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1b.sh
@@ -153,7 +153,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
index 83c2f3607f0..e69e499e152 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1c.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
index 2665ea91ff8..86e0352828c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1d.sh
@@ -164,7 +164,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
index f768c7659d7..0fdb2b3b63e 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_lstm_1e.sh
@@ -154,7 +154,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
index 3384b085114..492d3efb804 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1b.sh
@@ -143,7 +143,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
index 5dd838a15e3..01768c3875f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
@@ -160,7 +160,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
index 4f86691b752..bb5007f4c9f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
index e32c08562c6..1476ed1fd40 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1e.sh
@@ -143,7 +143,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
index 2eab0285828..47f939fea1c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1f.sh
@@ -141,7 +141,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
index 64ce1f02fdd..f02025674e8 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh
@@ -142,7 +142,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
index 8f0be130e27..b03da27e760 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -156,7 +156,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
index fef021c6482..e896a7867b3 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -169,7 +169,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
index d05ae15dfec..00f72fab796 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -160,7 +160,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
index 29d8e69b04c..80a9ed1c4d0 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -165,7 +165,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
index db3fde91656..031978f878a 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -213,7 +213,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
index f6a1d49890d..c60b8f7fefc 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -167,7 +167,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
index ff2c302fdf6..2d2048a6869 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -170,7 +170,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
index d4cb5e85657..a074e128270 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -168,7 +168,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
index 40b1bf7f54a..3bfe175806f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -189,7 +189,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
index 838f49f977f..acbef783823 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -186,7 +186,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
index b1abfdcf525..173be863608 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -184,7 +184,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
index ef151d72875..94955d0472c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -174,7 +174,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
index c2aac3f6e20..efd3bc98725 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -174,7 +174,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
index ed6cb66957d..c0559e8d389 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -185,7 +185,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
index 8a4b7468058..5a6dbaef8af 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -189,7 +189,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   # note: the value of the dropout-proportion is not important, as it's
   # controlled by the dropout schedule; what's important is that we set it.
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
index 8f80a6885ca..dd38d56759f 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
@@ -187,7 +187,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   tdnn_opts='ng-affine-options="update-period=1"'
   lstmp_opts='ng-affine-options="update-period=1" decay-time=20'
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
index ef1c7fc196f..1378d2d176d 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1s.sh
@@ -151,7 +151,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
index 19479de41aa..3c4882ec2c6 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1t.sh
@@ -152,7 +152,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
index 85c0e4a0661..23ea14ae151 100644
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1u.sh
@@ -145,7 +145,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
index e0431a83ceb..7c44d963504 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1v.sh
@@ -149,7 +149,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
index e1543c0120f..042ef346578 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_1a.sh
@@ -159,7 +159,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
index d08a7ad5e86..905e1845183 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1a.sh
@@ -163,7 +163,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
index d256150484b..7bd96e7d82c 100755
--- a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_attention_bs_1b.sh
@@ -150,7 +150,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r2/local/join_suffix.py b/egs/tedlium/s5_r2/local/join_suffix.py
index 64c62964331..c36b96a07f9 100755
--- a/egs/tedlium/s5_r2/local/join_suffix.py
+++ b/egs/tedlium/s5_r2/local/join_suffix.py
@@ -5,6 +5,7 @@
 # Apache 2.0
 
 
+from __future__ import print_function
 import sys
 from codecs import open
 
diff --git a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh
index cc0410c3519..87f99f651bf 100755
--- a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh
+++ b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh
@@ -2,29 +2,52 @@
 
 # Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
 #           2017  Hainan Xu
-#           2017  Ke Li
+#           2018  Ke Li
 
-# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 8, linking it to final iteration.
-# rnnlm/train_rnnlm.sh: train/dev perplexity was 78.4 / 147.8.
-# Train objf: -1556.00 -5.43 -5.15 -5.00 -4.90 -4.82 -4.75 -4.69 -4.63 -4.58
-# Dev objf:   -11.92 -5.70 -5.29 -5.16 -5.08 -5.04 -5.02 -5.00 -5.00 -5.00
+# rnnlm/train_rnnlm.sh: best iteration (out of 9) was 8, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 94.1 / 155.1.
+# Train objf: -6.24 -5.45 -5.12 -4.95 -4.84 -4.74 -4.66 -4.59 -4.52 -4.46
+# Dev objf:   -11.92 -5.80 -5.32 -5.17 -5.10 -5.07 -5.05 -5.05 -5.04 -5.06
+
+# 1-pass results 
+# %WER 8.3 | 1155 27500 | 92.7 4.9 2.4 1.0 8.3 68.8 | -0.019 | /export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi/decode_looped_test/score_10_0.0/ctm.filt.filt.sys
+
+# 4-gram rescoring
+# %WER 7.8 | 1155 27500 | 93.1 4.5 2.4 0.9 7.8 66.4 | -0.089 | /export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi/decode_looped_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
+# RNNLM lattice rescoring
+# %WER 7.2 | 1155 27500 | 93.6 4.0 2.3 0.8 7.2 64.3 | -0.927 | exp/decode_looped_test_rnnlm_tedlium_rescore//score_10_0.0/ctm.filt.filt.sys
+
+# RNNLM nbest rescoring
+# %WER 7.4 | 1155 27500 | 93.4 4.3 2.3 0.9 7.4 64.8 | -0.863 | exp/decode_looped_test_rnnlm_tedlium_nbest_rescore/score_8_0.0/ctm.filt.filt.sys
 
 # Begin configuration section.
+cmd=run.pl
+decode_cmd=run.pl
 dir=exp/rnnlm_lstm_tdnn
-embedding_dim=800
-lstm_rpd=200
-lstm_nrpd=200
-stage=-10
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=0
 train_stage=-10
 epochs=20
 
-. ./cmd.sh
-. utils/parse_options.sh
-[ -z "$cmd" ] && cmd=$train_cmd
+# variables for lattice rescoring
+run_lat_rescore=true
+run_nbest_rescore=true
+decode_dir_suffix=rnnlm_tedlium
+ac_model_dir=exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
 
+. ./cmd.sh
+. ./utils/parse_options.sh
 
-text=data/train/text
 wordlist=data/lang/words.txt
+text=data/train/text
 dev_sents=10000
 text_dir=data/rnnlm/text
 mkdir -p $dir/config
@@ -37,14 +60,14 @@ done
 
 if [ $stage -le 0 ]; then
   mkdir -p $text_dir
-  cat $text | cut -d ' ' -f2- | head -n $dev_sents> $text_dir/dev.txt
+  cat $text | cut -d ' ' -f2- | head -n $dev_sents > $text_dir/dev.txt
   cat $text | cut -d ' ' -f2- | tail -n +$[$dev_sents+1] > $text_dir/ted.txt
 fi
 
 if [ $stage -le 1 ]; then
   cp $wordlist $dir/config/
-  n=`cat $dir/config/words.txt | wc -l` 
-  echo "<brk> $n" >> $dir/config/words.txt 
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
 
   # words that are not present in words.txt but are in the training or dev data, will be
   # mapped to <unk> during training.
@@ -66,8 +89,9 @@ EOF
                            --min-frequency 1.0e-03 \
                            --special-words='<s>,</s>,<brk>,<unk>' \
                            $dir/config/words.txt > $dir/config/features.txt
+fi
 
-  cat >$dir/config/xconfig <<EOF
+cat >$dir/config/xconfig <<EOF
 input dim=$embedding_dim name=input
 relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
 fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
@@ -76,8 +100,7 @@ fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$ls
 relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
 output-layer name=output include-log-softmax=false dim=$embedding_dim
 EOF
-  rnnlm/validate_config_dir.sh $text_dir $dir/config
-fi
+rnnlm/validate_config_dir.sh $text_dir $dir/config
 
 if [ $stage -le 2 ]; then
   # the --unigram-factor option is set larger than the default (100)
@@ -86,11 +109,44 @@ if [ $stage -le 2 ]; then
   rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200.0 \
                              $text_dir $dir/config $dir
 fi
-echo "rnnlm dir done"
 
 if [ $stage -le 3 ]; then
   rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 \
-                       --stage $train_stage --num-epochs $epochs --cmd "$cmd" $dir
+                       --stage $train_stage --num-epochs $epochs \
+                       --cmd "queue.pl" $dir
+fi
+
+if [ $stage -le 4 ] && $run_lat_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  for decode_set in dev test; do
+    decode_dir=${ac_model_dir}/decode_looped_${decode_set}_rescore
+
+    # Lattice rescoring
+    rnnlm/lmrescore$pruned.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      exp/decode_looped_${decode_set}_${decode_dir_suffix}_rescore
+  done
+fi
+
+if [ $stage -le 5 ] && $run_nbest_rescore; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+  for decode_set in dev test; do
+    decode_dir=${ac_model_dir}/decode_looped_${decode_set}_rescore
+
+    # nbest rescoring
+    rnnlm/lmrescore_nbest.sh \
+      --cmd "$decode_cmd --mem 4G" --N 20 \
+      0.8 data/lang $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      exp/decode_looped_${decode_set}_${decode_dir_suffix}_nbest_rescore
+  done
 fi
 
 exit 0
diff --git a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh
new file mode 100755
index 00000000000..ec289df81ef
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)  Tony Robinson
+#           2018  Ke Li
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 60) was 58, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 25.1 / 104.5.
+# Train objf: -3.60 -3.52 -3.48 -3.44 -3.41 -3.38 -3.36 -3.35 -3.33 -3.31 -3.29 -3.29 -3.28 -3.28 -3.27 -3.25 -3.25 -3.23 -3.23 -3.22 -3.22 -3.21 -3.20 -3.19 -3.19 -3.18 -3.18 -3.18 -3.17 -3.17 -3.15 -3.15 -3.15 -3.15 -3.14 -3.14 -3.12 -3.14 -3.16 -3.13 -3.12 -3.13 -3.11 -3.12 -3.11 -3.10 -3.09 -3.10 -3.06 -3.08 -3.10 -3.09 -3.08 -3.09 -3.02 -3.01 -3.02 -2.98 -3.02
+# Dev objf:   -5.12 -5.04 -4.98 -4.93 -4.91 -4.89 -4.87 -4.86 -4.82 -4.80 -4.79 -4.79 -4.78 -4.77 -4.78 -4.76 -4.76 -4.75 -4.75 -4.74 -4.74 -4.73 -4.73 -4.72 -4.71 -4.72 -4.71 -4.70 -4.70 -4.70 -4.70 -4.70 -4.70 -4.70 -4.69 -4.69 -4.68 -4.68 -4.67 -4.67 -4.68 -4.67 -4.67 -4.67 -4.67 -4.67 -4.67 -4.67 -4.66 -4.68 -4.68 -4.72 -4.68 -4.66 -4.71 -4.65 -4.65 -4.65 -4.65
+
+# 1-pass results 
+# %WER 8.3 | 1155 27500 | 92.7 4.9 2.4 1.0 8.3 68.8 | -0.019 | /export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi/decode_looped_test/score_10_0.0/ctm.filt.filt.sys
+
+# 4-gram rescoring
+# %WER 7.8 | 1155 27500 | 93.1 4.5 2.4 0.9 7.8 66.4 | -0.089 | /export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi/decode_looped_test_rescore/score_10_0.0/ctm.filt.filt.sys
+
+# RNNLM lattice rescoring
+# %WER 6.8 | 1155 27500 | 94.0 3.7 2.3 0.8 6.8 62.3 | -0.844 | exp/decode_looped_test_rnnlm_lm1b_tedlium_weight3_rescore//score_10_0.0/ctm.filt.filt.sys
+
+# RNNLM nbest rescoring
+# %WER 6.9 | 1155 27500 | 94.0 3.8 2.2 0.9 6.9 61.3 | -0.769 | exp/decode_looped_test_rnnlm_lm1b_tedlium_weight3_nbest_rescore//score_10_0.0/ctm.filt.filt.sys
+
+# Begin configuration section.
+cmd=run.pl
+decode_cmd=run.pl
+dir=exp/rnnlm_lstm_tdnn_with_lm1b
+embedding_dim=1024
+lstm_rpd=256
+lstm_nrpd=256
+stage=0
+train_stage=-10
+epochs=20
+
+# variables for lattice rescoring
+run_lat_rescore=true
+run_nbest_rescore=true
+decode_dir_suffix=rnnlm_lstm_tdnn_with_lm1b
+ac_model_dir=exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+lm1b_dir=data/rnnlm/lm1b
+wordlist=data/lang/words.txt
+train_text=data/train/text
+dev_sents=10000
+text_dir=data/rnnlm/text_lm1b_tedlium
+mkdir -p $dir/config
+set -e
+
+for f in $wordlist $train_text; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; generate lm1b data first; \
+    search for local/prepare_data.sh and utils/prepare_lang.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+    mkdir -p $lm1b_dir
+    cd $lm1b_dir
+    if [ ! -f training-monolingual.tgz ]; then
+        wget http://statmt.org/wmt11/training-monolingual.tgz .
+    fi
+    echo "Downloaded google one billion dataset."
+    
+    if [ ! -d training-monolingual ]; then
+        tar --extract -v --file training-monolingual.tgz --wildcards training-monolingual/news.20??.en.shuffled
+    fi
+    echo "Untar google one billion dataset."
+
+    for year in 2007 2008 2009 2010 2011; do 
+        cat training-monolingual/news.${year}.en.shuffled
+    done | sort -u --output=training-monolingual/news.20XX.en.shuffled.sorted
+    echo "Done sorting corpus."
+
+    time cat training-monolingual/news.20XX.en.shuffled.sorted | \
+    ../../../utils/normalize_punctuation.pl -l en -q 1 | \
+    ../../../utils/tokenizer.pl -l en -q 1 > \
+    training-monolingual/news.20XX.en.shuffled.sorted.tokenized
+    echo "Done tokenizing corpus."
+    cd ../../..
+fi
+
+if [ $stage -le 1 ]; then
+  mkdir -p $text_dir
+  cat $train_text | cut -d ' ' -f2- | head -n $dev_sents > $text_dir/dev.txt
+  cat $train_text | cut -d ' ' -f2- | tail -n +$[$dev_sents+1] > $text_dir/ted.txt
+  cp $lm1b_dir/training-monolingual/news.20XX.en.shuffled.sorted.tokenized $text_dir/lm1b.txt
+fi
+
+if [ $stage -le 2 ]; then
+  cp $wordlist $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <unk> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+ted   1   3.0
+lm1b    1   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --top-word-features=10000 \
+                           --min-frequency 1.0e-03 \
+                           --special-words='<s>,</s>,<brk>,<unk>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+fi
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+rnnlm/validate_config_dir.sh $text_dir $dir/config
+
+if [ $stage -le 3 ]; then
+  # the --unigram-factor option is set larger than the default (100)
+  # in order to reduce the size of the sampling LM, because rnnlm-get-egs
+  # was taking up too much CPU (as much as 10 cores).
+  rnnlm/prepare_rnnlm_dir.sh --unigram-factor 200.0 \
+                             --words_per_split 100000000 \
+                             $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 4 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 5 \
+                       --stage $train_stage \
+                       --num-epochs $epochs --cmd "queue.pl" $dir
+fi
+
+if [ $stage -le 5 ] && $run_lat_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  for decode_set in dev test; do
+    decode_dir=${ac_model_dir}/decode_looped_${decode_set}_rescore
+
+    # Lattice rescoring
+    rnnlm/lmrescore$pruned.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      exp/decode_looped_${decode_set}_${decode_dir_suffix}_rescore
+  done
+fi
+
+if [ $stage -le 6 ] && $run_nbest_rescore; then
+  echo "$0: Perform nbest-rescoring on $ac_model_dir"
+  for decode_set in dev test; do
+    decode_dir=${ac_model_dir}/decode_looped_${decode_set}_rescore
+
+    # nbest rescoring
+    rnnlm/lmrescore_nbest.sh \
+      --cmd "$decode_cmd --mem 4G" --N 20 \
+      0.8 data/lang $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      exp/decode_looped_${decode_set}_${decode_dir_suffix}_nbest_rescore
+  done
+fi
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/run_learn_lex.sh b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh
similarity index 98%
rename from egs/tedlium/s5_r2/local/run_learn_lex.sh
rename to egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh
index a2a6f2e46b8..f1497bfe202 100755
--- a/egs/tedlium/s5_r2/local/run_learn_lex.sh
+++ b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh
@@ -2,7 +2,7 @@
 #
 # This script demonstrates a lexicon learning recipe, which aims to imrove
 # the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
-# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh
+# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_bayesian.sh
 # for explanation of the options. 
 #
 # Copyright 2016  Xiaohui Zhang
@@ -78,7 +78,7 @@ fi
 
 # Learn a lexicon based on the acoustic training data and the reference lexicon.
 if [ $stage -le 1 ]; then
-  steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
+  steps/dict/learn_lexicon_bayesian.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
     --min-prob $min_prob --variants-prob-mass $variants_prob_mass \
     --variants-prob-mass-ref $variants_prob_mass_ref  \
     --prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \
diff --git a/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh
new file mode 100755
index 00000000000..f69af3fe360
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh
@@ -0,0 +1,133 @@
+#! /bin/bash
+#
+# This script demonstrates a lexicon learning recipe, which aims to imrove
+# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
+# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_greedy.sh
+# for explanation of the options. 
+#
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+oov_symbol="<unk>"
+# The user may have an phonetisaurus-trained English g2p model ready.
+g2p_mdl_dir=
+# The dir which contains the reference lexicon (most probably hand-derived)
+# we want to expand/improve, and nonsilence_phones.txt,.etc which we need  
+# for building new dict dirs.
+ref_dict=data/local/dict
+# acoustic training data we use to get alternative
+# pronunciations and collet acoustic evidence.
+data=data/train
+# the cut-off parameter used to select pronunciation candidates from phone
+# decoding. We remove pronunciations with probabilities less than this value
+# after normalizing the probs s.t. the max-prob is 1.0 for each word."
+min_prob=0.1
+# Refer to steps/dict/select_prons_greedy.sh for the detailed meaning of
+# alpha, beta and delta. Basically, the three dimensions of alpha
+# and beta correspond to three pronunciation sources: phonetic-
+# decoding, G2P and the reference lexicon, and the larger a value is,
+# the more aggressive we'll prune pronunciations from that sooure.
+# The valid range of each dim. is [0, 1] (for alpha, and 0 means 
+# we never pruned pron from that source.) [0, 100] (for beta). 
+alpha="0.04,0.02,0"
+beta="30,5,0"
+# Floor value of the pronunciation posterior statistics.
+delta=0.00000001
+# This parameter determines how many pronunciations we keep for each word
+# after the first pass pruning. See steps/dict/internal/prune_pron_candidates.py
+# for details.
+vcr=16 
+
+# Intermediate outputs of the lexicon learning stage will be put into dir
+dir=exp/tri3_lex_greedy_work
+nj=35
+decode_nj=30
+stage=0
+lexlearn_stage=0
+affix="learned_greedy"
+
+. utils/parse_options.sh # accept options
+
+# The reference vocab is the list of words which we already have hand-derived pronunciations.
+ref_vocab=data/local/vocab.txt
+cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; 
+
+# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon)
+# in acoustic training data.
+if [ $stage -le 0 ]; then
+  if [ -z $g2p_mdl_dir ]; then
+    g2p_mdl_dir=exp/g2p_phonetisaurus
+    steps/dict/train_g2p_phonetisaurus.sh $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1;
+  fi
+  awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \
+    $data/text | sort -u > $data/train_vocab.txt || exit 1;
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \
+    $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1;
+  steps/dict/apply_g2p_phonetisaurus.sh --nbest 5 $data/train_vocab.txt $g2p_mdl_dir \
+    exp/g2p_phonetisaurus/lex_train || exit 1;
+fi
+
+# Learn a lexicon based on the acoustic training data and the reference lexicon.
+if [ $stage -le 1 ]; then
+  steps/dict/learn_lexicon_greedy.sh --lexiconp-g2p "exp/g2p_phonetisaurus/lex_train/lexicon.lex" \
+    --alpha $alpha --beta $beta --delta $delta \
+    --min-prob $min_prob --cmd "$train_cmd" \
+    --variant-counts-ratio $vcr \
+    --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl false \
+    $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_${affix}_nosp \
+    $dir || exit 1;
+fi
+
+# Add pronounciation probs to the learned lexicon.
+if [ $stage -le 2 ]; then
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_${affix}_nosp $oov_symbol data/local/lang_${affix}_nosp data/lang_${affix}_nosp || exit 1;
+  
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    $data data/lang_${affix}_nosp exp/tri2 exp/tri2_ali_${affix}_nosp || exit 1;
+  
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_${affix}_nosp exp/tri2_ali_${affix}_nosp || exit 1;
+  
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_${affix}_nosp exp/tri2_ali_${affix}_nosp/pron_counts_nowb.txt \
+    exp/tri2_ali_${affix}_nosp/sil_counts_nowb.txt \
+    exp/tri2_ali_${affix}_nosp/pron_bigram_counts_nowb.txt data/local/dict_${affix} || exit 1;
+  
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_${affix} $oov_symbol data/local/lang_${affix} data/lang_${affix} || exit 1;
+fi
+
+# Re-decode
+if [ $stage -le 3 ]; then
+  ! cmp data/lang_nosp/words.txt data/lang_${affix}/words.txt &&\
+    echo "$0: The vocab of the affix lexicon and the reference vocab may be incompatible."
+  cp data/lang_nosp/G.fst data/lang_${affix}/
+  utils/mkgraph.sh data/lang_${affix} exp/tri3 exp/tri3/graph_${affix} || exit 1;
+  
+  for dset in dev test; do
+  (  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri3/graph_${affix} data/${dset} exp/tri3/decode_${affix}_${dset} || exit 1;
+  ) &
+  done
+fi
+
+# RESULTS:
+# Baseline:
+# %WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys
+# %WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys
+
+# Re-decoding with the learned lexicon:
+# %WER 18.5 | 507 17783 | 84.3 11.2 4.5 2.8 18.5 92.3 | -0.007 | exp/tri3/decode_learned_greedy_dev/score_16_0.0/ctm.filt.filt.sys
+# %WER 17.5 | 1155 27500 | 84.9 11.5 3.6 2.4 17.5 87.5 | 0.035 | exp/tri3/decode_learned_greedy_test/score_14_0.0/ctm.filt.filt.sys
+
+# To see the effect to neural-net results, one should re-train NN with the learned lexicon.
+# Experiments have shown that, with the new lang dir, one should just re-run NN training
+# starting from the supervision generation (steps/align_fmllr_lats.sh) stage, and should
+# expect improved overall WERs and word recognition performance on words whose pronunciations
+# were changed.
+
+exit
+wait
diff --git a/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py b/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py
index 6338cbbf875..85e15d8dc07 100755
--- a/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py
+++ b/egs/tedlium/s5_r2_wsj/local/lm/merge_word_counts.py
@@ -7,6 +7,7 @@
 A min-count argument is required to only write counts that are above the
 specified minimum count.
 """
+from __future__ import print_function
 
 import sys
 
@@ -21,7 +22,7 @@ def main():
         parts = line.strip().split()
         words[parts[1]] = words.get(parts[1], 0) + int(parts[0])
 
-    for word, count in words.iteritems():
+    for word, count in words.items():
         if count >= int(sys.argv[1]):
             print ("{0} {1}".format(count, word))
 
diff --git a/egs/tedlium/s5_r3/.gitignore b/egs/tedlium/s5_r3/.gitignore
new file mode 100644
index 00000000000..65eef93d691
--- /dev/null
+++ b/egs/tedlium/s5_r3/.gitignore
@@ -0,0 +1 @@
+db
diff --git a/egs/tedlium/s5_r3/RESULTS b/egs/tedlium/s5_r3/RESULTS
new file mode 100644
index 00000000000..df056acf3ab
--- /dev/null
+++ b/egs/tedlium/s5_r3/RESULTS
@@ -0,0 +1,50 @@
+#!/bin/bash
+# This RESULTS file was obtained by running ./run.sh and then ./result.sh
+
+bash ./results.sh
+
+exit 0
+
+%WER 28.32 [ 5037 / 17783, 615 ins, 1171 del, 3251 sub ] exp/tri1/decode_nosp_dev/wer_10
+%WER 26.99 [ 4799 / 17783, 603 ins, 1169 del, 3027 sub ] exp/tri1/decode_nosp_dev_rescore/wer_10
+%WER 27.76 [ 7634 / 27500, 776 ins, 1689 del, 5169 sub ] exp/tri1/decode_nosp_test/wer_11
+%WER 26.52 [ 7292 / 27500, 766 ins, 1611 del, 4915 sub ] exp/tri1/decode_nosp_test_rescore/wer_11
+%WER 23.38 [ 4158 / 17783, 603 ins, 953 del, 2602 sub ] exp/tri2/decode_dev/wer_14
+%WER 21.98 [ 3909 / 17783, 597 ins, 910 del, 2402 sub ] exp/tri2/decode_dev_rescore/wer_14
+%WER 24.12 [ 4289 / 17783, 600 ins, 1014 del, 2675 sub ] exp/tri2/decode_nosp_dev/wer_12
+%WER 22.96 [ 4083 / 17783, 631 ins, 931 del, 2521 sub ] exp/tri2/decode_nosp_dev_rescore/wer_11
+%WER 23.30 [ 6408 / 27500, 727 ins, 1375 del, 4306 sub ] exp/tri2/decode_nosp_test/wer_13
+%WER 22.10 [ 6078 / 27500, 746 ins, 1281 del, 4051 sub ] exp/tri2/decode_nosp_test_rescore/wer_12
+%WER 22.31 [ 6134 / 27500, 794 ins, 1148 del, 4192 sub ] exp/tri2/decode_test/wer_13
+%WER 21.06 [ 5791 / 27500, 737 ins, 1147 del, 3907 sub ] exp/tri2/decode_test_rescore/wer_14
+%WER 19.99 [ 3554 / 17783, 570 ins, 816 del, 2168 sub ] exp/tri3_cleaned/decode_dev/wer_16
+%WER 18.92 [ 3364 / 17783, 588 ins, 791 del, 1985 sub ] exp/tri3_cleaned/decode_dev_rescore/wer_15
+%WER 23.85 [ 4241 / 17783, 686 ins, 874 del, 2681 sub ] exp/tri3_cleaned/decode_dev.si/wer_13
+%WER 17.73 [ 4876 / 27500, 700 ins, 935 del, 3241 sub ] exp/tri3_cleaned/decode_test/wer_16
+%WER 16.72 [ 4599 / 27500, 686 ins, 906 del, 3007 sub ] exp/tri3_cleaned/decode_test_rescore/wer_16
+%WER 22.10 [ 6077 / 27500, 864 ins, 1075 del, 4138 sub ] exp/tri3_cleaned/decode_test.si/wer_13
+%WER 19.63 [ 3490 / 17783, 585 ins, 809 del, 2096 sub ] exp/tri3/decode_dev/wer_15
+%WER 18.56 [ 3300 / 17783, 558 ins, 817 del, 1925 sub ] exp/tri3/decode_dev_rescore/wer_16
+%WER 23.75 [ 4224 / 17783, 661 ins, 917 del, 2646 sub ] exp/tri3/decode_dev.si/wer_14
+%WER 17.92 [ 4928 / 27500, 730 ins, 921 del, 3277 sub ] exp/tri3/decode_test/wer_14
+%WER 16.80 [ 4621 / 27500, 650 ins, 973 del, 2998 sub ] exp/tri3/decode_test_rescore/wer_17
+%WER 22.16 [ 6095 / 27500, 849 ins, 1070 del, 4176 sub ] exp/tri3/decode_test.si/wer_13
+%WER 8.17 [ 1453 / 17783, 242 ins, 310 del, 901 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev/wer_9
+%WER 7.61 [ 1354 / 17783, 236 ins, 300 del, 818 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev_rescore/wer_9
+%WER 6.17 [ 1097 / 17783, 207 ins, 292 del, 598 sub ] exp/chain_cleaned/tdnnf_1a/decode_dev_rnnlm_lstm_tdnn_a_averaged/wer_10
+%WER 8.16 [ 2245 / 27500, 288 ins, 605 del, 1352 sub ] exp/chain_cleaned/tdnnf_1a/decode_test/wer_9
+%WER 7.75 [ 2131 / 27500, 264 ins, 643 del, 1224 sub ] exp/chain_cleaned/tdnnf_1a/decode_test_rescore/wer_10
+%WER 6.84 [ 1880 / 27500, 283 ins, 533 del, 1064 sub ] exp/chain_cleaned/tdnnf_1a/decode_test_rnnlm_lstm_tdnn_a_averaged/wer_8
+
+# See what online-cmn does,
+# no cmvn (only i-vector UBM with online cmn),
+%WER 8.32 [ 1480 / 17783, 253 ins, 292 del, 935 sub ] exp/chain_cleaned/tdnn1c_sp/decode_dev/wer_9
+%WER 7.63 [ 1357 / 17783, 240 ins, 303 del, 814 sub ] exp/chain_cleaned/tdnn1c_sp/decode_dev_rescore/wer_10
+%WER 8.44 [ 2321 / 27500, 351 ins, 553 del, 1417 sub ] exp/chain_cleaned/tdnn1c_sp/decode_test/wer_9
+%WER 7.84 [ 2155 / 27500, 311 ins, 577 del, 1267 sub ] exp/chain_cleaned/tdnn1c_sp/decode_test_rescore/wer_10
+# online-cmn,
+%WER 8.22 [ 1461 / 17783, 236 ins, 296 del, 929 sub ] exp/chain_cleaned_1d/tdnn1d_sp/decode_dev/wer_10
+%WER 7.60 [ 1352 / 17783, 257 ins, 252 del, 843 sub ] exp/chain_cleaned_1d/tdnn1d_sp/decode_dev_rescore/wer_9
+%WER 8.45 [ 2324 / 27500, 347 ins, 549 del, 1428 sub ] exp/chain_cleaned_1d/tdnn1d_sp/decode_test/wer_9
+%WER 7.73 [ 2127 / 27500, 285 ins, 623 del, 1219 sub ] exp/chain_cleaned_1d/tdnn1d_sp/decode_test_rescore/wer_11
+
diff --git a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
index 88dde1ff0e2..c709e351e1e 100755
--- a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh
@@ -55,7 +55,7 @@ for n in 0 1 2 3; do
    for x in $*; do
      set_names $x  # sets $dirname and $epoch_infix
      decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
-     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     wer=$(grep WER $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
      printf "% 10s" $wer
    done
    echo
@@ -64,7 +64,7 @@ for n in 0 1 2 3; do
      for x in $*; do
        set_names $x  # sets $dirname and $epoch_infix
        decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
-       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       wer=$(grep WER $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
        printf "% 10s" $wer
      done
      echo
diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnn.sh b/egs/tedlium/s5_r3/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..e1adaa9346d
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
index 40cdcb5b5ff..1204ff6ce4c 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh
@@ -143,7 +143,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
index f8eec8c5213..744c964db2f 100755
--- a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh
@@ -55,7 +55,7 @@ nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
 # are just hardcoded at this level, in the commands below.
 train_stage=-10
 tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
-tdnnf_affix=_1a  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
+tdnnf_affix=_1b  #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration.
 common_egs_dir=  # you can set this to use previously dumped egs.
 
 # End configuration section.
@@ -77,7 +77,6 @@ fi
 
 local/nnet3/run_ivector_common.sh --stage $stage \
                                   --nj $nj \
-                                  --min-seg-len $min_seg_len \
                                   --train-set $train_set \
                                   --gmm $gmm \
                                   --num-threads-ubm $num_threads_ubm \
@@ -149,7 +148,7 @@ if [ $stage -le 17 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..faac365af54
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+
+# This is copied from tedlium/s5_r2/local/chain/tuning/run_tdnn_1g.sh setup, and it replaces the current run_tdnn_1b.sh script. 
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnnf_1b exp/chain_cleaned/tdnnf_1c
+# System                 tdnnf_1b  tdnnf_1c
+# WER on dev(orig)           8.15      8.03
+# WER on dev(rescored)       7.69      7.44
+# WER on test(orig)          8.19      8.30
+# WER on test(rescored)      7.77      7.85
+# Final train prob        -0.0692   -0.0669
+# Final valid prob        -0.0954   -0.0838
+# Final train prob (xent)   -0.9369   -0.9596
+# Final valid prob (xent)   -1.0730   -1.0780
+# Num-params                25741728   9463968
+
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnnf_1b/
+# exp/chain_cleaned/tdnnf_1b/: num-iters=945 nj=2..6 num-params=25.7M dim=40+100->3664 combine=-0.074->-0.071 (over 6) xent:train/valid[628,944,final]=(-1.07,-0.959,-0.937/-1.20,-1.10,-1.07) logprob:train/valid[628,944,final]=(-0.088,-0.070,-0.069/-0.111,-0.098,-0.095)
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnnf_1c
+# exp/chain_cleaned/tdnn1c/: num-iters=228 nj=3..12 num-params=9.5M dim=40+100->3664 combine=-0.068->-0.068 (over 4) xent:train/valid[151,227,final]=(-1.15,-0.967,-0.960/-1.25,-1.09,-1.08) logprob:train/valid[151,227,final]=(-0.090,-0.068,-0.067/-0.102,-0.05,-0.084)
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=15
+decode_nj=15
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=1
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1c  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false" \
+    --egs.chunk-width 150,110,100 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 5000000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..7d0e2c9bae4
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,260 @@
+#!/bin/bash
+
+# 1d is like 1c, while it introduces 'apply-cmvn-online' that does
+# cmn normalization both for i-extractor and TDNN input.
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1c_sp exp/chain_cleaned_1d/tdnn1d_sp
+# System                tdnn1c_sp tdnn1d_sp
+# WER on dev(orig)           8.32      8.50
+# WER on dev(rescored)       7.63      7.91
+# WER on test(orig)          8.44      8.39
+# WER on test(rescored)      7.84      7.88
+# Final train prob        -0.0688   -0.0698
+# Final valid prob        -0.0826   -0.0850
+# Final train prob (xent)   -0.9842   -0.9898
+# Final valid prob (xent)   -1.0976   -1.1018
+# Num-params                 9476304   9476304
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn1c_sp
+# exp/chain_cleaned/tdnn1c_sp: num-iters=228 nj=3..12 num-params=9.5M dim=40+100->3688 combine=-0.070->-0.070 (over 5) xent:train/valid[151,227,final]=(-1.19,-0.993,-0.984/-1.28,-1.10,-1.10) logprob:train/valid[151,227,final]=(-0.090,-0.070,-0.069/-0.107,-0.083,-0.083)
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned_1d/tdnn1d_sp
+# exp/chain_cleaned_1d/tdnn1d_sp: num-iters=228 nj=3..12 num-params=9.5M dim=40+100->3688 combine=-0.072->-0.072 (over 5) xent:train/valid[151,227,final]=(-1.19,-0.997,-0.990/-1.29,-1.11,-1.10) logprob:train/valid[151,227,final]=(-0.090,-0.071,-0.070/-0.110,-0.085,-0.085)
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+
+nj=15
+decode_nj=15
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=8
+nnet3_affix=_cleaned_1d  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# Setting 'online_cmvn' to true replaces 'apply-cmvn' by
+# 'apply-cmvn-online' both for i-vector extraction and TDNN input.
+# The i-vector extractor uses the config 'conf/online_cmvn.conf' for
+# both the UBM and the i-extractor. The TDNN input is configured via
+# '--feat.cmvn-opts' that is set to the same config, so we use the
+# same cmvn for i-extractor and the TDNN input.
+online_cmvn=true
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --online-cmvn-iextractor $online_cmvn \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.008"
+  output_opts="l2-regularize=0.002"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts="--config=conf/online_cmvn.conf" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0 --constrained false --online-cmvn $online_cmvn" \
+    --egs.chunk-width 150,110,100 \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 5000000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.00025 \
+    --trainer.optimization.final-effective-lrate 0.000025 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh
index 49de5b12372..0b31a258613 100755
--- a/egs/tedlium/s5_r3/local/download_data.sh
+++ b/egs/tedlium/s5_r3/local/download_data.sh
@@ -21,9 +21,11 @@ else
     # the following command won't re-get it if it's already there
     # because of the --continue switch.
     wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1
-    tar xf "TEDLIUM_release-3.tar.gz"
+    
+    echo "$0: extracting TEDLIUM_release-3 data"
+    tar xf "TEDLIUM_release-3.tgz"
   else
-    echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists."
+    echo "$0: not downloading or un-tarring TEDLIUM_release3 because it already exists."
   fi
 fi
 
diff --git a/egs/tedlium/s5_r3/local/join_suffix.py b/egs/tedlium/s5_r3/local/join_suffix.py
new file mode 100755
index 00000000000..c36b96a07f9
--- /dev/null
+++ b/egs/tedlium/s5_r3/local/join_suffix.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+#
+# Copyright  2014  Nickolay V. Shmyrev
+#            2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+
+from __future__ import print_function
+import sys
+from codecs import open
+
+# This script joins together pairs of split-up words like "you 're" -> "you're".
+# The TEDLIUM transcripts are normalized in a way that's not traditional for
+# speech recognition.
+
+for line in sys.stdin:
+    items = line.split()
+    new_items = []
+    i = 1
+    while i < len(items):
+        if i < len(items) - 1 and items[i+1][0] == '\'':
+            new_items.append(items[i] + items[i+1])
+            i = i + 1
+        else:
+            new_items.append(items[i])
+        i = i + 1
+    print(items[0] + ' ' + ' '.join(new_items))
diff --git a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
index 5322da6240f..4afbc9f4da9 100755
--- a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
+++ b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh
@@ -13,9 +13,10 @@ nj=30
 
 train_set=train_cleaned   # you might set this to e.g. train.
 gmm=tri3_cleaned          # This specifies a GMM-dir from the features of the type you're training the system on;
-                         # it should contain alignments for 'train_set'.
+                          # it should contain alignments for 'train_set'.
+online_cmvn_iextractor=false
 
-num_threads_ubm=32
+num_threads_ubm=8
 nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/nnet3_cleaned or whatever.
 
@@ -35,20 +36,52 @@ for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
 done
 
 
-
-if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
-  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
-  echo " ... Please either remove it, or rerun this script with stage > 2."
-  exit 1
+# lowres features, alignments
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 2 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
 fi
 
-
 if [ $stage -le 1 ]; then
-  echo "$0: preparing directory for speed-perturbed data"
-  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+
+  for datadir in ${train_set}_sp dev test; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
 fi
 
 if [ $stage -le 2 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 3 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+         data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+if [ $stage -le 5 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
+
+if [ $stage -le 5 ]; then
   echo "$0: creating high-resolution MFCC features"
 
   # this shows how you can split across multiple file-systems.  we'll split the
@@ -60,10 +93,6 @@ if [ $stage -le 2 ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
-  for datadir in ${train_set}_sp dev test; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-  done
-
   # do volume-perturbation on the training data prior to extracting hires
   # features; this helps make trained nnets more invariant to test data volume.
   utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
@@ -76,7 +105,7 @@ if [ $stage -le 2 ]; then
   done
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
 
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
@@ -104,16 +133,18 @@ if [ $stage -le 3 ]; then
     exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 7 ]; then
   # Train the iVector extractor. µUse all of the speed-perturbed data since iVector extractors
   # can be sensitive to the amount of data. The script defaults to an iVector dimension of 100.
   echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 15 \
+    --num-threads 4 --num-processes 2 \
+    --online-cmvn-iextractor $online_cmvn_iextractor \
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
     exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 8 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
   # valid for the non-'max2' data, the utterance list is the same.
@@ -146,39 +177,5 @@ if [ $stage -le 5 ]; then
   done
 fi
 
-if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 9 ]; then
-  echo "$0: $feats already exists.  Refusing to overwrite the features "
-  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
-  exit 1;
-fi
-
-
-if [ $stage -le 6 ]; then
-  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
-  utils/data/perturb_data_dir_speed_3way.sh \
-    data/${train_set} data/${train_set}_sp
-fi
-
-if [ $stage -le 7 ]; then
-  echo "$0: making MFCC features for low-resolution speed-perturbed data"
-  steps/make_mfcc.sh --nj $nj \
-    --cmd "$train_cmd" data/${train_set}_sp
-  steps/compute_cmvn_stats.sh data/${train_set}_sp
-  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
-  echo ".. speed-perturbed segments were too short."
-  utils/fix_data_dir.sh data/${train_set}_sp
-fi
-
-if [ $stage -le 8 ]; then
-  if [ -f $ali_dir/ali.1.gz ]; then
-    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
-    echo " ... or use a later --stage option."
-    exit 1
-  fi
-  echo "$0: aligning with the perturbed low-resolution data"
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-         data/${train_set}_sp data/lang $gmm_dir $ali_dir
-fi
-
 
 exit 0;
diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
index 32252db937d..73a684b6379 100755
--- a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
+++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh
@@ -30,7 +30,6 @@ epochs=20
 [ -z "$cmd" ] && cmd=$train_cmd
 
 text_from_audio=data/train/text
-text=data/LM/train.txt
 wordlist=data/lang_chain/words.txt
 dev_sents=10000
 text_dir=data/rnnlm/text
@@ -44,8 +43,9 @@ done
 
 if [ $stage -le 0 ]; then
   mkdir -p $text_dir
+  gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' > $text_dir/train.txt
   # shuffle text from audio and lm
-  cat $text_from_audio | cut -d ' ' -f2- | cat $text |\
+  cat $text_from_audio | cut -d ' ' -f2- | cat $text_dir/train.txt |\
     shuf > data/rnnlm/full_lm_data.shuffled
   # create dev and train sets based on audio and LM data
   cat data/rnnlm/full_lm_data.shuffled | head -n $dev_sents> $text_dir/dev.txt
diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh
index ad833555b5f..6118876a0ab 100755
--- a/egs/tedlium/s5_r3/local/ted_download_lm.sh
+++ b/egs/tedlium/s5_r3/local/ted_download_lm.sh
@@ -13,4 +13,4 @@ echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it
 wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P data/local/local_lm/data/arpa || exit 1
 wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P data/local/local_lm/data/arpa || exit 1
 
-exit 0
\ No newline at end of file
+exit 0
diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
index 431d44c6ff6..6cbcaaa85ee 100755
--- a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
+++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh
@@ -14,7 +14,7 @@ wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lst
 cd exp/rnnlm_lstm_tdnn_a_averaged
 tar -xvzf tedlium_rnnlm.tgz || exit 1
 rm tedlium_rnnlm.tgz
-mkdir config
+mkdir -p config
 cd ../..
 cp data/lang/words.txt exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
 echo "<brk> 152217" >> exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt
diff --git a/egs/tedlium/s5_r3/results.sh b/egs/tedlium/s5_r3/results.sh
index 98bcab94ec5..ba04c6c9bac 100755
--- a/egs/tedlium/s5_r3/results.sh
+++ b/egs/tedlium/s5_r3/results.sh
@@ -1,10 +1,25 @@
 #!/bin/bash
 
+# The output of this script (after successfully running ./run.sh) can be found in the RESULTS file.
+
 filter_regexp=.
 [ $# -ge 1 ] && filter_regexp=$1
 
-for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
-  for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
-   for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
+# kaldi scoring,
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do
+  [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh;
+done 2>/dev/null
+for x in exp/chain*/*/decode*; do
+  [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh;
+done 2>/dev/null | grep $filter_regexp
+
+# sclite scoring,
+for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do
+  [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh;
+done 2>/dev/null | grep $filter_regexp
+for x in exp/chain*/*/decode*; do
+  [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh;
+done 2>/dev/null | grep $filter_regexp
+
 exit 0
 
diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh
index d4f3a38fd49..c264c21d728 100755
--- a/egs/tedlium/s5_r3/run.sh
+++ b/egs/tedlium/s5_r3/run.sh
@@ -25,7 +25,7 @@
 set -e -o pipefail -u
 
 nj=35
-decode_nj=30   # note: should not be >38 which is the number of speakers in the dev set
+decode_nj=38   # note: should not be >38 which is the number of speakers in the dev set
                # after applying --seconds-per-spk-max 180.  We decode with 4 threads, so
                # this will be too many jobs if you're using run.pl.
 stage=0
@@ -186,7 +186,7 @@ fi
 if [ $stage -le 17 ]; then
   # This will only work if you have GPUs on your system (and note that it requires
   # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
-  local/chain/run_tdnnf.sh
+  local/chain/run_tdnn.sh
 fi
 
 if [ $stage -le 18 ]; then
@@ -207,7 +207,7 @@ if [ $stage -le 19 ]; then
 
   for dset in dev test; do
     data_dir=data/${dset}_hires
-    decoding_dir=exp/chain_cleaned/tdnnf_1a
+    decoding_dir=exp/chain_cleaned/tdnnf_1a/decode_${dset}
     suffix=$(basename $rnnlm_dir)
     output_dir=${decoding_dir}_$suffix
 
@@ -220,5 +220,6 @@ if [ $stage -le 19 ]; then
   done
 fi
 
+
 echo "$0: success."
 exit 0
diff --git a/egs/thchs30/s5/local/dae/add-noise-mod.py b/egs/thchs30/s5/local/dae/add-noise-mod.py
index 8327fc325ee..4486fd0fdc7 100755
--- a/egs/thchs30/s5/local/dae/add-noise-mod.py
+++ b/egs/thchs30/s5/local/dae/add-noise-mod.py
@@ -3,6 +3,7 @@
 
 
 from __future__ import print_function
+from __future__ import division
 import optparse
 import random
 import bisect
@@ -26,7 +27,7 @@ def energy(mat):
   def mix(mat, noise, pos, scale):
     ret = []
     l = len(noise)
-    for i in xrange(len(mat)):
+    for i in range(len(mat)):
         x = mat[i]
         d = int(x + scale * noise[pos])
         #if d > 32767 or d < -32768:
@@ -41,8 +42,8 @@ def mix(mat, noise, pos, scale):
 
 def dirichlet(params):
     samples = [random.gammavariate(x, 1) if x > 0 else 0. for x in params]
-    samples = [x / sum(samples) for x in samples]
-    for x in xrange(1, len(samples)):
+    samples = [(x / sum(samples)) for x in samples]
+    for x in range(1, len(samples)):
         samples[x] += samples[x - 1]
     return bisect.bisect_left(samples, random.random())
 
@@ -125,7 +126,7 @@ def main():
         mat = wave_mat(wav)
         signal = energy(mat)
         logging.debug('signal energy: %f', signal)
-        noise = signal / (10 ** (noise_level / 10.))
+        noise = signal / (10 ** (noise_level / 10))
         logging.debug('noise energy: %f', noise)
         type = dirichlet(params)
         logging.debug('selected type: %d', type)
diff --git a/egs/timit/s5/local/timit_data_prep.sh b/egs/timit/s5/local/timit_data_prep.sh
index f8f288ffccc..be2d6725952 100755
--- a/egs/timit/s5/local/timit_data_prep.sh
+++ b/egs/timit/s5/local/timit_data_prep.sh
@@ -70,7 +70,7 @@ for x in train dev test; do
   find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \
     | grep -f $tmpdir/${x}_spk > ${x}_sph.flist
 
-  sed -e 's:.*/\(.*\)/\(.*\).WAV$:\1_\2:i' ${x}_sph.flist \
+  sed -e 's:.*/\(.*\)/\(.*\).\(WAV\|wav\)$:\1_\2:' ${x}_sph.flist \
     > $tmpdir/${x}_sph.uttids
   paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \
     | sort -k1,1 > ${x}_sph.scp
@@ -82,7 +82,7 @@ for x in train dev test; do
   # ID followed by the transcript.
   find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \
     | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
-  sed -e 's:.*/\(.*\)/\(.*\).PHN$:\1_\2:i' $tmpdir/${x}_phn.flist \
+  sed -e 's:.*/\(.*\)/\(.*\).\(PHN\|phn\)$:\1_\2:' $tmpdir/${x}_phn.flist \
     > $tmpdir/${x}_phn.uttids
   while read line; do
     [ -f $line ] || error_exit "Cannot find transcription file '$line'";
diff --git a/egs/tunisian_msa/s5/README b/egs/tunisian_msa/s5/README
new file mode 100644
index 00000000000..ae2aa2bc452
--- /dev/null
+++ b/egs/tunisian_msa/s5/README
@@ -0,0 +1,24 @@
+A Kaldi recipe for Arabic using the Tunisian_MSA  corpus.
+
+Extra Requirements:
+This recipe uses the QCRI lexicon which uses the Buckwalter encoding.
+In order to convert the Buckwalter to utf-8, the Encode::Arabic::Buckwalter perl module is required.
+On ubuntu install the package: libencode-arabic-perl.
+On Mac OSX use cpanm (cpanminus) to install the perl module.
+
+Description of the Tunisian_MSA Corpus
+The Tunisian_MSA corpus was originally collected to train acoustic models for pronunciation modeling in Arabic language learning applications.
+The data collection took place near Tunis the capital of the Republic of Tunisia in 2003 at the Military Academy of Fondouk Jedied . 
+The Tunisian_MSA  corpus is divided into recited  and prompted speech  subcorpora.
+The  recited speech appears under the recordings directory and the prompted speech under the answers directory.
+Each of the 118 informants contributed to both subcorpora by reciting sentences and providing answers to prompted questions. 
+The Tunisian_MSA corpus  has   11.2 hours of speech.
+
+With the exception of speech from two speakers , all the corpus was used for training.
+
+A small corpus was collected for testing.
+
+A pronunciation dictionary is also available from openslrm.org.
+It covers all the words uttered in the Tunisian_MSA corpus and the test corpus.
+The QCRI lexicon was used as a starting point for writing this lexicon.
+The phones are the same as those used in the QCRI lexicon.
diff --git a/egs/tunisian_msa/s5/cmd.sh b/egs/tunisian_msa/s5/cmd.sh
new file mode 100644
index 00000000000..71dd849a93b
--- /dev/null
+++ b/egs/tunisian_msa/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/tunisian_msa/s5/conf/mfcc.conf b/egs/tunisian_msa/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/tunisian_msa/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/tunisian_msa/s5/conf/mfcc_hires.conf b/egs/tunisian_msa/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/tunisian_msa/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/tunisian_msa/s5/conf/online_cmvn.conf b/egs/tunisian_msa/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/tunisian_msa/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/tunisian_msa/s5/local/answers_make_lists.pl b/egs/tunisian_msa/s5/local/answers_make_lists.pl
new file mode 100755
index 00000000000..55ee5751d9b
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/answers_make_lists.pl
@@ -0,0 +1,77 @@
+#!/usr/bin/env perl
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# answers_make_lists.pl - make acoustic model training lists
+
+use strict;
+use warnings;
+use Carp;
+
+use File::Spec;
+use File::Copy;
+use File::Basename;
+
+my $tmpdir = 'data/local/tmp/tunis';
+
+system "mkdir -p $tmpdir/answers";
+
+# input wav file list
+my $wav_list = "$tmpdir/answers_wav.txt";
+
+# output temporary wav.scp files
+my $wav_scp = "$tmpdir/answers/wav.scp";
+
+# output temporary utt2spk files
+my $u = "$tmpdir/answers/utt2spk";
+
+# output temporary text files
+my $t = "$tmpdir/answers/text";
+
+# initialize hash for prompts
+my %prompt = ();
+
+# store prompts in hash
+LINEA: while ( my $line = <> ) {
+    chomp $line;
+    my ($num,$sent) = split /\t/sxm, $line, 2;
+
+    my ($machine,$s,$mode,$language,$i) = split /\_/sxm, $num;
+    # the utterance name
+    my $utt = $machine . '_' . $s . '_' . 'a' . '_' . $i;
+    $prompt{$utt} = $sent;
+}
+
+# Write wav.scp, utt2spk and text files.
+open my $W, '<', $wav_list or croak "problem with $wav_list $!";
+open my $O, '+>', $wav_scp or croak "problem with $wav_scp $!";
+open my $U, '+>', $u or croak "problem with $u";
+open my $T, '+>', $t or croak "problem with $t";
+
+ LINE: while ( my $line = <$W> ) {
+     chomp $line;
+     next LINE if ( $line !~ /Answers/sxm );
+     next LINE if ( $line =~ /Recordings/sxm );
+     my ($volume,$directories,$file) = File::Spec->splitpath( $line );
+     my @dirs = split /\//sxm, $directories;
+     my $r = basename $line, '.wav';
+     my $machine = $dirs[-3];
+     my $s = $dirs[-1];
+     my $rid = $machine . '_' . $s . '_' . 'a' . '_' . $r;
+     if ( exists $prompt{$rid} ) {
+	 print ${T} "$rid\t$prompt{$rid}\n" or croak;
+     } elsif ( defined $rid ) {
+	 print STDERR "problem\t$rid" or croak;
+	 next LINE;
+     } else {
+	 croak "$line";
+     }
+
+	print ${O} "$rid sox $line -t wav - |\n" or croak;
+     print ${U} "$rid ${machine}_${s}_a\n" or croak;
+}
+close $U or croak;
+close $T or croak;
+close $W or croak;
+close $O or croak;
diff --git a/egs/tunisian_msa/s5/local/buckwalter2unicode.py b/egs/tunisian_msa/s5/local/buckwalter2unicode.py
new file mode 100755
index 00000000000..f81841261ce
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/buckwalter2unicode.py
@@ -0,0 +1,454 @@
+#!/usr/bin/python
+
+# buckwalter2unicode.py - A script to convert transliterated Arabic
+#                         (using the Buckwalter system) to Unicode.
+#
+# Version 0.2 - 15th September 2004
+# 
+# Andrew Roberts (andyr [at] comp (dot) leeds [dot] ac (dot) uk)
+#
+# Project homepage: http://www.comp.leeds.ac.uk/andyr/software/
+#
+# Now, listen carefully...
+#
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+from __future__ import print_function
+import sys, getopt, codecs, os, re
+
+# Declare a dictionary with Buckwalter's ASCII symbols as the keys, and
+# their unicode equivalents as values.
+
+buck2uni = {"'": u"\u0621", # hamza-on-the-line
+            "|": u"\u0622", # madda
+            ">": u"\u0623", # hamza-on-'alif
+            "&": u"\u0624", # hamza-on-waaw
+            "<": u"\u0625", # hamza-under-'alif
+            "}": u"\u0626", # hamza-on-yaa'
+            "A": u"\u0627", # bare 'alif
+            "b": u"\u0628", # baa'
+            "p": u"\u0629", # taa' marbuuTa
+            "t": u"\u062A", # taa'
+            "v": u"\u062B", # thaa'
+            "j": u"\u062C", # jiim
+            "H": u"\u062D", # Haa'
+            "x": u"\u062E", # khaa'
+            "d": u"\u062F", # daal
+            "*": u"\u0630", # dhaal
+            "r": u"\u0631", # raa'
+            "z": u"\u0632", # zaay
+            "s": u"\u0633", # siin
+            "$": u"\u0634", # shiin
+            "S": u"\u0635", # Saad
+            "D": u"\u0636", # Daad
+            "T": u"\u0637", # Taa'
+            "Z": u"\u0638", # Zaa' (DHaa')
+            "E": u"\u0639", # cayn
+            "g": u"\u063A", # ghayn
+            "_": u"\u0640", # taTwiil
+            "f": u"\u0641", # faa'
+            "q": u"\u0642", # qaaf
+            "k": u"\u0643", # kaaf
+            "l": u"\u0644", # laam
+            "m": u"\u0645", # miim
+            "n": u"\u0646", # nuun
+            "h": u"\u0647", # haa'
+            "w": u"\u0648", # waaw
+            "Y": u"\u0649", # 'alif maqSuura
+            "y": u"\u064A", # yaa'
+            "F": u"\u064B", # fatHatayn
+            "N": u"\u064C", # Dammatayn
+            "K": u"\u064D", # kasratayn
+            "a": u"\u064E", # fatHa
+            "u": u"\u064F", # Damma
+            "i": u"\u0650", # kasra
+            "~": u"\u0651", # shaddah
+            "o": u"\u0652", # sukuun
+            "`": u"\u0670", # dagger 'alif
+            "{": u"\u0671", # waSla
+}
+
+# For a reverse transliteration (Unicode -> Buckwalter), a dictionary
+# which is the reverse of the above buck2uni is essential.
+
+uni2buck = {}
+
+# Iterate through all the items in the buck2uni dict.
+for (key, value) in buck2uni.items():
+		# The value from buck2uni becomes a key in uni2buck, and vice
+		# versa for the keys.
+		uni2buck[value] = key
+
+# Declare some global variables...
+
+
+inFilename = ""  # Name of filename containing input.
+outFilename = "" # Name of filename to send the output
+inEnc = ""       # The text encoding of the input file
+outEnc = ""      # The text encoding for the output file
+ignoreChars = "" # If lines begin with these symbols, ignore.
+columnRange = "" # Holds columns numbers to transliterate.
+delimiter = ""   # Holds user-defined column delimiter.
+reverse = 0      # When equal to 1, perform reverse transliteration, i.e.,
+                 # Unicode -> Buckwalter.
+
+# A function to print to screen the usage details of this script.
+
+def usage():
+	print("Usage: {} -i INFILE -o OUTFILE [-g CHARS -c RANGE -d CHAR".format(sys.argv[0]))
+	print("       -r -e INPUT_ENCODING, -E OUTPUT ENCODING]")
+	print("      {} -l".format(sys.argv[0]))
+	print("      {} -h".format(sys.argv[0]))
+	print("")
+	print("  -i INFILE, --input=INFILE:")
+	print("    Path to text file to be transliterated to Unicode.")
+	print("  -o OUTFILE, --output=OUTFILE:")
+	print("    Path of file to output the newly transliterated text.")
+	print("  -e ENC, --input-encoding=ENC:")
+	print("    Specify the text encoding of the source file. Default: latin_1.")
+	print("  -E ENC, --output-encoding=ENC:")
+	print("    Specify the text encoding of the target file. Default: utf_8.")
+	print("  -g CHARS, --ignore-lines=CHARS:")
+	print("    Will not transliterate lines that start with any of the CHARS")
+	print("    given. E.g., -g #; will not alter lines starting with # or ;.")
+	print("    (May need to be -g \#\; on some platforms. See README.txt.)")
+	print("  -c RANGE, --columns=RANGE:")
+	print("    If in columns, select columns to apply transliteration. Can be")
+	print("    comma separated numbers, or a range. E.g., -c 1, -c 1-3, -c 1,3.")
+	print("  -d CHAR, --delimiter=CHAR:")
+	print("    Specify the delimiter that defines the column if using the -c")
+	print("    option above. Default is ' ' (space).")
+	print("  -r, --reverse:")
+	print("    Reverses the transliteration, i.e., Arabic to Buckwalter.")
+	print("    When used, it will change the default input encoding to utf_8 and")
+	print("    output encoding to latin_1") 
+	print("  -l, --list-encodings:")
+	print("    Displays all supported file encodings.")
+	print("  -h, --help:")
+	print("    Displays this page.")
+	print("")
+
+# A function to print to screen all the available encodings supported by
+# Python.
+
+def displayEncodings():
+	print("Codec		Aliases				Languages")
+	print("ascii		646, us-ascii 			English")
+	print("cp037 		IBM037, IBM039 			English")
+	print("cp424 		EBCDIC-CP-HE, IBM424		Hebrew")
+	print("cp437 		437, IBM437 			English")
+	print("cp500 		EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 	Western Europe")
+	print("cp737						Greek")
+	print("cp775 		IBM775				Baltic languages")
+	print("cp850 		850, IBM850 			Western Europe")
+	print("cp852 		852, IBM852 			Central and Eastern Europe")
+	print("cp855 		855, IBM855 			Bulgarian, Byelorussian, Macedonian, Russian, Serbian")
+	print("cp856 		 	 			Hebrew")
+	print("cp857 		857, IBM857 			Turkish")
+	print("cp860 		860, IBM860 			Portuguese")
+	print("cp861 		861, CP-IS, IBM861		Icelandic")
+	print("cp862 		862, IBM862 			Hebrew")
+	print("cp863 		863, IBM863 			Canadian")
+	print("cp864 		IBM864				Arabic")
+	print("cp865 		865, IBM865 			Danish, Norwegian")
+	print("cp869 		869, CP-GR, IBM869 		Greek")
+	print("cp874 	  					Thai")
+	print("cp875 	  					Greek")
+	print("cp1006 	  					Urdu")
+	print("cp1026 		ibm1026				Turkish")
+	print("cp1140 		ibm1140				Western Europe")
+	print("cp1250 		windows-1250 			Central and Eastern Europe")
+	print("cp1251 		windows-1251 			Bulgarian, Byelorussian, Macedonian, Russian, Serbian")
+	print("cp1252 		windows-1252 			Western Europe")
+	print("cp1253 		windows-1253 			Greek")
+	print("cp1254 		windows-1254 			Turkish")
+	print("cp1255 		windows-1255 			Hebrew")
+	print("cp1256 		windows-1256 			Arabic")
+	print("cp1257 		windows-1257		 	Baltic languages")
+	print("cp1258 		windows-1258		 	Vietnamese")
+	print("latin_1		iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1	West Europe")
+	print("iso8859_2 	iso-8859-2, latin2, L2		Central and Eastern Europe")
+	print("iso8859_3 	iso-8859-3, latin3, L3		Esperanto, Maltese")
+	print("iso8859_4 	iso-8859-4, latin4, L4		Baltic languagues")
+	print("iso8859_5 	iso-8859-5, cyrillic		Bulgarian, Byelorussian, Macedonian, Russian, Serbian")
+	print("iso8859_6 	iso-8859-6, arabic		Arabic")
+	print("iso8859_7 	iso-8859-7, greek, greek8	Greek")
+	print("iso8859_8 	iso-8859-8, hebrew		Hebrew")
+	print("iso8859_9 	iso-8859-9, latin5, L5		Turkish")
+	print("iso8859_10 	iso-8859-10, latin6, L6 	Nordic languages")
+	print("iso8859_13 	iso-8859-13			Baltic languages")
+	print("iso8859_14 	iso-8859-14, latin8, L8		Celtic languages")
+	print("iso8859_15 	iso-8859-15			Western Europe")
+	print("koi8_r						Russian")
+	print("koi8_u						Ukrainian")
+	print("mac_cyrillic	maccyrillic			Bulgarian, Byelorussian, Macedonian, Russian, Serbian")
+	print("mac_greek	macgreek			Greek")
+	print("mac_iceland	maciceland			Icelandic")
+	print("mac_latin2	maclatin2, maccentraleurope	Central and Eastern Europe")
+	print("mac_roman 	macroman 			Western Europe")
+	print("mac_turkish 	macturkish 			Turkish")
+	print("utf_16 		U16, utf16 			all languages")
+	print("utf_16_be 	UTF-16BE 			all languages (BMP only)")
+	print("utf_16_le 	UTF-16LE 			all languages (BMP only)")
+	print("utf_7 		U7 				all languages")
+	print("utf_8 		U8, UTF, utf8 			all languages")
+
+def parseIgnoreString(string):
+	
+	symbols = []
+	
+	for char in string:
+		symbols.append(char)
+
+	return symbols
+
+# Begin parsing the command-line arguments...
+
+try:
+	(options, args) = getopt.getopt(sys.argv[1:], "i:o:e:E:g:c:d:rlh",
+	["input=","output=", "input-encoding=", "output-encoding=",
+	"ignore-lines=", "columns=", "delimiter=" "reverse", "list-encodings",
+	"help"])
+
+except getopt.GetoptError:
+	# print help information and exit:
+	usage()
+	sys.exit(1)
+
+# Loop over all arguments supplied by the user.
+for (x, y) in options:
+	if x in ("-h", "--help"): 
+		usage()
+		sys.exit(0)
+	
+	if x in ("-l", "--list-encodings"): 
+		displayEncodings()
+		sys.exit(0)
+		
+	if x in ("-i", "--input"): inFilename = y
+	if x in ("-o", "--output"): outFilename = y
+	if x in ("-e", "--input-encoding"): inEnc= y
+	if x in ("-E", "--output-encoding"): outEnc= y
+	if x in ("-r", "--reverse"): reverse = 1
+	if x in ("-g", "--ignore-lines"): ignoreChars = y
+	if x in ("-c", "--columns"): columnRange = y
+	if x in ("-d", "--delimiter"): 
+		delimiter = y
+		# Tabs come in off the command line from "\\t" to "\t". However,
+		# that's equivalent to "\\t" from python's point of view.
+		# Therefore replace any inputted "tabs" with proper tabs before
+		# proceeding.
+		delimiter = delimiter.replace("\\t", "\t")
+		# Do some error checking
+		if len(delimiter) > 1:
+			print("Delimeter should only be a single character. Using first character" + delimiter[0], file=sys.stderr)
+			delimiter = delimiter[0]
+		
+		if buck2uni.get(delimiter):
+			print("Invalid delimiter. \"" + delimiter + "\" is part of the Buckwalter character set.", file=sys.stderr)
+			print("This will obviously cause much confusion as a delimiter!", file=sys.stderr)
+			print("Please try again. Aborting...", file=sys.stderr)
+			sys.exit(1)
+
+# If no delimiter was set then, set the default to " " (space)
+if not delimiter:
+	delimiter = " "
+
+# If user didn't specify the encoding of the input file, then revert to
+# defaults. The defaults can depending on the direction of
+# transliteration:
+#
+# Buckwalter -> Unicode, default = latin1
+# Unicode -> Buckwalter, default = utf_8
+
+
+if not inEnc:
+	if reverse:
+		inEnc = "utf_8"
+	else:
+		inEnc = "latin_1"
+
+# Similarly, if user didn't specify the encoding of the output file,
+# then revert to defaults. The defaults can depending on the direction
+# of transliteration:
+#
+# Buckwalter -> Unicode, default = utf_8 
+# Unicode -> Buckwalter, default # = latin_1
+
+if not outEnc:
+	if reverse:
+		outEnc = "latin_1"
+	else:
+		outEnc = "utf_8"
+
+# Ok, let's get the files open!
+
+# Providing a file for output was specified...
+if outFilename:
+    try:
+		# Create a file object, set it to "write" mode using the
+		# specified output encoding.
+		outFile = codecs.open(outFilename, "w", outEnc)
+
+    except IOError as msg:
+		# A problem occurred when trying to open this file. Report to
+		# user...
+        print(msg)
+        sys.exit(1)
+
+# Script can not work without somewhere to store the transliteration.
+# Exit. 
+else:
+	print("Must specify a file to use store the output! Aborting...")
+	sys.exit(1)
+
+# Providing a file for input was specified...
+if inFilename:
+    try:
+		# Create a file object, set it to "read" mode using the
+		# specified input encoding.
+		inFile = codecs.open(inFilename, "r", inEnc)
+
+    except IOError as msg:
+		# A problem occurred when trying to open this file. Report to
+		# user...
+        print(msg)
+        sys.exit(1)
+
+# This script requires a file to read from. Exit.
+else:
+	print("Must specify a file to use as input! Aborting...")
+	sys.exit(1)
+
+def getColsFromRange(cRange):
+	
+	columns = []
+	hyphenSearch = re.compile(r'-')
+
+	rangeElements = cRange.split(",")
+
+	for i in rangeElements:
+		# If it contains a hyphen (e.g., 1-3)
+		if hyphenSearch.search(i):
+			[start, end] = i.split("-")
+			columns = columns + list(range(int(start)-1,int(end)))
+		else:
+			columns.append(int(i)-1)
+
+	return columns
+
+# This function transliterates a given string. It checks the direction
+# of the transliteration and then uses the appropriate dictionary. A
+# transliterated string is returned.
+
+def transliterate(inString, lineNumber):
+	out = ""
+
+	if columnRange:
+		columns = getColsFromRange(columnRange)
+
+		# Split the line on the delimiter
+		lineCols = inString.split(delimiter)
+
+		# Iterate over each column. If it's one of the ones in the range
+		# specified, then transliterate, otherwise just output column
+		# unchanged.
+
+		for i in range(len(lineCols)):
+			
+			# If first column, then don't prefix the delimiter
+			if i == 0:
+				if i in columns:
+					out = transliterateString(lineCols[i])
+				else :
+					out = lineCols[i]
+			else :
+				if i in columns:
+					out = out + delimiter + transliterateString(lineCols[i])
+				else :
+					out = out + delimiter + lineCols[i]
+
+	else:
+		out = transliterateString(inString)
+		
+
+
+	return out
+
+def transliterateString(inString):
+
+	out = ""
+	
+	# For normal Buckwalter -> Unicode transliteration..
+	if not reverse:
+
+		# Loop over each character in the string, inString.
+		for char in inString:
+			# Look up current char in the dictionary to get its
+			# respective value. If there is no match, e.g., chars like
+			# spaces, then just stick with the current char without any
+			# conversion.
+			out = out + buck2uni.get(char, char)
+	
+	# Same as above, just in the other direction.
+	else:
+
+		for char in inString:
+			out = out + uni2buck.get(char, char)
+
+	return out
+
+#while 1:
+#	line = inFile.readline().strip()
+#	line = line.decode(inEnc)
+#	if not line:
+#		break
+
+	# process string
+#	outFile.write(transliterate(line) + os.linesep)
+
+# Read in the lines of the input file.
+lines = inFile.readlines()
+
+currentLineNumber = 1
+# Loop over each line
+for line in lines:
+	line = line.strip()
+	try:
+		# Transliterate the current line, and then write the output to
+		# file.
+		
+		if not ignoreChars:
+			outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep)
+		else:
+			if line[0] in parseIgnoreString(ignoreChars):
+				outFile.write(line + " " + os.linesep)
+			else:
+				outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep)
+		
+		currentLineNumber = currentLineNumber + 1
+
+	except UnicodeError as msg:
+		# A problem when writing occurred. Report to user...
+		print(msg)
+		sys.exit(1)
+
+# All done! Better close the files used before terminating...
+inFile.close()
+outFile.close()
+
+# ... and relax! :)
diff --git a/egs/tunisian_msa/s5/local/chain/compare_wer.sh b/egs/tunisian_msa/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..c6a3a91ea69
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/chain/compare_wer.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System                 "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+test_sets=(devtest test)
+
+for t in ${test_sets[@]}; do
+   printf '# %%WER % 14s  ' $t
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     wer=$(cat $dirname/decode_$t/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_$t/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_$t/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob       "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob       "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/tunisian_msa/s5/local/chain/run_tdnn.sh b/egs/tunisian_msa/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..ab68ba6fb68
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+# Uses a resnet-style factored TDNN-F model.
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp
+# System                  tdnn1a_sp
+# %WER        devtest       39.25
+# %WER           test       49.74
+# Final train prob          -0.0473
+# Final valid prob          -0.0538
+# Final train prob (xent)   -1.0935
+# Final valid prob (xent)   -1.0817
+# Num-params                 3466448
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train
+test_sets="devtest test"
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+num_leaves=3500
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 20 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --cmd "$train_cmd" \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    $num_leaves \
+    ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  affine_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.015"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=768
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  ## adding the layers for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=768 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=768 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/train.py \
+    --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=8 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 \
+    data/lang_test \
+    $tree_dir \
+    $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/nnet3/decode.sh \
+      --acwt 1.0 \
+      --post-decode-acwt 10.0 \
+      --frames-per-chunk $frames_per_chunk \
+      --nj $nspk \
+      --cmd "$decode_cmd" \
+      --num-threads 4 \
+      --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+      $tree_dir/graph \
+      data/${data}_hires \
+      ${dir}/decode_${data} || exit 1;
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang \
+    exp/nnet3${nnet3_affix}/extractor \
+    ${dir} \
+    ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    # note: we just give it "data/${data}" as it only uses the wav.scp, the
+    # feature type does not matter.
+    steps/online/nnet3/decode.sh \
+      --acwt 1.0 --post-decode-acwt 10.0 \
+      --nj $nspk --cmd "$decode_cmd" \
+      $tree_dir/graph data/${data} ${dir}_online/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
+
+# Local Variables:
+# tab-width: 2
+# indent-tabs-mode: nil
+# End:
diff --git a/egs/tunisian_msa/s5/local/devtest_recordings_make_lists.pl b/egs/tunisian_msa/s5/local/devtest_recordings_make_lists.pl
new file mode 100755
index 00000000000..a872ed432fe
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/devtest_recordings_make_lists.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/env perl
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# devtest_recordings_make_lists.pl - make acoustic model training lists
+
+use strict;
+use warnings;
+use Carp;
+
+use File::Spec;
+use File::Copy;
+use File::Basename;
+
+BEGIN {
+    @ARGV == 3 or croak "USAGE $0 <TRANSCRIPT_FILENAME> <SPEAKER_NAME> <COUNTRY>
+example:
+$0 Tunisian_MSA/data/transcripts/devtest/recordings.tsv 6 tunisia
+";
+}
+
+my ($tr,$spk,$l) = @ARGV;
+
+open my $I, '<', $tr or croak "problems with $tr";
+
+my $tmp_dir = "data/local/tmp/$l/$spk";
+
+# input wav file list
+my $wav_list = "$tmp_dir/wav.txt";
+croak "$!" unless ( -f $wav_list );
+# output temporary wav.scp files
+my $wav_scp = "$tmp_dir/wav.scp";
+
+# output temporary utt2spk files
+my $u = "$tmp_dir/utt2spk";
+
+# output temporary text files
+my $t = "$tmp_dir/text";
+
+# initialize hash for prompts
+my %p = ();
+
+# store prompts in hash
+LINEA: while ( my $line = <$I> ) {
+    chomp $line;
+    my ($s,$sent) = split /\t/, $line, 2;
+    $p{$s} = $sent;
+}
+
+open my $W, '<', $wav_list or croak "problem with $wav_list $!";
+open my $O, '+>', $wav_scp or croak "problem with $wav_scp $!";
+open my $U, '+>', $u or croak "problem with $u $!";
+open my $T, '+>', $t or croak "problem with $t $!";
+
+ LINE: while ( my $line = <$W> ) {
+     chomp $line;
+     next LINE if ($line =~ /answers/ );
+     next LINE unless ( $line =~ /Recordings/ );
+     my ($volume,$directories,$file) = File::Spec->splitpath( $line );
+     my @dirs = split /\//, $directories;
+     my $b = basename $line, ".wav";
+     my $s = $dirs[-1];
+     my $rid = $s . '_' . 'recording' . '_' . $b;
+     my $uid = $s . '_' . 'recording';
+     if ( exists $p{$b} ) {
+	 print $T "$rid\t$p{$b}\n";
+     } elsif ( defined $s ) {
+	 warn  "problem\t$s";
+	 next LINE;
+     } else {
+	 croak "$line";
+     }
+
+     print $O "$rid sox $line -t wav - |\n";
+	print $U "$rid\t$uid\n";
+}
+close $T;
+close $O;
+close $U;
+close $W;
diff --git a/egs/tunisian_msa/s5/local/nnet3/run_ivector_common.sh b/egs/tunisian_msa/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..e8ff9a150ea
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="devtest test"
+gmm=tri3b
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+    # perturb data to get alignments
+    # nnet will be trained by high resolution data
+    # _sp stands for speed-perturbed
+    echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+    utils/data/perturb_data_dir_speed_3way.sh \
+	data/${train_set} \
+	data/${train_set}_sp
+    echo "$0: making mfcc features for low-resolution speed-perturbed data"
+    steps/make_mfcc.sh \
+	--cmd "$train_cmd" \
+	--nj 10 \
+	data/${train_set}_sp
+    steps/compute_cmvn_stats.sh \
+	data/${train_set}_sp
+    utils/fix_data_dir.sh \
+	data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+    echo "$0: aligning with the perturbed low-resolution data"
+    steps/align_fmllr.sh \
+	--nj 20 \
+	--cmd "$train_cmd" \
+	data/${train_set}_sp \
+	data/lang \
+	$gmm_dir \
+	$ali_dir
+fi
+
+if [ $stage -le 3 ]; then
+    # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+
+    echo "$0: creating high-resolution MFCC features"
+    mfccdir=data/${train_set}_sp_hires/data
+    for datadir in ${train_set}_sp ${test_sets}; do
+	utils/copy_data_dir.sh \
+	    data/$datadir \
+	    data/${datadir}_hires
+    done
+
+    # do volume-perturbation on the training data prior to extracting hires
+    # features; this helps make trained nnets more invariant to test data volume.
+    utils/data/perturb_data_dir_volume.sh \
+	data/${train_set}_sp_hires
+
+    for datadir in ${train_set}_sp ${test_sets}; do
+	steps/make_mfcc.sh \
+	    --nj 10 \
+	    --mfcc-config conf/mfcc_hires.conf \
+	    --cmd "$train_cmd" \
+	    data/${datadir}_hires
+	steps/compute_cmvn_stats.sh \
+	    data/${datadir}_hires
+	utils/fix_data_dir.sh \
+	    data/${datadir}_hires
+    done
+fi
+
+if [ $stage -le 4 ]; then
+    echo "$0: computing a subset of data to train the diagonal UBM."
+    # We'll use about a quarter of the data.
+    mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+    temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+    num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+    num_utts=$[$num_utts_total/4]
+    utils/data/subset_data_dir.sh \
+	data/${train_set}_sp_hires \
+	$num_utts \
+	${temp_data_root}/${train_set}_sp_hires_subset
+
+    echo "$0: computing a PCA transform from the hires data."
+    steps/online/nnet2/get_pca_transform.sh \
+	--cmd "$train_cmd" \
+	--splice-opts "--left-context=3 --right-context=3" \
+	--max-utts 10000 \
+	--subsample 2 \
+	${temp_data_root}/${train_set}_sp_hires_subset \
+	exp/nnet3${nnet3_affix}/pca_transform
+
+    echo "$0: training the diagonal UBM."
+    # Use 512 Gaussians in the UBM.
+    steps/online/nnet2/train_diag_ubm.sh \
+	--cmd "$train_cmd" \
+	--nj 20 \
+	--num-frames 700000 \
+	--num-threads 8 \
+	${temp_data_root}/${train_set}_sp_hires_subset \
+	512 \
+	exp/nnet3${nnet3_affix}/pca_transform \
+	exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+    # Train the iVector extractor.
+    # Use all the speed-perturbed data .
+    # iVector extractors can be sensitive to the amount of data.
+    # The script defaults to an iVector dimension of 100.
+    echo "$0: training the iVector extractor"
+    steps/online/nnet2/train_ivector_extractor.sh \
+	--cmd "$train_cmd" \
+	--nj 10 \
+	data/${train_set}_sp_hires \
+	exp/nnet3${nnet3_affix}/diag_ubm \
+	exp/nnet3${nnet3_affix}/extractor
+fi
+
+# combine   and train system on short segments.
+# extract iVectors on speed-perturbed training data
+# With --utts-per-spk-max 2, script pairs  utterances into twos.
+# Treats each  pair as one speaker.
+# Gives more diversity in iVectors.
+# Extracted online.
+
+# note: extract  ivectors from max2 data
+# Why is max2 not encoded in ivectordir name?
+# valid for non-max2 data
+#  utterance list is the same.
+
+# having a larger number of speakers is helpful for generalization, and to
+# handle per-utterance decoding well (iVector starts at zero).
+
+if [ $stage -le 6 ]; then
+    ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+    temp_data_root=${ivectordir}
+    utils/data/modify_speaker_info.sh \
+	--utts-per-spk-max 2 \
+	data/${train_set}_sp_hires \
+	${temp_data_root}/${train_set}_sp_hires_max2
+
+    steps/online/nnet2/extract_ivectors_online.sh \
+	--cmd "$train_cmd" \
+	--nj 20 \
+	${temp_data_root}/${train_set}_sp_hires_max2 \
+	exp/nnet3${nnet3_affix}/extractor \
+	$ivectordir
+fi
+
+# Also extract iVectors for test data.
+# No need for speed perturbation (sp).
+
+if [ $stage -le 7 ]; then
+    for data in $test_sets; do
+	steps/online/nnet2/extract_ivectors_online.sh \
+	    --cmd "$train_cmd" \
+	    --nj 1 \
+	    data/${data}_hires \
+	    exp/nnet3${nnet3_affix}/extractor \
+	    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+    done
+fi
+
+exit 0
diff --git a/egs/tunisian_msa/s5/local/prepare_data.sh b/egs/tunisian_msa/s5/local/prepare_data.sh
new file mode 100755
index 00000000000..fcf64582943
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/prepare_data.sh
@@ -0,0 +1,141 @@
+#!/bin/bash  
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# configuration variables
+tmpdir=data/local/tmp
+download_dir=$(pwd)
+tmp_tunis=$tmpdir/tunis
+tmp_libyan=$tmpdir/libyan
+data_dir=$download_dir/Tunisian_MSA/data
+# location of test data 
+libyan_src=$data_dir/speech/test/Libyan_MSA
+# end of configuration variable settings
+
+# process the Tunisian MSA devtest data
+
+# get list of  wav files
+for s in devtest/CTELLONE/Recordings_Arabic/6 devtest/CTELLTHREE/Recordings_Arabic/10; do
+  echo "$0: looking for wav files for $s."
+  mkdir -p $tmp_tunis/$s
+  find $data_dir/speech/$s -type f \
+  -name "*.wav" | grep Recordings_Arabic > $tmp_tunis/$s/wav.txt
+
+  local/devtest_recordings_make_lists.pl \
+  $data_dir/transcripts/devtest/recordings.tsv $s tunis
+
+  mkdir -p data/devtest
+
+  for x in wav.scp utt2spk text; do
+    cat     $tmp_tunis/$s/$x | tr "	" " " >> data/devtest/$x
+  done
+done
+
+utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort > data/devtest/spk2utt
+
+utils/fix_data_dir.sh data/devtest
+
+# training data consists of 2 parts: answers and recordings (recited)
+answers_transcripts=$data_dir/transcripts/train/answers.tsv
+recordings_transcripts=$data_dir/transcripts/train/recordings.tsv
+
+# location of test data
+cls_rec_tr=$libyan_src/cls/data/transcripts/recordings/cls_recordings.tsv
+lfi_rec_tr=$libyan_src/lfi/data/transcripts/recordings/lfi_recordings.tsv
+srj_rec_tr=$libyan_src/srj/data/transcripts/recordings/srj_recordings.tsv
+mbt_rec_tr=$data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv
+
+# make acoustic model training  lists
+mkdir -p $tmp_tunis
+
+# get  wav file names
+
+# for recited speech
+# the data collection laptops had names like CTELLONE CTELLTWO ...
+for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
+  find $data_dir/speech/train/$machine -type f -name "*.wav" | grep Recordings \
+  >> $tmp_tunis/recordings_wav.txt
+done
+
+# get file names for Answers 
+for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
+  find $data_dir/speech/train/$machine -type f \
+    -name "*.wav" \
+    | grep Answers >> $tmp_tunis/answers_wav.txt
+done
+
+# make separate transcription lists for answers and recordings
+export LC_ALL=en_US.UTF-8
+local/answers_make_lists.pl $answers_transcripts
+
+utils/fix_data_dir.sh $tmp_tunis/answers
+
+local/recordings_make_lists.pl $recordings_transcripts
+
+utils/fix_data_dir.sh $tmp_tunis/recordings
+
+# consolidate lists
+# acoustic models will be trained on both recited and prompted speech
+mkdir -p $tmp_tunis/lists
+
+for x in wav.scp utt2spk text; do
+  cat $tmp_tunis/answers/$x $tmp_tunis/recordings/$x > $tmp_tunis/lists/$x
+done
+
+utils/fix_data_dir.sh $tmp_tunis/lists
+
+# get training lists
+mkdir -p data/train
+for x in wav.scp utt2spk text; do
+  sort $tmp_tunis/lists/$x | tr "	" " " > data/train/$x
+done
+
+utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort > data/train/spk2utt
+
+utils/fix_data_dir.sh data/train
+
+# process the Libyan MSA data
+mkdir -p $tmp_libyan
+
+for s in cls lfi srj; do
+  mkdir -p $tmp_libyan/$s
+
+  # get list of  wav files
+  find $libyan_src/$s -type f \
+    -name "*.wav" \
+    | grep recordings > $tmp_libyan/$s/recordings_wav.txt
+
+  echo "$0: making recordings list for $s"
+  local/test_recordings_make_lists.pl \
+    $libyan_src/$s/data/transcripts/recordings/${s}_recordings.tsv $s libyan
+done
+
+# process the Tunisian MSA test data
+
+mkdir -p $tmp_tunis/mbt
+
+# get list of  wav files
+find $data_dir/speech/test/mbt -type f \
+  -name "*.wav" \
+  | grep recordings > $tmp_tunis/mbt/recordings_wav.txt
+
+echo "$0: making recordings list for mbt"
+local/test_recordings_make_lists.pl \
+  $data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv mbt tunis
+
+mkdir -p data/test
+# get the Libyan files
+for s in cls lfi srj; do
+  for x in wav.scp utt2spk text; do
+    cat     $tmp_libyan/$s/recordings/$x | tr "	" " " >> data/test/$x
+  done
+done
+
+for x in wav.scp utt2spk text; do
+  cat     $tmp_tunis/mbt/recordings/$x | tr "	" " " >> data/test/$x
+done
+
+utils/utt2spk_to_spk2utt.pl data/test/utt2spk | sort > data/test/spk2utt
+
+utils/fix_data_dir.sh data/test
diff --git a/egs/tunisian_msa/s5/local/prepare_dict.sh b/egs/tunisian_msa/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..f7d1ac3a619
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/prepare_dict.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+set -o errexit
+
+[ -f ./path.sh ] && . ./path.sh
+
+if [ ! -d data/local/dict ]; then
+  mkdir -p data/local/dict
+fi
+
+l=$1
+export LC_ALL=C
+
+cut -f2- -d " " $l | tr -s '[:space:]' '[\n*]' | grep -v SPN | \
+    sort -u | tail -n+2 > data/local/dict/nonsilence_phones.txt
+
+expand -t 1 $l | sort -u | \
+    sed "1d" > data/local/dict/lexicon.txt
+
+echo "<UNK> SPN" >> data/local/dict/lexicon.txt
+
+# silence phones, one per line.
+{
+    echo SIL;
+    echo SPN;
+} \
+    > \
+    data/local/dict/silence_phones.txt
+
+echo SIL > data/local/dict/optional_silence.txt
+
+# get the phone list from the lexicon file
+(
+    tr '\n' ' ' < data/local/dict/silence_phones.txt;
+    echo;
+    tr '\n' ' ' < data/local/dict/nonsilence_phones.txt;
+    echo;
+) >data/local/dict/extra_questions.txt
+
+echo "$0: Finished dictionary preparation."
diff --git a/egs/tunisian_msa/s5/local/prepare_lm.sh b/egs/tunisian_msa/s5/local/prepare_lm.sh
new file mode 100755
index 00000000000..4fc50b84d11
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/prepare_lm.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+. ./cmd.sh
+set -e
+. ./path.sh
+. $KALDI_ROOT/tools/env.sh
+stage=0
+nsegs=1000000;  # limit the number of training segments
+
+. ./utils/parse_options.sh
+
+if [ ! -d data/local/lm ]; then
+    mkdir -p data/local/lm
+fi
+
+corpus=$1
+
+if [ ! -f $corpus ]; then
+  echo "$0: input data $corpus not found."
+  exit 1
+fi
+
+perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < $corpus | \
+     head -n $nsegs > data/local/lm/train.txt
+
+if ! command ngram-count >/dev/null; then
+  if uname -a | grep darwin >/dev/null; then # For MACOSX...
+    sdir=$KALDI_ROOT/tools/srilm/bin/macosx
+  elif uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+
+ngram-count -order 3 -interpolate -unk -map-unk "<UNK>" \
+    -limit-vocab -text data/local/lm/train.txt -lm data/local/lm/trigram.arpa || exit 1;
+
+gzip -f data/local/lm/trigram.arpa
diff --git a/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh b/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh
new file mode 100755
index 00000000000..0468c04ebd8
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# write separate files for word and pronunciation fields
+cut -d " " -f 1 qcri.txt > qcri_words_buckwalter.txt
+cut -d " " -f 2- qcri.txt > qcri_prons.txt
+
+# convert words to utf8 
+local/buckwalter2unicode.py -i qcri_words_buckwalter.txt -o qcri_words_utf8.txt
+
+paste qcri_words_utf8.txt qcri_prons.txt
+
+rm qcri_words_buckwalter.txt qcri_words_utf8.txt qcri_prons.txt
diff --git a/egs/tunisian_msa/s5/local/qcri_lexicon_download.sh b/egs/tunisian_msa/s5/local/qcri_lexicon_download.sh
new file mode 100755
index 00000000000..29a9ca1eed6
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/qcri_lexicon_download.sh
@@ -0,0 +1,24 @@
+#!/bin/bash 
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# configuration variables
+lex=$1
+tmpdir=data/local/tmp
+# where to put the downloaded speech corpus
+downloaddir=$(pwd)
+# Where to put the uncompressed file
+datadir=$(pwd)
+# end of configuration variable settings
+
+# download the corpus 
+if [ ! -f $downloaddir/qcri.txt.bz2 ]; then
+  wget -O $downloaddir/qcri.txt.bz2 $lex
+  (
+    cd $downloaddir
+    bzcat qcri.txt.bz2 | tail -n+4 > $datadir/qcri.txt
+  )
+else
+  echo "$0: The corpus $lex was already downloaded."
+fi
diff --git a/egs/tunisian_msa/s5/local/recordings_make_lists.pl b/egs/tunisian_msa/s5/local/recordings_make_lists.pl
new file mode 100755
index 00000000000..41fc15e0dd3
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/recordings_make_lists.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/env perl
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# recordings_make_lists.pl - make acoustic model training lists
+
+use strict;
+use warnings;
+use Carp;
+
+use File::Spec;
+use File::Copy;
+use File::Basename;
+
+my $tmpdir = "data/local/tmp/tunis";
+
+system "mkdir -p $tmpdir/recordings";
+
+# input wav file list
+my $w = "$tmpdir/recordings_wav.txt";
+
+# output temporary wav.scp files
+my $o = "$tmpdir/recordings/wav.scp";
+
+# output temporary utt2spk files
+my $u = "$tmpdir/recordings/utt2spk";
+
+# output temporary text files
+my $t = "$tmpdir/recordings/text";
+
+# initialize hash for prompts
+my %p = ();
+
+# store prompts in hash
+LINEA: while ( my $line = <> ) {
+    chomp $line;
+    my ($s,$sent) = split /\t/, $line, 2;
+    $p{$s} = $sent;
+}
+
+open my $W, '<', $w or croak "problem with $w $!";
+open my $O, '+>', $o or croak "problem with $o $!";
+open my $U, '+>', $u or croak "problem with $u $!";
+open my $T, '+>', $t or croak "problem with $t $!";
+
+ LINE: while ( my $line = <$W> ) {
+     chomp $line;
+     next LINE if ($line =~ /Answers/ );
+     next LINE unless ( $line =~ /Recordings/ );
+     my ($volume,$directories,$file) = File::Spec->splitpath( $line );
+     my @dirs = split /\//, $directories;
+     my $machine = $dirs[-3];
+     my $r = basename $line, ".wav";
+     my $s = $dirs[-1];
+     my $rid = $machine . '_' . $s . '_r_' . $r;
+     if ( exists $p{$r} ) {
+	 print $T "$rid\t$p{$r}\n";
+     } elsif ( defined $rid ) {
+	 warn  "problem\t$rid";
+	 next LINE;
+     } else {
+	 croak "$line";
+     }
+
+     print $O "$rid sox $line -t wav - |\n";
+	print $U "$rid\t${machine}_${s}_r\n";
+}
+close $T;
+close $O;
+close $U;
+close $W;
diff --git a/egs/tunisian_msa/s5/local/score.sh b/egs/tunisian_msa/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/tunisian_msa/s5/local/subs_download.sh b/egs/tunisian_msa/s5/local/subs_download.sh
new file mode 100755
index 00000000000..7e46fd255aa
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/subs_download.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# Begin configuration 
+subs_src=$1
+tmpdir=data/local/tmp
+download_dir=$(pwd)
+datadir=$(pwd)
+# End configuration
+
+# download the subs corpus
+if [ ! -f $download_dir/subs.txt.gz ]; then
+    wget -O $download_dir/subs.txt.gz $subs_src
+else
+  echo "$0: The corpus $subs_src was already downloaded."
+fi
+
+if [ ! -f $datadir/subs.txt ]; then
+  (
+    cd $datadir
+    zcat < ./subs.txt.gz > subs.txt
+  )
+  else
+    echo "$0: subs file already extracted."
+fi
diff --git a/egs/tunisian_msa/s5/local/subs_prepare_data.pl b/egs/tunisian_msa/s5/local/subs_prepare_data.pl
new file mode 100755
index 00000000000..e39f77a25cb
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/subs_prepare_data.pl
@@ -0,0 +1,115 @@
+#!/usr/bin/env perl
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# subs_prepare_data.pl - condition subs data for lm training
+
+use strict;
+use warnings;
+use Carp;
+
+use Encode;
+
+# set lower and upper bounds
+my $low_bound = 8;
+# only segments with at least  $low_bound words will be written
+my $up_bound = 16;
+# only segments with fewer than $up_bound words will be written
+
+# input and output files
+my $corp = "subs.txt";
+my $symtab = "data/lang/words.txt";
+my $conditioned = "data/local/tmp/subs/lm/ar.txt";
+my $oo = "data/local/tmp/subs/lm/oovs.txt";
+my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt";
+
+open my $CORP, '<', $corp or croak "problems with $corp $!";
+system "mkdir -p data/local/tmp/subs/lm";
+open my $COND, '+>:utf8', $conditioned or croak "problems with $conditioned $!";
+
+if ( -s $conditioned ) {
+    croak "$conditioned already exists.";
+} else {
+  LINE: while ( my $line = <$CORP> ) {
+      $line = decode_utf8 $line;
+      chomp $line;
+
+      my @tokens = split /\s+/, $line;
+
+      next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound ));
+
+      # remove punctuation
+      $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[	 ]+)/ /msxg;
+      #convert tabs to white space
+      $line =~ s/\t/ /g;
+      #hard to soft space
+      $line =~ s/ / /g;
+      #squeeze white space
+      $line =~ s/\s+/ /g;
+      #initial and final white space
+      $line =~ s/^\p{Separator}+//;
+      $line =~ s/\p{Separator}+$//;
+      #down case
+      $line = lc $line;
+
+      print $COND "$line\n";
+  }
+}close $CORP;
+close $COND;
+
+# find out of vocabulary words
+# $symtab points to a file containing a map of symbols to integers
+
+# hash for word to integer map
+my %sym2int = ();
+
+open my $F, '<', $symtab or croak "problem with $symtab $!";
+
+# store words to int map in hash
+while( my $line = <$F>) {
+    chomp $line;
+    my ($s,$i) = split /\s/, $line, 2;
+    $sym2int{$s} = $i;
+}
+close $F;
+
+open my $I, '<', $conditioned or croak "problem with $conditioned $!";
+open my $OO, '+>', $oo or croak "problems with $oo $!";
+
+while ( my $line = <$I>) {
+    chomp $line;
+    my @A = split /\s/, $line;
+    foreach my $a (@A) {
+	if (!defined ($sym2int{$a})) {
+            print $OO "$a\n";
+	}
+    }
+}
+close $OO;
+close $I;
+
+# remove segments with OOVs
+
+# store OOVS in hash
+my %oov = ();
+open my $V, '<', $oo or croak "problems with $oo $!";
+while ( my $line = <$V> ) {
+    chomp $line;
+    $oov{$line} = 1;
+}
+close $V;
+
+open my $L, '<', $conditioned or croak "problems with $conditioned $!";
+open my $IV, '+>', $iv or croak "problems with $iv $!";
+
+SEGMENT: while ( my $segment = <$L> ) {
+    chomp $segment;
+    my @words = split /\s+/, $segment;
+    foreach my $word ( sort @words ) {
+	next SEGMENT if ( $oov{$word} );
+    }
+    print $IV "$segment\n";
+}
+close $IV;
+close $L;
diff --git a/egs/tunisian_msa/s5/local/tamsa_download.sh b/egs/tunisian_msa/s5/local/tamsa_download.sh
new file mode 100755
index 00000000000..5e4666482ab
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/tamsa_download.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+speech=$1
+
+# where to put the downloaded speech corpus
+download_dir=$(pwd)
+data_dir=$download_dir/Tunisian_MSA/data
+
+# download the corpus from openslr
+if [ ! -f $download_dir/tamsa.tar.gz ]; then
+    wget -O $download_dir/tamsa.tar.gz $speech
+else
+  echo "$0: The corpus $speech was already downloaded."
+fi
+
+if [ ! -d $download_dir/Tunisian_MSA ]; then
+  (
+    cd $download_dir
+    tar -xzf tamsa.tar.gz
+  )
+else
+  echo "$0: The corpus was already unzipped."
+fi
diff --git a/egs/tunisian_msa/s5/local/test_answers_make_lists.pl b/egs/tunisian_msa/s5/local/test_answers_make_lists.pl
new file mode 100755
index 00000000000..aa7d0e314f3
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/test_answers_make_lists.pl
@@ -0,0 +1,83 @@
+#!/usr/bin/env perl
+
+# Copyright 2018 John Zac76
+# Apache 2.0.
+
+# test_answers_make_lists.pl - make acoustic model training lists
+
+use strict;
+use warnings;
+use Carp;
+
+use File::Spec;
+use File::Copy;
+use File::Basename;
+
+BEGIN {
+    @ARGV == 3 or croak "USAGE $0 <TRANSCRIPT_FILENAME> <SPEAKER_NAME> <COUNTRY>
+example:
+$0 /home/zak76/Desktop/Kaldi/kaldi-master/tunisian_msa-master/Libyan_collected_test/TEST/Libyan_MSA/adel/data/transcripts/answers/adel_answers.tsv adel libyan
+";
+}
+
+my ($tr,$spk,$l) = @ARGV;
+
+open my $I, '<', $tr or croak "problems with $tr";
+
+my $tmp_dir = "data/local/tmp/$l/$spk";
+
+system "mkdir -p $tmp_dir/answers";
+
+# input wav file list
+my $w = "$tmp_dir/answers_wav.txt";
+
+# output temporary wav.scp files
+my $o = "$tmp_dir/answers/wav.scp";
+
+# output temporary utt2spk files
+my $u = "$tmp_dir/answers/utt2spk";
+
+# output temporary text files
+my $t = "$tmp_dir/answers/text";
+
+# initialize hash for prompts
+my %p = ();
+
+# store prompts in hash
+LINEA: while ( my $line = <$I> ) {
+    chomp $line;
+    my ($s,$sent) = split /\t/, $line, 2;
+    $p{$s} = $sent;
+}
+
+open my $W, '<', $w or croak "problem with $w $!";
+open my $O, '+>', $o or croak "problem with $o $!";
+open my $U, '+>', $u or croak "problem with $u $!";
+open my $T, '+>', $t or croak "problem with $t $!";
+
+ LINE: while ( my $line = <$W> ) {
+     chomp $line;
+     next LINE if ($line =~ /recordings/ );
+     next LINE unless ( $line =~ /answers/ );
+     my ($volume,$directories,$file) = File::Spec->splitpath( $line );
+     my @dirs = split /\//, $directories;
+     my $b = basename $line, ".wav";
+     my ($sk,$r) = split /\_/, $b, 2;
+     my $s = $dirs[-1];
+     my $rid = $sk . '_' . $r;
+     if ( exists $p{$b} ) {
+	 print $T "$rid\t$p{$b}\n";
+     } elsif ( defined $rid ) {
+	 warn  "problem\t$rid";
+	 next LINE;
+     } else {
+	 croak "$line";
+     }
+
+     print $O "$rid sox $line -t wav - |\n";
+	print $U "$rid\t${sk}_a\n";
+}
+close $T;
+close $O;
+close $U;
+close $W;
diff --git a/egs/tunisian_msa/s5/local/test_recordings_make_lists.pl b/egs/tunisian_msa/s5/local/test_recordings_make_lists.pl
new file mode 100755
index 00000000000..0b1323f2738
--- /dev/null
+++ b/egs/tunisian_msa/s5/local/test_recordings_make_lists.pl
@@ -0,0 +1,83 @@
+#!/usr/bin/env perl
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# test_recordings_make_lists.pl - make acoustic model training lists
+
+use strict;
+use warnings;
+use Carp;
+
+use File::Spec;
+use File::Copy;
+use File::Basename;
+
+BEGIN {
+    @ARGV == 3 or croak "USAGE $0 <TRANSCRIPT_FILENAME> <SPEAKER_NAME> <COUNTRY>
+example:
+$0 /mnt/disk01/Libyan_MSA/srj/data/transcripts/recordings/srj_recordings.tsv srj libyan
+";
+}
+
+my ($tr,$spk,$l) = @ARGV;
+
+open my $I, '<', $tr or croak "problems with $tr";
+
+my $tmp_dir = "data/local/tmp/$l/$spk";
+
+system "mkdir -p $tmp_dir/recordings";
+
+# input wav file list
+my $w = "$tmp_dir/recordings_wav.txt";
+
+# output temporary wav.scp files
+my $o = "$tmp_dir/recordings/wav.scp";
+
+# output temporary utt2spk files
+my $u = "$tmp_dir/recordings/utt2spk";
+
+# output temporary text files
+my $t = "$tmp_dir/recordings/text";
+
+# initialize hash for prompts
+my %p = ();
+
+# store prompts in hash
+LINEA: while ( my $line = <$I> ) {
+    chomp $line;
+    my ($s,$sent) = split /\t/, $line, 2;
+    $p{$s} = $sent;
+}
+
+open my $W, '<', $w or croak "problem with $w $!";
+open my $O, '+>', $o or croak "problem with $o $!";
+open my $U, '+>', $u or croak "problem with $u $!";
+open my $T, '+>', $t or croak "problem with $t $!";
+
+ LINE: while ( my $line = <$W> ) {
+     chomp $line;
+     next LINE if ($line =~ /answers/ );
+     next LINE unless ( $line =~ /recordings/ );
+     my ($volume,$directories,$file) = File::Spec->splitpath( $line );
+     my @dirs = split /\//, $directories;
+     my $b = basename $line, ".wav";
+     my ($sk,$r) = split /\_/, $b, 2;
+     my $s = $dirs[-1];
+     my $rid = $sk . '_' . $r;
+     if ( exists $p{$b} ) {
+	 print $T "$rid\t$p{$b}\n";
+     } elsif ( defined $rid ) {
+	 warn  "problem\t$rid";
+	 next LINE;
+     } else {
+	 croak "$line";
+     }
+
+     print $O "$rid sox $line -t wav - |\n";
+	print $U "$rid\t${sk}_r\n";
+}
+close $T;
+close $O;
+close $U;
+close $W;
diff --git a/egs/tunisian_msa/s5/path.sh b/egs/tunisian_msa/s5/path.sh
new file mode 100644
index 00000000000..705600ad47a
--- /dev/null
+++ b/egs/tunisian_msa/s5/path.sh
@@ -0,0 +1,8 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+# For now, don't include any of the optional dependenices of the main
+# librispeech recipe
diff --git a/egs/tunisian_msa/s5/run.sh b/egs/tunisian_msa/s5/run.sh
new file mode 100755
index 00000000000..107acdf271c
--- /dev/null
+++ b/egs/tunisian_msa/s5/run.sh
@@ -0,0 +1,190 @@
+#!/bin/bash 
+
+# Trains on 11 hours of speechfrom CTELL{ONE,TWO,THREE,FOUR,FIVE}
+# Uses the QCRI vowelized Arabic lexicon.
+# Converts the Buckwalter encoding to utf8.
+. ./cmd.sh
+. ./path.sh
+stage=0
+
+. ./utils/parse_options.sh
+
+set -e
+set -o pipefail
+set u
+
+# Do not change tmpdir, other scripts under local depend on it
+tmpdir=data/local/tmp
+
+# The speech corpus is on openslr.org
+speech="http://www.openslr.org/resources/46/Tunisian_MSA.tar.gz"
+
+# We use the QCRI lexicon.
+lex="http://alt.qcri.org/resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2"
+
+# We train the lm on subtitles.
+subs_src="http://opus.nlpl.eu/download.php?f=OpenSubtitles2018/mono/OpenSubtitles2018.ar.gz"
+
+if [ $stage -le 1 ]; then
+  # Downloads archive to this script's directory
+  local/tamsa_download.sh $speech
+
+  local/qcri_lexicon_download.sh $lex
+
+  local/subs_download.sh $subs_src
+fi
+
+# preparation stages will store files under data/
+# Delete the entire data directory when restarting.
+if [ $stage -le 2 ]; then
+  local/prepare_data.sh
+fi
+
+if [ $stage -le 3 ]; then
+  mkdir -p $tmpdir/dict
+  local/qcri_buckwalter2utf8.sh > $tmpdir/dict/qcri_utf8.txt
+fi
+
+if [ $stage -le 4 ]; then
+  local/prepare_dict.sh $tmpdir/dict/qcri_utf8.txt
+fi
+
+if [ $stage -le 5 ]; then
+  # prepare the lang directory
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "Preparing the subs data for lm training."
+  local/subs_prepare_data.pl 
+fi
+
+if [ $stage -le 7 ]; then
+  echo "lm training."
+  local/prepare_lm.sh  $tmpdir/subs/lm/in_vocabulary.txt
+fi
+
+if [ $stage -le 8 ]; then
+  echo "Making grammar fst."
+  utils/format_lm.sh \
+    data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \
+    data/lang_test
+fi
+
+if [ $stage -le 9 ]; then
+  # extract acoustic features
+  for fld in devtest train test; do
+    steps/make_mfcc.sh data/$fld exp/make_mfcc/$fld mfcc
+    utils/fix_data_dir.sh data/$fld
+    steps/compute_cmvn_stats.sh data/$fld exp/make_mfcc mfcc
+    utils/fix_data_dir.sh data/$fld
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  echo "$0: monophone training"
+  steps/train_mono.sh  data/train data/lang exp/mono
+fi
+
+if [ $stage -le 11 ]; then
+  # monophone evaluation
+  (
+    # make decoding graph for monophones
+    utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph
+
+    # test monophones
+    for x in devtest test; do
+      nspk=$(wc -l < data/$x/spk2utt)
+      steps/decode.sh  --nj $nspk exp/mono/graph data/$x exp/mono/decode_${x}
+    done
+  ) &
+fi
+
+if [ $stage -le 12 ]; then
+  # align with monophones
+  steps/align_si.sh  data/train data/lang exp/mono exp/mono_ali
+fi
+
+if [ $stage -le 13 ]; then
+  echo "$0: Starting  triphone training in exp/tri1"
+  steps/train_deltas.sh \
+    --boost-silence 1.25 1000 6000 data/train data/lang exp/mono_ali exp/tri1
+fi
+
+wait
+
+if [ $stage -le 14 ]; then
+  # test cd gmm hmm models
+  # make decoding graphs for tri1
+  (
+    utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
+
+    # decode test data with tri1 models
+    for x in devtest test; do
+      nspk=$(wc -l < data/$x/spk2utt)
+      steps/decode.sh --nj $nspk exp/tri1/graph data/$x exp/tri1/decode_${x}
+    done
+  ) &
+fi
+
+if [ $stage -le 15 ]; then
+  # align with triphones
+  steps/align_si.sh  data/train data/lang exp/tri1 exp/tri1_ali
+fi
+
+if [ $stage -le 16 ]; then
+  echo "$0: Starting (lda_mllt) triphone training in exp/tri2b"
+  steps/train_lda_mllt.sh \
+    --splice-opts "--left-context=3 --right-context=3" 500 5000 \
+    data/train data/lang exp/tri1_ali exp/tri2b
+fi
+
+wait
+
+if [ $stage -le 17 ]; then
+  (
+    #  make decoding FSTs for tri2b models
+    utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph
+
+    # decode  test with tri2b models
+    for x in devtest test; do
+      nspk=$(wc -l < data/$x/spk2utt)
+      steps/decode.sh --nj $nspk exp/tri2b/graph data/$x exp/tri2b/decode_${x}
+    done
+  ) &
+fi
+
+if [ $stage -le 18 ]; then
+  # align with lda and mllt adapted triphones
+  steps/align_si.sh \
+    --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali
+fi
+
+if [ $stage -le 19 ]; then
+  echo "$0: Starting (SAT) triphone training in exp/tri3b"
+  steps/train_sat.sh 800 8000 data/train data/lang exp/tri2b_ali exp/tri3b
+fi
+
+if [ $stage -le 20 ]; then
+  (
+    # make decoding graphs for SAT models
+    utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph
+
+    # decode test sets with tri3b models
+    for x in devtest test; do
+      nspk=$(wc -l < data/$x/spk2utt)
+      steps/decode_fmllr.sh --nj $nspk exp/tri3b/graph data/$x exp/tri3b/decode_${x}
+    done
+  ) &
+fi
+
+if [ $stage -le 21 ]; then
+  # align with tri3b models
+  echo "$0: Starting exp/tri3b_ali"
+  steps/align_fmllr.sh data/train data/lang exp/tri3b exp/tri3b_ali
+fi
+
+if [ $stage -le 22 ]; then
+  # train and test chain models
+  local/chain/run_tdnn.sh
+fi
diff --git a/egs/tunisian_msa/s5/steps b/egs/tunisian_msa/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/tunisian_msa/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/tunisian_msa/s5/utils b/egs/tunisian_msa/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/tunisian_msa/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh
index 582bfc90105..e3548609da7 100755
--- a/egs/uw3/v1/local/chain/run_cnn_1a.sh
+++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh
@@ -130,7 +130,7 @@ if [ $stage -le 4 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=12"
 
   mkdir -p $dir/configs
diff --git a/egs/uw3/v1/local/make_features.py b/egs/uw3/v1/local/make_features.py
index dd0a30a19d7..e0211963e39 100755
--- a/egs/uw3/v1/local/make_features.py
+++ b/egs/uw3/v1/local/make_features.py
@@ -24,8 +24,8 @@
 
 parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
                                                 writes them to standard output in text format.""")
-parser.add_argument('dir', type=str, help='data directory (should contain images.scp)')
-parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file.')
+parser.add_argument('dir', help='data directory (should contain images.scp)')
+parser.add_argument('--out-ark', default='-', help='where to write the output feature file.')
 parser.add_argument('--feat-dim', type=int, default=40,
                     help='size to scale the height of all images (i.e. the dimension of the resulting features)')
 parser.add_argument('--pad', type=bool, default=False, help='pad the left and right of the images with 10 white pixels.')
@@ -43,7 +43,7 @@ def write_kaldi_matrix(file_handle, matrix, key):
         if num_cols != len(matrix[row_index]):
             raise Exception("All the rows of a matrix are expected to "
                             "have the same length")
-        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        file_handle.write(" ".join([str(x) for x in matrix[row_index]]))
         if row_index != num_rows - 1:
             file_handle.write("\n")
     file_handle.write(" ]\n")
diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py
index f5b37b04c2f..23b8e5402cf 100755
--- a/egs/uw3/v1/local/process_data.py
+++ b/egs/uw3/v1/local/process_data.py
@@ -14,8 +14,8 @@
 import random
 
 parser = argparse.ArgumentParser(description="""Creates data/train and data/test.""")
-parser.add_argument('database_path', type=str, help='path to downloaded (and extracted) UW3 corpus')
-parser.add_argument('out_dir', type=str, default='data',
+parser.add_argument('database_path', help='path to downloaded (and extracted) UW3 corpus')
+parser.add_argument('out_dir', default='data',
                     help='where to create the train and test data directories')
 args = parser.parse_args()
 
@@ -52,10 +52,10 @@
       # The dataset is randomly split train 95% and test 5%
       coin = random.randint(0, 20)
       if coin >= 1:
-        train_text_fh.write(utt_id + ' ' + text + '\n')
-        train_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n')
-        train_image_fh.write(utt_id + ' ' + image_path + '\n')
+        train_text_fh.write("{} {}\n".format(utt_id, text))
+        train_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))
       elif coin < 1:
-        test_text_fh.write(utt_id + ' ' + text + '\n')
-        test_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n')
-        test_image_fh.write(utt_id + ' ' + image_path + '\n')
+        test_text_fh.write("{} {}\n".format(utt_id, text))
+        test_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))
diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
index c86d35e4b8a..1f1404b5165 100755
--- a/egs/uw3/v1/local/unk_arc_post_to_transcription.py
+++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
@@ -1,86 +1,108 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
-# Copyright     2017  Ashish Arora
+#Copyright      2017  Ashish Arora
 
+""" This module will be used by scripts for open vocabulary setup.
+ If the hypothesis transcription contains <unk>, then it will replace the 
+ <unk> with the word predicted by <unk> model by concatenating phones decoded 
+ from the unk-model. It is currently supported only for triphone setup.
+ Args:
+  phones: File name of a file that contains the phones.txt, (symbol-table for phones).
+          phone and phoneID, Eg. a 217, phoneID of 'a' is 217. 
+  words: File name of a file that contains the words.txt, (symbol-table for words). 
+         word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234.
+  unk: ID of <unk>. Eg. 231.
+  one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior
+               of arcs along the one-best path from the lattice.
+               E.g. 506_m01-049-00 8 12  1 7722  282 272 288 231
+                    <utterance-id> <start-frame> <num-frames> <posterior> <word> [<ali>] 
+                    [<phone1> <phone2>...]
+  output-text: File containing hypothesis transcription with <unk> recognized by the
+               unk-model.
+               E.g. A move to stop mr. gaitskell.
+  
+  Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt 
+      data/lang/oov.int
+"""
 import argparse
+import io
+import os
 import sys
-
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
-parser.add_argument('phones', type=str, help='phones and phonesID')
-parser.add_argument('words', type=str, help='word and wordID')
-parser.add_argument('unk', type=str, default='-', help='location of unk file')
-parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
-parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
+parser.add_argument('phones', type=str, help='File name of a file that contains the'
+                    'symbol-table for phones. Each line must be: <phone> <phoneID>')
+parser.add_argument('words', type=str, help='File name of a file that contains the'
+                    'symbol-table for words. Each line must be: <word> <word-id>')
+parser.add_argument('unk', type=str, default='-', help='File name of a file that'
+                    'contains the ID of <unk>. The content must be: <oov-id>, e.g. 231')
+parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post'
+                    'format, which is a list of timing info and posterior of arcs'
+                    'along the one-best path from the lattice')
+parser.add_argument('--output-text', type=str, default='-', help='File containing'
+                    'hypothesis transcription with <unk> recognized by the unk-model')
 args = parser.parse_args()
+
 ### main ###
-phone_fh = open(args.phones, 'r')
-word_fh = open(args.words, 'r')
-unk_fh = open(args.unk,'r')
-if args.input_ark == '-':
-    input_fh = sys.stdin
+phone_handle = open(args.phones, 'r', encoding='utf8') # Create file handles 
+word_handle = open(args.words, 'r', encoding='utf8')
+unk_handle = open(args.unk,'r', encoding='utf8')
+if args.one_best_arc_post == '-':
+    arc_post_handle = io.TextIOWrapper(sys.stdin.buffer, encoding='utf8')
 else:
-    input_fh = open(args.input_ark,'r')
-if args.out_ark == '-':
-    out_fh = sys.stdout
+    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='utf8')
+if args.output_text == '-':
+    output_text_handle = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
 else:
-    out_fh = open(args.out_ark,'wb')
+    output_text_handle = open(args.output_text, 'w', encoding='utf8')
 
-phone_dict = dict()# stores phoneID and phone mapping
-phone_data_vect = phone_fh.read().strip().split("\n")
-for key_val in phone_data_vect:
+id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
+phones_data = phone_handle.read().strip().split("\n")
+
+for key_val in phones_data:
   key_val = key_val.split(" ")
-  phone_dict[key_val[1]] = key_val[0]
+  id2phone[key_val[1]] = key_val[0]
+
 word_dict = dict()
-word_data_vect = word_fh.read().strip().split("\n")
+word_data_vect = word_handle.read().strip().split("\n")
+
 for key_val in word_data_vect:
   key_val = key_val.split(" ")
   word_dict[key_val[1]] = key_val[0]
-unk_val = unk_fh.read().strip().split(" ")[0]
+unk_val = unk_handle.read().strip().split(" ")[0]
 
-utt_word_dict = dict()
-utt_phone_dict = dict()# stores utteranceID and phoneID
-unk_word_dict = dict()
-count=0
-for line in input_fh:
+utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str)
+for line in arc_post_handle:
   line_vect = line.strip().split("\t")
-  if len(line_vect) < 6:
-    print "IndexError"
-    print line_vect
+  if len(line_vect) < 6: # Check for 1best-arc-post output
+    print("Error: Bad line: '{}'   Expecting 6 fields. Skipping...".format(line),
+          file=sys.stderr)
     continue
-  uttID = line_vect[0]
+  utt_id = line_vect[0]
   word = line_vect[4]
   phones = line_vect[5]
-  if uttID in utt_word_dict.keys():
-    utt_word_dict[uttID][count] = word
-    utt_phone_dict[uttID][count] = phones
-  else:
-    count = 0
-    utt_word_dict[uttID] = dict()
-    utt_phone_dict[uttID] = dict()
-    utt_word_dict[uttID][count] = word
-    utt_phone_dict[uttID][count] = phones
-  if word == unk_val: # get character sequence for unk
-    phone_key_vect = phones.split(" ")
-    phone_val_vect = list()
-    for pkey in phone_key_vect:
-      phone_val_vect.append(phone_dict[pkey])
+  if utt_id not in list(utt_word_dict.keys()):
+    utt_word_dict[utt_id] = list()
+
+  if word == unk_val: # Get the 1best phone sequence given by the unk-model
+    phone_id_seq = phones.split(" ")
+    phone_seq = list()
+    for pkey in phone_id_seq:
+      phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence.
     phone_2_word = list()
-    for phone_val in phone_val_vect:
-      phone_2_word.append(phone_val.split('_')[0])
-    phone_2_word = ''.join(phone_2_word)
-    utt_word_dict[uttID][count] = phone_2_word
+    for phone_val in phone_seq:
+      phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B)
+    phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence
+    utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model
   else:
-    if word == '0':
+    if word == '0': # Store space/silence
       word_val = ' '
     else:
       word_val = word_dict[word]
-    utt_word_dict[uttID][count] = word_val
-  count += 1
+    utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post
 
-transcription = ""
-for key in sorted(utt_word_dict.iterkeys()):
-  transcription = key
-  for index in sorted(utt_word_dict[key].iterkeys()):
-    value = utt_word_dict[key][index]
-    transcription = transcription + " " + value
-  out_fh.write(transcription + '\n')
+transcription = "" # Output transcription
+for utt_key in sorted(utt_word_dict.keys()):
+  transcription = utt_key
+  for word in utt_word_dict[utt_key]:
+    transcription = transcription + " " + word
+  output_text_handle.write(transcription + '\n')
diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1/local/make_musan.py
deleted file mode 100755
index 74c434990fb..00000000000
--- a/egs/voxceleb/v1/local/make_musan.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#!/usr/bin/env python3
-# Copyright 2015   David Snyder
-#           2018   Ewald Enzinger
-# Apache 2.0.
-#
-# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz.
-#
-# This file is meant to be invoked by make_musan.sh.
-
-import os, sys
-
-def process_music_annotations(path):
-  utt2spk = {}
-  utt2vocals = {}
-  lines = open(path, 'r').readlines()
-  for line in lines:
-    utt, genres, vocals, musician = line.rstrip().split()[:4]
-    # For this application, the musican ID isn't important
-    utt2spk[utt] = utt
-    utt2vocals[utt] = vocals == "Y"
-  return utt2spk, utt2vocals
-
-def prepare_music(root_dir, use_vocals):
-  utt2vocals = {}
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  music_dir = os.path.join(root_dir, "music")
-  for root, dirs, files in os.walk(music_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-      elif str(file) == "ANNOTATIONS":
-        utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
-        utt2spk.update(utt2spk_part)
-        utt2vocals.update(utt2vocals_part)
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2vocals:
-    if utt in utt2wav:
-      if use_vocals or not utt2vocals[utt]:
-        utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-        utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_speech(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  speech_dir = os.path.join(root_dir, "speech")
-  for root, dirs, files in os.walk(speech_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def prepare_noise(root_dir):
-  utt2spk = {}
-  utt2wav = {}
-  num_good_files = 0
-  num_bad_files = 0
-  noise_dir = os.path.join(root_dir, "noise")
-  for root, dirs, files in os.walk(noise_dir):
-    for file in files:
-      file_path = os.path.join(root, file)
-      if file.endswith(".wav"):
-        utt = str(file).replace(".wav", "")
-        utt2wav[utt] = file_path
-        utt2spk[utt] = utt
-  utt2spk_str = ""
-  utt2wav_str = ""
-  for utt in utt2spk:
-    if utt in utt2wav:
-      utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
-      utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
-      num_good_files += 1
-    else:
-      print("Missing file", utt)
-      num_bad_files += 1
-  print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data")
-  return utt2spk_str, utt2wav_str
-
-def main():
-  in_dir = sys.argv[1]
-  out_dir = sys.argv[2]
-  use_vocals = sys.argv[3] == "Y"
-  utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals)
-  utt2spk_speech, utt2wav_speech = prepare_speech(in_dir)
-  utt2spk_noise, utt2wav_noise = prepare_noise(in_dir)
-  utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
-  utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
-  wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
-  wav_fi.write(utt2wav)
-  utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
-  utt2spk_fi.write(utt2spk)
-
-
-if __name__=="__main__":
-  main()
diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/voxceleb/v1/local/make_musan.sh
deleted file mode 100755
index 1565ef0d85c..00000000000
--- a/egs/voxceleb/v1/local/make_musan.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright 2015   David Snyder
-# Apache 2.0.
-#
-# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8).
-#
-# This script, called by ../run.sh, creates the MUSAN
-# data directory. The required dataset is freely available at
-#   http://www.openslr.org/17/
-
-set -e
-in_dir=$1
-data_dir=$2
-use_vocals='Y'
-
-mkdir -p local/musan.tmp
-
-echo "Preparing ${data_dir}/musan..."
-mkdir -p ${data_dir}/musan
-local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals}
-
-utils/fix_data_dir.sh ${data_dir}/musan
-
-grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
-grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
-grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
-  ${data_dir}/musan ${data_dir}/musan_music
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
-  ${data_dir}/musan ${data_dir}/musan_speech
-utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
-  ${data_dir}/musan ${data_dir}/musan_noise
-
-utils/fix_data_dir.sh ${data_dir}/musan_music
-utils/fix_data_dir.sh ${data_dir}/musan_speech
-utils/fix_data_dir.sh ${data_dir}/musan_noise
-
-rm -rf local/musan.tmp
-
diff --git a/egs/voxceleb/v1/local/make_voxceleb1.pl b/egs/voxceleb/v1/local/make_voxceleb1.pl
index 916e11020d2..2268c20ab52 100755
--- a/egs/voxceleb/v1/local/make_voxceleb1.pl
+++ b/egs/voxceleb/v1/local/make_voxceleb1.pl
@@ -15,10 +15,6 @@
 my $out_test_dir = "$out_dir/voxceleb1_test";
 my $out_train_dir = "$out_dir/voxceleb1_train";
 
-if (! -e "$data_base/voxceleb1_test.txt") {
-  system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt");
-}
-
 if (system("mkdir -p $out_test_dir") != 0) {
   die "Error making directory $out_test_dir";
 }
@@ -31,20 +27,35 @@
 my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
 closedir $dh;
 
+if (! -e "$data_base/voxceleb1_test.txt") {
+  system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt");
+}
+
+if (! -e "$data_base/vox1_meta.csv") {
+  system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv");
+}
+
 open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt";
+open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
 open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk";
 open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp";
 open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk";
 open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp";
 open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials";
 
+my %id2spkr = ();
+while (<META_IN>) {
+  chomp;
+  my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
+  $id2spkr{$vox_id} = $spkr_id;
+}
+
 my $test_spkrs = ();
 while (<TRIAL_IN>) {
   chomp;
-  my ($tar_or_none, $path1, $path2) = split;
+  my ($tar_or_non, $path1, $path2) = split;
 
   # Create entry for left-hand side of trial
-  my $wav = "$data_base/voxceleb1_wav/$path1";
   my ($spkr_id, $filename) = split('/', $path1);
   my $rec_id = substr($filename, 0, 11);
   my $segment = substr($filename, 12, 7);
@@ -52,7 +63,6 @@
   $test_spkrs{$spkr_id} = ();
 
   # Create entry for right-hand side of trial
-  my $wav = "$data_base/voxceleb1_wav/$path2";
   my ($spkr_id, $filename) = split('/', $path2);
   my $rec_id = substr($filename, 0, 11);
   my $segment = substr($filename, 12, 7);
@@ -60,7 +70,7 @@
   $test_spkrs{$spkr_id} = ();
 
   my $target = "nontarget";
-  if ($tar_or_none eq "1") {
+  if ($tar_or_non eq "1") {
     $target = "target";
   }
   print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
@@ -68,6 +78,12 @@
 
 foreach (@spkr_dirs) {
   my $spkr_id = $_;
+  my $new_spkr_id = $spkr_id;
+  # If we're using a newer version of VoxCeleb1, we need to "deanonymize"
+  # the speaker labels.
+  if (exists $id2spkr{$spkr_id}) {
+    $new_spkr_id = $id2spkr{$spkr_id};
+  }
   opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
   my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
   closedir $dh;
@@ -75,14 +91,14 @@
     my $filename = $_;
     my $rec_id = substr($filename, 0, 11);
     my $segment = substr($filename, 12, 7);
-    my $utt_id = "$spkr_id-$rec_id-$segment";
     my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
-    if (exists $test_spkrs{$spkr_id}) {
+    my $utt_id = "$new_spkr_id-$rec_id-$segment";
+    if (exists $test_spkrs{$new_spkr_id}) {
       print WAV_TEST "$utt_id", " $wav", "\n";
-      print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
     } else {
       print WAV_TRAIN "$utt_id", " $wav", "\n";
-      print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      print SPKR_TRAIN "$utt_id", " $new_spkr_id", "\n";
     }
   }
 }
@@ -93,6 +109,7 @@
 close(WAV_TRAIN) or die;
 close(TRIAL_OUT) or die;
 close(TRIAL_IN) or die;
+close(META_IN) or die;
 
 if (system(
   "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) {
diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
new file mode 100755
index 00000000000..221507870be
--- /dev/null
+++ b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl
+#
+# Copyright 2018  Ewald Enzinger
+#           2018  David Snyder
+#           2019  Soonshin Seo
+#
+# Usage: make_voxceleb1_v2.pl /export/voxceleb1 dev data/dev
+#
+# The VoxCeleb1 corpus underwent several updates that changed the directory and speaker ID format.
+# The script 'make_voxceleb1.pl' works for the oldest version of the corpus.
+# This script should be used if you've downloaded the corpus recently.
+
+if (@ARGV != 3) {
+  print STDERR "Usage: $0 <path-to-voxceleb1> <dataset> <path-to-data-dir>\n";
+  print STDERR "e.g. $0 /export/voxceleb1 dev data/dev\n";
+  exit(1);
+}
+
+($data_base, $dataset, $out_dir) = @ARGV;
+
+if ("$dataset" ne "dev" && "$dataset" ne "test") {
+  die "dataset parameter must be 'dev' or 'test'!";
+}
+
+if (system("mkdir -p $out_dir") != 0) {
+  die "Error making directory $out_dir";
+}
+
+opendir my $dh, "$data_base/$dataset/wav" or die "Cannot open directory: $!";
+my @spkr_dirs = grep {-d "$data_base/$dataset/wav/$_" && ! /^\.{1,2}$/} readdir($dh);
+closedir $dh;
+
+if ($dataset eq "dev"){
+  open(SPKR_TRAIN, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TRAIN, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+      my $rec_id = $_;
+      opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+      my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+      closedir $dh;
+      foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TRAIN "$utt_id", " $wav", "\n";
+        print SPKR_TRAIN "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TRAIN) or die;
+  close(WAV_TRAIN) or die;
+}
+
+if ($dataset eq "test"){
+  if (! -e "$data_base/voxceleb1_test_v2.txt") {
+    system("wget -O $data_base/voxceleb1_test_v2.txt http://www.openslr.org/resources/49/voxceleb1_test_v2.txt");
+  }
+
+  open(TRIAL_IN, "<", "$data_base/voxceleb1_test_v2.txt") or die "could not open the verification trials file $data_base/voxceleb1_test_v2.txt";
+  open(TRIAL_OUT, ">", "$out_dir/trials") or die "Could not open the output file $out_test_dir/trials";
+  open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "could not open the output file $out_dir/utt2spk";
+  open(WAV_TEST, ">", "$out_dir/wav.scp") or die "could not open the output file $out_dir/wav.scp";
+
+  my $test_spkrs = ();
+  while (<TRIAL_IN>) {
+    chomp;
+    my ($tar_or_non, $path1, $path2) = split;
+    # Create entry for left-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path1);
+    $name =~ s/\.wav$//g;
+    my $utt_id1 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    # Create entry for right-hand side of trial
+    my ($spkr_id, $rec_id, $name) = split('/', $path2);
+    $name =~ s/\.wav$//g;
+    my $utt_id2 = "$spkr_id-$rec_id-$name";
+    $test_spkrs{$spkr_id} = ();
+
+    my $target = "nontarget";
+    if ($tar_or_non eq "1") {
+      $target = "target";
+    }
+    print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
+  }
+
+  foreach (@spkr_dirs) {
+    my $spkr_id = $_;
+    opendir my $dh, "$data_base/$dataset/wav/$spkr_id/" or die "Cannot open directory: $!";
+    my @rec_dirs = grep {-d "$data_base/$dataset/wav/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh);
+    closedir $dh;
+    foreach (@rec_dirs) {
+      my $rec_id = $_;
+      opendir my $dh, "$data_base/$dataset/wav/$spkr_id/$rec_id/" or die "Cannot open directory: $!";
+      my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
+      closedir $dh;
+      foreach (@files) {
+        my $name = $_;
+        my $wav = "$data_base/$dataset/wav/$spkr_id/$rec_id/$name.wav";
+        my $utt_id = "$spkr_id-$rec_id-$name";
+        print WAV_TEST "$utt_id", " $wav", "\n";
+        print SPKR_TEST "$utt_id", " $spkr_id", "\n";
+      }
+    }
+  }
+  close(SPKR_TEST) or die;
+  close(WAV_TEST) or die;
+  close(TRIAL_OUT) or die;
+  close(TRIAL_IN) or die;
+}
+
+if (system(
+  "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
+  die "Error creating spk2utt file in directory $out_dir";
+}
+system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
+if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
+  die "Error validating directory $out_dir";
+}
diff --git a/egs/voxceleb/v1/local/prepare_for_eer.py b/egs/voxceleb/v1/local/prepare_for_eer.py
index 6bfa04e011b..2f569b70bc5 100755
--- a/egs/voxceleb/v1/local/prepare_for_eer.py
+++ b/egs/voxceleb/v1/local/prepare_for_eer.py
@@ -16,4 +16,4 @@
   spkrutt2target[spkr+utt]=target
 for line in scores:
   spkr, utt, score = line.strip().split()
-  print(score, spkrutt2target[spkr+utt])
+  print("{} {}".format(score, spkrutt2target[spkr+utt]))
diff --git a/egs/voxceleb/v1/run.sh b/egs/voxceleb/v1/run.sh
index 8af2226423d..500c05c5db6 100755
--- a/egs/voxceleb/v1/run.sh
+++ b/egs/voxceleb/v1/run.sh
@@ -14,7 +14,7 @@ set -e
 mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -24,11 +24,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script reates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi
 
diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh
index e57799cee27..7c70e4a42c1 100755
--- a/egs/voxceleb/v2/run.sh
+++ b/egs/voxceleb/v2/run.sh
@@ -15,7 +15,7 @@ mfccdir=`pwd`/mfcc
 vaddir=`pwd`/mfcc
 
 
-# The trials file is downloaded by local/make_voxceleb1.pl.
+# The trials file is downloaded by local/make_voxceleb1_v2.pl.
 voxceleb1_trials=data/voxceleb1_test/trials
 voxceleb1_root=/export/corpora/VoxCeleb1
 voxceleb2_root=/export/corpora/VoxCeleb2
@@ -27,11 +27,14 @@ stage=0
 if [ $stage -le 0 ]; then
   local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train
   local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test
-  # This script reates data/voxceleb1_test and data/voxceleb1_train.
+  # This script creates data/voxceleb1_test and data/voxceleb1_train for latest version of VoxCeleb1.
   # Our evaluation set is the test portion of VoxCeleb1.
-  local/make_voxceleb1.pl $voxceleb1_root data
+  local/make_voxceleb1_v2.pl $voxceleb1_root dev data/voxceleb1_train
+  local/make_voxceleb1_v2.pl $voxceleb1_root test data/voxceleb1_test
+  # if you downloaded the dataset soon after it was released, you will want to use the make_voxceleb1.pl script instead.
+  # local/make_voxceleb1.pl $voxceleb1_root data
   # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1.
-  # This should give 7,351 speakers and 1,277,503 utterances.
+  # This should give 7,323 speakers and 1,276,888 utterances.
   utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train
 fi
 
@@ -66,7 +69,7 @@ if [ $stage -le 2 ]; then
 
   # Make a reverberated version of the VoxCeleb2 list.  Note that we don't add any
   # additive noise here.
-  python steps/data/reverberate_data_dir.py \
+  steps/data/reverberate_data_dir.py \
     "${rvb_opts[@]}" \
     --speech-rvb-probability 1 \
     --pointsource-noise-addition-probability 0 \
@@ -81,7 +84,7 @@ if [ $stage -le 2 ]; then
 
   # Prepare the MUSAN corpus, which consists of music, speech, and noise
   # suitable for augmentation.
-  local/make_musan.sh $musan_root data
+  steps/data/make_musan.sh --sampling-rate 16000 $musan_root data
 
   # Get the duration of the MUSAN recordings.  This will be used by the
   # script augment_data_dir.py.
@@ -91,11 +94,11 @@ if [ $stage -le 2 ]; then
   done
 
   # Augment with musan_noise
-  python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
+  steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise
   # Augment with musan_music
-  python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
+  steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music
   # Augment with musan_speech
-  python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
+  steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble
 
   # Combine reverb, noise, music, and babble into one directory.
   utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble
diff --git a/egs/voxforge/gst_demo/run-live.py b/egs/voxforge/gst_demo/run-live.py
index 725a306c42c..7876e5f2046 100755
--- a/egs/voxforge/gst_demo/run-live.py
+++ b/egs/voxforge/gst_demo/run-live.py
@@ -6,6 +6,7 @@
 #
 # Apache 2.0
 
+from __future__ import print_function
 import sys
 import os
 import gi
@@ -46,7 +47,7 @@ def init_gst(self):
         """Initialize the speech components"""
         self.pulsesrc = Gst.ElementFactory.make("pulsesrc", "pulsesrc")
         if self.pulsesrc == None:
-            print >> sys.stderr, "Error loading pulsesrc GST plugin. You probably need the gstreamer1.0-pulseaudio package"
+            print("Error loading pulsesrc GST plugin. You probably need the gstreamer1.0-pulseaudio package", file=sys.stderr)
             sys.exit()	
         self.audioconvert = Gst.ElementFactory.make("audioconvert", "audioconvert")
         self.audioresample = Gst.ElementFactory.make("audioresample", "audioresample")    
@@ -56,7 +57,7 @@ def init_gst(self):
         if self.asr:
           model_dir = "online-data/models/tri2b_mmi/"
           if not os.path.isdir(model_dir):
-              print >> sys.stderr, "Model (%s) not downloaded. Run run-simulated.sh first" % model_dir
+              print("Model (%s) not downloaded. Run run-simulated.sh first" % model_dir, file=sys.stderr)
               sys.exit(1)
           self.asr.set_property("fst", model_dir + "HCLG.fst")
           self.asr.set_property("lda-mat", model_dir + "matrix")
@@ -67,12 +68,12 @@ def init_gst(self):
           self.asr.set_property("beam", 12.0)
           self.asr.set_property("acoustic-scale", 0.0769)
         else:
-          print >> sys.stderr, "Couldn't create the onlinegmmfasterdecoder element. "
+          print("Couldn't create the onlinegmmfasterdecoder element. ", file=sys.stderr)
           if "GST_PLUGIN_PATH" in os.environ:
-            print >> sys.stderr, "Have you compiled the Kaldi GStreamer plugin?"
+            print("Have you compiled the Kaldi GStreamer plugin?", file=sys.stderr)
           else:
-            print >> sys.stderr, "You probably need to set the GST_PLUGIN_PATH envoronment variable"
-            print >> sys.stderr, "Try running: GST_PLUGIN_PATH=../../../src/gst-plugin %s" % sys.argv[0]
+            print("You probably need to set the GST_PLUGIN_PATH envoronment variable", file=sys.stderr)
+            print("Try running: GST_PLUGIN_PATH=../../../src/gst-plugin %s" % sys.argv[0], file=sys.stderr)
           sys.exit();
         
         # initially silence the decoder
@@ -111,10 +112,10 @@ def button_clicked(self, button):
 
 if __name__ == '__main__':
   app = DemoApp()
-  print '''
+  print('''
   The (bigram) language model used to build the decoding graph was
   estimated on an audio book's text. The text in question is
   King Solomon's Mines" (http://www.gutenberg.org/ebooks/2166).
-  You may want to read some sentences from this book first ...'''
+  You may want to read some sentences from this book first ...''')
 
   Gtk.main()
diff --git a/egs/voxforge/s5/local/make_trans.py b/egs/voxforge/s5/local/make_trans.py
index 1b4f5c4136a..612755c8be4 100755
--- a/egs/voxforge/s5/local/make_trans.py
+++ b/egs/voxforge/s5/local/make_trans.py
@@ -12,11 +12,12 @@
 if this is the case produces a transcript line for each file in the format:
 prefix_a0405 IT SEEMED THE ORDAINED ORDER OF THINGS THAT DOGS SHOULD WORK
 """
+from __future__ import print_function
 
 import sys
 
 def err(msg):
-    print >> sys.stderr, msg
+    print(msg, file=sys.stderr)
 
 if len(sys.argv) < 3:
     err("Usage: %s <prompts-file> <id-prefix> <utt-id1> <utt-id2> ... " % sys.argv[0])
@@ -46,5 +47,5 @@ def err(msg):
     if not uid in utt2trans:
         err("No transcript found for %s_%s" % (id_prefix, uid))
         continue
-    print "%s-%s %s" % (id_prefix, uid, utt2trans[uid])
+    print("%s-%s %s" % (id_prefix, uid, utt2trans[uid]))
 
diff --git a/egs/voxforge/s5/local/run_mmi_tri2b.sh b/egs/voxforge/s5/local/run_mmi_tri2b.sh
index 6517e46a1a7..8a4d03c59c4 100755
--- a/egs/voxforge/s5/local/run_mmi_tri2b.sh
+++ b/egs/voxforge/s5/local/run_mmi_tri2b.sh
@@ -38,7 +38,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
    data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
    exp/tri2b_fmmi_b0.1
 
- for iter in `seq 3 8`; do 
+ for iter in `seq 3 8`; do
    steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
      exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it$iter &
  done
@@ -46,7 +46,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
  steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
    data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
    exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
- for iter in `seq 3 8`; do 
+ for iter in `seq 3 8`; do
    steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
      exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it$iter &
  done
@@ -54,7 +54,7 @@ steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
  steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
    data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
    exp/tri2b_fmmi_indirect_b0.1
- for iter in `seq 3 8`; do 
+ for iter in `seq 3 8`; do
    steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
       exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it$iter &
  done
diff --git a/egs/voxforge/s5/local/run_sgmm2x.sh b/egs/voxforge/s5/local/run_sgmm2x.sh
index 96a17578203..c019bfdf3be 100755
--- a/egs/voxforge/s5/local/run_sgmm2x.sh
+++ b/egs/voxforge/s5/local/run_sgmm2x.sh
@@ -26,14 +26,14 @@ steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd
  steps/make_denlats_sgmm2.sh --nj 8 --sub-split 20 --cmd "$decode_cmd" --transform-dir exp/tri3b \
    data/train data/lang exp/sgmm2x_4a_ali exp/sgmm2x_4a_denlats
  steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri3b --boost 0.2 \
-   data/train data/lang exp/sgmm2x_4a_ali exp/sgmm2x_4a_denlats exp/sgmm2x_4a_mmi_b0.2 
+   data/train data/lang exp/sgmm2x_4a_ali exp/sgmm2x_4a_denlats exp/sgmm2x_4a_mmi_b0.2
 
  for iter in 1 2 3 4; do
   steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
     --transform-dir exp/tri3b/decode data/lang data/test exp/sgmm2x_4a/decode exp/sgmm2x_4a_mmi_b0.2/decode_it$iter &
- done  
+ done
 
-wait 
+wait
 steps/decode_combine.sh data/test data/lang exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode || exit 1;
 steps/decode_combine.sh data/test data/lang exp/sgmm2x_4a/decode exp/tri3b_mmi/decode exp/combine_sgmm2x_4a_3b/decode || exit 1;
 # combining the sgmm run and the best MMI+fMMI run.
diff --git a/egs/voxforge/s5/local/voxforge_prepare_dict.sh b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
index 4242af29d25..daf4e2326e5 100755
--- a/egs/voxforge/s5/local/voxforge_prepare_dict.sh
+++ b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
@@ -49,7 +49,7 @@ if [[ "$(uname)" == "Darwin" ]]; then
   alias readlink=greadlink
 fi
 
-sequitur=$KALDI_ROOT/tools/sequitur
+sequitur=$KALDI_ROOT/tools/sequitur-g2p
 export PATH=$PATH:$sequitur/bin
 export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages`
 
diff --git a/egs/voxforge/s5/run.sh b/egs/voxforge/s5/run.sh
index 277d41039ea..86fc128469e 100755
--- a/egs/voxforge/s5/run.sh
+++ b/egs/voxforge/s5/run.sh
@@ -44,7 +44,7 @@ selected=${DATA_ROOT}/selected
 # /bin/bash run.sh --pos-dep-phones false
 . utils/parse_options.sh || exit 1
 
-[[ $# -ge 1 ]] && { echo "Unexpected arguments"; exit 1; } 
+[[ $# -ge 1 ]] && { echo "Unexpected arguments"; exit 1; }
 
 # Select a subset of the data to use
 # WARNING: the destination directory will be deleted if it already exists!
@@ -75,7 +75,7 @@ local/voxforge_format_data.sh || exit 1
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
 mfccdir=${DATA_ROOT}/mfcc
-for x in train test; do 
+for x in train test; do
  steps/make_mfcc.sh --cmd "$train_cmd" --nj $njobs \
    data/$x exp/make_mfcc/$x $mfccdir || exit 1;
  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
diff --git a/egs/vystadial_cz/online_demo/build_reference.py b/egs/vystadial_cz/online_demo/build_reference.py
index 1be78391d2f..aea12a2c8bc 100755
--- a/egs/vystadial_cz/online_demo/build_reference.py
+++ b/egs/vystadial_cz/online_demo/build_reference.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # encoding: utf-8
 from __future__ import unicode_literals
+from __future__ import print_function
 
 import glob
 import sys
@@ -8,7 +9,7 @@
 import codecs
 
 def build_reference(wav_scp, ref_path):
-    print wav_scp, ref_path
+    print(wav_scp, ref_path)
     with codecs.open(ref_path, 'w', 'utf-8') as w:
         with codecs.open(wav_scp, 'r', 'utf-8') as scp:
             for line in scp:
@@ -31,8 +32,8 @@ def build_reference(wav_scp, ref_path):
     usage_args = {'exec': sys.argv[0]}
 
     if len(sys.argv) != 3:
-        print >> sys.stderr, "Wrong number of arguments"
-        print >> sys.stderr, usage % {'exec': sys.argv[0]}
+        print("Wrong number of arguments", file=sys.stderr)
+        print(usage % {'exec': sys.argv[0]}, file=sys.stderr)
         sys.exit(1)
 
     if sys.argv[1].endswith('scp'):
@@ -41,12 +42,12 @@ def build_reference(wav_scp, ref_path):
         scps = glob.glob(os.path.join(sys.argv[1], '*.scp'))
     target_dir = sys.argv[2]
     if not len(scps):
-        print >> sys.stderr, "No '*.scp' files found"
-        print >> sys.stderr, usage % {'exec': sys.argv[0]}
+        print("No '*.scp' files found", file=sys.stderr)
+        print(usage % {'exec': sys.argv[0]}, file=sys.stderr)
         sys.exit(1)
     if not os.path.isdir(target_dir):
-        print >> sys.stderr, "No '*.scp' files found"
-        print >> sys.stderr, usage % {'exec': sys.argv[0]}
+        print("No '*.scp' files found", file=sys.stderr)
+        print(usage % {'exec': sys.argv[0]}, file=sys.stderr)
         sys.exit(1)
 
     refers = [os.path.join(target_dir, os.path.basename(scp) + '.tra') for scp in scps]
diff --git a/egs/vystadial_cz/online_demo/live-demo.py b/egs/vystadial_cz/online_demo/live-demo.py
index 6b41c12c739..320a930735f 100755
--- a/egs/vystadial_cz/online_demo/live-demo.py
+++ b/egs/vystadial_cz/online_demo/live-demo.py
@@ -15,6 +15,7 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License. #
 from __future__ import unicode_literals
+from __future__ import print_function
 
 import pyaudio
 from kaldi.decoders import PyOnlineLatgenRecogniser
@@ -29,7 +30,7 @@
 CHANNELS, RATE, FORMAT = 1, 16000, pyaudio.paInt16
 
 
-class LiveDemo:
+class LiveDemo(object):
 
     def __init__(self, audio_batch_size, wst, dec_args):
         self.batch_size = audio_batch_size
@@ -127,7 +128,7 @@ def save_wav(self):
 if __name__ == '__main__':
     audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2]
     argv = sys.argv[3:]
-    print >> sys.stderr, 'Python args: %s' % str(sys.argv)
+    print('Python args: %s' % str(sys.argv), file=sys.stderr)
 
     wst = wst2dict(wst_path)
     demo = LiveDemo(audio_batch_size, wst, argv)
diff --git a/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py b/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py
index 02a0400921c..0008a4c01f1 100755
--- a/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py
+++ b/egs/vystadial_cz/online_demo/pykaldi-online-latgen-recogniser.py
@@ -14,6 +14,8 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License. #
 from __future__ import unicode_literals
+from __future__ import division
+from __future__ import print_function
 
 from kaldi.utils import load_wav, wst2dict, lattice_to_nbest
 from kaldi.decoders import PyOnlineLatgenRecogniser
@@ -31,14 +33,14 @@ def write_decoded(f, wav_name, word_ids, wst):
     if wst is not None:
         decoded = [wst[w] for w in best_path]
     else:
-        decoded = [unicode(w) for w in best_path]
+        decoded = [str(w) for w in best_path]
     line = u' '.join([wav_name] + decoded + ['\n'])
     if DEBUG:
-        print '%s best path %s' % (wav_name, decoded.encode('UTF-8'))
+        print('%s best path %s' % (wav_name, decoded.encode('UTF-8')))
         for i, s in enumerate(word_ids):
             if i > 0:
                 break
-            print 'best path %d: %s' % (i, str(s))
+            print('best path %d: %s' % (i, str(s)))
     f.write(line.encode('UTF-8'))
 
 
@@ -55,11 +57,11 @@ def decode(d, pcm):
         while dec_t > 0:
             decoded_frames += dec_t
             dec_t = d.decode(max_frames=10)
-    print "forward decode: %s secs" % str(time.time() - start)
+    print("forward decode: %s secs" % str(time.time() - start))
     start = time.time()
     d.prune_final()
     lik, lat = d.get_lattice()
-    print "backward decode: %s secs" % str(time.time() - start)
+    print("backward decode: %s secs" % str(time.time() - start))
     d.reset(keep_buffer_data=False)
     return (lat, lik, decoded_frames)
 
@@ -72,7 +74,7 @@ def decode_wrap(argv, audio_batch_size, wav_paths,
     for wav_name, wav_path in wav_paths:
         sw, sr = 2, 16000  # 16-bit audio so 1 sample_width = 2 chars
         pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr)
-        print '%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr)
+        print('%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr))
         lat, lik, decoded_frames = decode(d, pcm)
         lat.isyms = lat.osyms = fst.read_symbols_text(wst_path)
         if DEBUG:
@@ -80,8 +82,8 @@ def decode_wrap(argv, audio_batch_size, wav_paths,
                 f.write(lat._repr_svg_())
             lat.write('%s_pykaldi.fst' % wav_name)
 
-        print "Log-likelihood per frame for utterance %s is %f over %d frames" % (
-            wav_name, (lik / decoded_frames), decoded_frames)
+        print("Log-likelihood per frame for utterance %s is %f over %d frames" % (
+            wav_name, int(lik / decoded_frames), decoded_frames))
         word_ids = lattice_to_nbest(lat, n=10)
         write_decoded(file_output, wav_name, word_ids, wst)
 
@@ -90,7 +92,7 @@ def decode_wrap(argv, audio_batch_size, wav_paths,
     audio_scp, audio_batch_size = sys.argv[1], int(sys.argv[2])
     dec_hypo, wst_path = sys.argv[3], sys.argv[4]
     argv = sys.argv[5:]
-    print >> sys.stderr, 'Python args: %s' % str(sys.argv)
+    print('Python args: %s' % str(sys.argv), file=sys.stderr)
 
     # open audio_scp, decode and write to dec_hypo file
     with open(audio_scp, 'rb') as r:
diff --git a/egs/vystadial_cz/s5/local/results.py b/egs/vystadial_cz/s5/local/results.py
index a7c19af214c..f37109d5fcb 100755
--- a/egs/vystadial_cz/s5/local/results.py
+++ b/egs/vystadial_cz/s5/local/results.py
@@ -14,6 +14,8 @@
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License. #
+from __future__ import division
+from __future__ import print_function
 import argparse
 import glob
 import sys
@@ -29,8 +31,8 @@ def extract_stat(wer_file):
             ser = float(s[2].split()[1])
 
     except Exception as e:
-        print sys.stderr, 'Error parsing file %s' % wer_file
-        print sys.stderr, str(e)
+        print(sys.stderr, 'Error parsing file %s' % wer_file)
+        print(sys.stderr, str(e))
     return wer, ser
 
 
@@ -47,8 +49,8 @@ def extractResults(path):
             wer, ser = extract_stat(wf)
             table.append((exp, dataset, lm,  lm_w, wer, ser))
         except Exception as e:
-            print >> sys.stderr, 'failed to parse %s' % wf
-            print >> sys.stderr, str(e)
+            print('failed to parse %s' % wf, file=sys.stderr)
+            print(str(e), file=sys.stderr)
     return table
 
 
@@ -105,7 +107,7 @@ def Table2LatexTable(table):
 
 def createSmallTable(r):
     d = []
-    for k, v in r.iteritems():
+    for k, v in r.items():
         w, s, r = v
         if w == []:
             minw = None
@@ -115,7 +117,7 @@ def createSmallTable(r):
             mins = None
         else:
             mins = min(s)  # returns tuple if s is list of tuples
-        mean_r = sum(r) / float(len(r))
+        mean_r = float(sum(r)) / len(r)
         d.append([k, mean_r, minw, mins])
     t = Table(d, ['exp', 'RT coef', 'WER', 'SER'])
     return t
@@ -167,7 +169,7 @@ def createSmallTable(r):
 
     # remove duplicates: duplicates if equal mimimum wer in dev set
     min_dev_un = [(e, lm, lmw) for ((e, lm), lmw) in
-                  dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items()]
+                  list(dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items())]
     # sort according LM -> sort results according experiment & LMs
     min_dev_un.sort(key=lambda x: (x[1], x[0]))
 
@@ -182,6 +184,6 @@ def createSmallTable(r):
         d.append(x[0])
 
     t = Table(data=d, colnames=['exp', 'set', 'LM', 'LMW', 'WER', 'SER'])
-    print str(t)
+    print(str(t))
     if args.latex:
-        print Table2LatexTable(t)
+        print(Table2LatexTable(t))
diff --git a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
index 496ee5e84ca..844ccf80677 100755
--- a/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/vystadial_cz/s5b/local/chain/tuning/run_tdnn_1a.sh
@@ -148,7 +148,7 @@ if [ $stage -le 13 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
   output_opts="l2-regularize=0.02 bottleneck-dim=192"
 
diff --git a/egs/vystadial_en/s5/local/results.py b/egs/vystadial_en/s5/local/results.py
index a7c19af214c..f37109d5fcb 100755
--- a/egs/vystadial_en/s5/local/results.py
+++ b/egs/vystadial_en/s5/local/results.py
@@ -14,6 +14,8 @@
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License. #
+from __future__ import division
+from __future__ import print_function
 import argparse
 import glob
 import sys
@@ -29,8 +31,8 @@ def extract_stat(wer_file):
             ser = float(s[2].split()[1])
 
     except Exception as e:
-        print sys.stderr, 'Error parsing file %s' % wer_file
-        print sys.stderr, str(e)
+        print(sys.stderr, 'Error parsing file %s' % wer_file)
+        print(sys.stderr, str(e))
     return wer, ser
 
 
@@ -47,8 +49,8 @@ def extractResults(path):
             wer, ser = extract_stat(wf)
             table.append((exp, dataset, lm,  lm_w, wer, ser))
         except Exception as e:
-            print >> sys.stderr, 'failed to parse %s' % wf
-            print >> sys.stderr, str(e)
+            print('failed to parse %s' % wf, file=sys.stderr)
+            print(str(e), file=sys.stderr)
     return table
 
 
@@ -105,7 +107,7 @@ def Table2LatexTable(table):
 
 def createSmallTable(r):
     d = []
-    for k, v in r.iteritems():
+    for k, v in r.items():
         w, s, r = v
         if w == []:
             minw = None
@@ -115,7 +117,7 @@ def createSmallTable(r):
             mins = None
         else:
             mins = min(s)  # returns tuple if s is list of tuples
-        mean_r = sum(r) / float(len(r))
+        mean_r = float(sum(r)) / len(r)
         d.append([k, mean_r, minw, mins])
     t = Table(d, ['exp', 'RT coef', 'WER', 'SER'])
     return t
@@ -167,7 +169,7 @@ def createSmallTable(r):
 
     # remove duplicates: duplicates if equal mimimum wer in dev set
     min_dev_un = [(e, lm, lmw) for ((e, lm), lmw) in
-                  dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items()]
+                  list(dict([((e, lm), lmw) for e, lm, lmw in min_dev]).items())]
     # sort according LM -> sort results according experiment & LMs
     min_dev_un.sort(key=lambda x: (x[1], x[0]))
 
@@ -182,6 +184,6 @@ def createSmallTable(r):
         d.append(x[0])
 
     t = Table(data=d, colnames=['exp', 'set', 'LM', 'LMW', 'WER', 'SER'])
-    print str(t)
+    print(str(t))
     if args.latex:
-        print Table2LatexTable(t)
+        print(Table2LatexTable(t))
diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS
index 965a7143524..3166cec06f1 100644
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@@ -6,7 +6,7 @@
 # look at the top of those files, we don't put those in the
 # RESULTS file.
 
-for dir in exp/*; do
+for dir in exp/* exp/chain*/*; do
   steps/info/gmm_dir_info.pl $dir
   for x in $dir/decode*dev93* $dir/decode*eval92*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
 done
@@ -288,3 +288,4 @@ for x in exp/nnet3/nnet_tdnn_a/decode_*; do grep WER $x/wer_* | utils/best_wer.s
 %WER 4.27 exp/nnet3/lstm_bidirectional/decode_bd_tgpr_eval92/wer_11_0.0
 %WER 9.29 exp/nnet3/lstm_bidirectional/decode_tgpr_dev93/wer_11_0.5
 %WER 6.61 exp/nnet3/lstm_bidirectional/decode_tgpr_eval92/wer_11_1.0
+
diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh
index 179307556d5..23721d04c56 100644
--- a/egs/wsj/s5/cmd.sh
+++ b/egs/wsj/s5/cmd.sh
@@ -15,7 +15,7 @@ export decode_cmd="queue.pl --mem 2G"
 # the use of cuda_cmd is deprecated, used only in 'nnet1',
 export cuda_cmd="queue.pl --gpu 1"
 
-if [[ "$(hostname -f)" == "*.fit.vutbr.cz" ]]; then
+if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf,
   export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2"
   export decode_cmd="queue.pl --config $queue_conf --mem 3G --matylda 0.1"
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
index 9a4f0c87c8d..1ddb3c305ac 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh
@@ -3,33 +3,31 @@
 
 # This script performs chain training in a flat-start manner
 # and without building or using any context-dependency tree.
-# It does not use ivecors or other forms of speaker adaptation
-# except simple mean and variance normalization.
+# It does not use ivecors or other forms of speaker adaptation.
 # It is called from run_e2e_phone.sh
 
 # Note: this script is configured as phone-based, if you want
 # to run it in character mode, you'll need to change _nosp
-# to _char everywhere and also copy char_lm.fst instead
-# of phone_lm.fst (in stage 1 below)
-
-# local/chain/compare_wer.sh exp/chain/e2e_tdnn_1a
-# System                   e2e_tdnn_1a
-#WER dev93 (tgpr)                9.63
-#WER dev93 (tg)                  9.07
-#WER dev93 (big-dict,tgpr)       7.41
-#WER dev93 (big-dict,fg)         6.55
-#WER eval92 (tgpr)               5.90
-#WER eval92 (tg)                 5.17
-#WER eval92 (big-dict,tgpr)      3.56
-#WER eval92 (big-dict,fg)        2.85
-# Final train prob        -0.0726
-# Final valid prob        -0.0884
+# to _char everywhere.
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnnf_1a
+# System                e2e_tdnnf_1a
+#WER dev93 (tgpr)                8.77
+#WER dev93 (tg)                  8.11
+#WER dev93 (big-dict,tgpr)       6.17
+#WER dev93 (big-dict,fg)         5.66
+#WER eval92 (tgpr)               5.62
+#WER eval92 (tg)                 5.19
+#WER eval92 (big-dict,tgpr)      3.23
+#WER eval92 (big-dict,fg)        2.80
+# Final train prob        -0.0618
+# Final valid prob        -0.0825
 # Final train prob (xent)
 # Final valid prob (xent)
-# Num-params                 3740934
+# Num-params                 6772564
 
-# steps/info/chain_dir_info.pl exp/chain/e2e_tdnn_1a
-# exp/chain/e2e_tdnn_1a: num-iters=102 nj=2..5 num-params=3.7M dim=40->84 combine=-0.117->-0.116 (over 3) logprob:train/valid[67,101,final]=(-0.080,-0.073,-0.073/-0.090,-0.089,-0.088)
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_1a
+# exp/chain/e2e_tdnnf_1a: num-iters=180 nj=2..8 num-params=6.8M dim=40->84 combine=-0.060->-0.060 (over 3) logprob:train/valid[119,179,final]=(-0.080,-0.062,-0.062/-0.089,-0.083,-0.083)
 
 set -e
 
@@ -40,15 +38,15 @@ get_egs_stage=-10
 affix=1a
 
 # training options
-num_epochs=4
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
 num_jobs_initial=2
-num_jobs_final=5
-minibatch_size=150=128,64/300=100,64,32/600=50,32,16/1200=16,8
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
 common_egs_dir=
 l2_regularize=0.00005
-dim=450
 frames_per_iter=3000000
-cmvn_opts="--norm-means=true --norm-vars=true"
+cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train_si284_spe2e_hires
 test_sets="test_dev93 test_eval92"
 
@@ -69,7 +67,7 @@ fi
 
 lang=data/lang_e2e
 treedir=exp/chain/e2e_tree  # it's actually just a trivial tree (no tree building)
-dir=exp/chain/e2e_tdnn_${affix}
+dir=exp/chain/e2e_tdnnf_${affix}
 
 if [ $stage -le 0 ]; then
   # Create a version of the lang/ directory that has one state per phone in the
@@ -102,25 +100,35 @@ fi
 if [ $stage -le 2 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
   num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
-  opts="l2-regularize=0.01"
-  output_opts="l2-regularize=0.0025"
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
 
   input dim=40 name=input
 
-  relu-batchnorm-layer name=tdnn1 input=Append(-1,0,1) dim=$dim
-  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn3 dim=$dim $opts
-  relu-batchnorm-layer name=tdnn4 input=Append(-1,0,1) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn5 dim=$dim $opts
-  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$dim $opts
-  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$dim $opts
-
-  relu-batchnorm-layer name=prefinal-chain dim=$dim target-rms=0.5 $opts
-  output-layer name=output include-log-softmax=true dim=$num_targets $output_opts
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
 
 EOF
   steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
@@ -139,14 +147,15 @@ if [ $stage -le 3 ]; then
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
     --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
     --trainer.num-chunk-per-minibatch $minibatch_size \
     --trainer.frames-per-iter $frames_per_iter \
     --trainer.num-epochs $num_epochs \
     --trainer.optimization.momentum 0 \
     --trainer.optimization.num-jobs-initial $num_jobs_initial \
     --trainer.optimization.num-jobs-final $num_jobs_final \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
     --trainer.optimization.shrink-value 1.0 \
     --trainer.max-param-change 2.0 \
     --cleanup.remove-egs true \
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
index cc7c64f3cc8..be82e80d5fe 100755
--- a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh
@@ -6,31 +6,32 @@
 # a full trivial biphone context-dependency tree. This is because this recipe is
 # meant for character-based (i.e. lexicon-free) modeling where context helps
 # significantly.
-# It does not use ivecors or other forms of speaker adaptation
-# except simple mean and variance normalization.
+# It does not use ivecors or other forms of speaker adaptation.
 # It is called from run_e2e_char.sh
 
 # Note: this script is configured to run as character-based, if you want
 # to run it in phoneme mode, you'll need to change _char
-# to _nosp everywhere and also copy phone_lm.fst instead
-# of char_lm.fst (in stage 1 below)
+# to _nosp everywhere.
 
 
+# local/chain/compare_wer.sh exp/chain/e2e_tdnn_lstm_bichar_1a
 # System                e2e_tdnn_lstm_bichar_1a
-# WER dev93 (tgpr)                9.42
-# WER dev93 (tg)                  8.85
-# WER dev93 (big-dict,tgpr)       7.70
-# WER dev93 (big-dict,fg)         6.79
-# WER eval92 (tgpr)               6.42
-# WER eval92 (tg)                 6.11
-# WER eval92 (big-dict,tgpr)      4.50
-# WER eval92 (big-dict,fg)        4.09
-# Final train prob        -0.7535
-# Final valid prob        -0.7786
+#WER dev93 (tgpr)                9.85
+#WER dev93 (tg)                  9.32
+#WER dev93 (big-dict,tgpr)       8.19
+#WER dev93 (big-dict,fg)         7.27
+#WER eval92 (tgpr)               6.89
+#WER eval92 (tg)                 6.70
+#WER eval92 (big-dict,tgpr)      5.14
+#WER eval92 (big-dict,fg)        4.29
+# Final train prob        -0.0610
+# Final valid prob        -0.0836
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 9219188
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_tdnn_lstm_bichar_1a/
-# exp/chain/e2e_tdnn_lstm_bichar_1a/: num-iters=138 nj=2..5 num-params=9.2M dim=40->3444 combine=-6.480->-6.478 logprob:train/valid[91,137,final]=(-0.766,-0.754,-0.754/-0.784,-0.779,-0.779)
-
+# exp/chain/e2e_tdnn_lstm_bichar_1a_nocmvn: num-iters=138 nj=2..5 num-params=9.2M dim=40->3444 combine=-1.211->-1.211 (over 3) logprob:train/valid[91,137,final]=(-0.079,-0.062,-0.061/-0.093,-0.084,-0.084)
 
 set -e
 
@@ -50,7 +51,7 @@ common_egs_dir=
 l2_regularize=0.00001
 dim=512
 frames_per_iter=2500000
-cmvn_opts="--norm-means=true --norm-vars=true"
+cmvn_opts="--norm-means=false --norm-vars=false"
 train_set=train_si284_spe2e_hires
 test_sets="test_dev93 test_eval92"
 
@@ -96,8 +97,9 @@ if [ $stage -le 1 ]; then
   mkdir -p $treedir/log
   $train_cmd $treedir/log/make_phone_lm.log \
              cat data/$train_set/text \| \
-             steps/nnet3/chain/e2e/text_to_phones.py data/lang_nosp \| \
-             utils/sym2int.pl -f 2- data/lang_nosp/phones.txt \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             data/lang_char \| \
+             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
              chain-est-phone-lm --num-extra-lm-states=2000 \
              ark:- $treedir/phone_lm.fst
   steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh b/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
new file mode 120000
index 00000000000..b20849c2a48
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/run_tdnnf_flatstart_char.sh
@@ -0,0 +1 @@
+tuning/run_tdnnf_flatstart_char1b.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
new file mode 100755
index 00000000000..4ab0cf58d53
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1a.sh
@@ -0,0 +1,225 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script performs chain training in a flat-start manner
+# and without building or using any context-dependency tree.
+# It does not use ivecors or other forms of speaker adaptation
+# It is called from run_e2e_char.sh
+
+# Note: this script is configured as grapheme-based, if you want
+# to run it in phoneme mode, you'll need to change _char
+# to _nosp everywhere.
+
+# This is the same as run_tdnn_lstm_flatstart.sh except it uses
+# TDNN-F (and CMVN is disabled).
+
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnn_lstm_bichar_1a exp/chain/e2e_tdnnf_bichar1a
+# System                e2e_tdnn_lstm_bichar_1a e2e_tdnnf_bichar1a
+# WER dev93 (tgpr)                9.42      8.89
+# WER dev93 (tg)                  8.85      8.20
+# WER dev93 (big-dict,tgpr)       7.70      6.96
+# WER dev93 (big-dict,fg)         6.79      6.01
+# WER eval92 (tgpr)               6.42      6.08
+# WER eval92 (tg)                 6.11      5.79
+# WER eval92 (big-dict,tgpr)      4.50      4.39
+# WER eval92 (big-dict,fg)        4.09      3.88
+# Final train prob        -0.0610   -0.0598
+# Final valid prob        -0.0836   -0.0854
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 9219188   7421044
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_bichar1a
+# exp/chain/e2e_tdnnf_bichar1a: num-iters=180 nj=2..8 num-params=7.4M dim=40->3444 combine=-0.064->-0.064 (over 3) logprob:train/valid[119,179,final]=(-0.093,-0.060,-0.060/-0.107,-0.086,-0.085)
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
+num_jobs_initial=2
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=3000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train_si284_spe2e_hires
+test_sets="test_dev93 test_eval92"
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e_char
+treedir=exp/chain/e2e_bichar_tree
+dir=exp/chain/e2e_tdnnf_bichar${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_char $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Estimating a phone language model for the denominator graph..."
+  mkdir -p $treedir/log
+  $train_cmd $treedir/log/make_phone_lm.log \
+             cat data/$train_set/text \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             data/lang_char \| \
+             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
+             chain-est-phone-lm --num-extra-lm-states=2000 \
+             ark:- $treedir/phone_lm.fst
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
+                                       --type biphone \
+                                       --shared-phones true \
+                                       data/$train_set $lang $treedir
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+
+  input dim=40 name=input
+
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    $dir $treedir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    $dir $treedir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=150
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_char_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
new file mode 100755
index 00000000000..4e66fae8baa
--- /dev/null
+++ b/egs/wsj/s5/local/chain/e2e/tuning/run_tdnnf_flatstart_char1b.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+# Copyright    2019  Hossein Hadian
+
+# 1b is the same as 1a except it uses a better tree (which is
+# pruned based on training transcripts).
+
+# This script performs chain training in a flat-start manner
+# and without building or using any context-dependency tree.
+# It does not use ivecors or other forms of speaker adaptation
+# It is called from run_e2e_char.sh
+
+# Note: this script is configured as grapheme-based, if you want
+# to run it in phoneme mode, you'll need to change _char
+# to _nosp everywhere.
+
+# local/chain/compare_wer.sh exp/chain/e2e_tdnnf_bichar1a exp/chain/e2e_tdnnf_bichar1b
+# System                e2e_tdnnf_bichar1a e2e_tdnnf_bichar1b
+#WER dev93 (tgpr)                8.89      9.06
+#WER dev93 (tg)                  8.20      8.43
+#WER dev93 (big-dict,tgpr)       6.96      6.95
+#WER dev93 (big-dict,fg)         6.01      6.08
+#WER eval92 (tgpr)               6.08      5.98
+#WER eval92 (tg)                 5.79      5.94
+#WER eval92 (big-dict,tgpr)      4.39      4.29
+#WER eval92 (big-dict,fg)        3.88      3.69
+# Final train prob        -0.0598   -0.0601
+# Final valid prob        -0.0854   -0.0855
+# Final train prob (xent)
+# Final valid prob (xent)
+# Num-params                 7421044   7025973
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_tdnnf_bichar1b
+# exp/chain/e2e_tdnnf_bichar1b: num-iters=180 nj=2..8 num-params=7.0M dim=40->1397 combine=-0.064->-0.064 (over 2) logprob:train/valid[119,179,final]=(-0.086,-0.060,-0.060/-0.099,-0.087,-0.087)
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1b
+
+# training options
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+num_epochs=10
+num_jobs_initial=2
+num_jobs_final=8
+minibatch_size=150=128,64/300=64,32/600=32,16/1200=8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=3000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train_si284_spe2e_hires
+test_sets="test_dev93 test_eval92"
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e_char
+treedir=exp/chain/e2e_bichar_tree_tied1a
+dir=exp/chain/e2e_tdnnf_bichar${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang_char $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: Estimating a phone language model for the denominator graph..."
+  mkdir -p $treedir/log
+  $train_cmd $treedir/log/make_phone_lm.log \
+             cat data/$train_set/text \| \
+             steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \
+             data/lang_char \| \
+             utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \
+             chain-est-phone-lm --num-extra-lm-states=2000 \
+             ark:- $treedir/phone_lm.fst
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \
+                                       --type biphone \
+                                       --shared-phones true \
+                                       --tie true \
+                                       --min-biphone-count 100 \
+                                       --min-monophone-count 20 \
+                                       data/$train_set $lang $treedir
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+
+  input dim=40 name=input
+
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-1,0,1) $tdnn_opts dim=1024
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.0005 \
+    --trainer.optimization.final-effective-lrate 0.00005 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_tgpr \
+    $dir $treedir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_char_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_char_test_bd_tgpr \
+    $dir $treedir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=150
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          $treedir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_char_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_char_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/wsj/s5/local/chain/run_cnn_tdnn.sh b/egs/wsj/s5/local/chain/run_cnn_tdnn.sh
new file mode 120000
index 00000000000..16cbbc3ad30
--- /dev/null
+++ b/egs/wsj/s5/local/chain/run_cnn_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_tdnn_1c.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh
index cebb2b84f16..deb68d515d2 120000
--- a/egs/wsj/s5/local/chain/run_tdnn.sh
+++ b/egs/wsj/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1g.sh
\ No newline at end of file
+tuning/run_tdnn_1i.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
index ceca428f5c1..e656b67e529 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1a.sh
@@ -167,7 +167,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
index a3a747ed743..9db76e94430 100755
--- a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1b.sh
@@ -170,7 +170,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
new file mode 100755
index 00000000000..36ec5bb61af
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_cnn_tdnn_1c.sh
@@ -0,0 +1,341 @@
+#!/bin/bash
+
+# 1c is as 1b but taking the first layers from the cnn_tdnn_1a setup in mini_librispeech.
+# A little better than the baseline and overfits more.
+#
+# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/cnn_tdnn1c_sp
+# System                tdnn1g_sp cnn_tdnn1c_sp
+#WER dev93 (tgpr)                6.68      6.55
+#WER dev93 (tg)                  6.57      6.49
+#WER dev93 (big-dict,tgpr)       4.60      4.52
+#WER dev93 (big-dict,fg)         4.26      4.13
+#WER eval92 (tgpr)               4.54      4.47
+#WER eval92 (tg)                 4.32      4.15
+#WER eval92 (big-dict,tgpr)      2.62      2.57
+#WER eval92 (big-dict,fg)        2.32      2.02
+# Final train prob        -0.0417   -0.0409
+# Final valid prob        -0.0487   -0.0486
+# Final train prob (xent)   -0.6461   -0.6203
+# Final valid prob (xent)   -0.6882   -0.6591
+# Num-params                 8354636   6935084
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/cnn_tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.01"
+  ivector_affine_opts="l2-regularize=0.01"
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_first_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.0"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # this takes the MFCCs and generates filterbank coefficients.  The MFCCs
+  # are more compressible so we prefer to dump the MFCCs to disk rather
+  # than filterbanks.
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+
+  linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0)
+  batchnorm-component name=ivector-batchnorm target-rms=0.025
+
+  batchnorm-component name=idct-batchnorm input=idct
+  combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40
+
+  conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25
+  conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48
+  conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64
+  conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128
+
+  # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the
+  # information bottleneck doesn't become a problem.
+  tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=1024 bottleneck-dim=256 time-stride=0
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{4,5,6,7}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=8 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
index 10a9c608811..8d44db6f917 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -183,7 +183,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
index a2bb7e93388..544b9b04a0a 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -158,7 +158,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
index 7dc30ecf8fe..b268ed7feda 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1c.sh
@@ -159,7 +159,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
index 603e0f064b9..d1a7f9d0663 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1d.sh
@@ -159,7 +159,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
index 9808e274d83..e20069fbfa1 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1e.sh
@@ -167,7 +167,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.0025"
 
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index e3d13ac1f65..86df0779841 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -161,7 +161,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.005 bottleneck-dim=320"
 
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index 1724c057e12..9927a0c28d3 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -32,7 +32,14 @@ train_set=train_si284
 test_sets="test_dev93 test_eval92"
 gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
                  # should have alignments for the specified training data.
-num_threads_ubm=32
+
+num_threads_ubm=8
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_threads_extractor=4
+num_processes_extractor=2
+
 nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 
 # Options which are not passed through to run_ivector_common.sh
@@ -79,6 +86,9 @@ local/nnet3/run_ivector_common.sh \
   --stage $stage --nj $nj \
   --train-set $train_set --gmm $gmm \
   --num-threads-ubm $num_threads_ubm \
+  --nj-extractor $nj_extractor \
+  --num-processes-extractor $num_processes_extractor \
+  --num-threads-extractor $num_threads_extractor \
   --nnet3-affix "$nnet3_affix"
 
 
@@ -160,7 +170,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
   tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
   linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
@@ -220,6 +230,7 @@ if [ $stage -le 16 ]; then
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
     --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=10 \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1h.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1h.sh
new file mode 100755
index 00000000000..2e014eed970
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1h.sh
@@ -0,0 +1,349 @@
+#!/bin/bash
+
+# 1h is as 1g but replaces the LDA layer at the input of the
+# network with traditional delta and delta-delta features.
+
+# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/tdnn1h2_sp/ exp/chain/tdnn1h_sp/
+# System                tdnn1g_sp tdnn1h_sp
+#WER dev93 (tgpr)                6.87      6.75
+#WER dev93 (tg)                  6.66      6.70
+#WER dev93 (big-dict,tgpr)       4.77      4.60
+#WER dev93 (big-dict,fg)         4.29      4.40
+#WER eval92 (tgpr)               4.63      4.71
+#WER eval92 (tg)                 4.36      4.41
+#WER eval92 (big-dict,tgpr)      2.71      2.96
+#WER eval92 (big-dict,fg)        2.39      2.39
+# Final train prob        -0.0419   -0.0424
+# Final valid prob        -0.0540   -0.0534
+# Final train prob (xent)   -0.6582   -0.6583
+# Final valid prob (xent)   -0.7220   -0.7281
+# Num-params                 8364672   8364672
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1h_sp
+# exp/chain/tdnn1h_sp: num-iters=108 nj=2..8 num-params=8.4M dim=40+100->2880 combine=-0.044->-0.044 (over 2) xent:train/valid[71,107,final]=(-0.863,-0.648,-0.658/-0.932,-0.719,-0.728) logprob:train/valid[71,107,final]=(-0.065,-0.043,-0.042/-0.073,-0.055,-0.053)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+
+num_threads_ubm=8
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_threads_extractor=4
+num_processes_extractor=2
+
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1h   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=false
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nj-extractor $nj_extractor \
+  --num-processes-extractor $num_processes_extractor \
+  --num-threads-extractor $num_threads_extractor \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+  delta-layer name=delta input=idct
+  no-op-component name=input2 input=Append(delta, Scale(1.0, ReplaceIndex(ivector, t, 0)))
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=1024 input=input2
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=5000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1i.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1i.sh
new file mode 100755
index 00000000000..e1bdc24b5d8
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1i.sh
@@ -0,0 +1,358 @@
+#!/bin/bash
+
+# 1i is like 1h, while it introduces 'apply-cmvn-online' that does
+# cmn normalization both for i-extractor and TDNN input.
+
+# local/chain/compare_wer.sh exp/chain/tdnn1h_sp exp/chain_online_cmn/tdnn1i_sp
+# System                tdnn1h_sp tdnn1i_sp
+#WER dev93 (tgpr)                6.89      6.90
+#WER dev93 (tg)                  6.63      6.73
+#WER dev93 (big-dict,tgpr)       4.96      4.91
+#WER dev93 (big-dict,fg)         4.53      4.44
+#WER eval92 (tgpr)               4.68      4.77
+#WER eval92 (tg)                 4.32      4.36
+#WER eval92 (big-dict,tgpr)      2.69      2.85
+#WER eval92 (big-dict,fg)        2.34      2.36
+# Final train prob        -0.0442   -0.0436
+# Final valid prob        -0.0537   -0.0540
+# Final train prob (xent)   -0.6548   -0.6592
+# Final valid prob (xent)   -0.7324   -0.7326
+# Num-params                 8349232   8349232
+
+# steps/info/chain_dir_info.pl exp/chain_online_cmn/tdnn1i_sp
+# exp/chain_online_cmn/tdnn1i_sp: num-iters=108 nj=2..8 num-params=8.3M dim=40+100->2840 combine=-0.045->-0.045 (over 1) xent:train/valid[71,107,final]=(-0.873,-0.653,-0.659/-0.922,-0.713,-0.733) logprob:train/valid[71,107,final]=(-0.064,-0.044,-0.044/-0.068,-0.054,-0.054)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+
+num_threads_ubm=8
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_threads_extractor=4
+num_processes_extractor=2
+
+nnet3_affix=_online_cmn   # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1i   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# Setting 'online_cmvn' to true replaces 'apply-cmvn' by
+# 'apply-cmvn-online' both for i-vector extraction and TDNN input.
+# The i-vector extractor uses the config 'conf/online_cmvn.conf' for
+# both the UBM and the i-extractor. The TDNN input is configured via
+# '--feat.cmvn-opts' that is set to the same config, so we use the
+# same cmvn for i-extractor and the TDNN input.
+online_cmvn=true
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --online-cmvn-iextractor $online_cmvn \
+  --num-threads-ubm $num_threads_ubm \
+  --nj-extractor $nj_extractor \
+  --num-processes-extractor $num_processes_extractor \
+  --num-threads-extractor $num_threads_extractor \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+  delta-layer name=delta input=idct
+  no-op-component name=input2 input=Append(delta, Scale(1.0, ReplaceIndex(ivector, t, 0)))
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=1024 input=input2
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--config=conf/online_cmvn.conf" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=5000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --online-cmvn $online_cmvn" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4b752a55a4b..6e4f220c1f2 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -181,7 +181,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index 51fefb9ca88..2d113e58a93 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -473,7 +473,7 @@ if [ $stage -le 15 ]; then
   echo "$0: creating neural net configs using the xconfig parser";
 
   num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
   tdnn_opts="l2-regularize=0.01"
   output_opts="l2-regularize=0.005 bottleneck-dim=256"
   lstm_opts="l2-regularize=0.005 self-scale=2.0"
diff --git a/egs/wsj/s5/local/e2e/run_end2end_char.sh b/egs/wsj/s5/local/e2e/run_end2end_char.sh
index e5c84c405e2..ff44802f2be 100755
--- a/egs/wsj/s5/local/e2e/run_end2end_char.sh
+++ b/egs/wsj/s5/local/e2e/run_end2end_char.sh
@@ -56,6 +56,7 @@ if [ $stage -le 1 ]; then
   local/wsj_train_lms.sh --dict-suffix "_char"
   local/wsj_format_local_lms.sh --lang-suffix "_char"
   echo "$0: Done extending the vocabulary."
+  exit 0;
 fi
 
 if [ $stage -le 2 ]; then
@@ -102,5 +103,5 @@ fi
 
 if [ $stage -le 5 ]; then
   echo "$0: calling the flat-start chain recipe..."
-  local/chain/e2e/run_tdnn_lstm_flatstart.sh
+  local/chain/e2e/run_tdnnf_flatstart_char.sh
 fi
diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
index 2c218ae3673..f7f51a45a29 100755
--- a/egs/wsj/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -7,15 +7,21 @@ set -e -o pipefail
 # contains the common feature preparation and iVector-related parts of the
 # script.  See those scripts for examples of usage.
 
-
 stage=0
 nj=30
 train_set=train_si284   # you might set this to e.g. train.
 test_sets="test_dev93 test_eval92"
 gmm=tri4b                # This specifies a GMM-dir from the features of the type you're training the system on;
                          # it should contain alignments for 'train_set'.
+online_cmvn_iextractor=false
 
 num_threads_ubm=32
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_processes_extractor=4
+num_threads_extractor=4
+
 nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
                          # in the tedlium recip it's _cleaned).
 
@@ -35,20 +41,54 @@ for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
 done
 
 
+# low-resolution features and alignments,
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 2 ]; then
+  echo "$0: data/${train_set}_sp/feats.scp already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
 
-if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+if [ $stage -le 2 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 3 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
+# high-resolution features and i-vector extractor,
+if [ $stage -le 5 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
   echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
   echo " ... Please either remove it, or rerun this script with stage > 2."
   exit 1
 fi
 
-
-if [ $stage -le 1 ]; then
+if [ $stage -le 4 ]; then
   echo "$0: preparing directory for speed-perturbed data"
   utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 5 ]; then
   echo "$0: creating high-resolution MFCC features"
 
   # this shows how you can split across multiple file-systems.  we'll split the
@@ -76,7 +116,7 @@ if [ $stage -le 2 ]; then
   done
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 6 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
 
   mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
@@ -105,16 +145,18 @@ if [ $stage -le 3 ]; then
 
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 7 ]; then
   # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
   # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
   # 100.
   echo "$0: training the iVector extractor"
-  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" \
+    --online-cmvn-iextractor $online_cmvn_iextractor \
+    --nj $nj_extractor --num-threads $num_threads_extractor --num-processes $num_processes_extractor \
     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 8 ]; then
   # note, we don't encode the 'max2' in the name of the ivectordir even though
   # that's the data we extract the ivectors from, as it's still going to be
   # valid for the non-'max2' data; the utterance list is the same.
@@ -149,39 +191,4 @@ if [ $stage -le 5 ]; then
   done
 fi
 
-if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 8 ]; then
-  echo "$0: $feats already exists.  Refusing to overwrite the features "
-  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
-  exit 1;
-fi
-
-
-if [ $stage -le 6 ]; then
-  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
-  utils/data/perturb_data_dir_speed_3way.sh \
-    data/${train_set} data/${train_set}_sp
-fi
-
-if [ $stage -le 7 ]; then
-  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
-  steps/make_mfcc.sh --nj $nj \
-    --cmd "$train_cmd" data/${train_set}_sp
-  steps/compute_cmvn_stats.sh data/${train_set}_sp
-  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
-  echo ".. speed-perturbed segments were too short."
-  utils/fix_data_dir.sh data/${train_set}_sp
-fi
-
-if [ $stage -le 8 ]; then
-  if [ -f $ali_dir/ali.1.gz ]; then
-    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
-    echo " ... or use a later --stage option."
-    exit 1
-  fi
-  echo "$0: aligning with the perturbed low-resolution data"
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/${train_set}_sp data/lang $gmm_dir $ali_dir
-fi
-
-
 exit 0;
diff --git a/egs/wsj/s5/local/rnnlm/run_rnnlm.sh b/egs/wsj/s5/local/rnnlm/run_rnnlm.sh
new file mode 120000
index 00000000000..e638c4df523
--- /dev/null
+++ b/egs/wsj/s5/local/rnnlm/run_rnnlm.sh
@@ -0,0 +1 @@
+tuning/run_lstm_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh b/egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh
index 0bf6b2a102f..8fe50b699cf 100755
--- a/egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh
+++ b/egs/wsj/s5/local/rnnlm/tuning/run_lstm_tdnn_1b.sh
@@ -9,7 +9,18 @@
 # Train objf: -1038.00 -5.35 -5.04 -4.87 -4.76 -4.68 -4.61 -4.56 -4.52 -4.47 -4.44 -4.41 -4.37 -4.35 -4.33 -4.31 -4.29 -4.27 -4.25 -4.24 -4.23 -4.21 -4.19 -4.17 -4.16 -4.15 -4.13 -4.12 -4.11 -4.10 -4.09 -4.07 -4.07 -4.06 -4.05 -4.04 -4.03 -4.02 -4.01 -4.00 -3.99 -3.98 -3.98 -3.97 -3.96 -3.96 -3.95 -3.94 -3.93 -3.93 -3.92 -3.92 -3.91 -3.91 -3.90 -3.90 -3.89 -3.88 -3.88 -3.88 -3.88 -3.88 -3.86 -3.86 -3.85 -3.85 -3.84 -3.83 -3.83 -3.83 -3.82 -3.82 -3.81 -3.81 -3.80 -3.80 -3.79 -3.79 -3.79 -3.79 
 # Dev objf:   -11.73 -5.66 -5.18 -4.96 -4.82 -4.73 -4.66 -4.59 -4.54 -4.51 -4.47 -4.44 -4.40 -4.38 -4.36 -4.34 -4.32 -4.30 -4.28 -4.27 -4.26 -4.21 -4.19 -4.18 -4.16 -4.15 -4.14 -4.13 -4.12 -4.12 -4.11 -4.09 -4.09 -4.08 -4.07 -4.07 -4.06 -4.06 -4.05 -4.04 -4.04 -4.04 -4.03 -4.02 -4.02 -4.01 -4.01 -4.00 -4.00 -4.00 -3.99 -3.99 -3.98 -3.98 -3.98 -3.98 -3.97 -3.97 -3.97 -3.97 -3.96 -3.95 -3.95 -3.94 -3.94 -3.94 -3.94 -3.93 -3.93 -3.93 -3.93 -3.93 -3.93 -3.92 -3.92 -3.92 -3.92 -3.92 -3.91 -3.91 
 
+# WER numbers
+
+# without RNNLM
+# %WER 7.51 [ 618 / 8234, 82 ins, 112 del, 424 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_dev93/wer_10_1.0
+# %WER 5.21 [ 294 / 5643, 55 ins, 34 del, 205 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_eval92/wer_11_0.5
+
+# with RNNLM
+# %WER 5.74 [ 473 / 8234, 81 ins, 76 del, 316 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_dev93_rnnlm/wer_14_1.0
+# %WER 4.27 [ 241 / 5643, 62 ins, 23 del, 156 sub ] exp/chain/tdnn_lstm1b_sp/decode_looped_tgpr_eval92_rnnlm/wer_12_1.0
+
 # Begin configuration section.
+
 dir=exp/rnnlm_lstm_tdnn_1b
 embedding_dim=800
 lstm_rpd=200
@@ -21,6 +32,11 @@ epochs=20
 stage=-10
 train_stage=-10
 
+# variables for rnnlm rescoring
+ac_model_dir=exp/chain/tdnn_lstm1b_sp
+ngram_order=4
+decode_dir_suffix=rnnlm
+
 . ./cmd.sh
 . ./utils/parse_options.sh
 [ -z "$cmd" ] && cmd=$train_cmd
@@ -102,4 +118,20 @@ if [ $stage -le 3 ]; then
                        --stage $train_stage --num-epochs $epochs --cmd "$cmd" $dir
 fi
 
+LM=tgpr
+if [ $stage -le 4 ]; then
+  for decode_set in dev93 eval92; do
+    decode_dir=${ac_model_dir}/decode_looped_${LM}_${decode_set}
+
+    # Lattice rescoring
+    rnnlm/lmrescore_pruned.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.8 --max-ngram-order $ngram_order \
+      data/lang_test_$LM $dir \
+      data/test_${decode_set}_hires ${decode_dir} \
+      ${decode_dir}_${decode_dir_suffix} &
+  done
+  wait
+fi
+
 exit 0
diff --git a/egs/wsj/s5/local/run_sgmm2.sh b/egs/wsj/s5/local/run_sgmm2.sh
index e2b12184c22..f391797ee58 100755
--- a/egs/wsj/s5/local/run_sgmm2.sh
+++ b/egs/wsj/s5/local/run_sgmm2.sh
@@ -144,7 +144,7 @@ local/score_combine.sh data/test_eval92 \
 # %WER 3.76 [ 212 / 5643, 32 ins, 12 del, 168 sub ] exp/combine_tri4b_fmmi_a_sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it8_3/wer_12
 
 # Checking MBR decode of baseline:
-rm -r exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr 2>/dev/null 
+rm -r exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr 2>/dev/null
 cp -r exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3{,.mbr}
 local/score_mbr.sh data/test_eval92 data/lang_test_bd_tgpr exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_eval92_it3.mbr
 # MBR decoding did not seem to help (baseline was 3.85).  I think this is normal at such low WERs.
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index 277252cecc3..3f7737240a2 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -17,8 +17,8 @@ decode=true  # set to false to disable the decoding-related scripts.
 #wsj0=/ais/gobi2/speech/WSJ/csr_?_senn_d?
 #wsj1=/ais/gobi2/speech/WSJ/csr_senn_d?
 
-#wsj0=/mnt/matylda2/data/WSJ0
-#wsj1=/mnt/matylda2/data/WSJ1
+#wsj0=/mnt/matylda2/data/WSJ/WSJ0
+#wsj1=/mnt/matylda2/data/WSJ/WSJ1
 
 #wsj0=/data/corpora0/LDC93S6B
 #wsj1=/data/corpora0/LDC94S13B
@@ -320,41 +320,48 @@ if [ $stage -le 6 ]; then
   fi
 fi
 
+if [ $stage -le 7 ]; then
+  # Caution: this part needs a GPU.
+  local/chain/run_tdnn.sh
+fi
 
 exit 0;
 
-### Caution: the parts of the script below this statement are not run by default.
-###
-
+# Below are some commented-out commands that demonstrate how to run various other things--
+# mainly outdated methods.
 
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
-steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1;
-local/run_mmi_tri4b.sh
-
-# These demonstrate how to build a sytem usable for online-decoding with the nnet2 setup.
-# (see local/run_nnet2.sh for other, non-online nnet2 setups).
-local/online/run_nnet2.sh
-local/online/run_nnet2_baseline.sh
-local/online/run_nnet2_discriminative.sh
-
-# Demonstration of RNNLM rescoring on TDNN models. We comment this out by
-# default.
+# Note: there isn't much use for this these days.
+#steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+#  data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1;
+#local/run_mmi_tri4b.sh
+
+# The following are the old nnet2 recipes.
+#local/online/run_nnet2.sh
+#local/online/run_nnet2_baseline.sh
+#local/online/run_nnet2_discriminative.sh
+
+# The following is the
+
+
+# Demonstration of RNNLM rescoring on nnet2 TDNN models.  This is
+# outdated now.
 # local/run_rnnlms.sh
 
 
 #local/run_nnet2.sh
 
 # You probably want to run the sgmm2 recipe as it's generally a bit better:
-local/run_sgmm2.sh
+# The SGMM2 recipe.  This is better than GMMs but you probably just want the neural net.
+# local/run_sgmm2.sh
 
 # We demonstrate MAP adaptation of GMMs to gender-dependent systems here.  This also serves
 # as a generic way to demonstrate MAP adaptation to different domains.
 # local/run_gender_dep.sh
 
-# You probably want to run the hybrid recipe as it is complementary:
-local/nnet/run_dnn.sh
+# This is the old "nnet1" neural net.
+#local/nnet/run_dnn.sh
 
 # The following demonstrate how to re-segment long audios.
 # local/run_segmentation_long_utts.sh
diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh
index d65986bd9ec..e5510c5ab7e 100755
--- a/egs/wsj/s5/steps/align_basis_fmllr.sh
+++ b/egs/wsj/s5/steps/align_basis_fmllr.sh
@@ -20,6 +20,7 @@ cmd=run.pl
 use_graphs=false
 # Begin configuration.
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
 beam=10
 retry_beam=40
 boost_silence=1.5 # factor by which to boost silence during alignment.
@@ -32,8 +33,11 @@ echo "$0 $@"  # Print the command line for logging
 . parse_options.sh || exit 1;
 
 if [ $# != 4 ]; then
-   echo "usage: steps/align_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
-   echo "e.g.:  steps/align_fmllr.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "usage: steps/align_basis_fmllr.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_basis_fmllr.sh data/train data/lang exp/tri4 exp/tri4_ali"
+   echo "Note: <src-dir> should ideally have been trained by steps/train_sat_basis.sh, or"
+   echo "if a non-SAT system (not recommended), the basis should have been computed"
+   echo "by steps/get_fmllr_basis.sh."
    echo "main options (for others, see top of script file)"
    echo "  --config <config-file>                           # config containing options"
    echo "  --nj <nj>                                        # number of parallel jobs"
@@ -57,9 +61,19 @@ mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 
+
+for f in $srcdir/tree  $srcdir/final.mdl $srcdir/fmllr.basis \
+                       $data/feats.scp $lang/phones.txt; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
 utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
 cp $lang/phones.txt $dir || exit 1;
 
+
 cp $srcdir/{tree,final.mdl} $dir || exit 1;
 cp $srcdir/final.occs $dir;
 splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
@@ -123,22 +137,20 @@ if [ $stage -le 2 ]; then
       ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
       weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
       gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
-      gmm-est-basis-fmllr-gpost --fmllr-min-count=22  --num-iters=10 \
-        --size-scale=0.2 --step-size-iters=3 \
-        --write-weights=ark:$dir/pre_wgt.JOB \
+      gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
         $mdl $srcdir/fmllr.basis "$sifeats"  ark,s,cs:- \
         ark:$dir/trans.JOB || exit 1;
-#  else
-#    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
-#      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
-#      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
-#      gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \
-#      --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \
-#      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-basis-fmllr $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
+         $mdl $srcdir/fmllr.basis "$sifeats" \
+        ark,s,cs:- ark:$dir/trans.JOB || exit 1;
   fi
 fi
 
-feats="$sifeats transform-feats ark:$dir/trans.JOB ark:- ark:- |"
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
 
 if [ $stage -le 3 ]; then
   echo "$0: doing final alignment."
diff --git a/egs/wsj/s5/steps/align_basis_fmllr_lats.sh b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
new file mode 100755
index 00000000000..426168496cc
--- /dev/null
+++ b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh
@@ -0,0 +1,184 @@
+#!/bin/bash
+#
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0
+
+# Version of align_fmllr_lats.sh that uses "basis fMLLR", so it is suitable for
+# situations where there is very little data per speaker (e.g. when there is a
+# one-to-one mapping between utterances and speakers).  Intended for use where
+# the model was trained with basis-fMLLR (i.e.  when you trained the model with
+# train_sat_basis.sh where you normally would have trained with train_sat.sh),
+# or when it was trained with SAT but you ran get_fmllr_basis.sh on the
+# source-model directory.
+
+# Begin configuration section.
+stage=0
+nj=4
+cmd=run.pl
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=0.1"
+acoustic_scale=0.1
+beam=10
+retry_beam=40
+final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
+               # is a limitation of gmm-latgen-faster.  We just use an
+               # intermediate beam.  We'll lose a little data and it will be
+               # slightly slower.  (however, the min-active of 200 that
+               # gmm-latgen-faster defaults to may help.)
+boost_silence=1.0 # factor by which to boost silence during alignment.
+basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
+
+generate_ali_from_lats=false # If true, alingments generated from lattices.
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "usage: steps/align_fmllr_lats.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+if [ ! -f $srcdir/fmllr.basis ]; then
+  echo "$0: expected $srcdir/fmllr.basis to exist.   Run get_fmllr_basis.sh on $srcdir."
+fi
+
+for f in $data/feats.scp $lang/phones.txt $srcdir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+oov=`cat $lang/oov.int` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+
+cp $srcdir/{tree,final.mdl} $dir || exit 1;
+cp $srcdir/final.alimdl $dir 2>/dev/null
+cp $srcdir/final.occs $dir;
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options.
+cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options.
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option.
+delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
+cp $srcdir/delta_opts $dir 2>/dev/null
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir
+    cp $srcdir/full.mat $dir 2>/dev/null
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+## Set up model and alignment model.
+mdl=$srcdir/final.mdl
+if [ -f $srcdir/final.alimdl ]; then
+  alimdl=$srcdir/final.alimdl
+else
+  alimdl=$srcdir/final.mdl
+fi
+[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1;
+alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |"
+mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |"
+
+
+## because gmm-latgen-faster doesn't support adding the transition-probs to the
+## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
+## because the other scripts write them without transition probs.
+if [ $stage -le 0 ]; then
+  echo "$0: compiling training graphs"
+  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log  \
+    compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl  $lang/L.fst "$tra" \
+    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because,
+  # as explained above, we compiled the transition probs into the training
+  # graphs.
+  echo "$0: aligning data in $data using $alimdl and speaker-independent features."
+  $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \
+    gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \
+        --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \
+    "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: computing fMLLR transforms"
+  if [ "$alimdl" != "$mdl" ]; then
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \
+      gmm-est-basis-fmllr-gpost $basis_fmllr_opts \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  else
+    $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \
+      ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \
+      gmm-est-basis-fmllr $basis_fmllr_opts \
+      --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \
+      ark,s,cs:- ark:$dir/trans.JOB || exit 1;
+  fi
+fi
+
+feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
+
+if [ $stage -le 3 ]; then
+  # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more
+  # alignment errors (however, it does have a default min-active=200 so this
+  # will tend to reduce alignment errors).
+  # --allow_partial=false makes sure we reach the end of the decoding graph.
+  # --word-determinize=false makes sure we retain the alternative pronunciations of
+  #   words (including alternatives regarding optional silences).
+  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
+  #    it means we do no pruning of the lattice (lattices from a training transcription
+  #    will be small anyway).
+  echo "$0: generating lattices containing alternate pronunciations."
+  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
+    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \
+        --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
+      "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
+      "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 4 ] && $generate_ali_from_lats; then
+  # If generate_alignments is true, ali.*.gz is generated in lats dir
+  $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \
+    lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \
+    ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+rm $dir/pre_ali.*.gz 2>/dev/null || true
+
+echo "$0: done generating lattices from training transcripts."
+
+utils/summarize_warnings.pl $dir/log
+
+exit 0;
diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh
index 187d9bf5687..b331b40d73c 100755
--- a/egs/wsj/s5/steps/align_fmllr_lats.sh
+++ b/egs/wsj/s5/steps/align_fmllr_lats.sh
@@ -5,7 +5,7 @@
 
 # Version of align_fmllr.sh that generates lattices (lat.*.gz) with
 # alignments of alternative pronunciations in them.  Mainly intended
-# as a precursor to CTC training for now.
+# as a precursor to LF-MMI/chain training for now.
 
 # Begin configuration section.
 stage=0
@@ -24,6 +24,7 @@ final_beam=20  # For the lattice-generation phase there is no retry-beam.  This
 boost_silence=1.0 # factor by which to boost silence during alignment.
 fmllr_update_type=full
 generate_ali_from_lats=false # If true, alingments generated from lattices.
+max_active=7000
 # End configuration options.
 
 echo "$0 $@"  # Print the command line for logging
@@ -149,7 +150,7 @@ if [ $stage -le 3 ]; then
   #    will be small anyway).
   echo "$0: generating lattices containing alternate pronunciations."
   $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
-    gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \
+    gmm-latgen-faster --max-active=$max_active --acoustic-scale=$acoustic_scale --beam=$final_beam \
         --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \
       "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
       "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh
index 670e6c2b714..fb386fa244f 100755
--- a/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh
+++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh
@@ -6,9 +6,9 @@
 
 # This script demonstrates how to re-segment training data selecting only the
 # "good" audio that matches the transcripts.
-# The basic idea is to decode with an existing in-domain acoustic model, and a
-# biased language model built from the reference, and then work out the
-# segmentation from a ctm like file.
+# The basic idea is to decode with an existing in-domain GMM acoustic model, and
+# a biased language model built from the reference transcript, and then work out
+# the segmentation from a ctm like file.
 
 set -e -o pipefail
 
@@ -179,7 +179,7 @@ if [ $stage -le 8 ]; then
   # the apply_map command below gives us lines of the form 'utt dur-from-$data/utt2dur dur-from-utt2dur.from_ctm',
   # e.g. AMI_EN2001a_H00_MEE068_0000557_0000594 0.37 0.35
   utils/apply_map.pl -f 1 <(awk '{print $1,$1,$2}' <$data/utt2dur) <$dir/utt2dur.from_ctm  | \
-    awk '{printf("%.3f\n", $2 - $3); }' | sort | uniq -c > $dir/padding_frequencies
+    awk '{printf("%.3f\n", $2 - $3); }' | sort | uniq -c | sort -nr > $dir/padding_frequencies
   # there are values other than the most-frequent one (0.02) in there because
   # of wav files that were shorter than the segment info.
   padding=$(head -n 1 $dir/padding_frequencies | awk '{print $2}')
@@ -206,7 +206,7 @@ fi
 
 if $cleanup; then
   echo "$0: cleaning up intermediate files"
-  rm -r $dir/fsts $dir/HCLG.fsts.scp || true
+  rm -r $dir/graphs/fsts $dir/graphs/HCLG.fsts.scp || true
   rm -r $dir/lats/lat.*.gz $dir/lats/split_fsts || true
   rm $dir/lattice_oracle/lat.*.gz || true
 fi
diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
new file mode 100755
index 00000000000..cc8da298d2f
--- /dev/null
+++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh
@@ -0,0 +1,286 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# This script is like clean_and_segment_data.sh, but uses nnet3 model instead of
+# a GMM for decoding.
+# The basic idea is to decode with an existing in-domain nnet3 acoustic model,
+# and a biased language model built from the reference transcript, and then work
+# out the segmentation from a ctm like file.
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+
+cmd=run.pl
+cleanup=true  # remove temporary directories and files
+nj=4
+# Decode options
+graph_opts=
+scale_opts=
+beam=15.0
+lattice_beam=1.0
+
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+lmwt=10
+
+# Contexts must ideally match training
+extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+frames_per_chunk=150
+
+# i-vector options
+extractor=    # i-Vector extractor. If provided, will extract i-vectors.
+              # Required if the network was trained with i-vector extractor.
+use_vad=false # Use energy-based VAD for i-vector extraction
+
+segmentation_opts=
+
+. ./path.sh
+. utils/parse_options.sh
+
+
+if [ $# -ne 5 ]; then
+  cat <<EOF
+  Usage: $0 [--extractor <ivector-extractor>] [options] <data> <lang> <srcdir> <dir> <cleaned-data>
+   This script does data cleanup to remove bad portions of transcripts and
+   may do other minor modifications of transcripts such as allowing repetitions
+   for disfluencies, and adding or removing non-scored words (by default:
+   words that map to 'silence phones')
+   Note: <srcdir> is expected to contain a nnet3-based model.
+   <ivector-extractor> and decoding options like --extra-left-context must match
+   the appropriate options used for training.
+
+  e.g. $0 data/train data/lang exp/tri3 exp/tri3_cleanup data/train_cleaned
+  main options (for others, see top of script file):
+    --stage <n>             # stage to run from, to enable resuming from partially
+                            # completed run (default: 0)
+    --cmd '$cmd'            # command to submit jobs with (e.g. run.pl, queue.pl)
+    --nj <n>                # number of parallel jobs to use in graph creation and
+                            # decoding
+    --graph-opts 'opts'         # Additional options to make_biased_lm_graphs.sh.
+                                # Please run steps/cleanup/make_biased_lm_graphs.sh
+                                # without arguments to see allowed options.
+    --segmentation-opts 'opts'  # Additional options to segment_ctm_edits.py.
+                                # Please run steps/cleanup/internal/segment_ctm_edits.py
+                                # without arguments to see allowed options.
+    --cleanup        <true|false>  # Clean up intermediate files afterward.  Default true.
+    --extractor <extractor>     # i-vector extractor directory if i-vector is
+                                # to be used during decoding. Must match
+                                # the extractor used for training neural-network.
+    --use-vad <true|false>      # If true, uses energy-based VAD to apply frame weights
+                                # for i-vector stats extraction
+EOF
+  exit 1
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+data_out=$5
+
+
+extra_files=
+if [ ! -z "$extractor" ]; then
+  extra_files="$extractor/final.ie"
+fi
+
+for f in $srcdir/{final.mdl,tree,cmvn_opts} $data/utt2spk $data/feats.scp \
+  $lang/words.txt $lang/oov.txt $extra_files; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist."
+    exit 1
+  fi
+done
+
+mkdir -p $dir
+cp $srcdir/final.mdl $dir
+cp $srcdir/tree $dir
+cp $srcdir/cmvn_opts $dir
+cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
+cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
+
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  echo "$0: guessing that this is a chain system, checking parameters."
+  if [ -z $scale_opts ]; then
+    echo "$0: setting scale_opts"
+    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+  fi
+  if [ $acwt == 0.1 ]; then
+    echo "$0: setting acwt=1.0"
+    acwt=1.0
+  fi
+  if [ $lmwt == 10 ]; then
+    echo "$0: setting lmwt=1.0"
+    lmwt=1
+  fi
+fi
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
+cp $lang/phones.txt $dir
+
+if [ $stage -le 1 ]; then
+  echo "$0: Building biased-language-model decoding graphs..."
+
+
+  steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
+    --nj $nj --cmd "$cmd" \
+     $data $lang $dir $dir/graphs
+fi
+
+online_ivector_dir=
+if [ ! -z "$extractor" ]; then
+  online_ivector_dir=$dir/ivectors_$(basename $data)
+
+  if [ $stage -le 2 ]; then
+    # Compute energy-based VAD
+    if $use_vad; then
+      steps/compute_vad_decision.sh $data \
+        $data/log $data/data
+    fi
+
+    steps/online/nnet2/extract_ivectors_online.sh \
+      --nj $nj --cmd "$cmd --mem 4G" --use-vad $use_vad \
+      $data $extractor $online_ivector_dir
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Decoding with biased language models..."
+
+  steps/cleanup/decode_segmentation_nnet3.sh \
+    --acwt $acwt  \
+    --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
+    --skip-scoring true --allow-partial false \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk $frames_per_chunk \
+    ${online_ivector_dir:+--online-ivector-dir $online_ivector_dir} \
+    $dir/graphs $data $dir/lats
+
+  # the following is for diagnostics, e.g. it will give us the lattice depth.
+  steps/diagnostic/analyze_lats.sh --cmd "$cmd" $lang $dir/lats
+fi
+
+frame_shift_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_shift_opt="--frame-shift 0.0$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Doing oracle alignment of lattices..."
+  steps/cleanup/lattice_oracle_align.sh --cmd "$cmd --mem 4G" $frame_shift_opt \
+    $data $lang $dir/lats $dir/lattice_oracle
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: using default values of non-scored words..."
+
+  # At the level of this script we just hard-code it that non-scored words are
+  # those that map to silence phones (which is what get_non_scored_words.py
+  # gives us), although this could easily be made user-configurable.  This list
+  # of non-scored words affects the behavior of several of the data-cleanup
+  # scripts; essentially, we view the non-scored words as negotiable when it
+  # comes to the reference transcript, so we'll consider changing the reference
+  # to match the hyp when it comes to these words.
+  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and "
+  echo "   ... to fix reference mismatches involving non-scored words. "
+
+  $cmd $dir/log/modify_ctm_edits.log \
+    steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \
+    $dir/lattice_oracle/ctm_edits $dir/ctm_edits.modified
+
+  echo "   ... See $dir/log/modify_ctm_edits.log for details and stats, including"
+  echo " a list of commonly-repeated words."
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: applying 'taint' markers to ctm-edits file to mark silences and"
+  echo "  ... non-scored words that are next to errors."
+  $cmd $dir/log/taint_ctm_edits.log \
+       steps/cleanup/internal/taint_ctm_edits.py $dir/ctm_edits.modified $dir/ctm_edits.tainted
+  echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log."
+fi
+
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating segmentation from ctm-edits file."
+
+  $cmd $dir/log/segment_ctm_edits.log \
+    steps/cleanup/internal/segment_ctm_edits.py \
+      $segmentation_opts \
+      --oov-symbol-file=$lang/oov.txt \
+      --ctm-edits-out=$dir/ctm_edits.segmented \
+      --word-stats-out=$dir/word_stats.txt \
+      $dir/non_scored_words.txt \
+      $dir/ctm_edits.tainted $dir/text $dir/segments
+
+  echo "$0: contents of $dir/log/segment_ctm_edits.log are:"
+  cat $dir/log/segment_ctm_edits.log
+  echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top,"
+  echo "see $dir/word_stats.txt"
+  echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented"
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: working out required segment padding to account for feature-generation edge effects."
+  # make sure $data/utt2dur exists.
+  utils/data/get_utt2dur.sh $data
+  # utt2dur.from_ctm contains lines of the form 'utt dur',  e.g.
+  # AMI_EN2001a_H00_MEE068_0000557_0000594 0.35
+  # where the times are ultimately derived from the num-frames in the features.
+  cat $dir/lattice_oracle/ctm_edits | \
+     awk '{utt=$1; t=$3+$4; if (t > dur[$1]) dur[$1] = t; } END{for (k in dur) print k, dur[k];}' | \
+     sort > $dir/utt2dur.from_ctm
+  # the apply_map command below gives us lines of the form 'utt dur-from-$data/utt2dur dur-from-utt2dur.from_ctm',
+  # e.g. AMI_EN2001a_H00_MEE068_0000557_0000594 0.37 0.35
+  utils/apply_map.pl -f 1 <(awk '{print $1,$1,$2}' <$data/utt2dur) <$dir/utt2dur.from_ctm  | \
+    awk '{printf("%.3f\n", $2 - $3); }' | sort | uniq -c | sort -nr > $dir/padding_frequencies
+  # there are values other than the most-frequent one (0.02) in there because
+  # of wav files that were shorter than the segment info.
+  padding=$(head -n 1 $dir/padding_frequencies | awk '{print $2}')
+  echo "$0: we'll pad segments with $padding seconds at segment ends to correct for feature-generation end effects"
+  echo $padding >$dir/segment_end_padding
+fi
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: based on the segments and text file in $dir/segments and $dir/text, creating new data-dir in $data_out"
+  padding=$(cat $dir/segment_end_padding)  # e.g. 0.02
+  utils/data/subsegment_data_dir.sh --segment-end-padding $padding ${data} $dir/segments $dir/text $data_out
+  # utils/data/subsegment_data_dir.sh can output directories that have e.g. to many entries left in wav.scp
+  # Clean this up with the fix_dat_dir.sh script
+  utils/fix_data_dir.sh $data_out
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: recomputing CMVN stats for the new data"
+  # Caution: this script puts the CMVN stats in $data_out/data,
+  # e.g. data/train_cleaned/data.  This is not the general pattern we use.
+  steps/compute_cmvn_stats.sh $data_out $data_out/log $data_out/data
+fi
+
+if $cleanup; then
+  echo "$0: cleaning up intermediate files"
+  rm -r $dir/graphs/fsts $dir/graphs/HCLG.fsts.scp || true
+  rm -r $dir/lats/lat.*.gz $dir/lats/split_fsts || true
+  rm $dir/lattice_oracle/lat.*.gz || true
+fi
+
+echo "$0: done."
diff --git a/egs/wsj/s5/steps/cleanup/combine_short_segments.py b/egs/wsj/s5/steps/cleanup/combine_short_segments.py
index 1d14bd2a57f..099b92882a9 100755
--- a/egs/wsj/s5/steps/cleanup/combine_short_segments.py
+++ b/egs/wsj/s5/steps/cleanup/combine_short_segments.py
@@ -284,7 +284,7 @@ def CombineSegments(input_dir, output_dir, minimum_duration):
                 assert(cur_utt_dur == combined_duration)
 
                 # now modify the utts list
-                combined_indices = range(left_index, right_index + 1)
+                combined_indices = list(range(left_index, right_index + 1))
                 # start popping from the largest index so that the lower
                 # indexes are valid
                 for i in combined_indices[::-1]:
diff --git a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
index 9091764924a..eca807ad247 100755
--- a/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
+++ b/egs/wsj/s5/steps/cleanup/debug_lexicon.sh
@@ -113,23 +113,24 @@ if [ $stage -le 8 ]; then
   grep -v '<eps>' $phone_lang/phones.txt | awk '{print $1, $1}' | \
     sed 's/_B$//' | sed 's/_I$//' | sed 's/_E$//' | sed 's/_S$//' >$dir/phone_map.txt
 
-  cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_text.ctm > $dir/phone_mapped.ctm
 
   export LC_ALL=C
 
+  cat $dir/phone.ctm | utils/apply_map.pl -f 5 $dir/phone_map.txt > $dir/phone_mapped.ctm
+
   cat $dir/word.ctm  | awk '{printf("%s-%s %010.0f START %s\n", $1, $2, 1000*$3, $5); printf("%s-%s %010.0f END %s\n", $1, $2, 1000*($3+$4), $5);}' | \
     sort > $dir/word_processed.ctm
 
   # filter out those utteraces which only appea in phone_processed.ctm but not in word_processed.ctm
   cat $dir/phone_mapped.ctm | awk '{printf("%s-%s %010.0f PHONE %s\n", $1, $2, 1000*($3+(0.5*$4)), $5);}' | \
-    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/word_processed.ctm - \
-    > $dir/phone_processed.ctm
+    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/word_processed.ctm - | \
+    sort > $dir/phone_processed.ctm
 
   # merge-sort both ctm's
   sort -m $dir/word_processed.ctm $dir/phone_processed.ctm > $dir/combined.ctm
 fi
 
-  # after merge-sort of the two ctm's, we add <eps> to cover "deserted" phones due to precision limits, and then merge all consecutive <eps>'s. 
+# after merge-sort of the two ctm's, we add <eps> to cover "deserted" phones due to precision limits, and then merge all consecutive <eps>'s.
 if [ $stage -le 9 ]; then
   awk '{print $1, $3, $4}' $dir/combined.ctm | \
      perl -e ' while (<>) { chop; @A = split(" ", $_); ($utt, $a,$b) = @A;
@@ -137,14 +138,14 @@ if [ $stage -le 9 ]; then
      if ($a eq "END") { print $utt, " ", $cur_word, " ", join(" ", @phones), "\n"; }
      if ($a eq "PHONE") { if ($prev eq "END") {print $utt, " ", "<eps>", " ", $b, "\n";} else {push @phones, $b;}} $prev = $a;} ' |\
      awk 'BEGIN{merge_prev=0;} {utt=$1;word=$2;pron=$3;for (i=4;i<=NF;i++) pron=pron" "$i;
-     if (word_prev == "<eps>" && word == "<eps>" && utt_prev == utt) {merge=0;pron_prev=pron_prev" "pron;} else {merge=1;} 
+     if (word_prev == "<eps>" && word == "<eps>" && utt_prev == utt) {merge=0;pron_prev=pron_prev" "pron;} else {merge=1;}
      if(merge_prev==1) {print utt_prev, word_prev, pron_prev;};
      merge_prev=merge; utt_prev=utt; word_prev=word; pron_prev=pron;}
      END{if(merge_prev==1) {print utt_prev, word_prev, pron_prev;}}' > $dir/ctm_prons.txt
-  
+
   steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words
   steps/cleanup/internal/get_pron_stats.py $dir/ctm_prons.txt $phone_lang/phones/silence.txt $phone_lang/phones/optional_silence.txt $dir/non_scored_words - | \
-    sort -nr > $dir/prons.txt  
+    sort -nr > $dir/prons.txt
 fi
 
 if [ $stage -le 10 ]; then
diff --git a/egs/wsj/s5/steps/cleanup/decode_segmentation_nnet3.sh b/egs/wsj/s5/steps/cleanup/decode_segmentation_nnet3.sh
new file mode 100755
index 00000000000..02a9d87d26b
--- /dev/null
+++ b/egs/wsj/s5/steps/cleanup/decode_segmentation_nnet3.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+
+# Copyright 2014  Guoguo Chen, 2015 GoVivace Inc. (Nagendra Goel)
+#           2017  Vimal Manohar
+# Apache 2.0
+
+# This script is similar to steps/cleanup/decode_segmentation.sh, but 
+# does decoding using nnet3 model.
+
+set -e
+set -o pipefail
+
+# Begin configuration section.
+stage=-1
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0  # Beam we use in lattice generation. We can reduce this if 
+                  # we only need the best path
+iter=final
+num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+scoring_opts=
+skip_scoring=false
+allow_partial=true
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+minimize=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+   echo "$0: This is a special decoding script for segmentation where we"
+   echo "use one decoding graph per segment. We assume a file HCLG.fsts.scp exists"
+   echo "which is the scp file of the graphs for each segment."
+   echo "This will normally be obtained by steps/cleanup/make_biased_lm_graphs.sh."
+   echo ""
+   echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+   echo " e.g.: $0 --online-ivector-dir exp/nnet3/ivectors_train_si284_split "
+   echo "             exp/nnet3/tdnn/graph_train_si284_split \\"
+   echo "             data/train_si284_split exp/nnet3/tdnn/decode_train_si284_split"
+   echo ""
+   echo "where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo "where the model is."
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --iter <iter>                                    # Iteration of model to test."
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --acwt <float>                                   # acoustic scale used for lattice generation "
+   echo "  --scoring-opts <string>                          # options to local/score.sh"
+   echo "  --num-threads <n>                                # number of threads to use, default 1."
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+dir=$3
+
+mkdir -p $dir/log
+
+if [ -e $dir/$iter.mdl ]; then
+  srcdir=$dir
+elif [ -e $dir/../$iter.mdl ]; then
+  srcdir=$(dirname $dir)
+else
+  echo "$0: expected either $dir/$iter.mdl or $dir/../$iter.mdl to exist"
+  exit 1
+fi
+model=$srcdir/$iter.mdl
+
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+utils/lang/check_phones_compatible.sh $graph_dir/phones.txt $srcdir/phones.txt || exit 1
+
+for f in $graphdir/HCLG.fsts.scp $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+# Split HCLG.fsts.scp by input utterance
+n1=$(cat $graphdir/HCLG.fsts.scp | wc -l)
+n2=$(cat $data/feats.scp | wc -l)
+if [ $n1 != $n2 ]; then
+  echo "$0: expected $n2 graphs in $graphdir/HCLG.fsts.scp, got $n1"
+fi
+
+mkdir -p $dir/split_fsts
+sort -k1,1 $graphdir/HCLG.fsts.scp > $dir/HCLG.fsts.sorted.scp
+utils/filter_scps.pl --no-warn -f 1 JOB=1:$nj \
+  $sdata/JOB/feats.scp $dir/HCLG.fsts.sorted.scp $dir/split_fsts/HCLG.fsts.JOB.scp
+HCLG=scp:$dir/split_fsts/HCLG.fsts.JOB.scp
+
+## Set up features.
+echo "$0: feature type is raw"
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 1 ]; then
+  if [ -f "$graphdir/num_pdfs" ]; then
+    [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $model | grep pdfs | awk '{print $NF}'` ] || \
+      { echo "Mismatch in number of pdfs with $model"; exit 1; }
+  fi
+  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=$allow_partial \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     "$HCLG" "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "$0: Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    iter_opt=
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir ||
+      { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
index 848ca61ebe4..d3e012da13c 100755
--- a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
+++ b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
@@ -127,7 +127,7 @@ def read_text(text_file):
                 "Did not get enough columns; line {0} in {1}"
                 "".format(line, text_file.name))
         elif len(parts) == 1:
-            logger.warn("Empty transcript for utterance %s in %s", 
+            logger.warn("Empty transcript for utterance %s in %s",
                         parts[0], text_file.name)
             yield parts[0], []
         else:
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
index a19c5344572..3032a4b434a 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -116,17 +116,17 @@
 def OpenFiles():
     global ctm_edits_out, edits_in, ctm_in, symbol_table, oov_word
     try:
-        ctm_edits_out = open(args.ctm_edits_out, 'w')
+        ctm_edits_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm-edits file {0} for output".format(
                 args.ctm_edits_out))
     try:
-        edits_in = open(args.edits_in)
+        edits_in = open(args.edits_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening edits file {0} for input".format(
                 args.edits_in))
     try:
-        ctm_in = open(args.ctm_in)
+        ctm_in = open(args.ctm_in, encoding='utf-8')
     except:
         sys.exit("get_ctm_edits.py: error opening ctm file {0} for input".format(
                 args.ctm_in))
@@ -138,7 +138,7 @@ def OpenFiles():
             print("get_ctm_edits.py: error: if you set the the --symbol-table option "
                   "you must also set the --oov option", file = sys.stderr)
         try:
-            f = open(args.symbol_table, 'r')
+            f = open(args.symbol_table, 'r', encoding='utf-8')
             for line in f.readlines():
                 [ word, integer ] = line.split()
                 if int(integer) == args.oov:
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
index aa71fa47d84..50ee8e2333f 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_non_scored_words.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -12,6 +12,10 @@
 import sys
 from collections import defaultdict
 
+import io
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -90,7 +94,7 @@ def read_lang(lang_dir):
         raise
 
     try:
-        for line in open(lang_dir + '/words.txt').readlines():
+        for line in open(lang_dir + '/words.txt', encoding='utf-8').readlines():
             [ word, integer ] = line.split()
             if int(integer) in silence_word_ints:
                 non_scored_words.add(word)
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
index 414875f9013..3ea217b6589 100755
--- a/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
+++ b/egs/wsj/s5/steps/cleanup/internal/get_pron_stats.py
@@ -4,6 +4,7 @@
 # Apache 2.0.
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import sys
 import warnings
@@ -74,14 +75,14 @@ def ReadEntries(file_handle):
 # Each entry in the list represents the pronounciation candidate(s) of a word.
 # For each non-<eps> word, the entry is a list: [utt_id, word, set(pronunciation_candidates)]. e.g:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
-# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left", 
+# For each <eps>, we split the phones it aligns to into two parts: "nonsil_left",
 # which includes phones before the first silphone, and "nonsil_right", which includes
-# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL', 
+# phones after the last silphone. For example, for <eps> : 'V SIL B AH SIL',
 # nonsil_left is 'V' and nonsil_right is empty ''. After processing an <eps> entry
 # in ctm_prons, we put it in "info" as an entry:  [utt_id, word, nonsil_right]
 # only if it's nonsil_right segment is not empty, which may be used when processing
 # the next word.
-# 
+#
 # Normally, one non-<eps> word is only aligned to one pronounciation candidate. However
 # when there is a preceding/following <eps>, like in the following example, we
 # assume the phones aligned to <eps> should be statistically distributed
@@ -89,7 +90,7 @@ def ReadEntries(file_handle):
 # Thus we append the "nonsil_left" segment of these phones to the pronounciation
 # of the preceding word, if the last phone of this pronounciation is not a silence phone,
 # Similarly we can add a pron candidate to the following word.
-# 
+#
 # For example, for the following part of a ctm_prons file:
 # 911Mothers_2010W-0010916-0012901-1 other AH DH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> K AH N SIL B
@@ -98,11 +99,11 @@ def ReadEntries(file_handle):
 # 911Mothers_2010W-0010916-0012901-1 when W EH N
 # 911Mothers_2010W-0010916-0012901-1 people P IY P AH L
 # 911Mothers_2010W-0010916-0012901-1 <eps> SIL
-# 911Mothers_2010W-0010916-0012901-1 heard HH ER 
+# 911Mothers_2010W-0010916-0012901-1 heard HH ER
 # 911Mothers_2010W-0010916-0012901-1 <eps> D
 # 911Mothers_2010W-0010916-0012901-1 that SIL DH AH T
 # 911Mothers_2010W-0010916-0012901-1 my M AY
-# 
+#
 # The corresponding segment in the "info" list is:
 # [911Mothers_2010W-0010916-0012901-1, other, set('AH DH ER', 'AH DH ER K AH N')]
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'B'
@@ -112,7 +113,7 @@ def ReadEntries(file_handle):
 # [911Mothers_2010W-0010916-0012901-1, <eps>, 'D']
 # [911Mothers_2010W-0010916-0012901-1, that, set('SIL DH AH T')]
 # [911Mothers_2010W-0010916-0012901-1, my, set('M AY')]
-# 
+#
 # Then we accumulate pronouciation stats from "info". Basically, for each occurence
 # of a word, each pronounciation candidate gets equal soft counts. e.g. In the above
 # example, each pron candidate of "because" gets a count of 1/4. The stats is stored
@@ -138,20 +139,20 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
         # So we apply the same merging method in these cases.
         if word == '<eps>' or (word in non_scored_words and word != '<unk>' and word != '<UNK>'):
             nonsil_left = []
-            nonsil_right = [] 
+            nonsil_right = []
             for phone in phones:
                 if phone in silphones:
                     break
                 nonsil_left.append(phone)
-            
+
             for phone in reversed(phones):
                 if phone in silphones:
                     break
                 nonsil_right.insert(0, phone)
-            
+
             # info[-1][0] is the utt_id of the last entry
-            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]: 
-                # pron_ext is a set of extended pron candidates. 
+            if len(nonsil_left) > 0 and len(info) > 0 and utt == info[-1][0]:
+                # pron_ext is a set of extended pron candidates.
                 pron_ext = set()
                 # info[-1][2] is the set of pron candidates of the last entry.
                 for pron in info[-1][2]:
@@ -210,8 +211,8 @@ def GetStatsFromCtmProns(silphones, optional_silence, non_scored_words, ctm_pron
                 stats[(word, phones)] = stats.get((word, phones), 0) + count
     return stats
 
-def WriteStats(stats, file_handle):            
-    for word_pron, count in stats.iteritems():
+def WriteStats(stats, file_handle):
+    for word_pron, count in stats.items():
         print('{0} {1} {2}'.format(count, word_pron[0], word_pron[1]), file=file_handle)
     file_handle.close()
 
@@ -221,7 +222,7 @@ def Main():
     non_scored_words = ReadEntries(args.non_scored_words_file_handle)
     optional_silence = ReadEntries(args.optional_silence_file_handle)
     stats = GetStatsFromCtmProns(silphones, optional_silence.pop(), non_scored_words, args.ctm_prons_file_handle)
-    WriteStats(stats, args.stats_file_handle)            
+    WriteStats(stats, args.stats_file_handle)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
index f37fa866b0f..e5f4a8d1996 100755
--- a/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
+++ b/egs/wsj/s5/steps/cleanup/internal/make_one_biased_lm.py
@@ -1,14 +1,20 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
 
 from __future__ import print_function
+from __future__ import division
 import sys
 import argparse
 import math
 from collections import defaultdict
 
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")
+
 parser = argparse.ArgumentParser(description="""
 This script creates a biased language model suitable for alignment and
 data-cleanup purposes.   It reads (possibly multiple) lines of integerized text
@@ -47,7 +53,7 @@
 
 
 
-class NgramCounts:
+class NgramCounts(object):
     ## A note on data-structure.
     ## Firstly, all words are represented as integers.
     ## We store n-gram counts as an array, indexed by (history-length == n-gram order minus one)
@@ -139,24 +145,26 @@ def GetHistToTotalCount(self):
     # LM-states that would back off to 'this' lm-state, in the total.
     def CompletelyDiscountLowCountStates(self, min_count):
         hist_to_total_count = self.GetHistToTotalCount()
-        for n in reversed(range(2, self.ngram_order)):
+        for n in reversed(list(range(2, self.ngram_order))):
             this_order_counts = self.counts[n]
+            to_delete = []
             for hist in this_order_counts.keys():
                 if hist_to_total_count[hist] < min_count:
                     # we need to completely back off this count.
                     word_to_count = this_order_counts[hist]
-                    del this_order_counts[hist] # delete the key from the dict.
+                    # mark this key for deleting
+                    to_delete.append(hist)
                     backoff_hist = hist[1:]  # this will be a tuple not a list.
                     for word, count in word_to_count.items():
                         self.AddCount(backoff_hist, word, count)
-
-
+            for hist in to_delete:
+                del this_order_counts[hist]
 
     # This backs off the counts according to Kneser-Ney (unmodified,
     # with interpolation).
     def ApplyBackoff(self, D):
         assert D > 0.0 and D < 1.0
-        for n in reversed(range(1, self.ngram_order)):
+        for n in reversed(list(range(1, self.ngram_order))):
             this_order_counts = self.counts[n]
             for hist, word_to_count in this_order_counts.items():
                 backoff_hist = hist[1:]
@@ -182,7 +190,7 @@ def Print(self, info_string):
         for this_order_counts in self.counts:
             for hist, word_to_count in this_order_counts.items():
                 this_total_count = sum(word_to_count.values())
-                print(str(hist) + ': total={0} '.format(this_total_count),
+                print('{0}: total={1} '.format(hist, this_total_count),
                       end='', file=sys.stderr)
                 print(' '.join(['{0} -> {1} '.format(word, count)
                                 for word, count in word_to_count.items() ]),
@@ -199,7 +207,7 @@ def AddTopWords(self, top_words_file):
         word_to_count = self.counts[0][empty_history]
         total = sum(word_to_count.values())
         try:
-            f = open(top_words_file)
+            f = open(top_words_file, mode='r', encoding='utf-8')
         except:
             sys.exit("make_one_biased_lm.py: error opening top-words file: "
                      "--top-words=" + top_words_file)
@@ -242,10 +250,10 @@ def GetHistToStateMap(self):
     def GetProb(self, hist, word, total_count_map):
         total_count = total_count_map[hist]
         word_to_count = self.counts[len(hist)][hist]
-        prob = word_to_count[word] / total_count
+        prob = float(word_to_count[word]) / total_count
         if len(hist) > 0 and word != self.backoff_symbol:
             prob_in_backoff = self.GetProb(hist[1:], word, total_count_map)
-            backoff_prob = word_to_count[self.backoff_symbol] / total_count
+            backoff_prob = float(word_to_count[self.backoff_symbol]) / total_count
             prob += backoff_prob * prob_in_backoff
         return prob
 
@@ -262,7 +270,7 @@ def PrintAsFst(self, word_disambig_symbol):
         hist_to_state = self.GetHistToStateMap()
         total_count_map = self.GetTotalCountMap()
 
-        for n in [ 1, 0 ] + range(2, self.ngram_order):
+        for n in [ 1, 0 ] + list(range(2, self.ngram_order)):
             this_order_counts = self.counts[n]
             # For order 1, make sure the keys are sorted.
             keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys())
diff --git a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
index d6f0d0f6b23..af63ca27d2b 100755
--- a/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/modify_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -105,7 +105,7 @@
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -317,12 +317,12 @@ def ProcessUtterance(split_lines_of_utt):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        f_out = open(args.ctm_edits_out, 'w')
+        f_out = open(args.ctm_edits_out, 'w', encoding='utf-8')
     except:
         sys.exit("modify_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))
diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
index ad03b557bfe..2801639274b 100755
--- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
+++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
@@ -15,6 +15,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import collections
 import logging
@@ -135,7 +136,7 @@ def wer(ctm_edit_lines):
         return float('inf')
     if num_words == 0 and num_incorrect_words == 0:
         return 0
-    return (float(num_incorrect_words) / num_words, -num_words)
+    return float(num_incorrect_words) / num_words
 
 
 def choose_best_ctm_lines(first_lines, second_lines,
@@ -143,9 +144,9 @@ def choose_best_ctm_lines(first_lines, second_lines,
     """Returns ctm lines that have lower WER. If the WER is the lines with
     the higher number of words is returned.
     """
-    i, best_lines = min((0, first_lines), (1, second_lines),
+    i, best_lines = min((0, first_lines),
+                        (1, second_lines),
                         key=lambda x: wer(x[1]))
-
     return i
 
 
@@ -299,7 +300,7 @@ def run(args):
     segments, reco2utt = read_segments(args.segments)
     ctm_edits = read_ctm_edits(args.ctm_edits_in, segments)
 
-    for reco, utts in reco2utt.iteritems():
+    for reco, utts in reco2utt.items():
         ctm_edits_for_reco = []
         for utt in sorted(utts, key=lambda x: segments[x][1]):
             if (reco, utt) in ctm_edits:
@@ -307,7 +308,7 @@ def run(args):
         try:
             if len(ctm_edits_for_reco) == 0:
                 logger.warn('CTMs for recording %s is empty.',
-                             reco)
+                            reco)
                 continue   # Go to the next recording
 
             # Process CTMs in the recordings
diff --git a/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py b/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py
index eb0b18f0408..9594d2ecc60 100755
--- a/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py
+++ b/egs/wsj/s5/steps/cleanup/internal/retrieve_similar_docs.py
@@ -223,7 +223,7 @@ def read_map(file_handle, num_values_per_key=None,
 
 def get_document_ids(source_docs, indexes):
     indexes = sorted(
-        [(key, value[0], value[1]) for key, value in indexes.iteritems()],
+        [(key, value[0], value[1]) for key, value in indexes.items()],
         key=lambda x: x[0])
 
     doc_ids = []
@@ -273,7 +273,7 @@ def run(args):
             "Did not get scores for query {0}".format(query_id))
 
         if args.verbose > 2:
-            for tup, score in scores.iteritems():
+            for tup, score in scores.items():
                 logger.debug("Score, {num}: {0} {1} {2}".format(
                     tup[0], tup[1], score, num=num_queries))
 
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
index 39f6d38d6bf..2ea8f5f6070 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits.py
@@ -1,10 +1,12 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
 # Apache 2.0
 
 from __future__ import print_function
+from __future__ import division
 import sys, operator, argparse, os
 from collections import defaultdict
 
@@ -68,7 +70,7 @@
                     help="""Minimum duration of silence or non-scored word
                     to be considered a viable split point when
                     truncating based on junk proportion.""")
-parser.add_argument("--max-deleted-words-kept-when-merging", type = str, default = 1,
+parser.add_argument("--max-deleted-words-kept-when-merging", type = int, default = 1,
                     help = "When merging segments that are found to be overlapping or "
                     "adjacent after all other processing, keep in the transcript the "
                     "reference words that were deleted between the segments [if any] "
@@ -171,7 +173,7 @@ def ComputeSegmentCores(split_lines_of_utt):
 
     return segment_ranges
 
-class Segment:
+class Segment(object):
     def __init__(self, split_lines_of_utt, start_index, end_index, debug_str = None):
         self.split_lines_of_utt = split_lines_of_utt
         # start_index is the index of the first line that appears in this
@@ -551,7 +553,7 @@ def PossiblyTruncateStartForJunkProportion(self):
         if candidate_start_index is None:
             return  # Nothing to do as there is no place to split.
         candidate_removed_piece_duration = candidate_start_time - self.StartTime()
-        if begin_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion:
+        if float(begin_junk_duration) / candidate_removed_piece_duration < args.max_junk_proportion:
             return  # Nothing to do as the candidate piece to remove has too
                     # little junk.
         # OK, remove the piece.
@@ -593,7 +595,7 @@ def PossiblyTruncateEndForJunkProportion(self):
         if candidate_end_index is None:
             return  # Nothing to do as there is no place to split.
         candidate_removed_piece_duration = self.EndTime() - candidate_end_time
-        if end_junk_duration / candidate_removed_piece_duration < args.max_junk_proportion:
+        if float(end_junk_duration) / candidate_removed_piece_duration < args.max_junk_proportion:
             return  # Nothing to do as the candidate piece to remove has too
                     # little junk.
         # OK, remove the piece.
@@ -807,7 +809,7 @@ def TimeToString(time, frame_length):
 
 def WriteSegmentsForUtterance(text_output_handle, segments_output_handle,
                               old_utterance_name, segments):
-    num_digits = len(str(len(segments)))
+    num_digits = len('{}'.format(len(segments)))
     for n in range(len(segments)):
         segment = segments[n]
         # split utterances will be named foo-bar-1 foo-bar-2, etc.
@@ -840,24 +842,24 @@ def PrintDebugInfoForUtterance(ctm_edits_out_handle,
     info_to_print = []
     for n in range(len(segments_for_utterance)):
         segment = segments_for_utterance[n]
-        start_string = 'start-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']'
+        start_string = 'start-segment-{0}[{1}]'.format(n+1, segment.DebugInfo())
         info_to_print.append( (segment.StartTime(), start_string) )
-        end_string = 'end-segment-' + str(n+1)
+        end_string = 'end-segment-{}'.format(n+1)
         info_to_print.append( (segment.EndTime(), end_string) )
     # for segments that were deleted we print info like start-deleted-segment-1, and
     # otherwise similar info to segments that were retained.
     for n in range(len(deleted_segments_for_utterance)):
         segment = deleted_segments_for_utterance[n]
-        start_string = 'start-deleted-segment-' + str(n+1) + '[' + segment.DebugInfo() + ']'
+        start_string = 'start-deleted-segment-{0}[{1}]'.format(n+1, segment.DebugInfo())
         info_to_print.append( (segment.StartTime(), start_string) )
-        end_string = 'end-deleted-segment-' + str(n+1)
+        end_string = 'end-deleted-segment-{}'.format(n+1)
         info_to_print.append( (segment.EndTime(), end_string) )
 
     info_to_print = sorted(info_to_print)
 
     for i in range(len(split_lines_of_cur_utterance)):
         split_line=split_lines_of_cur_utterance[i]
-        split_line[0] += '[' + str(i) + ']'  # add an index like [0], [1], to
+        split_line[0] += '[{}]'.format(i)    # add an index like [0], [1], to
                                              # the utterance-id so we can easily
                                              # look up segment indexes.
         start_time = float(split_line[2])
@@ -893,7 +895,7 @@ def AccWordStatsForUtterance(split_lines_of_utt,
 
 def PrintWordStats(word_stats_out):
     try:
-        f = open(word_stats_out, 'w')
+        f = open(word_stats_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening word-stats file --word-stats-out={0} "
                  "for writing".format(word_stats_out))
@@ -923,23 +925,23 @@ def PrintWordStats(word_stats_out):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        text_output_handle = open(args.text_out, 'w')
+        text_output_handle = open(args.text_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening text output "
                  "file {0}".format(args.text_out))
     try:
-        segments_output_handle = open(args.segments_out, 'w')
+        segments_output_handle = open(args.segments_out, 'w', encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening segments output "
                  "file {0}".format(args.text_out))
     if args.ctm_edits_out != None:
         try:
-            ctm_edits_output_handle = open(args.ctm_edits_out, 'w')
+            ctm_edits_output_handle = open(args.ctm_edits_out, 'w', encoding='utf-8')
         except:
             sys.exit("segment_ctm_edits.py: error opening ctm-edits output "
                      "file {0}".format(args.ctm_edits_out))
@@ -993,7 +995,7 @@ def ProcessData():
 def ReadNonScoredWords(non_scored_words_file):
     global non_scored_words
     try:
-        f = open(non_scored_words_file)
+        f = open(non_scored_words_file, encoding='utf-8')
     except:
         sys.exit("segment_ctm_edits.py: error opening file: "
                  "--non-scored-words=" + non_scored_words_file)
@@ -1014,7 +1016,7 @@ def ReadNonScoredWords(non_scored_words_file):
 oov_symbol = None
 if args.oov_symbol_file != None:
     try:
-        with open(args.oov_symbol_file) as f:
+        with open(args.oov_symbol_file, encoding='utf-8') as f:
             line = f.readline()
             assert len(line.split()) == 1
             oov_symbol = line.split()[0]
diff --git a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
index 46a9369ae98..1ebfdaf7465 100755
--- a/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
+++ b/egs/wsj/s5/steps/cleanup/internal/segment_ctm_edits_mild.py
@@ -5,6 +5,7 @@
 # Apache 2.0
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import copy
 import logging
@@ -869,8 +870,7 @@ def relax_boundary_truncation(self, min_segment_length,
         #        a * (length_with_truncation - length_with_relaxed_boundaries)
         # -> a = (length_cutoff - length_with_relaxed_boundaries)
         #        / (length_with_truncation - length_with_relaxed_boundaries)
-        a = ((length_cutoff - length_with_relaxed_boundaries)
-             / (length_with_truncation - length_with_relaxed_boundaries))
+        a = (length_cutoff - length_with_relaxed_boundaries) / (length_with_truncation - length_with_relaxed_boundaries)
         if a < 0.0 or a > 1.0:
             # TODO(vimal): Should this be an error?
             _global_logger.warn("bad 'a' value = %.4f", a)
@@ -1331,8 +1331,7 @@ def merge_clusters(self, scoring_function,
                     if reject:
                         rejected_clusters.add(tuple(new_cluster))
                         continue
-
-                    heapq.heappush(heap, (-scoring_function(merged_segment),
+                    heapq.heappush(heap, ((-scoring_function(merged_segment), i),
                                           (merged_segment, i, new_cluster)))
 
                 candidate_index = -1
@@ -1527,7 +1526,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
         _global_logger.debug("stage 0: segment %d = %s", i, x)
 
     if args.verbose > 4:
-        print ("Stage 0 [segment cores]:", file=sys.stderr)
+        print("Stage 0 [segment cores]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1542,7 +1541,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
         _global_logger.debug("stage 1: segment %d = %s", i, x)
 
     if args.verbose > 4:
-        print ("Stage 1 [add tainted lines]:", file=sys.stderr)
+        print("Stage 1 [add tainted lines]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1556,7 +1555,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
         _global_logger.debug("stage 2: segment %d = %s", i, x)
 
     if args.verbose > 4:
-        print ("Stage 2 [merge segments]:", file=sys.stderr)
+        print("Stage 2 [merge segments]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1576,7 +1575,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 3: segment %d, %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 3 [split segments]:", file=sys.stderr)
+        print("Stage 3 [split segments]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1598,7 +1597,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 4: segment %d, %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 4 [split long segments]:", file=sys.stderr)
+        print("Stage 4 [split long segments]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1615,7 +1614,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 5: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 5 [truncate boundaries]:", file=sys.stderr)
+        print("Stage 5 [truncate boundaries]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1632,7 +1631,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 6: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 6 [relax boundary truncation]:", file=sys.stderr)
+        print("Stage 6 [relax boundary truncation]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1648,7 +1647,7 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 7: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 7 [unk-padding]:", file=sys.stderr)
+        print("Stage 7 [unk-padding]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1674,8 +1673,8 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 8: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 8 [remove new segments under "
-               "--min-new-segment-length]:", file=sys.stderr)
+        print("Stage 8 [remove new segments under "
+              "--min-new-segment-length]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1698,8 +1697,8 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 9: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 9 [remove segments under "
-               "--min-segment-length]:", file=sys.stderr)
+        print("Stage 9 [remove segments under "
+              "--min-segment-length]:", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1721,8 +1720,8 @@ def get_segments_for_utterance(split_lines_of_utt, args, utterance_stats):
             "stage 10: segment %d = %s", i, x.debug_info(False))
 
     if args.verbose > 4:
-        print ("Stage 10 [remove segments without scored, non-OOV words "
-               "", file=sys.stderr)
+        print("Stage 10 [remove segments without scored, non-OOV words "
+              "", file=sys.stderr)
         segments_copy = [x.copy() for x in segments]
         print_debug_info_for_utterance(sys.stderr,
                                        copy.deepcopy(split_lines_of_utt),
@@ -1929,6 +1928,7 @@ def process_data(args, oov_symbol, utterance_stats, word_stats):
                         args.ctm_edits_out, split_lines_of_cur_utterance,
                         segments_for_utterance, deleted_segments_for_utterance,
                         frame_length=args.frame_length)
+
                 split_lines_of_cur_utterance = []
                 if len(split_pending_line) == 0:
                     break
diff --git a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
index 85e1df997a7..908be55ec0d 100755
--- a/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
+++ b/egs/wsj/s5/steps/cleanup/internal/taint_ctm_edits.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2016   Vimal Manohar
 #           2016   Johns Hopkins University (author: Daniel Povey)
@@ -8,6 +8,10 @@
 import sys, operator, argparse, os
 from collections import defaultdict
 
+import io
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf8")
+
+
 # This script reads and writes the 'ctm-edits' file that is
 # produced by get_ctm_edits.py.
 #
@@ -136,12 +140,12 @@ def ProcessUtterance(split_lines_of_utt, remove_deletions=True):
 
 def ProcessData():
     try:
-        f_in = open(args.ctm_edits_in)
+        f_in = open(args.ctm_edits_in, encoding="utf8")
     except:
         sys.exit("taint_ctm_edits.py: error opening ctm-edits input "
                  "file {0}".format(args.ctm_edits_in))
     try:
-        f_out = open(args.ctm_edits_out, 'w')
+        f_out = open(args.ctm_edits_out, 'w', encoding="utf8")
     except:
         sys.exit("taint_ctm_edits.py: error opening ctm-edits output "
                  "file {0}".format(args.ctm_edits_out))
@@ -201,7 +205,7 @@ def PrintNonScoredStats():
             percent_modified, percent_of_incorrect_modified),
           file = sys.stderr)
 
-    keys = sorted(ref_change_stats.keys(), reverse=True,
+    keys = sorted(list(ref_change_stats.keys()), reverse=True,
                   key = lambda x: ref_change_stats[x])
     num_keys_to_print = 40 if args.verbose >= 2 else 10
 
@@ -219,7 +223,7 @@ def PrintStats():
         return
     print("taint_ctm_edits.py: processed {0} input lines, whose edit-types were: ".format(tot_lines) +
           ', '.join([ '%s = %.2f%%' % (k, num_lines_of_type[k] * 100.0 / tot_lines)
-                      for k in sorted(num_lines_of_type.keys(), reverse = True,
+                      for k in sorted(list(num_lines_of_type.keys()), reverse = True,
                                       key = lambda k: num_lines_of_type[k])  ]),
           file = sys.stderr)
 
@@ -246,4 +250,3 @@ def PrintStats():
 
 ProcessData()
 PrintStats()
-
diff --git a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
index 9b2f4d693a6..15773d0977e 100644
--- a/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
+++ b/egs/wsj/s5/steps/cleanup/internal/tf_idf.py
@@ -6,6 +6,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import logging
 import math
 import re
@@ -51,8 +52,7 @@ def get_inverse_document_frequency(self, term, weighting_scheme="log"):
         if weighting_scheme == "log-smoothed":
             return math.log(1.0 + float(self.num_docs) / (1.0 + n_t))
         if weighting_scheme == "probabilitic":
-            return math.log((self.num_docs - n_t - 1)
-                            / (1.0 + n_t))
+            return math.log((self.num_docs - n_t - 1) / (1.0 + n_t))
 
     def accumulate(self, term):
         """Adds one count to the number of docs containing the term "term".
@@ -66,7 +66,7 @@ def write(self, file_handle):
         <term-1> <term-2> ... <term-N> <num-docs>
         for n-gram (<term-1>, ... <term-N>)
         """
-        for term, num in self.num_docs_for_term.iteritems():
+        for term, num in self.num_docs_for_term.items():
             if num == 0:
                 continue
             assert isinstance(term, tuple)
@@ -135,7 +135,7 @@ def compute_term_stats(self, idf_stats=None):
         based on the stored raw counts."""
         if len(self.raw_counts) == 0:
             raise RuntimeError("No (term, doc) found in tf-stats.")
-        for tup, counts in self.raw_counts.iteritems():
+        for tup, counts in self.raw_counts.items():
             term = tup[0]
 
             if counts > self.max_counts_for_term.get(term, 0):
@@ -149,7 +149,7 @@ def __str__(self):
         <n-gram order> <term-1> <term-2> ... <term-n> <document-id> <counts>
         """
         lines = []
-        for tup, counts in self.raw_counts.iteritems():
+        for tup, counts in self.raw_counts.items():
             term, doc = tup
             lines.append("{order} {term} {doc} {counts}".format(
                 order=len(term), term=" ".join(term),
@@ -225,7 +225,7 @@ def compute_similarity_scores(self, source_tfidf, source_docs=None,
         num_terms_per_doc = {}
         similarity_scores = {}
 
-        for tup, value in self.tf_idf.iteritems():
+        for tup, value in self.tf_idf.items():
             term, doc = tup
             num_terms_per_doc[doc] = num_terms_per_doc.get(doc, 0) + 1
 
@@ -253,19 +253,18 @@ def compute_similarity_scores(self, source_tfidf, source_docs=None,
                         similarity_scores.get((doc, src_doc), 0)
                         + src_value * value)
             else:
-                for src_tup, src_value in source_tfidf.tf_idf.iteritems():
+                for src_tup, src_value in source_tfidf.tf_idf.items():
                     similarity_scores[(doc, src_doc)] = (
                         similarity_scores.get((doc, src_doc), 0)
                         + src_value * value)
 
         if do_length_normalization:
-            for doc_pair, value in similarity_scores.iteritems():
+            for doc_pair, value in similarity_scores.items():
                 doc, src_doc = doc_pair
-                similarity_scores[(doc, src_doc)] = (value
-                                                     / num_terms_per_doc[doc])
+                similarity_scores[(doc, src_doc)] = value / num_terms_per_doc[doc]
 
         if logger.isEnabledFor(logging.DEBUG):
-            for doc, count in num_terms_per_doc.iteritems():
+            for doc, count in num_terms_per_doc.items():
                 logger.debug(
                     'Seen {0} terms in query document {1}'.format(count, doc))
 
@@ -329,7 +328,7 @@ def write(self, tf_idf_file):
         """Writes TFIDF object to file."""
 
         print ("<TFIDF>", file=tf_idf_file)
-        for tup, value in self.tf_idf.iteritems():
+        for tup, value in self.tf_idf.items():
             term, doc = tup
             print("{order} {term} {doc} {tfidf}".format(
                 order=len(term), term=" ".join(term),
@@ -402,7 +401,6 @@ def read_key(fd):
     str += char
   str = str.strip()
   if str == '': return None # end of file,
-  assert(re.match('^[\.a-zA-Z0-9_:-]+$',str) != None) # check format,
   return str
 
 
diff --git a/egs/wsj/s5/steps/cleanup/make_biased_lms.py b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
index ab508eedc9c..9f6a8ca5938 100755
--- a/egs/wsj/s5/steps/cleanup/make_biased_lms.py
+++ b/egs/wsj/s5/steps/cleanup/make_biased_lms.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 from __future__ import print_function
 import sys
@@ -7,6 +7,11 @@
 import subprocess
 from collections import defaultdict
 
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf8")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer,encoding="utf8")
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer,encoding="utf8")
+
 parser = argparse.ArgumentParser(description="""
 This script is a wrapper for make_one_biased_lm.py that reads a Kaldi archive
 of (integerized) text data from the standard input and writes a Kaldi archive of
@@ -31,7 +36,7 @@
 
 
 try:
-    utterance_map_file = open(args.utterance_map, "w")
+    utterance_map_file = open(args.utterance_map, "w", encoding="utf-8")
 except:
     sys.exit("make_biased_lms.py: error opening {0} to write utterance map".format(
             args.utterance_map))
@@ -55,7 +60,7 @@ def ProcessGroupOfLines(group_of_lines):
     try:
         command = "steps/cleanup/internal/make_one_biased_lm.py " + args.lm_opts
         p = subprocess.Popen(command, shell = True, stdin = subprocess.PIPE,
-                            stdout = sys.stdout, stderr = sys.stderr)
+                             stdout = sys.stdout, stderr = sys.stderr)
         for line in group_of_lines:
             a = line.split()
             if len(a) == 0:
@@ -63,13 +68,15 @@ def ProcessGroupOfLines(group_of_lines):
             utterance_id = a[0]
             # print <utt> <utt-group> to utterance-map file
             print(utterance_id, group_utterance_id, file = utterance_map_file)
-            rest_of_line = ' '.join(a[1:])  # get rid of utterance id.
-            print(rest_of_line, file=p.stdin)
+            rest_of_line = ' '.join(a[1:]) + '\n' # get rid of utterance id.
+            p.stdin.write(rest_of_line.encode('utf-8'))
         p.stdin.close()
         assert p.wait() == 0
-    except Exception as e:
-        sys.exit("make_biased_lms.py: error calling subprocess, command was: " +
-                 command + ", error was : " + str(e))
+    except Exception:
+        sys.stderr.write(
+            "make_biased_lms.py: error calling subprocess, command was: " +
+            command)
+        raise
     # Print a blank line; this terminates the FST in the Kaldi fst-archive
     # format.
     print("")
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index 16350fdb032..92c575d0740 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -4,6 +4,23 @@
 #           2016  Vimal Manohar
 # Apache 2.0
 
+# This script performs segmentation of the input data based on the transcription
+# and outputs segmented data along with the corresponding aligned transcription.
+# The purpose of this script is to divide up the input data (which may consist
+# of long recordings such as television shows or audiobooks) into segments which
+# are of manageable length for further processing, along with the portion of the
+# transcript that seems to match (aligns with) each segment.
+# This the light-supervised training scenario where the input transcription is
+# not expected to be completely clean and may have significant errors. 
+# See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization,
+# Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel
+# Povey, Sanjeev Khudanpur, ASRU 2017
+# (http://www.danielpovey.com/files/2017_asru_mgb3.pdf) for details.
+# The output data is not necessarily particularly clean; you can run
+# steps/cleanup/clean_and_segment_data.sh on the output in order to
+# further clean it and eliminate data where the transcript doesn't seem to
+# match.
+
 . ./path.sh
 
 set -e
@@ -157,10 +174,17 @@ if [ $stage -le 3 ]; then
   cp $srcdir/phones.txt $dir 2>/dev/null || true
 
   mkdir -p $graph_dir
+  
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
 
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
-    --nj $nj --cmd "$cmd" $text \
+    --nj $nj_reco --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
     # and then copy it to the sub-segments.
@@ -202,7 +226,7 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ $stage -le 5 ]; then
-  steps/get_ctm_fast.sh --lmwt $lmwt --cmd "$cmd --mem 4G" \
+  steps/get_ctm_fast.sh --frame_shift $frame_shift --lmwt $lmwt --cmd "$cmd --mem 4G" \
     --print-silence true \
     $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
 fi
@@ -380,7 +404,8 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
+  $cmd $dir/log/resolve_ctm_edits.log \
+    steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
     ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits
 fi
 
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
new file mode 100755
index 00000000000..f0df1e7730c
--- /dev/null
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -0,0 +1,552 @@
+#!/bin/bash
+
+# Copyright 2014  Guoguo Chen
+#           2016  Vimal Manohar
+# Apache 2.0
+
+
+# This script is similar to steps/cleanup/segment_long_utterances.sh, but
+# uses nnet3 acoustic model instead of GMM acoustic model for decoding.
+# This script performs segmentation of the input data based on the transcription
+# and outputs segmented data along with the corresponding aligned transcription.
+# The purpose of this script is to divide up the input data (which may consist
+# of long recordings such as television shows or audiobooks) into segments which
+# are of manageable length for further processing, along with the portion of the
+# transcript that seems to match (aligns with) each segment.
+# This the light-supervised training scenario where the input transcription is
+# not expected to be completely clean and may have significant errors.
+# See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization,
+# Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel
+# Povey, Sanjeev Khudanpur, ASRU 2017
+# (http://www.danielpovey.com/files/2017_asru_mgb3.pdf) for details.
+# The output data is not necessarily particularly clean; you can run
+# steps/cleanup/clean_and_segment_data_nnet3.sh on the output in order to
+# further clean it and eliminate data where the transcript doesn't seem to
+# match.
+
+
+set -e
+set -o pipefail
+set -u
+
+stage=-1
+cmd=run.pl
+nj=4
+
+# Uniform segmentation options
+max_segment_duration=30
+overlap_duration=5
+seconds_per_spk_max=30
+
+# Decode options
+graph_opts=
+scale_opts=  # for making the graphs
+beam=15.0
+lattice_beam=1.0
+lmwt=10
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+
+# Contexts must ideally match training
+extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+frames_per_chunk=150
+
+# i-vector options
+extractor=    # i-Vector extractor. If provided, will extract i-vectors.
+              # Required if the network was trained with i-vector extractor.
+use_vad=false # Use energy-based VAD for i-vector extraction
+
+# TF-IDF similarity search options
+max_words=1000
+num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity.
+neighbor_tfidf_threshold=0.5
+
+align_full_hyp=false  # Align full hypothesis i.e. trackback from the end to get the alignment.
+
+# First-pass segmentation opts
+# These options are passed to the script
+# steps/cleanup/internal/segment_ctm_edits_mild.py
+segmentation_extra_opts=
+min_split_point_duration=0.1
+max_deleted_words_kept_when_merging=1
+max_wer=50
+max_segment_length_for_merging=60
+max_bad_proportion=0.75
+max_intersegment_incorrect_words_length=1
+max_segment_length_for_splitting=10
+hard_max_segment_length=15
+min_silence_length_to_split_at=0.3
+min_non_scored_length_to_split_at=0.3
+
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 5 ] && [ $# -ne 7 ]; then
+  cat <<EOF
+Usage: $0 [--extractor <ivector-extractor>] [options] <model-dir> <lang> <data-in> [<text-in> <utt2text>] <segmented-data-out> <work-dir>
+ e.g.: $0 exp/wsj_tri2b data/lang_nosp data/train_long data/train_long/text data/train_reseg exp/segment_wsj_long_utts_train
+This script performs segmentation of the data in <data-in> and writes out the
+segmented data (with a segments file) to
+<segmented-data-out> along with the corresponding aligned transcription.
+Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the
+raw transcripts to train biased LM for the utterances.
+If <utt2text> is provided, then it should be a mapping from the utterance-ids in
+<data-in> to the transcript-keys in the file <text-in>, which will be
+used to train biased LMs for the utterances.
+The purpose of this script is to divide up the input data (which may consist of
+long recordings such as television shows or audiobooks) into segments which are
+of manageable length for further processing, along with the portion of the
+transcript that seems to match each segment.
+The output data is not necessarily particularly clean; you are advised to run
+steps/cleanup/clean_and_segment_data.sh on the output in order to further clean
+it and eliminate data where the transcript doesn't seem to match.
+  main options (for others, see top of script file):
+    --stage <n>             # stage to run from, to enable resuming from partially
+                            # completed run (default: 0)
+    --cmd '$cmd'            # command to submit jobs with (e.g. run.pl, queue.pl)
+    --nj <n>                # number of parallel jobs to use in graph creation and
+                            # decoding
+    --graph-opts 'opts'         # Additional options to make_biased_lm_graphs.sh.
+                                # Please run steps/cleanup/make_biased_lm_graphs.sh
+                                # without arguments to see allowed options.
+    --segmentation-extra-opts 'opts'  # Additional options to segment_ctm_edits_mild.py.
+                                # Please run steps/cleanup/internal/segment_ctm_edits_mild.py
+                                # without arguments to see allowed options.
+    --align-full-hyp <true|false>  # If true, align full hypothesis
+                                   i.e. trackback from the end to get the alignment.
+                                   This is different from the normal
+                                   Smith-Waterman alignment, where the
+                                   traceback will be from the maximum score.
+    --extractor <extractor>     # i-vector extractor directory if i-vector is
+                                # to be used during decoding. Must match
+                                # the extractor used for training neural-network.
+    --use-vad <true|false>      # If true, uses energy-based VAD to apply frame weights
+                                # for i-vector stats extraction
+EOF
+  exit 1
+fi
+
+srcdir=$1
+lang=$2
+data=$3
+
+extra_files=
+utt2text=
+text=$data/text
+if [ $# -eq 7 ]; then
+  text=$4
+  utt2text=$5
+  out_data=$6
+  dir=$7
+  extra_files="$utt2text"
+else
+  out_data=$4
+  dir=$5
+fi
+
+if [ ! -z "$extractor" ]; then
+  extra_files="$extra_files $extractor/final.ie"
+fi
+
+for f in $data/feats.scp $text $extra_files $srcdir/tree \
+  $srcdir/final.mdl $srcdir/cmvn_opts; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+data_id=`basename $data`
+mkdir -p $dir
+cp $srcdir/final.mdl $dir
+cp $srcdir/tree $dir
+cp $srcdir/cmvn_opts $dir
+cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
+cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
+
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  echo "$0: guessing that this is a chain system, checking parameters."
+  if [ -z $scale_opts ]; then
+    echo "$0: setting scale_opts"
+    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+  fi
+  if [ $acwt == 0.1 ]; then
+    echo "$0: setting acwt=1.0"
+    acwt=1.0
+  fi
+  if [ $lmwt == 10 ]; then
+    echo "$0: setting lmwt=1.0"
+    lmwt=1
+  fi
+fi
+
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
+cp $lang/phones.txt $dir
+
+data_uniform_seg=$dir/${data_id}_uniform_seg
+
+# First we split the data into segments of around 30s long, on which
+# it would be possible to do a decoding.
+# A diarization step will be added in the future.
+if [ $stage -le 1 ]; then
+  echo "$0: Stage 1 (Splitting data directory $data into uniform segments)"
+
+  utils/data/get_utt2dur.sh $data
+  if [ ! -f $data/segments ]; then
+    utils/data/get_segments_for_data.sh $data > $data/segments
+  fi
+
+  utils/data/get_uniform_subsegments.py \
+    --max-segment-duration=$max_segment_duration \
+    --overlap-duration=$overlap_duration \
+    --max-remaining-duration=$(perl -e "print $max_segment_duration / 2.0") \
+    $data/segments > $dir/uniform_sub_segments
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Stage 2 (Prepare uniform sub-segmented data directory)"
+  rm -r $data_uniform_seg || true
+
+  if [ ! -z "$seconds_per_spk_max" ]; then
+    utils/data/subsegment_data_dir.sh \
+      $data $dir/uniform_sub_segments $dir/${data_id}_uniform_seg.temp
+
+    utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \
+      $dir/${data_id}_uniform_seg.temp $data_uniform_seg
+  else
+    utils/data/subsegment_data_dir.sh \
+      $data $dir/uniform_sub_segments $data_uniform_seg
+  fi
+
+  utils/fix_data_dir.sh $data_uniform_seg
+
+  # Compute new cmvn stats for the segmented data directory
+  steps/compute_cmvn_stats.sh $data_uniform_seg/
+fi
+
+graph_dir=$dir/graphs_uniform_seg
+
+if [ $stage -le 3 ]; then
+  echo "$0: Stage 3 (Building biased-language-model decoding graphs)"
+
+  mkdir -p $graph_dir
+
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
+
+  # Make graphs w.r.t. to the original text (usually recording-level)
+  steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
+    --scale-opts "$scale_opts" \
+    --nj $nj_reco --cmd "$cmd" $text \
+    $lang $dir $dir/graphs
+  if [ -z "$utt2text" ]; then
+    # and then copy it to the sub-segments.
+    cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
+      utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
+      sort -k1,1 > \
+      $graph_dir/HCLG.fsts.scp
+  else
+    # and then copy it to the sub-segments.
+    cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
+      utils/apply_map.pl -f 2 $utt2text | \
+      utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
+      sort -k1,1 > \
+      $graph_dir/HCLG.fsts.scp
+  fi
+
+  cp $lang/words.txt $graph_dir
+  cp -r $lang/phones $graph_dir
+  [ -f $dir/graphs/num_pdfs ] && cp $dir/graphs/num_pdfs $graph_dir/
+fi
+
+decode_dir=$dir/lats
+mkdir -p $decode_dir
+
+online_ivector_dir=
+if [ ! -z "$extractor" ]; then
+  online_ivector_dir=$dir/ivectors_$(basename $data_uniform_seg)
+
+  if [ $stage -le 4 ]; then
+    # Compute energy-based VAD
+    if $use_vad; then
+      steps/compute_vad_decision.sh $data_uniform_seg \
+        $data_uniform_seg/log $data_uniform_seg/data
+    fi
+
+    steps/online/nnet2/extract_ivectors_online.sh \
+      --nj $nj --cmd "$cmd --mem 4G" --use-vad $use_vad \
+      $data_uniform_seg $extractor $online_ivector_dir
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: Decoding with biased language models..."
+
+  steps/cleanup/decode_segmentation_nnet3.sh \
+    --acwt $acwt \
+    --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
+    --skip-scoring true --allow-partial false \
+    --extra-left-context $extra_left_context \
+    --extra-right-context $extra_right_context \
+    --extra-left-context-initial $extra_left_context_initial \
+    --extra-right-context-final $extra_right_context_final \
+    --frames-per-chunk $frames_per_chunk \
+    ${online_ivector_dir:+--online-ivector-dir $online_ivector_dir} \
+    $graph_dir $data_uniform_seg $decode_dir
+fi
+
+frame_shift_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_shift_opt="--frame-shift 0.0$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 6 ]; then
+  steps/get_ctm_fast.sh --lmwt $lmwt --cmd "$cmd --mem 4G" \
+    --print-silence true $frame_shift_opt \
+    $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
+fi
+
+# Split the original text into documents, over which we can do
+# searching reasonably efficiently. Also get a mapping from the original
+# text to the created documents (i.e. text2doc)
+# Since the Smith-Waterman alignment is linear in the length of the
+# text, we want to keep it reasonably small (a few thousand words).
+
+if [ $stage -le 7 ]; then
+  # Split the reference text into documents.
+  mkdir -p $dir/docs
+
+  # text2doc is a mapping from the original transcript to the documents
+  # it is split into.
+  # The format is
+  # <original-transcript> <doc1> <doc2> ...
+  steps/cleanup/internal/split_text_into_docs.pl --max-words $max_words \
+    $text $dir/docs/doc2text $dir/docs/docs.txt
+  utils/utt2spk_to_spk2utt.pl $dir/docs/doc2text > $dir/docs/text2doc
+fi
+
+if [ $stage -le 8 ]; then
+  # Get TF-IDF for the reference documents.
+  echo $nj > $dir/docs/num_jobs
+
+  utils/split_data.sh $data_uniform_seg $nj
+
+  mkdir -p $dir/docs/split$nj/
+
+  # First compute IDF stats
+  $cmd $dir/log/compute_source_idf_stats.log \
+    steps/cleanup/internal/compute_tf_idf.py \
+    --tf-weighting-scheme="raw" \
+    --idf-weighting-scheme="log" \
+    --output-idf-stats=$dir/docs/idf_stats.txt \
+    $dir/docs/docs.txt $dir/docs/src_tf_idf.txt
+
+  # Split documents so that they can be accessed easily by parallel jobs.
+  mkdir -p $dir/docs/split$nj/
+  sdir=$dir/docs/split$nj
+  for n in `seq $nj`; do
+
+    # old2new_utts is a mapping from the original segments to the
+    # new segments created by uniformly segmenting.
+    # The format is <old-utterance> <new-utt1> <new-utt2> ...
+    utils/filter_scp.pl $data_uniform_seg/split$nj/$n/utt2spk $dir/uniform_sub_segments | \
+      cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl > $sdir/old2new_utts.$n.txt
+
+    if [ ! -z "$utt2text" ]; then
+      # utt2text, if provided, is a mapping from the <old-utterance> to
+      # <original-transript>.
+      # Since text2doc is mapping from <original-transcript> to documents, we
+      # first have to find the original-transcripts that are in the current
+      # split.
+      utils/filter_scp.pl $sdir/old2new_utts.$n.txt $utt2text | \
+        cut -d ' ' -f 2 | sort -u | \
+        utils/filter_scp.pl /dev/stdin $dir/docs/text2doc > $sdir/text2doc.$n
+    else
+      utils/filter_scp.pl $sdir/old2new_utts.$n.txt \
+        $dir/docs/text2doc > $sdir/text2doc.$n
+    fi
+
+    utils/spk2utt_to_utt2spk.pl $sdir/text2doc.$n | \
+      utils/filter_scp.pl /dev/stdin $dir/docs/docs.txt > \
+      $sdir/docs.$n.txt
+  done
+
+  # Compute TF-IDF for the source documents.
+  $cmd JOB=1:$nj $dir/docs/log/get_tfidf_for_source_texts.JOB.log \
+    steps/cleanup/internal/compute_tf_idf.py \
+      --tf-weighting-scheme="raw" \
+      --idf-weighting-scheme="log" \
+      --input-idf-stats=$dir/docs/idf_stats.txt \
+      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt
+
+  sdir=$dir/docs/split$nj
+  # Make $sdir an absolute pathname.
+  sdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $sdir ${PWD}`
+
+  for n in `seq $nj`; do
+    awk -v f="$sdir/src_tf_idf.$n.txt" '{print $1" "f}' \
+      $sdir/text2doc.$n
+  done | perl -ane 'BEGIN { %tfidfs = (); }
+  {
+    if (!defined $tfidfs{$F[0]}) {
+      $tfidfs{$F[0]} = $F[1];
+    }
+  }
+  END {
+  while(my ($k, $v) = each %tfidfs) {
+    print "$k $v\n";
+  } }' > $dir/docs/source2tf_idf.scp
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: using default values of non-scored words..."
+
+  # At the level of this script we just hard-code it that non-scored words are
+  # those that map to silence phones (which is what get_non_scored_words.py
+  # gives us), although this could easily be made user-configurable.  This list
+  # of non-scored words affects the behavior of several of the data-cleanup
+  # scripts; essentially, we view the non-scored words as negotiable when it
+  # comes to the reference transcript, so we'll consider changing the reference
+  # to match the hyp when it comes to these words.
+  steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt
+fi
+
+if [ $stage -le 10 ]; then
+  sdir=$dir/query_docs/split$nj
+  mkdir -p $sdir
+
+  # Compute TF-IDF for the query documents (decode hypotheses).
+  # The output is an archive of TF-IDF indexed by the query.
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/compute_query_tf_idf.JOB.log \
+    steps/cleanup/internal/ctm_to_text.pl --non-scored-words $dir/non_scored_words.txt \
+      $decode_dir/ctm_$lmwt/ctm.JOB \| \
+    steps/cleanup/internal/compute_tf_idf.py \
+      --tf-weighting-scheme="normalized" \
+      --idf-weighting-scheme="log" \
+      --input-idf-stats=$dir/docs/idf_stats.txt \
+      --accumulate-over-docs=false \
+      - $sdir/query_tf_idf.JOB.ark.txt
+
+  # The relevant documents can be found using TF-IDF similarity and nearby
+  # documents can also be picked for the Smith-Waterman alignment stage.
+
+  # Get a mapping from the new utterance-ids to original transcripts
+  if [ -z "$utt2text" ]; then
+    awk '{print $1" "$2}' $dir/uniform_sub_segments > \
+      $dir/new2orig_utt
+  else
+    awk '{print $1" "$2}' $dir/uniform_sub_segments | \
+      utils/apply_map.pl -f 2 $utt2text > \
+      $dir/new2orig_utt
+  fi
+
+  # The query TF-IDFs are all indexed by the utterance-id of the sub-segments.
+  # The source TF-IDFs use the document-ids created by splitting the reference
+  # text into documents.
+  # For each query, we need to retrieve the documents that were created from
+  # the same original utterance that the sub-segment was from. For this,
+  # we have to load the source TF-IDF that has those documents. This
+  # information is provided using the option --source-text-id2tf-idf-file.
+  # The output of this script is a file where the first column is the
+  # query-id (i.e. sub-segment-id) and the remaining columns, which is at least
+  # one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns
+  # is the document-ids for the retrieved documents.
+  $cmd JOB=1:$nj $dir/log/retrieve_similar_docs.JOB.log \
+    steps/cleanup/internal/retrieve_similar_docs.py \
+      --query-tfidf=$dir/query_docs/split$nj/query_tf_idf.JOB.ark.txt \
+      --source-text-id2tfidf=$dir/docs/source2tf_idf.scp \
+      --source-text-id2doc-ids=$dir/docs/text2doc \
+      --query-id2source-text-id=$dir/new2orig_utt \
+      --num-neighbors-to-search=$num_neighbors_to_search \
+      --neighbor-tfidf-threshold=$neighbor_tfidf_threshold \
+      --relevant-docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt
+
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/get_ctm_edits.JOB.log \
+    steps/cleanup/internal/stitch_documents.py \
+      --query2docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt \
+      --input-documents=$dir/docs/split$nj/docs.JOB.txt \
+      --output-documents=- \| \
+    steps/cleanup/internal/align_ctm_ref.py --eps-symbol='"<eps>"' \
+      --oov-word="'`cat $lang/oov.txt`'" --symbol-table=$lang/words.txt \
+      --hyp-format=CTM --align-full-hyp=$align_full_hyp \
+      --hyp=$decode_dir/ctm_$lmwt/ctm.JOB --ref=- \
+      --output=$decode_dir/ctm_$lmwt/ctm_edits.JOB
+
+  for n in `seq $nj`; do
+    cat $decode_dir/ctm_$lmwt/ctm_edits.$n
+  done > $decode_dir/ctm_$lmwt/ctm_edits
+
+fi
+
+if [ $stage -le 11 ]; then
+  $cmd $dir/log/resolve_ctm_edits.log \
+    steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
+    ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and "
+  echo "   ... to fix reference mismatches involving non-scored words. "
+
+  $cmd $dir/log/modify_ctm_edits.log \
+    steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \
+    $dir/ctm_edits $dir/ctm_edits.modified
+
+  echo "   ... See $dir/log/modify_ctm_edits.log for details and stats, including"
+  echo " a list of commonly-repeated words."
+fi
+
+if [ $stage -le 13 ]; then
+  echo "$0: applying 'taint' markers to ctm-edits file to mark silences and"
+  echo "  ... non-scored words that are next to errors."
+  $cmd $dir/log/taint_ctm_edits.log \
+       steps/cleanup/internal/taint_ctm_edits.py --remove-deletions=false \
+       $dir/ctm_edits.modified $dir/ctm_edits.tainted
+  echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log."
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating segmentation from ctm-edits file."
+
+  segmentation_opts=(
+  --min-split-point-duration=$min_split_point_duration
+  --max-deleted-words-kept-when-merging=$max_deleted_words_kept_when_merging
+  --merging.max-wer=$max_wer
+  --merging.max-segment-length=$max_segment_length_for_merging
+  --merging.max-bad-proportion=$max_bad_proportion
+  --merging.max-intersegment-incorrect-words-length=$max_intersegment_incorrect_words_length
+  --splitting.max-segment-length=$max_segment_length_for_splitting
+  --splitting.hard-max-segment-length=$hard_max_segment_length
+  --splitting.min-silence-length=$min_silence_length_to_split_at
+  --splitting.min-non-scored-length=$min_non_scored_length_to_split_at
+  )
+
+  $cmd $dir/log/segment_ctm_edits.log \
+    steps/cleanup/internal/segment_ctm_edits_mild.py \
+      ${segmentation_opts[@]} $segmentation_extra_opts \
+      --oov-symbol-file=$lang/oov.txt \
+      --ctm-edits-out=$dir/ctm_edits.segmented \
+      --word-stats-out=$dir/word_stats.txt \
+      $dir/non_scored_words.txt \
+      $dir/ctm_edits.tainted $dir/text $dir/segments
+
+  echo "$0: contents of $dir/log/segment_ctm_edits.log are:"
+  cat $dir/log/segment_ctm_edits.log
+  echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top,"
+  echo "see $dir/word_stats.txt"
+  echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented"
+fi
+
+mkdir -p $out_data
+if [ $stage -le 15 ]; then
+  utils/data/subsegment_data_dir.sh $data_uniform_seg \
+    $dir/segments $dir/text $out_data
+fi
diff --git a/egs/wsj/s5/steps/combine_ali_dirs.sh b/egs/wsj/s5/steps/combine_ali_dirs.sh
index fe704da3dc6..39f2ff2b33b 100755
--- a/egs/wsj/s5/steps/combine_ali_dirs.sh
+++ b/egs/wsj/s5/steps/combine_ali_dirs.sh
@@ -1,105 +1,211 @@
 #!/bin/bash
 # Copyright 2016  Xiaohui Zhang  Apache 2.0.
+# Copyright 2019  SmartAction (kkm)
 
-# This srcipt operates on alignment directories, such as exp/tri4a_ali
-# the output is a new ali dir which has alignments from all the input ali dirs
+# This script combines alignment directories, such as exp/tri4a_ali, and
+# validates matching of the utterances and alignments after combining.
 
 # Begin configuration section.
 cmd=run.pl
-extra_files=
-num_jobs=4
+nj=4
+combine_lat=true
+combine_ali=true
+tolerance=10
 # End configuration section.
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
-if [ -f path.sh ]; then . ./path.sh; fi
-. parse_options.sh || exit 1;
+[[ -f path.sh ]] && . ./path.sh
+. parse_options.sh || exit 1
+
+export LC_ALL=C
 
 if [[ $# -lt 3 ]]; then
-  echo "Usage: $0 [options] <data> <dest-ali-dir> <src-ali-dir1> <src-ali-dir2> ..."
-  echo "e.g.: $0 --num-jobs 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2"
-  echo "Options:"
-  echo " --extra-files <file1 file2...>   # specify addtional files in 'src-ali-dir1' to copy"
-  echo " --num-jobs <nj>                  # number of jobs used to split the data directory."
-  echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones."
-  echo " Other than alignments, only files from the first src ali dir are copied."
+  cat >&2 <<EOF
+Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
+ e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2
+Options:
+ --nj <nj>              # number of jobs to split combined archives [4]
+ --combine_ali false    # merge ali.*.gz if present [true]
+ --combine_lat false    # merge lat.*.gz if present [true]
+ --tolerance <int,%>    # maximum percentage of missing alignments or lattices
+                        # w.r.t. total utterances in <data> before error is
+                        # reported [10]
+
+The script checks that certain important files are present and compatible in all
+source directories (phones.txt, tree); other are copied from the first source
+(cmvn_opts, final.mdl) without much checking.
+
+Both --combine_ali and --combine_lat are true by default, but the script
+proceeds with a warning if directories do not contain either alignments or
+alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir>
+after the script completes if additional programmatic check is required.
+EOF
   exit 1;
 fi
 
-data=$1;
-shift;
-dest=$1;
-shift;
-first_src=$1;
-
-mkdir -p $dest;
-rm $dest/{ali.*.gz,num_jobs} 2>/dev/null
-
-cp $first_src/phones.txt $dest 2>/dev/null
-
-export LC_ALL=C
+if [[ ! $combine_lat && ! $combine_ali ]]; then
+  echo "$0: at least one of --combine_lat and --combine_ali must be true"
+  exit 1
+fi
 
-for dir in $*; do
-  if [ ! -f $dir/ali.1.gz ]; then
-    echo "$0: check if alignments (ali.*.gz) are present in $dir."
-    exit 1;
+data=$1
+dest=$2
+shift 2
+first_src=$1
+
+do_ali=$combine_ali
+do_lat=$combine_lat
+
+# Check if alignments and/or lattices are present. Since we combine both,
+# whichever present, issue a warning only. Also verify that the target is
+# different from any source; we cannot combine in-place, and a lot of damage
+# could result.
+for src in $@; do
+  if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
+        "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
+    echo "$0: error: Source $src is same as target $dest."
+    exit 1
+  fi
+  if $do_ali && [[ ! -f $src/ali.1.gz ]]; then
+    echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \
+         "combining. Consider '--combine_ali false' to suppress this warning."
+    do_ali=false
+  fi
+  if $do_lat && [[ ! -f $src/lat.1.gz ]]; then
+    echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\
+      "not combining. Consider '--combine_lat false' to suppress this warning."
+    do_lat=false
   fi
 done
 
-for dir in $*; do
-  for f in tree; do
-    diff $first_src/$f $dir/$f 1>/dev/null 2>&1
-    if [ $? -ne 0 ]; then
-      echo "$0: Cannot combine alignment directories with different $f files."
-    fi
-  done
-done
+if ! $do_ali && ! $do_lat; then
+  echo "$0: error: Cannot combine directories."
+  exit 1
+fi
 
-for f in final.mdl tree cmvn_opts num_jobs $extra_files; do
+# Verify that required files are present in the first directory.
+for f in cmvn_opts final.mdl num_jobs phones.txt tree; do
   if [ ! -f $first_src/$f ]; then
-    echo "combine_ali_dir.sh: no such file $first_src/$f"
-    exit 1;
+    echo "$0: error: Required source file $first_src/$f is missing."
+    exit 1
   fi
-  cp $first_src/$f $dest/
 done
 
-src_id=0
-temp_dir=$dest/temp
-[ -d $temp_dir ] && rm -r $temp_dir;
-mkdir -p $temp_dir
-echo "$0: dumping alignments in each source directory as single archive and index."
-for dir in $*; do
-  src_id=$((src_id + 1))
-  cur_num_jobs=$(cat $dir/num_jobs) || exit 1;
-  alis=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/ali.$n.gz "; done)
-  $cmd $dir/log/copy_alignments.log \
-    copy-int-vector "ark:gunzip -c $alis|" \
-    ark,scp:$temp_dir/ali.$src_id.ark,$temp_dir/ali.$src_id.scp || exit 1;
+# Verify that phones and trees are compatible in all directories, and than
+# num_jobs files are present, too.
+for src in $@; do
+  if [[ $src != $first_src ]]; then
+    if [[ ! -f $src/num_jobs ]]; then
+      echo "$0: error: Required source file $src/num_jobs is missing."
+      exit 1
+    fi
+    if ! cmp -s $first_src/tree $src/tree; then
+      echo "$0: error: tree $src/tree is either missing or not the" \
+           "same as $first_src/tree."
+      exit 1
+    fi
+    if [[ ! -f $src/phones.txt ]]; then
+      echo "$0: error: Required source file $src/phones.txt is missing."
+      exit 1
+    fi
+    utils/lang/check_phones_compatible.sh $first_src/phones.txt \
+                                          $src/phones.txt || exit 1
+  fi
 done
-sort -m $temp_dir/ali.*.scp > $temp_dir/ali.scp || exit 1;
 
-echo "$0: splitting data to get reference utt2spk for individual ali.JOB.gz files."
-utils/split_data.sh $data $num_jobs || exit 1;
+# All checks passed, ok to prepare directory. Copy model and other files from
+# the first source, as they either checked to be compatible, or we do not care
+# if they are.
+mkdir -p $dest || exit 1
+rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree}
+$do_ali && rm -f $dest/ali.*.{gz,scp}
+$do_lat && rm -f $dest/lat.*.{gz,scp}
+cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1
+cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null  # If present.
+echo $nj > $dest/num_jobs || exit 1
+
+# Make temporary directory, delete on signal, but not on 'exit 1'.
+temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
+cleanup() { rm -rf "$temp_dir"; }
+trap cleanup HUP INT TERM
+echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
+     "script failure, so you could examine it for troubleshooting."
+
+
+# This function may be called twice, once to combine alignments and the second
+# time to combine lattices. The two invocations are as follows:
+#   do_combine ali alignments copy-int-vector $@
+#   do_combine lat lattices   lattice-copy $@
+# where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into
+# log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the
+# program used to copy corresponding objects.
+do_combine() {
+  local ark=$1 entities=$2 copy_program=$3
+  shift 3
+
+  echo "$0: Gathering $entities from each source directory."
+  # Assign all source gzipped archive names to an exported variable, one each
+  # per source directory, so that we can copy archives in a job per source.
+  src_id=0
+  for src in $@; do
+    src_id=$((src_id + 1))
+    nj_src=$(cat $src/num_jobs) || exit 1
+    # Create and export variable src_arcs_${src_id} for the job runner.
+    # Each numbered variable will contain the list of archives, e. g.:
+    # src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..."
+    # ('printf' repeats its format as long as there are more arguments).
+    printf "$src/$ark.%d.gz " $(seq $nj_src) > $temp_dir/src_arks.${src_id}
+  done
+  
+  # Gather archives in parallel jobs.
+  $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
+    $copy_program \
+      "ark:gunzip -c \$(cat $temp_dir/src_arks.JOB) |" \
+      "ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1
+
+  # Merge (presumed already sorted) scp's into a single script.
+  sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1
+
+  inputs=$(for n in `seq $nj`; do echo $temp_dir/$ark.$n.scp; done)
+  utils/split_scp.pl --utt2spk=$data/utt2spk $temp_dir/$ark.scp $inputs
+
+  echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
+  $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
+    $copy_program \
+      "scp:$temp_dir/$ark.JOB.scp" \
+      "ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1
+
+  # Get some interesting stats, and signal an error if error threshold exceeded.
+  n_utt=$(wc -l <$data/utt2spk)
+  n_ali=$(wc -l <$temp_dir/$ark.scp)
+  n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l)
+  n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l)
+  n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);")
+  echo "$0: Combined $n_ali $entities for $n_utt utterances." \
+       "There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \
+       "$entities, and $n_ali_no_utt $entities not matching any utterance."
+
+  if (( $n_utt_no_ali_pct >= $tolerance )); then
+    echo "$0: error: Percentage of utterances missing $entities," \
+         "${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%."
+    exit 1
+  fi
 
-echo "$0: splitting the alignments to appropriate chunks according to the reference utt2spk files."
-utils/filter_scps.pl JOB=1:$num_jobs \
-  $data/split$num_jobs/JOB/utt2spk $temp_dir/ali.scp $temp_dir/ali.JOB.scp
+  return 0
+}
 
-for i in `seq 1 $num_jobs`; do
-    copy-int-vector scp:$temp_dir/ali.${i}.scp "ark:|gzip -c >$dest/ali.$i.gz" || exit 1;
-done
+# Do the actual combining. Do not check returned exit code, as
+# the function always calls 'exit 1' on failure.
+$do_ali && do_combine ali 'alignments' copy-int-vector "$@"
+$do_lat && do_combine lat 'lattices' lattice-copy "$@"
 
-echo $num_jobs > $dest/num_jobs  || exit 1
-
-echo "$0: checking the alignment files generated have at least 90% of the utterances."
-for i in `seq 1 $num_jobs`; do
-  num_lines=`cat $temp_dir/ali.$i.scp | wc -l` || exit 1;
-  num_lines_tot=`cat $data/split$num_jobs/$i/utt2spk | wc -l` || exit 1;
-  python -c "import sys;
-percent = 100.0 * float($num_lines) / $num_lines_tot
-if percent < 90 :
-  print ('$dest/ali.$i.gz {0}% utterances missing.'.format(percent))"  || exit 1;
-done
-rm -r $temp_dir 2>/dev/null
+# Delete the temporary directory on success.
+cleanup
 
-echo "Combined alignments and stored in $dest"
+what=
+$do_ali && what+='alignments '
+$do_ali && $do_lat && what+='and '
+$do_lat && what+='lattices '
+echo "$0: Stored combined ${what}in $dest"  # No period, interferes with
+                                            # copy/paste from tty emulator.
 exit 0
diff --git a/egs/wsj/s5/steps/combine_lat_dirs.sh b/egs/wsj/s5/steps/combine_lat_dirs.sh
new file mode 120000
index 00000000000..9cc58c3a616
--- /dev/null
+++ b/egs/wsj/s5/steps/combine_lat_dirs.sh
@@ -0,0 +1 @@
+combine_ali_dirs.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/combine_trans_dirs.sh b/egs/wsj/s5/steps/combine_trans_dirs.sh
new file mode 100644
index 00000000000..2edb9b9f6a5
--- /dev/null
+++ b/egs/wsj/s5/steps/combine_trans_dirs.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+# Copyright 2016  Xiaohui Zhang  Apache 2.0.
+# Copyright 2019  SmartAction (kkm)
+# Copyright 2019  manhong wang (marvin)
+
+# This script only combines transform file in the aligments dirs, egs: trans.1,  and
+# validates matching of the utterances and alignments after combining. you would need this fmllr trans
+# files after you combine ali or lat dirs(combine_ali_dirs.sh or combine_lat_dis.sh).
+
+# Begin configuration section.
+cmd=run.pl
+tolerance=10
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging.
+
+[[ -f path.sh ]] && . ./path.sh
+. parse_options.sh || exit 1
+
+export LC_ALL=C
+
+if [[ $# -lt 3 ]]; then
+  cat >&2 <<EOF
+Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
+ e.g.: $0 data/train exp/tri3_trans_combined exp/tri3_trans_1 exp_tri3_trans_2
+Options:
+ --tolerance <int,%>    # maximum percentage of missing trans
+                        # w.r.t. total utterances in <data> before error is
+                        # reported [10]
+
+Note:we do not checks that certain important files are present and compatible in all
+source directories (phones.txt, tree) here.Because you would run combine_trans_dirs.sh 
+or combine_lat_dis.sh first.
+
+EOF
+  exit 1;
+fi
+
+
+data=$1
+dest=$2
+shift 2
+first_src=$1
+
+do_trans=true    
+
+
+# All checks passed, ok to prepare directory. but we do not Copy model and other files from
+# the first source.
+
+for src in $@; do
+  if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
+        "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
+    echo "$0: error: Source $src is same as target $dest."
+    exit 1
+  fi
+  if $do_trans && [[ ! -f $src/trans.1 ]]; then
+    echo "$0: warning: transform (trans.*) are not present in $src, not" \
+         "combining. please check you files" 
+    exit 1
+  fi
+done
+
+if [ ! -f $dest/ali.1.gz  ] && [ ! -f $dest/lat.1.gz ] ; then 
+    echo "$0: warning: we assume you have combined the ali or lat dirs " \
+         "please run combine_ali_dir.sh or combine_lat_dir.sh firstly"
+    exit 1
+fi
+
+nj=$(cat $dest/num_jobs)
+
+if [ -f $dest/trans.1 ] ; then rm $dest/trans.* ;fi    #remove old trans.*
+
+# Make temporary directory, delete on signal, but not on 'exit 1'.
+temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
+cleanup() { rm -rf "$temp_dir"; }
+trap cleanup HUP INT TERM
+echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
+     "script failure, so you could examine it for troubleshooting."
+
+do_combine_trans() {
+  local ark=$1 entities=$2 copy_program=$3
+  shift 3
+
+  echo "$0: Gathering $entities from each source directory."
+  # Assign all source gzipped archive names to an exported variable, one each
+  # per source directory, so that we can copy archives in a job per source.
+  src_id=0
+  for src in $@; do
+    src_id=$((src_id + 1))
+    nj_src=$(cat $src/num_jobs) || exit 1
+    # Create and export variable src_arcs_${src_id} for the job runner.
+    # Each numbered variable will contain the list of archives, e. g.:
+    # src_arcs_1="exp/tri3_ali/trans.1 exp/tri3_ali/trans.1 ..."
+    # ('printf' repeats its format as long as there are more arguments).
+    printf "$src/$ark.%d " $(seq $nj_src) > $temp_dir/src_arks.${src_id}
+  done
+  
+  # Gather archives in parallel jobs.
+  $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
+    $copy_program \
+      "ark:cat \$(cat $temp_dir/src_arks.JOB) |" \
+      "ark,scp:$temp_dir/$ark.JOB,$temp_dir/$ark.JOB.scp" || exit 1
+
+  # Merge (presumed already sorted) scp's into a single script.
+  sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1
+
+  echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
+  $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
+    $copy_program \
+      "scp:utils/split_scp.pl  -j $nj JOB --one-based $temp_dir/$ark.scp |" \
+      "ark:$dest/$ark.JOB" || exit 1
+
+  # Get some interesting stats.
+  n_utt=$(wc -l <$data/spk2utt)
+  n_trans=$(wc -l <$temp_dir/$ark.scp)
+  n_utt_no_trans_pct=$(perl -e "print int(($n_utt - $n_trans)/$n_utt * 100 + .5);")
+  echo "$0: Combined $n_trans $entities for $n_utt utterances." 
+
+  if (( $n_utt_no_trans_pct >= $tolerance )); then
+    echo "$0: error: Percentage of utterances missing $entities," \
+         "${n_utt_no_trans_pct}%, is at or above error tolerance ${tolerance}%."
+    exit 1
+  fi
+
+  return 0
+}
+
+$do_trans && do_combine_trans trans 'transforms' copy-matrix "$@"
+
+cleanup     # Delete the temporary directory on success.
+
+echo "$0: Stored combined fmllr trans in $dest"  
+exit 0
diff --git a/egs/wsj/s5/steps/compare_alignments.sh b/egs/wsj/s5/steps/compare_alignments.sh
new file mode 100755
index 00000000000..d94d2197fee
--- /dev/null
+++ b/egs/wsj/s5/steps/compare_alignments.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0.
+
+set -e
+stage=0
+cmd=run.pl   # We use this only for get_ctm.sh, which can be a little slow.
+num_to_sample=1000  # We sample this many utterances for human-readable display, starting from the worst and then
+                    # starting from the middle.
+cleanup=true
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 5 ] && [ $# -ne 7 ]; then
+  cat <<EOF
+  This script compares two directories containing data alignments, and
+  creates statistics showing how much the phone and word alignments differ,
+  including breakdown by phones and words; and which utterances differ the
+  most.  This is intended for diagnostic purposes.  Both alignment directories
+  should be for the same data (or at least the data sets should overlap).
+  The word alignment stats may not be correctly obtained if the data-dirs are
+  not the same.
+
+  Usage: $0 [options] <lang-directory> <data-directory> <ali-dir1> <ali-dir2> <work-dir>
+    or:  $0 [options] <lang1> <lang2> <data1> <data2> <ali-dir1> <ali-dir2> <work-dir>
+   e.g.: $0 data/lang data/train exp/tri2_ali exp/tri3_ali exp/compare_ali_2_3
+
+  Options:
+              --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
+                                              # (passed through to get_train_ctm.sh)
+              --cleanup <true|false>          # Specify --cleanup false to prevent
+                                              # cleanup of temporary files.
+              --stage  <n>                    # Enables you to run part of the script.
+
+EOF
+  exit 1
+fi
+
+if [ $# -eq 5 ]; then
+  lang1=$1
+  lang2=$1
+  data1=$2
+  data2=$2
+  ali_dir1=$3
+  ali_dir2=$4
+  dir=$5
+else
+  lang1=$1
+  lang2=$2
+  data1=$3
+  data2=$4
+  ali_dir1=$5
+  ali_dir2=$6
+  dir=$7
+fi
+
+for f in $lang1/phones.txt $lang2/phones.txt $data1/utt2spk $data2/utt2spk \
+         $ali_dir1/ali.1.gz $ali_dir2/ali.2.gz; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+# This will exit if the phone symbol id's are different, due to
+# `set -e` above.
+utils/lang/check_phones_compatible.sh $lang1/phones.txt $lang2/phones.txt
+
+nj1=$(cat $ali_dir1/num_jobs)
+nj2=$(cat $ali_dir2/num_jobs)
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: converting alignments to phones."
+
+  for j in $(seq $nj1); do gunzip -c $ali_dir1/ali.$j.gz; done | \
+    ali-to-phones --per-frame=true $ali_dir1/final.mdl ark:- ark:- | gzip -c > $dir/phones1.gz
+
+  for j in $(seq $nj2); do gunzip -c $ali_dir2/ali.$j.gz; done | \
+    ali-to-phones --per-frame=true $ali_dir2/final.mdl ark:- ark:- | gzip -c > $dir/phones2.gz
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: getting comparison stats and utterance stats."
+  compare-int-vector --binary=false --write-confusion-matrix=$dir/conf.mat \
+            "ark:gunzip -c $dir/phones1.gz|" "ark:gunzip -c $dir/phones2.gz|" 2>$dir/log/compare_phones.log > $dir/utt_stats.phones
+  tail -n 8 $dir/log/compare_phones.log
+fi
+
+if [ $stage -le 3 ]; then
+  cat $dir/conf.mat | grep -v -F '[' | sed 's/]//' | awk '{n=NF; for (k=1;k<=n;k++) { conf[NR,k] = $k; row_tot[NR] += $k; col_tot[k] += $k; } } END{
+   for (row=1;row<=n;row++) for (col=1;col<=n;col++) {
+     val = conf[row,col]; this_row_tot = row_tot[row]; this_col_tot = col_tot[col];
+     rval=conf[col,row]
+     min_tot = (this_row_tot < this_col_tot ? this_row_tot : this_col_tot);
+     if (val != 0) {
+       phone1 = row-1; phone2 = col-1;
+       if (row == col) printf("COR %d %d %.2f%\n", phone1, val, (val * 100 / this_row_tot));
+       else {
+         norm_prob = val * val / min_tot;  # heuristic for sorting.
+         printf("SUB %d %d %d %d %.2f%% %.2f%%\n",
+                 norm_prob, phone1, phone2, val, (val * 100 / min_tot), (rval * 100 / min_tot)); }}}}' > $dir/phone_stats.all
+
+   (
+     echo "# Format: <phone> <frame-count> <percent-correct>"
+     grep '^COR' $dir/phone_stats.all | sort -n -k4,4 | awk '{print $2, $3, $4}' | utils/int2sym.pl -f 1 $lang1/phones.txt
+   ) > $dir/phones_correct.txt
+
+   (
+     echo "#Format: <phone1> <phone2> <num-frames> <prob-wrong%> <reverse-prob-wrong%>"
+     echo "# <num-frames> is the number of frames that were labeled <phone1> in the first"
+     echo "# set of alignments and <phone2> in the second."
+     echo "# <prob-wrong> is <num-frames> divided by the smaller of the total num-frames of"
+     echo "#  phone1 or phone2, whichever is smaller; expressed as a percentage."
+     echo "#<reverse-prob-wrong> is the same but for the reverse substitution, from"
+     echo "#<phone2> to <phone1>; the comparison with <prob-wrong> the substitutions are)."
+     grep '^SUB' $dir/phone_stats.all | sort -nr -k2,2 | awk '{print $3,$4,$5,$6,$7}' | utils/int2sym.pl -f 1-2 $lang1/phones.txt
+   ) > $dir/phone_subs.txt
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: getting CTMs"
+  steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data1 $lang1 $ali_dir1 $dir/ctm1
+  steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data2 $lang2 $ali_dir2 $dir/ctm2
+fi
+
+if [ $stage -le 5 ]; then
+  oov=$(cat $lang1/oov.int)
+  # Note: below, we use $lang1 for both setups; this is by design as compare-int-vector
+  # assumes they use the same symbol table.
+  for n in 1 2; do
+    cat $dir/ctm${n}/ctm | utils/sym2int.pl --map-oov $oov -f 5 $lang1/words.txt | \
+      awk 'BEGIN{utt_id="";} { if (utt_id != $1) { if (utt_id != "") printf("\n"); utt_id=$1; printf("%s ", utt_id); } t_start=int($3); t_end=t_start + int($4); word=$5; for (t=t_start; t<t_end; t++) printf("%s ", word); } END{printf("\n")}' | \
+      copy-int-vector ark:- ark:- | gzip -c >$dir/words${n}.gz
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  compare-int-vector --binary=false --write-tot-counts=$dir/words_tot.vec --write-diff-counts=$dir/words_diff.vec \
+         "ark:gunzip -c $dir/words1.gz|" "ark:gunzip -c $dir/words2.gz|" 2>$dir/log/compare_words.log >$dir/utt_stats.words
+  tail -n 8 $dir/log/compare_words.log
+fi
+
+if [ $stage -le 6 ]; then
+
+  ( echo "# Word stats.  Format:";
+    echo "<proportion-of-wrong-frames> <num-wrong-frames> <num-correct-frames> <word>"
+
+    paste <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_diff.vec) \
+      <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_tot.vec) | \
+       awk '{ if($2 > 0) print $1*$1/$2, $1/$2, $1, $2, (NR-1)}' | utils/int2sym.pl -f 5 $lang1/words.txt | \
+      sort -nr | awk '{print $2, $3, $4, $5;}'
+  ) > $dir/word_stats.txt
+
+fi
+
+if [ $stage -le 7 ]; then
+  for type in phones words; do
+    num_utts=$(wc -l <$dir/utt_stats.$type)
+    cat $dir/utt_stats.$type | awk -v type=$type 'BEGIN{print "Utterance-id proportion-"type"-changed num-frames num-wrong-frames"; }
+          {print $1, $3 * 1.0 / $2, $2, $3; }' | sort -nr -k2,2 > $dir/utt_stats.$type.sorted
+    (
+      echo "$0: Percentiles 100, 90, .. 0 of proportion-$type-changed distribution (over utterances) are:"
+    cat $dir/utt_stats.$type.sorted | awk -v n=$num_utts 'BEGIN{k=int((n-1)/10);} {if (NR % k == 1) printf("%s ", $2); } END{print "";}'
+    ) | tee $dir/utt_stats.$type.percentiles
+  done
+fi
+
+
+if [ $stage -le 8 ]; then
+  # Display the 1000 worst utterances, and 1000 utterances from the middle of the pack, in a readable format.
+  num_utts=$(wc -l <$dir/utt_stats.words.sorted)
+  half_num_utts=$[$num_utts/2];
+  if [ $num_to_sample -gt $half_num_utts ]; then
+    num_to_sample=$half_num_utts
+  fi
+  head -n $num_to_sample $dir/utt_stats.words.sorted | awk '{print $1}' > $dir/utt_ids.worst
+  tail -n +$half_num_utts $dir/utt_stats.words.sorted | head -n $num_to_sample | awk '{print $1}' > $dir/utt_ids.mid
+
+  for suf in worst mid; do
+    for n in 1 2; do
+      gunzip -c $dir/phones${n}.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/utt_ids.$suf  >$dir/temp
+      # the next command reorders them, and duplicates the utterance-idwhich we'll later use
+      # that to display the word sequence.
+      awk '{print $1,$1,$1}' <$dir/utt_ids.$suf | utils/apply_map.pl -f 3 $dir/temp > $dir/phones${n}.$suf
+      rm $dir/temp
+    done
+    # the stuff with 0 and <eps> below is a kind of hack so that if the phones are the same, we end up
+    # with just the phone, but if different, we end up with p1/p2.
+    # The apply_map.pl stuff is to put the transcript there.
+
+    (
+      echo "# Format: <utterance-id> <word1> <word2> ... <wordN>  <frame1-phone> ... <frameN-phone>"
+      echo "# If the two alignments have the same phone, just that phone will be printed;"
+      echo "# otherwise the two phones will be printed, as in 'phone1/phone2'.  So '/' is present"
+      echo "# whenever there is a mismatch."
+
+      paste $dir/phones1.$suf $dir/phones2.$suf | perl -ane ' @A = split("\t", $_); @A1 = split(" ", $A[0]); @A2 = split(" ", $A[1]);
+            $utt = shift @A1; shift @A2; print $utt, " ";
+            for ($n = 0; $n < @A1 && $n < @A2; $n++) { $a1=$A1[$n]; $a2=$A2[$n];  if ($a1 eq $a2) { print "$a1 "; } else { print "$a1 0 $a2 "; }}
+            print "\n" ' | utils/int2sym.pl -f 3- $lang1/phones.txt | sed 's: <eps> :/:g' | \
+        utils/apply_map.pl -f 2 $data1/text
+    )  > $dir/compare_phones_${suf}.txt
+  done
+fi
+
+
+if [ $stage -le 9 ] && $cleanup; then
+  rm $dir/phones{1,2}.gz $dir/words{1,2}.gz $dir/ctm*/ctm $dir/*.vec $dir/conf.mat \
+     $dir/utt_ids.*  $dir/phones{1,2}.{mid,worst} $dir/utt_stats.{phones,words} \
+     $dir/phone_stats.all
+fi
+
+# clean up
+exit 0
diff --git a/egs/wsj/s5/steps/compute_cmvn_stats.sh b/egs/wsj/s5/steps/compute_cmvn_stats.sh
index 6e7531394a2..6c05c66a0bc 100755
--- a/egs/wsj/s5/steps/compute_cmvn_stats.sh
+++ b/egs/wsj/s5/steps/compute_cmvn_stats.sh
@@ -81,7 +81,7 @@ required="$data/feats.scp $data/spk2utt"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_cmvn.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
diff --git a/egs/wsj/s5/steps/compute_vad_decision.sh b/egs/wsj/s5/steps/compute_vad_decision.sh
new file mode 100755
index 00000000000..4cf3c5b2b79
--- /dev/null
+++ b/egs/wsj/s5/steps/compute_vad_decision.sh
@@ -0,0 +1,86 @@
+#!/bin/bash 
+
+# Copyright    2017  Vimal Manohar
+# Apache 2.0
+
+# To be run from .. (one directory up from here)
+# see ../run.sh for example
+
+# Compute energy based VAD output
+
+nj=4
+cmd=run.pl
+vad_config=conf/vad.conf
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 1 ] || [ $# -gt 3 ]; then
+   echo "Usage: $0 [options] <data-dir> [<log-dir> [<vad-dir>]]";
+   echo "e.g.: $0 data/train exp/make_vad mfcc"
+   echo "Note: <log-dir> defaults to <data-dir>/log, and <vad-dir> defaults to <data-dir>/data"
+   echo " Options:"
+   echo "  --vad-config <config-file>                       # config passed to compute-vad-energy"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+if [ $# -ge 2 ]; then
+  logdir=$2
+else
+  logdir=$data/log
+fi
+if [ $# -ge 3 ]; then
+  vaddir=$3
+else
+  vaddir=$data/data
+fi
+
+
+# make $vaddir an absolute pathname.
+vaddir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $vaddir ${PWD}`
+
+# use "name" as part of name of the archive.
+name=`basename $data`
+
+mkdir -p $vaddir || exit 1;
+mkdir -p $logdir || exit 1;
+
+if [ -f $data/vad.scp ]; then
+  mkdir -p $data/.backup
+  echo "$0: moving $data/vad.scp to $data/.backup"
+  mv $data/vad.scp $data/.backup
+fi
+
+for f in $data/feats.scp "$vad_config"; do
+  if [ ! -f $f ]; then
+    echo "compute_vad_decision.sh: no such file $f"
+    exit 1;
+  fi
+done
+
+utils/split_data.sh $data $nj || exit 1;
+sdata=$data/split$nj;
+
+$cmd JOB=1:$nj $logdir/vad_${name}.JOB.log \
+  compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp \
+  ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp || exit 1
+
+for ((n=1; n<=nj; n++)); do
+  cat $vaddir/vad_${name}.$n.scp || exit 1;
+done > $data/vad.scp
+
+nc=`cat $data/vad.scp | wc -l` 
+nu=`cat $data/feats.scp | wc -l` 
+if [ $nc -ne $nu ]; then
+  echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);"
+  echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh"
+  [ $nc -eq 0 ] && exit 1;
+fi
+
+
+echo "Created VAD output for $name"
diff --git a/egs/wsj/s5/steps/conf/append_eval_to_ctm.py b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py
index f8e2aad891d..90679d2b341 100755
--- a/egs/wsj/s5/steps/conf/append_eval_to_ctm.py
+++ b/egs/wsj/s5/steps/conf/append_eval_to_ctm.py
@@ -3,6 +3,7 @@
 # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
+from __future__ import print_function
 import sys,operator
 
 # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
@@ -15,7 +16,7 @@
 #  'U' = unknown (not part of scored segment)
 
 if len(sys.argv) != 4:
-  print 'Usage: %s eval-in ctm-in ctm-eval-out' % __file__
+  print('Usage: %s eval-in ctm-in ctm-eval-out' % __file__)
   sys.exit(1)
 dummy, eval_in, ctm_in, ctm_eval_out = sys.argv
 
@@ -54,7 +55,7 @@
 
 # Build the 'ctm' with 'eval' column added,
 ctm_eval = []
-for utt,ctm_part in ctm.iteritems():
+for utt,ctm_part in ctm.items():
   ctm_part.sort(key = operator.itemgetter(2)) # Sort by 'beg' time,
   try:
     # merging 'tuples' by '+', the record has format:
@@ -69,7 +70,7 @@
     # append,
     ctm_eval.extend(merged)
   except KeyError:
-    print 'Missing key', utt, 'in the word-evaluation stats from scoring'
+    print('Missing key', utt, 'in the word-evaluation stats from scoring')
 
 # Sort again,
 ctm_eval.sort(key = operator.itemgetter(0,1,2))
diff --git a/egs/wsj/s5/steps/conf/append_prf_to_ctm.py b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py
index 547b6176c9f..42acc5e22b7 100755
--- a/egs/wsj/s5/steps/conf/append_prf_to_ctm.py
+++ b/egs/wsj/s5/steps/conf/append_prf_to_ctm.py
@@ -3,6 +3,7 @@
 # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
+from __future__ import print_function
 import sys
 
 # Append Levenshtein alignment of 'hypothesis' and 'reference' into 'CTM':
@@ -16,7 +17,7 @@
 
 # Parse options,
 if len(sys.argv) != 4:
-  print "Usage: %s prf ctm_in ctm_out" % __file__
+  print("Usage: %s prf ctm_in ctm_out" % __file__)
   sys.exit(1)
 prf_file, ctm_file, ctm_out_file = sys.argv[1:]
 
diff --git a/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py
index 8fec0064fd7..25899e19264 100755
--- a/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py
+++ b/egs/wsj/s5/steps/conf/convert_ctm_to_tra.py
@@ -3,6 +3,7 @@
 # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
+from __future__ import print_function
 import sys, operator
 
 # This scripts loads a 'ctm' file and converts it into the 'tra' format:
@@ -14,7 +15,7 @@
 # - confidences
 
 if len(sys.argv) != 3:
-  print 'Usage: %s ctm-in tra-out' % __file__
+  print('Usage: %s ctm-in tra-out' % __file__)
   sys.exit(1)
 dummy, ctm_in, tra_out = sys.argv
 
@@ -31,7 +32,7 @@
 
 # Store the in 'tra' format,
 with open(tra_out,'w') as f:
-  for utt,tuples in tra.iteritems():
+  for utt,tuples in tra.items():
     tuples.sort(key = operator.itemgetter(0)) # Sort by 'beg' time,
     f.write('%s %s\n' % (utt,' '.join([t[1] for t in tuples])))
 
diff --git a/egs/wsj/s5/steps/conf/get_ctm_conf.sh b/egs/wsj/s5/steps/conf/get_ctm_conf.sh
new file mode 100755
index 00000000000..5ce39b1ddb6
--- /dev/null
+++ b/egs/wsj/s5/steps/conf/get_ctm_conf.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# This script produces CTM files from a decoding directory that has lattices
+# present.  This version gives you confidence scores using MBR decoding.
+# See also steps/get_ctm.sh
+
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+min_lmwt=5
+max_lmwt=20
+use_segments=true # if we have a segments file, use it to convert
+                  # the segments to be relative to the original files.
+iter=final
+beam=5  # pruning beam before MBR decoding
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "This script produces CTM files from a decoding directory that has lattices "
+  echo "present.  This version gives you confidence scores using MBR decoding."
+  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --use-segments (true|false)     # use segments and reco2file_and_channel files "
+  echo "                                    # to produce a ctm relative to the original audio"
+  echo "                                    # files, with channel information (typically needed"
+  echo "                                    # for NIST scoring)."
+  echo "e.g.:"
+  echo "$0 data/train data/lang exp/tri4a/decode/"
+  echo "See also: steps/get_ctm.sh, steps/get_ctm_conf_fast.sh"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
+
+
+for f in $lang/words.txt $model $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+name=`basename $data`; # e.g. eval2000
+
+mkdir -p $dir/scoring/log
+
+frame_shift_opt=
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+if [ $stage -le 0 ]; then
+  if [ -f $data/segments ] && $use_segments; then
+    f=$data/reco2file_and_channel
+    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+    filter_cmd="utils/convert_ctm.pl $data/segments $data/reco2file_and_channel"
+  else
+    filter_cmd=cat
+  fi
+
+  nj=$(cat $dir/num_jobs)
+  lats=$(for n in $(seq $nj); do echo -n "$dir/lat.$n.gz "; done)
+  if [ -f $lang/phones/word_boundary.int ]; then
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
+      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
+      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
+      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt \| \
+      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
+  else
+    if [ ! -f $lang/phones/align_lexicon.int ]; then
+      echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
+      exit 1;
+    fi
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
+      set -o pipefail '&&' mkdir -p $dir/score_LMWT/ '&&' \
+      lattice-prune --inv-acoustic-scale=LMWT --beam=$beam "ark:gunzip -c $lats|" ark:- \| \
+      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+      lattice-to-ctm-conf $frame_shift_opt --decode-mbr=true --inv-acoustic-scale=LMWT ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt \| \
+      $filter_cmd '>' $dir/score_LMWT/$name.ctm || exit 1;
+  fi
+fi
+
diff --git a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
index 1be32d4c4d7..f0a2fe13497 100755
--- a/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
+++ b/egs/wsj/s5/steps/conf/parse_arpa_unigrams.py
@@ -3,11 +3,12 @@
 # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
+from __future__ import print_function
 import sys, gzip, re
 
 # Parse options,
 if len(sys.argv) != 4:
-  print "Usage: %s <words.txt> <arpa-gz> <unigrams>" % __file__
+  print("Usage: %s <words.txt> <arpa-gz> <unigrams>" % __file__)
   sys.exit(0)
 words_txt, arpa_gz, unigrams_out = sys.argv[1:]
 
@@ -31,7 +32,7 @@
 # Create list, 'wrd id log_p_unigram',
 words_unigram = [[wrd, id, (wrd_log10[wrd] if wrd in wrd_log10 else -99)] for wrd,id in words ]
 
-print >>sys.stderr, words_unigram[0]
+print(words_unigram[0], file=sys.stderr)
 # Store,
 with open(unigrams_out,'w') as f:
   f.writelines(['%s %s %g\n' % (w,i,p) for (w,i,p) in words_unigram])
diff --git a/egs/wsj/s5/steps/conf/prepare_calibration_data.py b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
index bc8f92a2f7f..c4da720ba71 100755
--- a/egs/wsj/s5/steps/conf/prepare_calibration_data.py
+++ b/egs/wsj/s5/steps/conf/prepare_calibration_data.py
@@ -3,6 +3,7 @@
 # Copyright 2015  Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
 
+from __future__ import division
 import sys, math
 
 from optparse import OptionParser
@@ -82,7 +83,7 @@
   depths = dict()
   for l in open(o.lattice_depth):
     utt,d = l.split(' ',1)
-    depths[utt] = map(int,d.split())
+    depths[utt] = [int(i) for i in d.split()]
 
 # Load the 'word_categories' mapping for categorical input features derived from 'lang/words.txt',
 wrd_to_cat = [ l.split() for l in open(word_categories_file) ]
diff --git a/egs/wsj/s5/steps/copy_ali_dir.sh b/egs/wsj/s5/steps/copy_ali_dir.sh
new file mode 100755
index 00000000000..60618a2f4bf
--- /dev/null
+++ b/egs/wsj/s5/steps/copy_ali_dir.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+prefixes="reverb1 babble music noise"
+include_original=true
+max_jobs_run=50
+nj=100
+cmd=queue.pl
+write_binary=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <out-data> <src-ali-dir> <out-ali-dir>"
+  echo "This script creates alignments for the aug dirs by copying "
+  echo " the alignments of original train dir"
+  echo "While copying it adds prefix to the utterances specified by prefixes option"
+  echo "Note that the original train dir does not have any prefix"
+  echo "To include the original training directory in the copied "
+  echo "version set the --include-original option to true"
+  echo "main options (for others, see top of script file)"
+  echo "  --prefixes <string of prefixes to add>    # All the prefixes of aug data to be included"
+  echo "  --include-original <true/false>           # If true, will copy the alignements of original dir"
+  echo "  --write-compact <true/false>              # Write lattices in compact mode"
+  exit 1
+fi
+
+data=$1
+src_dir=$2
+dir=$3
+
+mkdir -p $dir
+
+num_jobs=$(cat $src_dir/num_jobs)
+
+rm -f $dir/ali_tmp.*.{ark,scp} 2>/dev/null
+
+# Copy the alignments temporarily
+echo "creating temporary alignments in $dir"
+$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_ali_temp.JOB.log \
+  copy-int-vector --binary=$write_binary \
+  "ark:gunzip -c $src_dir/ali.JOB.gz |" \
+  ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1
+
+# Make copies of utterances for perturbed data
+for p in $prefixes; do
+  cat $dir/ali_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
+done | sort -k1,1 > $dir/ali_out.scp.aug
+
+if [ "$include_original" == "true" ]; then
+  cat $dir/ali_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/ali_out.scp.clean
+  cat $dir/ali_out.scp.clean $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
+else
+  cat $dir/ali_out.scp.aug | sort -k1,1 > $dir/ali_out.scp
+fi
+
+utils/split_data.sh ${data} $nj
+
+# Copy and dump the lattices for perturbed data
+echo Creating alignments for augmented data by copying alignments from clean data
+$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_ali.JOB.log \
+  copy-int-vector --binary=$write_binary \
+  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/ali_out.scp |" \
+  "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
+
+rm $dir/ali_out.scp.{aug,clean} $dir/ali_out.scp
+rm $dir/ali_tmp.*
+
+echo $nj > $dir/num_jobs
+
+for f in cmvn_opts tree splice_opts phones.txt final.mdl splice_opts tree frame_subsampling_factor; do
+  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
+done
diff --git a/egs/wsj/s5/steps/copy_lat_dir.sh b/egs/wsj/s5/steps/copy_lat_dir.sh
new file mode 100755
index 00000000000..67b2a66382e
--- /dev/null
+++ b/egs/wsj/s5/steps/copy_lat_dir.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+
+prefixes="reverb1 babble music noise"
+include_original=true
+max_jobs_run=50
+nj=100
+cmd=queue.pl
+write_compact=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <out-data> <src-lat-dir> <out-lat-dir>"
+  echo "This script creates lattices for the aug dirs by copying the lattices of original train dir"
+  echo "While copying it adds prefix to the utterances specified by prefixes option"
+  echo "Note that the original train dir does not have any prefix"
+  echo "To include the original training directory in the copied "
+  echo "version set the --include-original option to true"
+  echo "main options (for others, see top of script file)"
+  echo "  --prefixes <string of prefixes to add>             # All the prefixes of aug data to be included"
+  echo "  --include-original <true/false>                    # If true, will copy the lattices of original dir"
+  echo "  --write-compact <true/false>                       # Write lattices in compact mode"
+  exit 1
+fi
+
+data=$1
+src_dir=$2
+dir=$3
+
+mkdir -p $dir
+
+num_jobs=$(cat $src_dir/num_jobs)
+
+rm -f $dir/lat_tmp.*.{ark,scp} 2>/dev/null
+
+# Copy the alignments temporarily
+echo "creating temporary lattices in $dir"
+$cmd --max-jobs-run $max_jobs_run JOB=1:$num_jobs $dir/log/copy_lat_temp.JOB.log \
+  lattice-copy --write-compact=$write_compact \
+  "ark:gunzip -c $src_dir/lat.JOB.gz |" \
+  ark,scp:$dir/lat_tmp.JOB.ark,$dir/lat_tmp.JOB.scp || exit 1
+
+# Make copies of utterances for perturbed data
+for p in $prefixes; do
+  cat $dir/lat_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
+done | sort -k1,1 > $dir/lat_out.scp.aug
+
+if [ "$include_original" == "true" ]; then
+  cat $dir/lat_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/lat_out.scp.clean
+  cat $dir/lat_out.scp.clean $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
+else
+  cat $dir/lat_out.scp.aug | sort -k1,1 > $dir/lat_out.scp
+fi
+
+utils/split_data.sh ${data} $nj
+
+# Copy and dump the lattices for perturbed data
+echo Creating lattices for augmented data by copying lattices from clean data
+$cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/copy_out_lat.JOB.log \
+  lattice-copy --write-compact=$write_compact \
+  "scp:utils/filter_scp.pl ${data}/split$nj/JOB/utt2spk $dir/lat_out.scp |" \
+  "ark:| gzip -c > $dir/lat.JOB.gz" || exit 1
+
+rm $dir/lat_out.scp.{aug,clean} $dir/lat_out.scp
+rm $dir/lat_tmp.*
+
+echo $nj > $dir/num_jobs
+
+for f in phones.txt cmvn_opts splice_opts final.mdl splice_opts tree frame_subsampling_factor; do
+  if [ -f $src_dir/$f ]; then cp $src_dir/$f $dir/$f; fi
+done
diff --git a/egs/wsj/s5/steps/copy_trans_dir.sh b/egs/wsj/s5/steps/copy_trans_dir.sh
new file mode 100644
index 00000000000..3344d749ef8
--- /dev/null
+++ b/egs/wsj/s5/steps/copy_trans_dir.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Copyright 2019   Phani Sankar Nidadavolu
+# Copyright 2019   manhong wang(marvin)
+# Apache 2.0.
+
+#This script creates fmllr transform for the aug dirs by copying 
+#the trans of original train dir after you copy_ali_dirs.sh or copy_lat_dirs.sh
+#Note :  wo do not accept --nj here ,which shoud keep same as ali file
+prefixes="reverb1 babble music noise"
+include_original=true
+cmd=run.pl
+write_binary=true
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <out-data> <src-ali-dir> <out-ali-dir>"
+  echo "This script creates fmllr transform for the aug dirs by copying "
+  echo " the trans of original train dir"
+  echo "While copying it adds prefix to the utterances specified by prefixes option"
+  echo "Note that the original train dir does not have any prefix"
+  echo "To include the original training directory in the copied "
+  echo "version set the --include-original option to true"
+  echo "main options (for others, see top of script file)"
+  echo "  --prefixes <string of prefixes to add>    # All the prefixes of aug data to be included"
+  echo "  --include-original <true/false>           # If true, will copy the alignements of original dir"
+  exit 1
+fi
+
+data=$1
+src_dir=$2
+dir=$3
+
+if [ ! -d $dir ]; then
+    echo "$0: warning : you may need combine ali or lat first !" && exit 1
+fi
+
+if [ ! -f $src_dir/trans.1 ] ; then
+    echo "$0: no trans exist in $src_dir dir"  && exit 1
+fi
+
+
+nj=$(cat $dir/num_jobs)
+rm -f $dir/trans* 2>/dev/null
+
+# Copy the fmllr trans temporarily
+echo "creating temporary trans in $dir"
+$cmd  JOB=1:$nj $dir/log/copy_trans_temp.JOB.log \
+  copy-matrix --binary=$write_binary \
+  "ark:cat $src_dir/trans.JOB |" \
+  ark,scp:$dir/trans_tmp.JOB.ark,$dir/trans_tmp.JOB.scp || exit 1
+
+# Make copies of utterances for perturbed data
+for p in $prefixes; do
+  cat $dir/trans_tmp.*.scp | awk -v p=$p '{print p"-"$0}'
+done | sort -k1,1 > $dir/trans_out.scp.aug
+
+if [ "$include_original" == "true" ]; then
+  cat $dir/trans_tmp.*.scp | awk '{print $0}' | sort -k1,1 > $dir/trans_out.scp.clean
+  cat $dir/trans_out.scp.clean $dir/trans_out.scp.aug | sort -k1,1 > $dir/trans_out.scp
+else
+  cat $dir/trans_out.scp.aug | sort -k1,1 > $dir/trans_out.scp.old
+fi
+
+utils/filter_scp.pl  ${data}/spk2utt  $dir/trans_out.scp.old  >  $dir/trans_out.scp
+utils/split_data.sh ${data} $nj
+
+# Copy and dump the trans for perturbed data
+echo Creating fmllr trans for augmented data by copying fmllr trans from clean data
+$cmd  JOB=1:$nj $dir/log/copy_out_trans.JOB.log \
+  copy-matrix --binary=$write_binary \
+  "scp:utils/split_scp.pl  --one-based -j $nj JOB $dir/trans_out.scp |" \
+  ark:$dir/trans.JOB || exit 1
+
+n_aug_trans=`wc -l $data/spk2utt`
+n_copy_trans=`wc -l $dir/trans_out.scp`
+echo "copy $n_copy_trans speaker's  fmllr trans of total $n_aug_trans"
+rm $dir/trans_out.scp.aug  $dir/trans_out.scp.old $dir/trans_out.scp   $dir/trans_tmp.*
+exit 0
diff --git a/egs/wsj/s5/steps/data/augment_data_dir.py b/egs/wsj/s5/steps/data/augment_data_dir.py
old mode 100644
new mode 100755
index b78a644074f..9f78c7ebf62
--- a/egs/wsj/s5/steps/data/augment_data_dir.py
+++ b/egs/wsj/s5/steps/data/augment_data_dir.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # Copyright 2017  David Snyder
 #           2017  Ye Bai
+#           2019  Phani Sankar Nidadavolu
 # Apache 2.0
 #
 # This script generates augmented data.  It is based on
@@ -10,11 +11,14 @@
 from __future__ import print_function
 import sys, random, argparse, os, imp
 sys.path.append("steps/data/")
-from reverberate_data_dir import ParseFileToDict
-from reverberate_data_dir import WriteDictToFile
+sys.path.insert(0, 'steps/')
+
+from reverberate_data_dir import parse_file_to_dict
+from reverberate_data_dir import write_dict_to_file
+import libs.common as common_lib
 data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')
 
-def GetArgs():
+def get_args():
     parser = argparse.ArgumentParser(description="Augment the data directory with additive noises. "
         "Noises are separated into background and foreground noises which are added together or "
         "separately.  Background noises are added to the entire recording, and repeated as necessary "
@@ -29,13 +33,29 @@ def GetArgs():
                         help='When foreground noises are being added, the script will iterate through these SNRs.')
     parser.add_argument('--bg-snrs', type=str, dest = "bg_snr_str", default = '20:10:0',
                         help='When background noises are being added, the script will iterate through these SNRs.')
-    parser.add_argument('--num-bg-noises', type=str, dest = "num_bg_noises", default = '1',
-                        help='Number of overlapping background noises that we iterate over. For example, if the input is "1:2:3" then the output wavs will have either 1, 2, or 3 randomly chosen background noises overlapping the entire recording')
-    parser.add_argument('--fg-interval', type=int, dest = "fg_interval", default = 0,
-                        help='Number of seconds between the end of one foreground noise and the beginning of the next.')
-    parser.add_argument('--utt-suffix', type=str, dest = "utt_suffix", default = "aug", help='Suffix added to utterance IDs.')
-    parser.add_argument('--random-seed', type=int, dest = "random_seed", default = 123, help='Random seed.')
-
+    parser.add_argument('--num-bg-noises', type=str,
+                        dest = "num_bg_noises", default = '1',
+                        help='Number of overlapping background noises that we iterate over.'
+                            ' For example, if the input is "1:2:3" then the output wavs will have either '
+                            '1, 2, or 3 randomly chosen background noises overlapping the entire recording')
+    parser.add_argument('--fg-interval', type=int,
+                        dest = "fg_interval", default = 0,
+                        help='Number of seconds between the end of one '
+                            'foreground noise and the beginning of the next.')
+    parser.add_argument('--utt-suffix', type=str,
+                        dest = "utt_suffix", default = None,
+                        help='Suffix added to utterance IDs.')
+    parser.add_argument('--utt-prefix', type=str,
+                        dest = "utt_prefix", default = None,
+                        help='Prefix added to utterance IDs.')
+    parser.add_argument('--random-seed', type=int, dest = "random_seed",
+                        default = 123, help='Random seed.')
+    parser.add_argument("--modify-spk-id", type=str,
+                        dest='modify_spk_id', default=False,
+                        action=common_lib.StrToBoolAction,
+                        choices=["true", "false"],
+                        help='Utt prefix or suffix would be added to the spk id '
+                            'also (used in ASR), in speaker id it is left unmodifed')
     parser.add_argument("--bg-noise-dir", type=str, dest="bg_noise_dir",
                         help="Background noise data directory")
     parser.add_argument("--fg-noise-dir", type=str, dest="fg_noise_dir",
@@ -45,10 +65,23 @@ def GetArgs():
 
     print(' '.join(sys.argv))
     args = parser.parse_args()
-    args = CheckArgs(args)
+    args = check_args(args)
     return args
 
-def CheckArgs(args):
+def check_args(args):
+    # Check args
+    if args.utt_suffix is None and args.utt_prefix is None:
+        args.utt_modifier_type = None
+        args.utt_modifier = ""
+    elif args.utt_suffix is None and args.utt_prefix is not None:
+        args.utt_modifier_type = "prefix"
+        args.utt_modifier = args.utt_prefix
+    elif args.utt_suffix is not None and args.utt_prefix is None:
+        args.utt_modifier_type = "suffix"
+        args.utt_modifier = args.utt_suffix
+    else:
+        raise Exception("Trying to add both prefix and suffix. Choose either of them")
+
     if not os.path.exists(args.output_dir):
         os.makedirs(args.output_dir)
     if not args.fg_interval >= 0:
@@ -57,8 +90,8 @@ def CheckArgs(args):
         raise Exception("Either --fg-noise-dir or --bg-noise-dir must be specified")
     return args
 
-def GetNoiseList(noise_wav_scp_filename):
-    noise_wav_scp_file = open(noise_wav_scp_filename, 'r').readlines()
+def get_noise_list(noise_wav_scp_filename):
+    noise_wav_scp_file = open(noise_wav_scp_filename, 'r', encoding='utf-8').readlines()
     noise_wavs = {}
     noise_utts = []
     for line in noise_wav_scp_file:
@@ -68,7 +101,7 @@ def GetNoiseList(noise_wav_scp_filename):
         noise_wavs[toks[0]] = wav.rstrip()
     return noise_utts, noise_wavs
 
-def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
+def augment_wav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
     bg_noise_utts, noise_wavs, noise2dur, interval, num_opts):
     # This section is common to both foreground and background noises
     new_wav = ""
@@ -103,8 +136,8 @@ def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
             tot_noise_dur += noise_dur + interval
             noises.append(noise)
 
-    start_times_str = "--start-times='" + ",".join(map(str,start_times)) + "'"
-    snrs_str = "--snrs='" + ",".join(map(str,snrs)) + "'"
+    start_times_str = "--start-times='" + ",".join([str(i) for i in start_times]) + "'"
+    snrs_str = "--snrs='" + ",".join([str(i) for i in snrs]) + "'"
     noises_str = "--additive-signals='" + ",".join(noises).strip() + "'"
 
     # If the wav is just a file
@@ -117,27 +150,61 @@ def AugmentWav(utt, wav, dur, fg_snr_opts, bg_snr_opts, fg_noise_utts, \
             + start_times_str + " " + snrs_str + " - - |"
     return new_wav
 
-def CopyFileIfExists(utt_suffix, filename, input_dir, output_dir):
-    if os.path.isfile(input_dir + "/" + filename):
-        dict = ParseFileToDict(input_dir + "/" + filename,
+def get_new_id(utt, utt_modifier_type, utt_modifier):
+    """ This function generates a new id from the input id
+        This is needed when we have to create multiple copies of the original data
+        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
+    """
+    if utt_modifier_type == "suffix" and len(utt_modifier) > 0:
+        new_utt = utt + "-" + utt_modifier
+    elif utt_modifier_type == "prefix" and len(utt_modifier) > 0:
+        new_utt = utt_modifier + "-" + utt
+    else:
+        new_utt = utt
+
+    return new_utt
+
+def copy_file_if_exists(input_file, output_file, utt_modifier_type,
+                        utt_modifier, fields=[0]):
+    if os.path.isfile(input_file):
+        clean_dict = parse_file_to_dict(input_file,
             value_processor = lambda x: " ".join(x))
-        if len(utt_suffix) > 0:
-            new_dict = {}
-            for key in dict.keys():
-                new_dict[key + "-" + utt_suffix] = dict[key]
-            dict = new_dict
-        WriteDictToFile(dict, output_dir + "/" + filename)
+        new_dict = {}
+        for key in clean_dict.keys():
+            modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
+            if len(fields) > 1:
+                values = clean_dict[key].split(" ")
+                modified_values = values
+                for idx in range(1, len(fields)):
+                    modified_values[idx-1] = get_new_id(values[idx-1],
+                                            utt_modifier_type, utt_modifier)
+                new_dict[modified_key] = " ".join(modified_values)
+            else:
+                new_dict[modified_key] = clean_dict[key]
+        write_dict_to_file(new_dict, output_file)
+
+def create_augmented_utt2uniq(input_dir, output_dir,
+                            utt_modifier_type, utt_modifier):
+    clean_utt2spk_file = input_dir + "/utt2spk"
+    clean_utt2spk_dict = parse_file_to_dict(clean_utt2spk_file,
+                            value_processor = lambda x: " ".join(x))
+    augmented_utt2uniq_dict = {}
+    for key in clean_utt2spk_dict.keys():
+        modified_key = get_new_id(key, utt_modifier_type, utt_modifier)
+        augmented_utt2uniq_dict[modified_key] = key
+    write_dict_to_file(augmented_utt2uniq_dict, output_dir + "/utt2uniq")
 
 def main():
-    args = GetArgs()
-    fg_snrs = map(int, args.fg_snr_str.split(":"))
-    bg_snrs = map(int, args.bg_snr_str.split(":"))
+    args = get_args()
     input_dir = args.input_dir
     output_dir = args.output_dir
-    num_bg_noises = map(int, args.num_bg_noises.split(":"))
-    reco2dur = ParseFileToDict(input_dir + "/reco2dur",
+
+    fg_snrs = [int(i) for i in args.fg_snr_str.split(":")]
+    bg_snrs = [int(i) for i in args.bg_snr_str.split(":")]
+    num_bg_noises = [int(i) for i in args.num_bg_noises.split(":")]
+    reco2dur = parse_file_to_dict(input_dir + "/reco2dur",
         value_processor = lambda x: float(x[0]))
-    wav_scp_file = open(input_dir + "/wav.scp", 'r').readlines()
+    wav_scp_file = open(input_dir + "/wav.scp", 'r', encoding='utf-8').readlines()
 
     noise_wavs = {}
     noise_reco2dur = {}
@@ -147,18 +214,18 @@ def main():
     # Load background noises
     if args.bg_noise_dir:
         bg_noise_wav_filename = args.bg_noise_dir + "/wav.scp"
-        bg_noise_utts, bg_noise_wavs = GetNoiseList(bg_noise_wav_filename)
-        bg_noise_reco2dur = ParseFileToDict(args.bg_noise_dir + "/reco2dur",
+        bg_noise_utts, bg_noise_wavs = get_noise_list(bg_noise_wav_filename)
+        bg_noise_reco2dur = parse_file_to_dict(args.bg_noise_dir + "/reco2dur",
             value_processor = lambda x: float(x[0]))
         noise_wavs.update(bg_noise_wavs)
         noise_reco2dur.update(bg_noise_reco2dur)
 
-    # Load background noises
+    # Load foreground noises
     if args.fg_noise_dir:
         fg_noise_wav_filename = args.fg_noise_dir + "/wav.scp"
         fg_noise_reco2dur_filename = args.fg_noise_dir + "/reco2dur"
-        fg_noise_utts, fg_noise_wavs = GetNoiseList(fg_noise_wav_filename)
-        fg_noise_reco2dur = ParseFileToDict(args.fg_noise_dir + "/reco2dur",
+        fg_noise_utts, fg_noise_wavs = get_noise_list(fg_noise_wav_filename)
+        fg_noise_reco2dur = parse_file_to_dict(args.fg_noise_dir + "/reco2dur",
             value_processor = lambda x: float(x[0]))
         noise_wavs.update(fg_noise_wavs)
         noise_reco2dur.update(fg_noise_reco2dur)
@@ -173,24 +240,58 @@ def main():
         utt = toks[0]
         wav = " ".join(toks[1:])
         dur = reco2dur[utt]
-        new_wav = AugmentWav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
+        new_wav = augment_wav(utt, wav, dur, fg_snrs, bg_snrs, fg_noise_utts,
             bg_noise_utts, noise_wavs, noise_reco2dur, args.fg_interval,
             num_bg_noises)
-        new_utt = utt + "-" + args.utt_suffix
+
+        new_utt = get_new_id(utt, args.utt_modifier_type, args.utt_modifier)
+
         new_utt2wav[new_utt] = new_wav
 
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    WriteDictToFile(new_utt2wav, output_dir + "/wav.scp")
-    CopyFileIfExists(args.utt_suffix, "reco2dur", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2dur", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2lang", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "text", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "utt2spk", input_dir, output_dir)
-    CopyFileIfExists(args.utt_suffix, "vad.scp", input_dir, output_dir)
-    CopyFileIfExists("", "spk2gender", input_dir, output_dir)
+    write_dict_to_file(new_utt2wav, output_dir + "/wav.scp")
+    copy_file_if_exists(input_dir + "/reco2dur", output_dir + "/reco2dur",
+                                args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/utt2dur", output_dir + "/utt2dur",
+                                args.utt_modifier_type, args.utt_modifier)
+
+    # Check whether to modify the speaker id or not while creating utt2spk file
+    fields = ([0, 1] if args.modify_spk_id else [0])
+    copy_file_if_exists(input_dir + "/utt2spk", output_dir + "/utt2spk",
+                        args.utt_modifier_type, args.utt_modifier, fields=fields)
+    copy_file_if_exists(input_dir + "/utt2lang", output_dir + "/utt2lang",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/utt2num_frames", output_dir + "/utt2num_frames",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/text", output_dir + "/text", args.utt_modifier_type,
+                        args.utt_modifier)
+    copy_file_if_exists(input_dir + "/segments", output_dir + "/segments",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])
+    copy_file_if_exists(input_dir + "/vad.scp", output_dir + "/vad.scp",
+                        args.utt_modifier_type, args.utt_modifier)
+    copy_file_if_exists(input_dir + "/reco2file_and_channel",
+                        output_dir + "/reco2file_and_channel",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0, 1])
+
+    if args.modify_spk_id:
+        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender",
+                        args.utt_modifier_type, args.utt_modifier)
+    else:
+        copy_file_if_exists(input_dir + "/spk2gender", output_dir + "/spk2gender", None, "")
+
+    # Create utt2uniq file
+    if os.path.isfile(input_dir + "/utt2uniq"):
+        copy_file_if_exists(input_dir + "/utt2uniq", output_dir + "/utt2uniq",
+                        args.utt_modifier_type, args.utt_modifier, fields=[0])
+    else:
+        create_augmented_utt2uniq(input_dir, output_dir,
+                        args.utt_modifier_type, args.utt_modifier)
+
+    data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
+                    .format(output_dir = output_dir))
+
     data_lib.RunKaldiCommand("utils/fix_data_dir.sh {output_dir}".format(output_dir = output_dir))
 
 if __name__ == "__main__":
diff --git a/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py b/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py
index 1f7253d4891..0092424ac5f 100644
--- a/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py
+++ b/egs/wsj/s5/steps/data/data_dir_manipulation_lib.py
@@ -11,8 +11,7 @@ def RunKaldiCommand(command, wait = True):
     if wait:
         [stdout, stderr] = p.communicate()
         if p.returncode is not 0:
-            raise Exception("There was an error while running the command {0}\n".format(command)+"-"*10+"\n"+stderr)
+            raise Exception("There was an error while running the command {0}\n------------\n{1}".format(command, stderr))
         return stdout, stderr
     else:
         return p
-
diff --git a/egs/wsj/s5/steps/data/make_musan.py b/egs/wsj/s5/steps/data/make_musan.py
new file mode 100755
index 00000000000..9165fd7e522
--- /dev/null
+++ b/egs/wsj/s5/steps/data/make_musan.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+# Copyright 2015   David Snyder
+#           2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+#
+# This file is meant to be invoked by make_musan.sh.
+
+import os, sys, argparse
+sys.path.append("steps/data/")
+sys.path.insert(0, 'steps/')
+import libs.common as common_lib
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Create MUSAN corpus",
+                        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--use-vocals", type=str,
+                        dest='use_vocals', default=True,
+                        action=common_lib.StrToBoolAction,
+                        choices=["true", "false"],
+                        help='use vocals from the music corpus')
+    parser.add_argument('--sampling-rate', type=int, default=16000,
+                        help="Sampling rate of the source data. If a positive integer is specified with this option, "
+                        "the MUSAN corpus will be resampled to the rate of the source data."
+                        "Original MUSAN corpus is sampled at 16KHz. Defaults to 16000 Hz")
+    parser.add_argument("in_dir", help="Input data directory")
+    parser.add_argument("out_dir", help="Output data directory")
+
+    print(' '.join(sys.argv))
+    args = parser.parse_args()
+    args = check_args(args)
+
+    return args
+
+def check_args(args):
+    if not os.path.exists(args.in_dir):
+        raise Exception('input dir {0} does not exist'.format(args.in_dir))
+    if not os.path.exists(args.out_dir):
+        print("Preparing {0}/musan...".format(args.out_dir))
+        os.makedirs(args.out_dir)
+
+    return args
+
+def process_music_annotations(path):
+    utt2spk = {}
+    utt2vocals = {}
+    lines = open(path, 'r').readlines()
+    for line in lines:
+        utt, genres, vocals, musician = line.rstrip().split()[:4]
+        # For this application, the musican ID isn't important
+        utt2spk[utt] = utt
+        utt2vocals[utt] = vocals == "Y"
+    return utt2spk, utt2vocals
+
+def prepare_music(root_dir, use_vocals, sampling_rate):
+    utt2vocals = {}
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    music_dir = os.path.join(root_dir, "music")
+    for root, dirs, files in os.walk(music_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+            elif str(file) == "ANNOTATIONS":
+                utt2spk_part, utt2vocals_part = process_music_annotations(file_path)
+                utt2spk.update(utt2spk_part)
+                utt2vocals.update(utt2vocals_part)
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2vocals:
+        if utt in utt2wav:
+            if use_vocals or not utt2vocals[utt]:
+                utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+                if sampling_rate == 16000:
+                    utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+                else:
+                    utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In music directory, processed {} files; {} had missing wav data".format(
+                                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_speech(root_dir, sampling_rate):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    speech_dir = os.path.join(root_dir, "speech")
+    for root, dirs, files in os.walk(speech_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if sampling_rate == 16000:
+                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            else:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In speech directory, processed {} files; {} had missing wav data".format(
+                                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def prepare_noise(root_dir, sampling_rate):
+    utt2spk = {}
+    utt2wav = {}
+    num_good_files = 0
+    num_bad_files = 0
+    noise_dir = os.path.join(root_dir, "noise")
+    for root, dirs, files in os.walk(noise_dir):
+        for file in files:
+            file_path = os.path.join(root, file)
+            if file.endswith(".wav"):
+                utt = str(file).replace(".wav", "")
+                utt2wav[utt] = file_path
+                utt2spk[utt] = utt
+
+    utt2spk_str = ""
+    utt2wav_str = ""
+    for utt in utt2spk:
+        if utt in utt2wav:
+            utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n"
+            if sampling_rate == 16000:
+                utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n"
+            else:
+                utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r" \
+                                    " {fs} -t wav - |\n".format(fs=sampling_rate)
+            num_good_files += 1
+        else:
+            print("Missing file {}".format(utt))
+            num_bad_files += 1
+    print("In noise directory, processed {} files; {} had missing wav data".format(
+                                    num_good_files, num_bad_files))
+    return utt2spk_str, utt2wav_str
+
+
+def main():
+    args = get_args()
+    in_dir = args.in_dir
+    out_dir = args.out_dir
+    use_vocals = args.use_vocals
+    sampling_rate = args.sampling_rate
+
+    utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals, sampling_rate)
+    utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, sampling_rate)
+    utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, sampling_rate)
+
+    utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise
+    utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise
+    wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w')
+    wav_fi.write(utt2wav)
+    utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w')
+    utt2spk_fi.write(utt2spk)
+
+
+if __name__=="__main__":
+    main()
diff --git a/egs/wsj/s5/steps/data/make_musan.sh b/egs/wsj/s5/steps/data/make_musan.sh
new file mode 100755
index 00000000000..40ec9b9a279
--- /dev/null
+++ b/egs/wsj/s5/steps/data/make_musan.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# Copyright 2015   David Snyder
+#           2019   Phani Sankar Nidadavolu
+# Apache 2.0.
+#
+# This script creates the MUSAN data directory.
+# Consists of babble, music and noise files.
+# Used to create augmented data
+# The required dataset is freely available at http://www.openslr.org/17/
+
+# The corpus can be cited as follows:
+# @misc{musan2015,
+#  author = {David Snyder and Guoguo Chen and Daniel Povey},
+#  title = {{MUSAN}: {A} {M}usic, {S}peech, and {N}oise {C}orpus},
+#  year = {2015},
+#  eprint = {1510.08484},
+#  note = {arXiv:1510.08484v1}
+# }
+
+set -e
+use_vocals=true
+sampling_rate=16000
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -ne 2 ]; then
+    echo USAGE: $0 input_dir output_dir
+    echo input_dir is the path where the MUSAN corpus is located
+    echo e.g: $0 /export/corpora/JHU/musan data
+    echo "main options (for others, see top of script file)"
+    echo "  --sampling-rate <sampling frequency>        # Sampling frequency of source dir"
+    echo "  --use-vocals <true/false>        # Use vocals from music portion of MUSAN corpus"
+    exit 1;
+fi
+
+in_dir=$1
+data_dir=$2
+
+mkdir -p local/musan.tmp
+
+# The below script will create the musan corpus
+steps/data/make_musan.py --use-vocals ${use_vocals} \
+                        --sampling-rate ${sampling_rate} \
+                        ${in_dir} ${data_dir}/musan || exit 1;
+
+utils/fix_data_dir.sh ${data_dir}/musan
+
+grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music
+grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech
+grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise
+
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \
+        ${data_dir}/musan ${data_dir}/musan_music
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \
+        ${data_dir}/musan ${data_dir}/musan_speech
+utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \
+        ${data_dir}/musan ${data_dir}/musan_noise
+
+utils/fix_data_dir.sh ${data_dir}/musan_music
+utils/fix_data_dir.sh ${data_dir}/musan_speech
+utils/fix_data_dir.sh ${data_dir}/musan_noise
+
+rm -rf local/musan.tmp
+
+for name in speech noise music; do
+    utils/data/get_reco2dur.sh ${data_dir}/musan_${name}
+done
diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py
index 71e64d9e680..ea504244d38 100755
--- a/egs/wsj/s5/steps/data/reverberate_data_dir.py
+++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py
@@ -1,15 +1,15 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # Copyright 2016  Tom Ko
+#           2018  David Snyder
+#           2019  Phani Sankar Nidadavolu
 # Apache 2.0
 # script to generate reverberated data
 
-# we're using python 3.x style print but want it to work in python 2.x,
-from __future__ import print_function
 import argparse, shlex, glob, math, os, random, sys, warnings, copy, imp, ast
 
 data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py')
 
-def GetArgs():
+def get_args():
     # we add required arguments as named arguments for readability
     parser = argparse.ArgumentParser(description="Reverberate the data directory with an option "
                                                  "to add isotropic and point source noises. "
@@ -79,15 +79,11 @@ def GetArgs():
     print(' '.join(sys.argv))
 
     args = parser.parse_args()
-    args = CheckArgs(args)
+    args = check_args(args)
 
     return args
 
-def CheckArgs(args):
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    ## Check arguments
+def check_args(args):
     if args.prefix is None:
         if args.num_replicas > 1 or args.include_original_data == "true":
             args.prefix = "rvb"
@@ -120,43 +116,50 @@ def CheckArgs(args):
     return args
 
 
-class list_cyclic_iterator:
-  def __init__(self, list):
-    self.list_index = 0
-    self.list = list
-    random.shuffle(self.list)
-
-  def next(self):
-    item = self.list[self.list_index]
-    self.list_index = (self.list_index + 1) % len(self.list)
-    return item
-
-
-# This functions picks an item from the collection according to the associated probability distribution.
-# The probability estimate of each item in the collection is stored in the "probability" field of
-# the particular item. x : a collection (list or dictionary) where the values contain a field called probability
-def PickItemWithProbability(x):
-   if isinstance(x, dict):
-     plist = list(set(x.values()))
-   else:
-     plist = x
-   total_p = sum(item.probability for item in plist)
-   p = random.uniform(0, total_p)
-   accumulate_p = 0
-   for item in plist:
-      if accumulate_p + item.probability >= p:
-         return item
-      accumulate_p += item.probability
-   assert False, "Shouldn't get here as the accumulated probability should always equal to 1"
-
-
-# This function parses a file and pack the data into a dictionary
-# It is useful for parsing file like wav.scp, utt2spk, text...etc
-def ParseFileToDict(file, assert2fields = False, value_processor = None):
+class list_cyclic_iterator(object):
+    def __init__(self, list):
+        self.list_index = 0
+        self.list = list
+        random.shuffle(self.list)
+
+    def __next__(self):
+        item = self.list[self.list_index]
+        self.list_index = (self.list_index + 1) % len(self.list)
+        return item
+
+    next = __next__  # for Python 2
+
+def pick_item_with_probability(x):
+    """ This functions picks an item from the collection according to the associated
+        probability distribution. The probability estimate of each item in the collection
+        is stored in the "probability" field of the particular item. x : a
+        collection (list or dictionary) where the values contain a field called probability
+    """
+    if isinstance(x, dict):
+        keylist = list(x.keys())
+        keylist.sort()
+        random.shuffle(keylist)
+        plist = [x[k] for k in keylist]
+    else:
+        plist = x
+    total_p = sum(item.probability for item in plist)
+    p = random.uniform(0, total_p)
+    accumulate_p = 0
+    for item in plist:
+        if accumulate_p + item.probability >= p:
+            return item
+        accumulate_p += item.probability
+    assert False, "Shouldn't get here as the accumulated probability should always equal to 1"
+
+
+def parse_file_to_dict(file, assert2fields = False, value_processor = None):
+    """ This function parses a file and pack the data into a dictionary
+        It is useful for parsing file like wav.scp, utt2spk, text...etc
+    """
     if value_processor is None:
         value_processor = lambda x: x[0]
     dict = {}
-    for line in open(file, 'r'):
+    for line in open(file, 'r', encoding='utf-8'):
         parts = line.split()
         if assert2fields:
             assert(len(parts) == 2)
@@ -164,29 +167,29 @@ def ParseFileToDict(file, assert2fields = False, value_processor = None):
         dict[parts[0]] = value_processor(parts[1:])
     return dict
 
-# This function creates a file and write the content of a dictionary into it
-def WriteDictToFile(dict, file_name):
-    file = open(file_name, 'w')
-    keys = dict.keys()
-    keys.sort()
+def write_dict_to_file(dict, file_name):
+    """ This function creates a file and write the content of a dictionary into it
+    """
+    file = open(file_name, 'w', encoding='utf-8')
+    keys = sorted(dict.keys())
     for key in keys:
         value = dict[key]
         if type(value) in [list, tuple] :
             if type(value) is tuple:
                 value = list(value)
-            value.sort()
+            value = sorted(value)
             value = ' '.join(str(value))
         file.write('{0} {1}\n'.format(key, value))
     file.close()
 
 
-# This function creates the utt2uniq file from the utterance id in utt2spk file
-def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix):
+def create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix):
+    """This function creates the utt2uniq file from the utterance id in utt2spk file
+    """
     corrupted_utt2uniq = {}
     # Parse the utt2spk to get the utterance id
-    utt2spk = ParseFileToDict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x))
-    keys = utt2spk.keys()
-    keys.sort()
+    utt2spk = parse_file_to_dict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x))
+    keys = sorted(utt2spk.keys())
     if include_original:
         start_index = 0
     else:
@@ -194,13 +197,13 @@ def CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_origina
 
     for i in range(start_index, num_replicas+1):
         for utt_id in keys:
-            new_utt_id = GetNewId(utt_id, prefix, i)
+            new_utt_id = get_new_id(utt_id, prefix, i)
             corrupted_utt2uniq[new_utt_id] = utt_id
 
-    WriteDictToFile(corrupted_utt2uniq, output_dir + "/utt2uniq")
+    write_dict_to_file(corrupted_utt2uniq, output_dir + "/utt2uniq")
 
 
-def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the information of the noise added
+def add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                         room,  # the room selected
                         pointsource_noise_list, # the point source noise list
                         pointsource_noise_addition_probability, # Probability of adding point-source noises
@@ -212,18 +215,18 @@ def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the in
     if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability and max_noises_recording >= 1:
         for k in range(random.randint(1, max_noises_recording)):
             # pick the RIR to reverberate the point-source noise
-            noise = PickItemWithProbability(pointsource_noise_list)
-            noise_rir = PickItemWithProbability(room.rir_list)
+            noise = pick_item_with_probability(pointsource_noise_list)
+            noise_rir = pick_item_with_probability(room.rir_list)
             # If it is a background noise, the noise will be extended and be added to the whole speech
             # if it is a foreground noise, the noise will not extended and be added at a random time of the speech
             if noise.bg_fg_type == "background":
                 noise_rvb_command = """wav-reverberate --impulse-response="{0}" --duration={1}""".format(noise_rir.rir_rspecifier, speech_dur)
                 noise_addition_descriptor['start_times'].append(0)
-                noise_addition_descriptor['snrs'].append(background_snrs.next())
+                noise_addition_descriptor['snrs'].append(next(background_snrs))
             else:
                 noise_rvb_command = """wav-reverberate --impulse-response="{0}" """.format(noise_rir.rir_rspecifier)
                 noise_addition_descriptor['start_times'].append(round(random.random() * speech_dur, 2))
-                noise_addition_descriptor['snrs'].append(foreground_snrs.next())
+                noise_addition_descriptor['snrs'].append(next(foreground_snrs))
 
             # check if the rspecifier is a pipe or not
             if len(noise.noise_rspecifier.split()) == 1:
@@ -234,10 +237,7 @@ def AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the in
     return noise_addition_descriptor
 
 
-# This function randomly decides whether to reverberate, and sample a RIR if it does
-# It also decides whether to add the appropriate noises
-# This function return the string of options to the binary wav-reverberate
-def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+def generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                               pointsource_noise_list, # the point source noise list
                               iso_noise_dict, # the isotropic noise dictionary
                               foreground_snrs, # the SNR for adding the foreground noises
@@ -248,15 +248,19 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
                               speech_dur,  # duration of the recording
                               max_noises_recording  # Maximum number of point-source noises that can be added
                               ):
+    """ This function randomly decides whether to reverberate, and sample a RIR if it does
+        It also decides whether to add the appropriate noises
+        This function return the string of options to the binary wav-reverberate
+    """
     reverberate_opts = ""
     noise_addition_descriptor = {'noise_io': [],
                                  'start_times': [],
                                  'snrs': []}
     # Randomly select the room
     # Here the room probability is a sum of the probabilities of the RIRs recorded in the room.
-    room = PickItemWithProbability(room_dict)
+    room = pick_item_with_probability(room_dict)
     # Randomly select the RIR in the room
-    speech_rir = PickItemWithProbability(room.rir_list)
+    speech_rir = pick_item_with_probability(room.rir_list)
     if random.random() < speech_rvb_probability:
         # pick the RIR to reverberate the speech
         reverberate_opts += """--impulse-response="{0}" """.format(speech_rir.rir_rspecifier)
@@ -266,7 +270,7 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
         rir_iso_noise_list = iso_noise_dict[speech_rir.room_id]
     # Add the corresponding isotropic noise associated with the selected RIR
     if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability:
-        isotropic_noise = PickItemWithProbability(rir_iso_noise_list)
+        isotropic_noise = pick_item_with_probability(rir_iso_noise_list)
         # extend the isotropic noise to the length of the speech waveform
         # check if the rspecifier is a pipe or not
         if len(isotropic_noise.noise_rspecifier.split()) == 1:
@@ -274,9 +278,9 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
         else:
             noise_addition_descriptor['noise_io'].append("{0} wav-reverberate --duration={1} - - |".format(isotropic_noise.noise_rspecifier, speech_dur))
         noise_addition_descriptor['start_times'].append(0)
-        noise_addition_descriptor['snrs'].append(background_snrs.next())
+        noise_addition_descriptor['snrs'].append(next(background_snrs))
 
-    noise_addition_descriptor = AddPointSourceNoise(noise_addition_descriptor,  # descriptor to store the information of the noise added
+    noise_addition_descriptor = add_point_source_noise(noise_addition_descriptor,  # descriptor to store the information of the noise added
                                                     room,  # the room selected
                                                     pointsource_noise_list, # the point source noise list
                                                     pointsource_noise_addition_probability, # Probability of adding point-source noises
@@ -290,31 +294,28 @@ def GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to
     assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['snrs'])
     if len(noise_addition_descriptor['noise_io']) > 0:
         reverberate_opts += "--additive-signals='{0}' ".format(','.join(noise_addition_descriptor['noise_io']))
-        reverberate_opts += "--start-times='{0}' ".format(','.join(map(lambda x:str(x), noise_addition_descriptor['start_times'])))
-        reverberate_opts += "--snrs='{0}' ".format(','.join(map(lambda x:str(x), noise_addition_descriptor['snrs'])))
+        reverberate_opts += "--start-times='{0}' ".format(','.join([str(x) for x in noise_addition_descriptor['start_times']]))
+        reverberate_opts += "--snrs='{0}' ".format(','.join([str(x) for x in noise_addition_descriptor['snrs']]))
 
     return reverberate_opts
 
-# This function generates a new id from the input id
-# This is needed when we have to create multiple copies of the original data
-# E.g. GetNewId("swb0035", prefix="rvb", copy=1) returns a string "rvb1_swb0035"
-def GetNewId(id, prefix=None, copy=0):
+def get_new_id(id, prefix=None, copy=0):
+    """ This function generates a new id from the input id
+        This is needed when we have to create multiple copies of the original data
+        E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1-swb0035"
+    """
     if prefix is not None:
-        new_id = prefix + str(copy) + "_" + id
+        new_id = prefix + str(copy) + "-" + id
     else:
         new_id = id
 
     return new_id
 
 
-# This is the main function to generate pipeline command for the corruption
-# The generic command of wav-reverberate will be like:
-# wav-reverberate --duration=t --impulse-response=rir.wav
-# --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
-def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kaldi-IO strings of the speech recordings
+def generate_reverberated_wav_scp(wav_scp,  # a dictionary whose values are the Kaldi-IO strings of the speech recordings
                                durations, # a dictionary whose values are the duration (in sec) of the speech recordings
                                output_dir, # output directory to write the corrupted wav.scp
-                               room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+                               room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                                pointsource_noise_list, # the point source noise list
                                iso_noise_dict, # the isotropic noise dictionary
                                foreground_snr_array, # the SNR for adding the foreground noises
@@ -328,11 +329,15 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
                                pointsource_noise_addition_probability, # Probability of adding point-source noises
                                max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration
                                ):
+    """ This is the main function to generate pipeline command for the corruption
+        The generic command of wav-reverberate will be like:
+        wav-reverberate --duration=t --impulse-response=rir.wav
+        --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav
+    """
     foreground_snrs = list_cyclic_iterator(foreground_snr_array)
     background_snrs = list_cyclic_iterator(background_snr_array)
     corrupted_wav_scp = {}
-    keys = wav_scp.keys()
-    keys.sort()
+    keys = sorted(wav_scp.keys())
     if include_original:
         start_index = 0
     else:
@@ -347,7 +352,7 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
             speech_dur = durations[recording_id]
             max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60)
 
-            reverberate_opts = GenerateReverberationOpts(room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+            reverberate_opts = generate_reverberation_opts(room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                                                          pointsource_noise_list, # the point source noise list
                                                          iso_noise_dict, # the isotropic noise dictionary
                                                          foreground_snrs, # the SNR for adding the foreground noises
@@ -365,16 +370,17 @@ def GenerateReverberatedWavScp(wav_scp,  # a dictionary whose values are the Kal
             else:
                 wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts)
 
-            new_recording_id = GetNewId(recording_id, prefix, i)
+            new_recording_id = get_new_id(recording_id, prefix, i)
             corrupted_wav_scp[new_recording_id] = wav_corrupted_pipe
 
-    WriteDictToFile(corrupted_wav_scp, output_dir + "/wav.scp")
+    write_dict_to_file(corrupted_wav_scp, output_dir + "/wav.scp")
 
 
-# This function replicate the entries in files like segments, utt2spk, text
-def AddPrefixToFields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
-    list = map(lambda x: x.strip(), open(input_file))
-    f = open(output_file, "w")
+def add_prefix_to_fields(input_file, output_file, num_replicas, include_original, prefix, field = [0]):
+    """ This function replicate the entries in files like segments, utt2spk, text
+    """
+    list = [x.strip() for x in open(input_file, encoding='utf-8')]
+    f = open(output_file, "w", encoding='utf-8')
     if include_original:
         start_index = 0
     else:
@@ -385,17 +391,16 @@ def AddPrefixToFields(input_file, output_file, num_replicas, include_original, p
             if len(line) > 0 and line[0] != ';':
                 split1 = line.split()
                 for j in field:
-                    split1[j] = GetNewId(split1[j], prefix, i)
+                    split1[j] = get_new_id(split1[j], prefix, i)
                 print(" ".join(split1), file=f)
             else:
                 print(line, file=f)
     f.close()
 
 
-# This function creates multiple copies of the necessary files, e.g. utt2spk, wav.scp ...
-def CreateReverberatedCopy(input_dir,
+def create_reverberated_copy(input_dir,
                            output_dir,
-                           room_dict,  # the room dictionary, please refer to MakeRoomDict() for the format
+                           room_dict,  # the room dictionary, please refer to make_room_dict() for the format
                            pointsource_noise_list, # the point source noise list
                            iso_noise_dict, # the isotropic noise dictionary
                            foreground_snr_string, # the SNR for adding the foreground noises
@@ -409,53 +414,54 @@ def CreateReverberatedCopy(input_dir,
                            pointsource_noise_addition_probability, # Probability of adding point-source noises
                            max_noises_per_minute  # maximum number of point-source noises that can be added to a recording according to its duration
                            ):
-
-    wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
+    """ This function creates multiple copies of the necessary files,
+        e.g. utt2spk, wav.scp ...
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    wav_scp = parse_file_to_dict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x))
     if not os.path.isfile(input_dir + "/reco2dur"):
         print("Getting the duration of the recordings...");
-        read_entire_file="false"
-        for value in wav_scp.values():
-            # we will add more checks for sox commands which modify the header as we come across these cases in our data
-            if "sox" in value and "speed" in value:
-                read_entire_file="true"
-                break
-        data_lib.RunKaldiCommand("wav-to-duration --read-entire-file={1} scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir, read_entire_file))
-    durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0]))
-    foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':'))
-    background_snr_array = map(lambda x: float(x), background_snr_string.split(':'))
-
-    GenerateReverberatedWavScp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
+        data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir))
+    durations = parse_file_to_dict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0]))
+    foreground_snr_array = [float(x) for x in foreground_snr_string.split(':')]
+    background_snr_array = [float(x) for x in background_snr_string.split(':')]
+
+    generate_reverberated_wav_scp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict,
                foreground_snr_array, background_snr_array, num_replicas, include_original, prefix,
                speech_rvb_probability, shift_output, isotropic_noise_addition_probability,
                pointsource_noise_addition_probability, max_noises_per_minute)
 
-    AddPrefixToFields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
+    add_prefix_to_fields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1])
     data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt"
                     .format(output_dir = output_dir))
 
     if os.path.isfile(input_dir + "/utt2uniq"):
-        AddPrefixToFields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0])
+        add_prefix_to_fields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0])
     else:
         # Create the utt2uniq file
-        CreateCorruptedUtt2uniq(input_dir, output_dir, num_replicas, include_original, prefix)
+        create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix)
 
     if os.path.isfile(input_dir + "/text"):
-        AddPrefixToFields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0])
+        add_prefix_to_fields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0])
     if os.path.isfile(input_dir + "/segments"):
-        AddPrefixToFields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1])
+        add_prefix_to_fields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1])
     if os.path.isfile(input_dir + "/reco2file_and_channel"):
-        AddPrefixToFields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
+        add_prefix_to_fields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1])
+    if os.path.isfile(input_dir + "/vad.scp"):
+        add_prefix_to_fields(input_dir + "/vad.scp", output_dir + "/vad.scp", num_replicas, include_original, prefix, field=[0])
 
     data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
                     .format(output_dir = output_dir))
 
 
-# This function smooths the probability distribution in the list
-def SmoothProbabilityDistribution(list, smoothing_weight=0.0, target_sum=1.0):
-    if len(list) > 0:
+def smooth_probability_distribution(set_list, smoothing_weight=0.0, target_sum=1.0):
+    """ This function smooths the probability distribution in the list
+    """
+    if len(list(set_list)) > 0:
       num_unspecified = 0
       accumulated_prob = 0
-      for item in list:
+      for item in set_list:
           if item.probability is None:
               num_unspecified += 1
           else:
@@ -465,11 +471,11 @@ def SmoothProbabilityDistribution(list, smoothing_weight=0.0, target_sum=1.0):
       uniform_probability = 0
       if num_unspecified > 0 and accumulated_prob < 1:
           uniform_probability = (1 - accumulated_prob) / float(num_unspecified)
-      elif num_unspecified > 0 and accumulate_prob >= 1:
+      elif num_unspecified > 0 and accumulated_prob >= 1:
           warnings.warn("The sum of probabilities specified by user is larger than or equal to 1. "
                         "The items without probabilities specified will be given zero to their probabilities.")
 
-      for item in list:
+      for item in set_list:
           if item.probability is None:
               item.probability = uniform_probability
           else:
@@ -477,17 +483,18 @@ def SmoothProbabilityDistribution(list, smoothing_weight=0.0, target_sum=1.0):
               item.probability = (1 - smoothing_weight) * item.probability + smoothing_weight * uniform_probability
 
       # Normalize the probability
-      sum_p = sum(item.probability for item in list)
-      for item in list:
+      sum_p = sum(item.probability for item in set_list)
+      for item in set_list:
           item.probability = item.probability / sum_p * target_sum
 
-    return list
+    return set_list
 
 
-# This function parse the array of rir set parameter strings.
-# It will assign probabilities to those rir sets which don't have a probability
-# It will also check the existence of the rir list files.
-def ParseSetParameterStrings(set_para_array):
+def parse_set_parameter_strings(set_para_array):
+    """ This function parse the array of rir set parameter strings.
+        It will assign probabilities to those rir sets which don't have a probability
+        It will also check the existence of the rir list files.
+    """
     set_list = []
     for set_para in set_para_array:
         set = lambda: None
@@ -503,14 +510,15 @@ def ParseSetParameterStrings(set_para_array):
             raise Exception(set.filename + " not found")
         set_list.append(set)
 
-    return SmoothProbabilityDistribution(set_list)
+    return smooth_probability_distribution(set_list)
 
 
-# This function creates the RIR list
-# Each rir object in the list contains the following attributes:
-# rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
-# Please refer to the help messages in the parser for the meaning of these attributes
-def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
+def parse_rir_list(rir_set_para_array, smoothing_weight, sampling_rate = None):
+    """ This function creates the RIR list
+        Each rir object in the list contains the following attributes:
+        rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability
+        Please refer to the help messages in the parser for the meaning of these attributes
+    """
     rir_parser = argparse.ArgumentParser()
     rir_parser.add_argument('--rir-id', type=str, required=True, help='This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id')
     rir_parser.add_argument('--room-id', type=str, required=True, help='This is the room that where the RIR is generated')
@@ -523,11 +531,11 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
     rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command.
                             E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """)
 
-    set_list = ParseSetParameterStrings(rir_set_para_array)
+    set_list = parse_set_parameter_strings(rir_set_para_array)
 
     rir_list = []
     for rir_set in set_list:
-        current_rir_list = map(lambda x: rir_parser.parse_args(shlex.split(x.strip())),open(rir_set.filename))
+        current_rir_list = [rir_parser.parse_args(shlex.split(x.strip())) for x in open(rir_set.filename)]
         for rir in current_rir_list:
             if sampling_rate is not None:
                 # check if the rspecifier is a pipe or not
@@ -536,20 +544,23 @@ def ParseRirList(rir_set_para_array, smoothing_weight, sampling_rate = None):
                 else:
                     rir.rir_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format(rir.rir_rspecifier, sampling_rate)
 
-        rir_list += SmoothProbabilityDistribution(current_rir_list, smoothing_weight, rir_set.probability)
+        rir_list += smooth_probability_distribution(current_rir_list, smoothing_weight, rir_set.probability)
 
     return rir_list
 
 
-# This dunction checks if the inputs are approximately equal assuming they are floats.
 def almost_equal(value_1, value_2, accuracy = 10**-8):
+    """ This function checks if the inputs are approximately equal assuming they are floats.
+    """
     return abs(value_1 - value_2) < accuracy
 
-# This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id.
-# Its values are objects with two attributes: a local RIR list
-# and the probability of the corresponding room
-# Please look at the comments at ParseRirList() for the attributes that a RIR object contains
-def MakeRoomDict(rir_list):
+
+def make_room_dict(rir_list):
+    """ This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id.
+        Its values are objects with two attributes: a local RIR list
+        and the probability of the corresponding room
+        Please look at the comments at parse_rir_list() for the attributes that a RIR object contains
+    """
     room_dict = {}
     for rir in rir_list:
         if rir.room_id not in room_dict:
@@ -567,15 +578,15 @@ def MakeRoomDict(rir_list):
 
     return room_dict
 
-
-# This function creates the point-source noise list
-# and the isotropic noise dictionary from the noise information file
-# The isotropic noise dictionary is indexed by the room
-# and its value is the corrresponding isotropic noise list
-# Each noise object in the list contains the following attributes:
-# noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier
-# Please refer to the help messages in the parser for the meaning of these attributes
-def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None):
+def parse_noise_list(noise_set_para_array, smoothing_weight, sampling_rate = None):
+    """ This function creates the point-source noise list
+         and the isotropic noise dictionary from the noise information file
+         The isotropic noise dictionary is indexed by the room
+         and its value is the corrresponding isotropic noise list
+         Each noise object in the list contains the following attributes:
+         noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier
+         Please refer to the help messages in the parser for the meaning of these attributes
+    """
     noise_parser = argparse.ArgumentParser()
     noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id')
     noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"])
@@ -587,12 +598,12 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
     noise_parser.add_argument('noise_rspecifier', type=str, help="""noise rspecifier, it can be either a filename or a piped command.
                               E.g. type5_noise_cirline_ofc_ambient1.wav or "sox type5_noise_cirline_ofc_ambient1.wav -t wav - |" """)
 
-    set_list = ParseSetParameterStrings(noise_set_para_array)
+    set_list = parse_set_parameter_strings(noise_set_para_array)
 
     pointsource_noise_list = []
     iso_noise_dict = {}
     for noise_set in set_list:
-        current_noise_list = map(lambda x: noise_parser.parse_args(shlex.split(x.strip())),open(noise_set.filename))
+        current_noise_list = [noise_parser.parse_args(shlex.split(x.strip())) for x in open(noise_set.filename)]
         current_pointsource_noise_list = []
         for noise in current_noise_list:
             if sampling_rate is not None:
@@ -612,40 +623,42 @@ def ParseNoiseList(noise_set_para_array, smoothing_weight, sampling_rate = None)
             else:
                 current_pointsource_noise_list.append(noise)
 
-        pointsource_noise_list += SmoothProbabilityDistribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)
+        pointsource_noise_list += smooth_probability_distribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability)
 
     # ensure the point-source noise probabilities sum to 1
-    pointsource_noise_list = SmoothProbabilityDistribution(pointsource_noise_list, smoothing_weight, 1.0)
+    pointsource_noise_list = smooth_probability_distribution(pointsource_noise_list, smoothing_weight, 1.0)
     if len(pointsource_noise_list) > 0:
         assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0)
 
     # ensure the isotropic noise source probabilities for a given room sum to 1
     for key in iso_noise_dict.keys():
-        iso_noise_dict[key] = SmoothProbabilityDistribution(iso_noise_dict[key])
+        iso_noise_dict[key] = smooth_probability_distribution(iso_noise_dict[key])
         assert almost_equal(sum(noise.probability for noise in iso_noise_dict[key]), 1.0)
 
     return (pointsource_noise_list, iso_noise_dict)
 
 
-def Main():
-    args = GetArgs()
+def main():
+    args = get_args()
+
     random.seed(args.random_seed)
-    rir_list = ParseRirList(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate)
+    rir_list = parse_rir_list(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate)
     print("Number of RIRs is {0}".format(len(rir_list)))
     pointsource_noise_list = []
     iso_noise_dict = {}
     if args.noise_set_para_array is not None:
-        pointsource_noise_list, iso_noise_dict = ParseNoiseList(args.noise_set_para_array, args.noise_smoothing_weight, args.source_sampling_rate)
+        pointsource_noise_list, iso_noise_dict = parse_noise_list(args.noise_set_para_array,
+                                                                args.noise_smoothing_weight,
+                                                                args.source_sampling_rate)
         print("Number of point-source noises is {0}".format(len(pointsource_noise_list)))
         print("Number of isotropic noises is {0}".format(sum(len(iso_noise_dict[key]) for key in iso_noise_dict.keys())))
-    room_dict = MakeRoomDict(rir_list)
+    room_dict = make_room_dict(rir_list)
 
     if args.include_original_data == "true":
         include_original = True
     else:
         include_original = False
-
-    CreateReverberatedCopy(input_dir = args.input_dir,
+    create_reverberated_copy(input_dir = args.input_dir,
                            output_dir = args.output_dir,
                            room_dict = room_dict,
                            pointsource_noise_list = pointsource_noise_list,
@@ -661,6 +674,9 @@ def Main():
                            pointsource_noise_addition_probability = args.pointsource_noise_addition_probability,
                            max_noises_per_minute = args.max_noises_per_minute)
 
-if __name__ == "__main__":
-    Main()
 
+    data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}"
+                    .format(output_dir = args.output_dir))
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh
index ff0a87ae295..21b48e649a5 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh
+++ b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh
@@ -3,7 +3,7 @@
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2016.  Apache 2.0.
 
 # This script performs some analysis of alignments on disk, currently in terms
-# of phone lengths, including lenghts of leading and trailing silences
+# of phone lengths, including lengths of leading and trailing silences
 
 
 # begin configuration section.
diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
index d580f516527..df1a6d64801 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
+++ b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
@@ -50,10 +50,10 @@ $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \
 
 $cmd JOB=1:$num_jobs $dir/log/get_lattice_stats.JOB.log \
   ali-to-phones --write-lengths=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \
-  sed -E 's/^[^ ]+ //' \| \
-  awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \
-  sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1
-
+  perl -ne 'chomp;s/^\S+\s*//;@a=split /\s;\s/, $_;$count{"begin ".$a[$0]."\n"}++;
+  if(@a>1){$count{"end ".$a[-1]."\n"}++;}for($i=0;$i<@a;$i++){$count{"all ".$a[$i]."\n"}++;}
+  END{for $k (sort keys %count){print "$count{$k} $k"}}' \| \
+  gzip -c '>' $dir/phone_stats.JOB.gz || exit 1
 
 $cmd $dir/log/analyze_alignments.log \
   gunzip -c "$dir/phone_stats.*.gz" \| \
@@ -67,16 +67,16 @@ echo "$0: see stats in $dir/log/analyze_alignments.log"
 # escaped since it needs to be passed to $cmd.
 # the 'paste' command will paste together the phone-indexes and the depths
 # so that one line will be like utt-id1 phone1 phone2 phone3 .. utt-id1 depth1 depth2 depth3 ...
-# the awk command computes counts of pairs (phone, lattice-depth) and outputs lines
+# the following command computes counts of pairs (phone, lattice-depth) and outputs lines
 # containing 3 integers representing:
 #   phone lattice_depth, count[phone,lattice_depth]
 $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \
   ali-to-phones --per-frame=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \
   paste /dev/stdin '<(' gunzip -c $dir/depth_tmp.JOB.gz  ')'  \| \
-  awk '{ half=NF/2; for (n=2; n<=half; n++) { m=n+half; count[$n " " $m]++;}} END{for(k in count) print k, count[k]; }' \| \
+  perl -ane '$half=@F/2;for($i=1;$i<$half;$i++){$j=$i+$half;$count{$F[$i]." ".$F[$j]}++;}
+  END{for $k (sort keys %count){print "$k $count{$k}\n"}}' \| \
   gzip -c '>' $dir/depth_stats_tmp.JOB.gz
 
-
 $cmd $dir/log/analyze_lattice_depth_stats.log \
   gunzip -c "$dir/depth_stats_tmp.*.gz" \| \
   steps/diagnostic/analyze_lattice_depth_stats.py $lang || exit 1
diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py
index 56b9f69b3c9..8ae5e1ef61c 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py
+++ b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py
@@ -5,9 +5,19 @@
 # Apache 2.0.
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import sys, os
 from collections import defaultdict
+from io import open
+import codecs
+
+# reference: http://www.macfreek.nl/memory/Encoding_of_Python_stdout
+if sys.version_info.major == 2:
+    sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
+else:
+    assert sys.version_info.major == 3
+    sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
 
 
 parser = argparse.ArgumentParser(description="This script reads stats created in analyze_lats.sh "
@@ -28,13 +38,13 @@
 # set up phone_int2text to map from phone to printed form.
 phone_int2text = {}
 try:
-    f = open(args.lang + "/phones.txt", "r");
+    f = open(args.lang + "/phones.txt", "r", encoding='utf-8')
     for line in f.readlines():
         [ word, number] = line.split()
         phone_int2text[int(number)] = word
     f.close()
 except:
-    sys.exit("analyze_lattice_depth_stats.py: error opening or reading {0}/phones.txt".format(
+    sys.exit(u"analyze_lattice_depth_stats.py: error opening or reading {0}/phones.txt".format(
             args.lang))
 # this is a special case... for begin- and end-of-sentence stats,
 # we group all nonsilence phones together.
@@ -48,14 +58,14 @@
     # open lang/phones/silence.csl-- while there are many ways of obtaining the
     # silence/nonsilence phones, we read this because it's present in graph
     # directories as well as lang directories.
-    filename = "{0}/phones/silence.csl".format(args.lang)
+    filename = u"{0}/phones/silence.csl".format(args.lang)
     f = open(filename, "r")
     line = f.readline()
     for silence_phone in line.split(":"):
         nonsilence.remove(int(silence_phone))
     f.close()
 except Exception as e:
-    sys.exit("analyze_lattice_depth_stats.py: error processing {0}/phones/silence.csl: {1}".format(
+    sys.exit(u"analyze_lattice_depth_stats.py: error processing {0}/phones/silence.csl: {1}".format(
             args.lang, str(e)))
 
 # phone_depth_counts is a dict of dicts.
@@ -79,7 +89,7 @@
         break
     a = line.split()
     if len(a) != 3:
-        sys.exit("analyze_lattice_depth_stats.py: reading stdin, could not interpret line: " + line)
+        sys.exit(u"analyze_lattice_depth_stats.py: reading stdin, could not interpret line: " + line)
     try:
         phone, depth, count = [ int(x) for x in a ]
 
@@ -91,11 +101,11 @@
         universal_phone = -1
         phone_depth_counts[universal_phone][depth] += count
     except Exception as e:
-        sys.exit("analyze_lattice_depth_stats.py: unexpected phone {0} "
-                 "seen (lang directory mismatch?): line is {1}, error is {2}".format(phone, line, str(e)))
+        sys.exit(u"analyze_lattice_depth_stats.py: unexpected phone {0} "
+                 u"seen (lang directory mismatch?): line is {1}, error is {2}".format(phone, line, str(e)))
 
 if total_frames == 0:
-    sys.exit("analyze_lattice_depth_stats.py: read no input")
+    sys.exit(u"analyze_lattice_depth_stats.py: read no input")
 
 
 # If depth_to_count is a map from depth-in-frames to count,
@@ -124,8 +134,8 @@ def GetMean(depth_to_count):
     return this_total_depth / this_total_frames
 
 
-print("The total amount of data analyzed assuming 100 frames per second "
-      "is {0} hours".format("%.1f" % (total_frames / 360000.0)))
+print(u"The total amount of data analyzed assuming 100 frames per second "
+      u"is {0} hours".format("%.1f" % (total_frames / 360000.0)))
 
 # the next block prints lines like (to give some examples):
 # Nonsilence phones as a group account for 74.4% of phone occurrences, with lattice depth (10,50,90-percentile)=(1,2,7) and mean=3.1
@@ -151,18 +161,18 @@ def GetMean(depth_to_count):
         try:
             phone_text = phone_int2text[phone]
         except:
-            sys.exit("analyze_lattice_depth_stats.py: phone {0} is not covered on phones.txt "
-                     "(lang/alignment mismatch?)".format(phone))
-        preamble = "Phone {phone_text} accounts for {percent}% of frames, with".format(
+            sys.exit(u"analyze_lattice_depth_stats.py: phone {0} is not covered on phones.txt "
+                     u"(lang/alignment mismatch?)".format(phone))
+        preamble = u"Phone {phone_text} accounts for {percent}% of frames, with".format(
             phone_text = phone_text, percent = "%.1f" % frequency_percentage)
     elif phone == 0:
-        preamble = "Nonsilence phones as a group account for {percent}% of frames, with".format(
+        preamble = u"Nonsilence phones as a group account for {percent}% of frames, with".format(
             percent = "%.1f" % frequency_percentage)
     else:
         assert phone == -1
         preamble = "Overall,";
 
-    print("{preamble} lattice depth (10,50,90-percentile)=({p10},{p50},{p90}) and mean={mean}".format(
+    print(u"{preamble} lattice depth (10,50,90-percentile)=({p10},{p50},{p90}) and mean={mean}".format(
             preamble = preamble,
             p10 = depth_percentile_10,
             p50 = depth_percentile_50,
diff --git a/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py b/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
index 5ebd9e7369b..549c1875a8b 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
+++ b/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
@@ -8,6 +8,15 @@
 import argparse
 import sys, os
 from collections import defaultdict
+from io import open
+import codecs
+
+# reference: http://www.macfreek.nl/memory/Encoding_of_Python_stdout
+if sys.version_info.major == 2:
+    sys.stdout = codecs.getwriter('utf-8')(sys.stdout, 'strict')
+else:
+    assert sys.version_info.major == 3
+    sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer, 'strict')
 
 
 parser = argparse.ArgumentParser(description="This script reads stats created in analyze_alignments.sh "
@@ -31,7 +40,7 @@
 # set up phone_int2text to map from phone to printed form.
 phone_int2text = {}
 try:
-    f = open(args.lang + "/phones.txt", "r");
+    f = open(args.lang + "/phones.txt", "r", encoding='utf-8')
     for line in f.readlines():
         [ word, number] = line.split()
         phone_int2text[int(number)] = word
@@ -112,8 +121,8 @@
     optional_silence_phone_text = phone_int2text[optional_silence_phone]
     f.close()
     if optional_silence_phone in nonsilence:
-        print("analyze_phone_length_stats.py: was expecting the optional-silence phone to "
-              "be a member of the silence phones, it is not.  This script won't work correctly.")
+        print(u"analyze_phone_length_stats.py: was expecting the optional-silence phone to "
+              u"be a member of the silence phones, it is not.  This script won't work correctly.")
 except:
     largest_count = 0
     optional_silence_phone = 1
@@ -124,8 +133,8 @@
                 largest_count = this_count
                 optional_silence_phone = p
     optional_silence_phone_text = phone_int2text[optional_silence_phone]
-    print("analyze_phone_length_stats.py: could not get optional-silence phone from "
-          "{0}/phones/optional_silence.int, guessing that it's {1} from the stats. ".format(
+    print(u"analyze_phone_length_stats.py: could not get optional-silence phone from "
+          u"{0}/phones/optional_silence.int, guessing that it's {1} from the stats. ".format(
             args.lang, optional_silence_phone_text))
 
 
@@ -175,8 +184,8 @@ def GetMean(length_to_count):
     # maybe half a second.  If your database is not like this, you should know;
     # you may want to mess with the segmentation to add more silence.
     if frequency_percentage < 80.0:
-        print("analyze_phone_length_stats.py: WARNING: optional-silence {0} is seen only {1}% "
-              "of the time at utterance {2}.  This may not be optimal.".format(
+        print(u"analyze_phone_length_stats.py: WARNING: optional-silence {0} is seen only {1}% "
+              u"of the time at utterance {2}.  This may not be optimal.".format(
                 optional_silence_phone_text, frequency_percentage, boundary_type))
 
 
@@ -213,8 +222,8 @@ def GetMean(length_to_count):
         except:
             sys.exit("analyze_phone_length_stats.py: phone {0} is not covered on phones.txt "
                      "(lang/alignment mismatch?)".format(phone))
-        print("{text}, {phone_text} accounts for {percent}% of phone occurrences, with "
-              "duration (median, mean, 95-percentile) is ({median},{mean},{percentile95}) frames.".format(
+        print(u"{text}, {phone_text} accounts for {percent}% of phone occurrences, with "
+              u"duration (median, mean, 95-percentile) is ({median},{mean},{percentile95}) frames.".format(
                 text = text, phone_text = phone_text,
                 percent = "%.1f" % frequency_percentage,
                 median = duration_median, mean = "%.1f" % duration_mean,
@@ -245,16 +254,16 @@ def GetMean(length_to_count):
     opt_sil_total_frame_percent = total_optsil_frames * 100.0 / total_frames['all']
     internal_frame_percent = total_frames['internal'] * 100.0 / total_frames['all']
 
-    print("The optional-silence phone {0} occupies {1}% of frames overall ".format(
+    print(u"The optional-silence phone {0} occupies {1}% of frames overall ".format(
             optional_silence_phone_text, "%.1f" % opt_sil_total_frame_percent))
     hours_total = total_frames['all'] / 360000.0;
     hours_nonsil = (total_frames['all'] - total_optsil_frames) / 360000.0
-    print("Limiting the stats to the {0}% of frames not covered by an utterance-[begin/end] phone, "
-          "optional-silence {1} occupies {2}% of frames.".format("%.1f" % internal_frame_percent,
+    print(u"Limiting the stats to the {0}% of frames not covered by an utterance-[begin/end] phone, "
+          u"optional-silence {1} occupies {2}% of frames.".format("%.1f" % internal_frame_percent,
                                                                  optional_silence_phone_text,
                                                                  "%.1f" % opt_sil_internal_frame_percent))
-    print("Assuming 100 frames per second, the alignments represent {0} hours of data, "
-          "or {1} hours if {2} frames are excluded.".format(
+    print(u"Assuming 100 frames per second, the alignments represent {0} hours of data, "
+          u"or {1} hours if {2} frames are excluded.".format(
             "%.1f" % hours_total, "%.1f" % hours_nonsil, optional_silence_phone_text))
 
     opt_sil_internal_phone_percent = (sum(internal_opt_sil_phone_lengths.values()) *
@@ -262,7 +271,7 @@ def GetMean(length_to_count):
     duration_median = GetPercentile(internal_opt_sil_phone_lengths, 0.5)
     duration_mean = GetMean(internal_opt_sil_phone_lengths)
     duration_percentile_95 = GetPercentile(internal_opt_sil_phone_lengths, 0.95)
-    print("Utterance-internal optional-silences {0} comprise {1}% of utterance-internal phones, with duration "
-          "(median, mean, 95-percentile) = ({2},{3},{4})".format(
+    print(u"Utterance-internal optional-silences {0} comprise {1}% of utterance-internal phones, with duration "
+          u"(median, mean, 95-percentile) = ({2},{3},{4})".format(
                 optional_silence_phone_text, "%.1f" % opt_sil_internal_phone_percent,
                 duration_median, "%0.1f" % duration_mean, duration_percentile_95))
diff --git a/egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh b/egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
new file mode 100755
index 00000000000..a793f91fd0a
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Xiaohui Zhang
+#           2018  Ruizhe Huang
+# Apache 2.0
+
+# This script applies a trained Phonetisarus G2P model to
+# synthesize pronunciations for missing words (i.e., words in
+# transcripts but not the lexicon), and output the expanded lexicon.
+# The user could specify either nbest or pmass option 
+# to determine the number of output pronunciation variants, 
+# or use them together to get the intersection of two options.
+
+# Begin configuration section.  
+stage=0
+nbest=      # Generate up to N, like N=3, pronunciation variants for each word
+            # (The maximum size of the nbest list, not considering pruning and taking the prob-mass yet). 
+thresh=5    # Pruning threshold for the n-best list, in (0, 99], which is a -log-probability value.
+            # A large threshold makes the nbest list shorter, and less likely to hit the max size.
+            # This value corresponds to the weight_threshold in shortest-path.h of openfst.
+pmass=      # Select the top variants from the pruned nbest list,
+            # summing up to this total prob-mass for a word.
+            # On the "boundary", it's greedy by design, e.g. if pmass = 0.8,
+            # and we have prob(pron_1) = 0.5, and prob(pron_2) = 0.4, then we get both.
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>"
+  echo "... where <word-list> is a list of words whose pronunciation is to be generated."
+  echo "          <g2p-model-dir> is a directory used as a target during training of G2P"
+  echo "          <output-dir> is the directory where the output lexicon should be stored."
+  echo "                       The format of the output lexicon output-dir/lexicon.lex is" 
+  echo "                       <word>\t<prob>\t<pronunciation> per line."
+  echo "e.g.: $0 --nbest 1 exp/g2p/oov_words.txt exp/g2p exp/g2p/oov_lex"
+  echo ""
+  echo "main options (for others, see top of script file)"
+  echo "  --nbest <int>    # Generate upto N pronunciation variants for each word." 
+  echo "  --pmass <float>  # Select the top variants from the pruned nbest list," 
+  echo "                   # summing up to this total prob-mass, within [0, 1], for a word." 
+  echo "  --thresh <int>   # Pruning threshold for n-best."
+  exit 1;
+fi
+
+wordlist=$1
+modeldir=$2
+outdir=$3
+
+model=$modeldir/model.fst
+output_lex=$outdir/lexicon.lex
+mkdir -p $outdir
+
+[ ! -f ${model:-} ] && echo "$0: File $model not found in the directory $modeldir." && exit 1
+[ ! -f $wordlist ] && echo "$0: File $wordlist not found!" && exit 1
+[ -z $pmass ] && [ -z $nbest ] && echo "$0: nbest or/and pmass should be specified." && exit 1;
+if ! phonetisaurus=`which phonetisaurus-apply` ; then
+  echo "Phonetisarus was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
+  exit 1
+fi
+
+cp $wordlist $outdir/wordlist.txt
+
+# three options: 1) nbest, 2) pmass, 3) nbest+pmass,
+nbest=${nbest:-20}   # if nbest is not specified, set it to 20, due to Phonetisaurus mechanism
+pmass=${pmass:-1.0}  # if pmass is not specified, set it to 1.0, due to Phonetisaurus mechanism
+
+[[ ! $nbest =~ ^[1-9][0-9]*$ ]] && echo "$0: nbest should be a positive integer." && exit 1;
+
+echo "Applying the G2P model to wordlist $wordlist"
+phonetisaurus-apply --pmass $pmass --nbest $nbest --thresh $thresh \
+  --word_list $wordlist --model $model \
+  --accumulate --verbose --prob \
+  1>$output_lex
+
+echo "Completed. Synthesized lexicon for new words is in $output_lex"
+
+# Some words might have been removed or skipped during the process,
+# let's check it and warn the user if so...
+nlex=`cut -f 1 $output_lex | sort -u | wc -l`
+nwlist=`cut -f 1 $wordlist | sort -u | wc -l`
+if [ $nlex -ne $nwlist ] ; then
+  failed_wordlist=$outdir/lexicon.failed
+  echo "WARNING: Unable to generate pronunciation for all words. ";
+  echo "WARINNG:   Wordlist: $nwlist words"
+  echo "WARNING:   Lexicon : $nlex words"
+  comm -13 <(cut -f 1 $output_lex | sort -u ) \
+           <(cut -f 1 $wordlist | sort -u ) \
+           >$failed_wordlist && echo "WARNING: The list of failed words is in $failed_wordlist"
+fi
+exit 0
+
diff --git a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
index a5bdbc30d46..f8568971fb7 100755
--- a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
+++ b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
@@ -10,7 +10,7 @@
 def GetArgs():
     parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
                                      "to produce a learned lexicon.",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
 
     parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
                         help = "Input lexicon. Each line must be <word> <phones>.")
diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py
index b5202a69abb..e8106bdd1ac 100755
--- a/egs/wsj/s5/steps/dict/get_pron_stats.py
+++ b/egs/wsj/s5/steps/dict/get_pron_stats.py
@@ -10,15 +10,16 @@
 import sys
 
 def GetArgs():
-    parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon"
-                                     "learning. The inputs are a file containing arc level information from lattice-align-words,"
-                                     "and a map which maps word-position-dependent phones to word-position-independent phones"
-                                     "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
-                                     "of pronunciations",
-                                     epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
-                                              "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
-                                              "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
-                                              "See steps/dict/learn_lexicon.sh for examples in detail.")
+    parser = argparse.ArgumentParser(
+        description = "Accumulate statistics from lattice-alignment outputs for lexicon"
+        "learning. The inputs are a file containing arc level information from lattice-align-words,"
+        "and a map which maps word-position-dependent phones to word-position-independent phones"
+        "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
+        "of pronunciations",
+        epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
+        "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
+        "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
+        "See steps/dict/learn_lexicon_greedy.sh for examples in detail.")
 
     parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
                         help = "Input file containing per arc statistics; "
@@ -75,14 +76,14 @@ def GetStatsFromArcInfo(arc_info_file_handle, phone_map_handle):
         prons[word].add(phones)
         stats_unmapped[(word, phones)] = stats_unmapped.get((word, phones), 0) + count
      
-    for word_pron, count in stats_unmapped.iteritems():
+    for word_pron, count in stats_unmapped.items():
         phones_unmapped = word_pron[1].split()
         phones = [phone_map[phone] for phone in phones_unmapped]
         stats[(word_pron[0], " ".join(phones))] = count
     return stats
 
 def WriteStats(stats, file_handle):
-    for word_pron, count in stats.iteritems():
+    for word_pron, count in stats.items():
         print('{2} {0} {1}'.format(word_pron[0], word_pron[1], count),
               file=file_handle)
     file_handle.close()
diff --git a/egs/wsj/s5/steps/dict/internal/get_subsegments.py b/egs/wsj/s5/steps/dict/internal/get_subsegments.py
new file mode 100755
index 00000000000..c431b4c7066
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/internal/get_subsegments.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+# Copyright 2018 Xiaohui Zhang
+# Apache 2.0.
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import argparse
+import sys
+import string
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "The purpose of this script is to use a ctm and a vocab file"
+        "to extract sub-utterances and a sub-segmentation. Extracted sub-utterances"
+        "are all the strings of consecutive in-vocab words from the ctm"
+        "surrounded by an out-of-vocab word at each end if present.",
+        epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\"
+        "exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\"
+        "exp/tri3_lex_0.4_work/resegmentation/text"
+        "See steps/dict/learn_lexicon_greedy.sh for an example.")
+
+    parser.add_argument("ctm", metavar='<ctm>', type = str,
+                        help = "Input ctm file."
+                        "each line must be <utt-id> <chanel> <start-time> <duration> <word>")
+    parser.add_argument("vocab", metavar='<vocab>', type = str,
+                        help = "Vocab file."
+                        "each line must be <word>")
+    parser.add_argument("subsegment", metavar='<subsegtment>', type = str,
+                        help = "Subsegment file. Each line is in format:"
+                        "<new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>")
+    parser.add_argument("text", metavar='<text>', type = str,
+                        help = "Text file. Each line is in format:"
+                        " <new-utt> <word1> <word2> ... <wordN>.")
+  
+    print (' '.join(sys.argv), file = sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.ctm == "-":
+        args.ctm_handle = sys.stdin
+    else:
+        args.ctm_handle = open(args.ctm)
+
+    if args.vocab is not '':
+        if args.vocab == "-":
+            args.vocab_handle = sys.stdout
+        else:
+            args.vocab_handle = open(args.vocab)
+
+    args.subsegment_handle = open(args.subsegment, 'w')
+    args.text_handle = open(args.text, 'w')
+
+    return args
+
+def GetSubsegments(args, vocab):
+    sub_utt = list()
+    last_is_oov = False
+    is_oov = False
+    utt_id_last = None
+    start_times = {}
+    end_times = {}
+    sub_utts = {}
+    sub_utt_id = 1
+    sub_utt_id_last = 1
+    end_time_last = 0.0
+    for line in args.ctm_handle:
+        splits = line.strip().split()
+        if len(splits) < 5:
+            raise Exception("problematic line",line)
+
+        utt_id = splits[0]
+        start = float(splits[2])
+        dur = float(splits[3])
+        word = splits[4]
+        if utt_id != utt_id_last:
+            sub_utt_id = 1
+            if len(sub_utt)>1:
+                sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt)
+                end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last
+            sub_utt = []
+            start_times[utt_id+'-'+str(sub_utt_id)] = start
+            is_oov_last = False
+        if word == '<eps>':
+            is_oov = True
+            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
+        elif word in vocab:
+            is_oov = True
+            sub_utt.append(word)
+            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
+        else:
+            is_oov = False
+            if is_oov_last == True:
+                sub_utt.append(word)
+                sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
+                end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
+                sub_utt_id += 1
+            sub_utt = [word]
+            start_times[utt_id+'-'+str(sub_utt_id)] = start
+        utt_id_last = utt_id
+        sub_utt_id_last = sub_utt_id
+        is_oov_last = is_oov
+        ent_time_last = start + dur
+        
+    if is_oov:
+        if word != '<eps>':
+            sub_utt.append(word)
+        sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
+        end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
+
+    for utt,v in sorted(sub_utts.items()):
+        print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle)
+        print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle)
+
+def ReadVocab(vocab_file_handle):
+    vocab = set()
+    if vocab_file_handle:
+        for line in vocab_file_handle.readlines():
+            splits = line.strip().split()
+            if len(splits) == 0:
+                continue
+            if len(splits) > 1:
+                raise Exception('Invalid format of line ' + line
+                                    + ' in vocab file.')
+            word = splits[0]
+            vocab.add(word)
+    return vocab
+
+def Main():
+    args = GetArgs()
+
+    vocab = ReadVocab(args.vocab_handle)
+    GetSubsegments(args, vocab)
+   
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
index 1f2863424f3..60c7f75bbe8 100755
--- a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
+++ b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2016  Xiaohui Zhang
+# Copyright 2018  Xiaohui Zhang
 # Apache 2.0.
 
 from __future__ import print_function
@@ -10,27 +10,36 @@
 import math
 
 def GetArgs():
-    parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
-                                     "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
-                                     "cadidates according to their soft-counts, and then select the top r * N candidates"
-                                     "(For words in the reference lexicon, N = # pron variants given by the reference"
-                                     "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
-                                     "r is a user-specified constant, like 2.",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example")
-
-    parser.add_argument("--r", type = float, default = "2.0",
-                        help = "a user-specified ratio parameter which determines how many"
-                        "pronunciation candidates we want to keep for each word.")
+    parser = argparse.ArgumentParser(
+        description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
+        "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
+        "cadidates according to their soft-counts, and then select the top variant-counts-ratio * N candidates"
+        "(For words in the reference lexicon, N = # pron variants given by the reference"
+        "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon).",
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
+
+    parser.add_argument("--variant-counts-ratio", type = float, default = "3.0",
+                        help = "A user-specified ratio parameter which determines how many"
+                        "pronunciation candidates we want to keep for each word at most.")
     parser.add_argument("pron_stats", metavar = "<pron-stats>", type = str,
-                        help = "File containing soft-counts of all pronounciation candidates; "
+                        help = "File containing soft-counts of pronounciation candidates; "
                         "each line must be <soft-counts> <word> <phones>")
+    parser.add_argument("lexicon_phonetic_decoding", metavar = "<lexicon-phonetic-decoding>", type = str,
+                        help = "Lexicon containing pronunciation candidates from phonetic decoding."
+                        "each line must be <word> <phones>")
+    parser.add_argument("lexiconp_g2p", metavar = "<lexiconp-g2p>", type = str,
+                        help = "Lexicon with probabilities for pronunciation candidates from G2P."
+                        "each line must be <prob> <word> <phones>")
     parser.add_argument("ref_lexicon", metavar = "<ref-lexicon>", type = str,
                         help = "Reference lexicon file, where we obtain # pron variants for"
                         "each word, based on which we prune the pron candidates."
                         "Each line must be <word> <phones>")
-    parser.add_argument("pruned_prons", metavar = "<pruned-prons>", type = str,
-                        help = "An output file in lexicon format, which contains prons we want to" 
-                        "prune off from the pron_stats file.")
+    parser.add_argument("lexicon_phonetic_decoding_pruned", metavar = "<lexicon-phonetic-decoding-pruned>", type = str,
+                        help = "Output lexicon containing pronunciation candidates from phonetic decoding after pruning."
+                        "each line must be <word> <phones>")
+    parser.add_argument("lexicon_g2p_pruned", metavar = "<lexicon-g2p-pruned>", type = str,
+                        help = "Output lexicon containing pronunciation candidates from G2P after pruning."
+                        "each line must be <word> <phones>")
 
     print (' '.join(sys.argv), file=sys.stderr)
 
@@ -40,12 +49,13 @@ def GetArgs():
     return args
 
 def CheckArgs(args):
+    print(args)
     args.pron_stats_handle = open(args.pron_stats)
+    args.lexicon_phonetic_decoding_handle = open(args.lexicon_phonetic_decoding)
+    args.lexiconp_g2p_handle = open(args.lexiconp_g2p)
     args.ref_lexicon_handle = open(args.ref_lexicon)
-    if args.pruned_prons == "-":
-        args.pruned_prons_handle = sys.stdout
-    else:
-        args.pruned_prons_handle = open(args.pruned_prons, "w")
+    args.lexicon_phonetic_decoding_pruned_handle = open(args.lexicon_phonetic_decoding_pruned, "w")
+    args.lexicon_g2p_pruned_handle = open(args.lexicon_g2p_pruned, "w")
     return args
 
 def ReadStats(pron_stats_handle):
@@ -62,13 +72,11 @@ def ReadStats(pron_stats_handle):
         phones = ' '.join(splits[2:])
         stats[word].append((phones, count))
 
-    for word, entry in stats.iteritems():
-        entry.sort(key=lambda x: x[1])
     return stats
 
-def ReadLexicon(ref_lexicon_handle):
-    ref_lexicon = defaultdict(set)
-    for line in ref_lexicon_handle.readlines():
+def ReadLexicon(lexicon_handle):
+    lexicon = defaultdict(set)
+    for line in lexicon_handle.readlines():
         splits = line.strip().split()
         if len(splits) == 0:
             continue
@@ -77,42 +85,74 @@ def ReadLexicon(ref_lexicon_handle):
                                 + ' in lexicon file.')
         word = splits[0]
         phones = ' '.join(splits[1:])
-        ref_lexicon[word].add(phones)
-    return ref_lexicon
+        lexicon[word].add(phones)
+    return lexicon
 
-def PruneProns(args, stats, ref_lexicon):
+def ReadLexiconp(lexiconp_handle):
+    lexicon = defaultdict(set)
+    pron_probs = defaultdict(float)
+    for line in lexiconp_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 3:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[1]
+        prob = float(splits[0])
+        phones = ' '.join(splits[2:])
+        pron_probs[(word, phones)] = prob
+        lexicon[word].add(phones)
+    return lexicon, pron_probs
+
+def PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs):
+    # For those pron candidates from lexicon_phonetic_decoding/g2p which don't
+    # have stats, we append them to the "stats" dict, with a zero count.
+    for word, entry in stats.iteritems():
+        prons_with_stats = set()
+        for (pron, count) in entry:
+            prons_with_stats.add(pron)
+        for pron in lexicon_g2p[word]:
+            if pron not in prons_with_stats:
+                entry.append((pron, lexicon_g2p_probs[(word, pron)]-1.0))
+        entry.sort(key=lambda x: x[1])
+    
     # Compute the average # pron variants counts per word in the reference lexicon.
     num_words_ref = 0
     num_prons_ref = 0
     for word, prons in ref_lexicon.iteritems():
         num_words_ref += 1
         num_prons_ref += len(prons)
-    avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref))
-
+    avg_variant_counts_ref = round(float(num_prons_ref) / float(num_words_ref))
     for word, entry in stats.iteritems():
         if word in ref_lexicon:
-            variants_counts = args.r * len(ref_lexicon[word])
+            variant_counts = args.variant_counts_ratio * len(ref_lexicon[word])
         else:
-            variants_counts = args.r * avg_variants_counts_ref
+            variant_counts = args.variant_counts_ratio * avg_variant_counts_ref
         num_variants = 0
-        while num_variants < variants_counts:
+        count = 0.0
+        while num_variants < variant_counts:
             try:
-                pron, prob = entry.pop()
-                if word not in ref_lexicon or pron not in ref_lexicon[word]:
+                pron, count = entry.pop()
+                if word in ref_lexicon and pron in ref_lexicon[word]:
+                    continue
+                if pron in lexicon_phonetic_decoding[word]:
+                    num_variants += 1
+                    print('{0} {1}'.format(word, pron), file=args.lexicon_phonetic_decoding_pruned_handle)
+                if pron in lexicon_g2p[word]:
                     num_variants += 1
+                    print('{0} {1}'.format(word, pron), file=args.lexicon_g2p_pruned_handle)
             except IndexError:
                 break
-        
-    for word, entry in stats.iteritems():
-        for pron, prob in entry:
-            if word not in ref_lexicon or pron not in ref_lexicon[word]:
-                print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle)
 
 def Main():
     args = GetArgs()
     ref_lexicon = ReadLexicon(args.ref_lexicon_handle)
+    lexicon_phonetic_decoding = ReadLexicon(args.lexicon_phonetic_decoding_handle)
+    lexicon_g2p, lexicon_g2p_probs = ReadLexiconp(args.lexiconp_g2p_handle)
     stats = ReadStats(args.pron_stats_handle)
-    PruneProns(args, stats, ref_lexicon)
+
+    PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py
new file mode 100755
index 00000000000..5f02bc5fc29
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+
+# Copyright 2018   Xiaohui Zhang
+# Apache 2.0
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values == "true":
+            setattr(namespace, self.dest, True)
+        elif values == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "Accumulate statistics from per arc lattice statitics"
+        "for lexicon learning",
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
+
+    parser.add_argument("--set-sum-to-one", type = str, default = True,
+                        action = StrToBoolAction, choices = ["true", "false"],
+                        help = "If normalize posteriors such that the sum of "
+                        "pronunciation posteriors of a word in an utterance is 1.")
+    parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
+                        help = "File containing per arc statistics; "
+                        "each line must be <utt-id> <word> <start-frame> <duration> <posterior>"
+                        "<phones-with-word-boundary-markers>")
+    parser.add_argument("phone_map", metavar = "<phone-map>", type = str,
+                        help = "An input phone map used to remove word boundary markers from phones;"
+                        "generated in steps/cleanup/debug_lexicon.sh")
+    parser.add_argument("stats_file", metavar = "<out-stats-file>", type = str,
+                        help = "Write accumulated statitistics to this file"
+                        "each line is <utt-id> <word> <start-frame> <posterior>"
+                        "<phones-without-word-boundary-markers>")
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.arc_info_file == "-":
+        args.arc_info_file_handle = sys.stdin
+    else:
+        args.arc_info_file_handle = open(args.arc_info_file)
+    
+    args.phone_map_handle = open(args.phone_map)
+
+    if args.stats_file == "-":
+        args.stats_file_handle = sys.stdout
+    else:
+        args.stats_file_handle = open(args.stats_file, "w")
+
+    return args
+
+def Main():
+    args = GetArgs()
+
+    lexicon = defaultdict(list)
+    prons = defaultdict(list)
+    start_frames = {}
+    stats = defaultdict(lambda : defaultdict(float))
+    sum_tot = defaultdict(float)
+
+    phone_map = {}
+    for line in args.phone_map_handle.readlines():
+        splits = line.strip().split()
+        phone_map[splits[0]] = splits[1]
+
+    for line in args.arc_info_file_handle.readlines():
+        splits = line.strip().split()
+
+        if (len(splits) == 0):
+            continue
+
+        if (len(splits) < 6):
+            raise Exception('Invalid format of line ' + line
+                                + ' in ' + args.arc_info_file)
+
+        utt = splits[0]
+        start_frame = int(splits[1])
+        word = splits[4]
+        count = float(splits[3])
+        phones_unmapped = splits[5:]   
+        phones = [phone_map[phone] for phone in phones_unmapped]
+        phones = ' '.join(phones)
+        overlap = False
+        if word == '<eps>':
+            continue
+        if (word, utt) not in start_frames:
+            start_frames[(word, utt)] = start_frame
+
+        if (word, utt) in stats:
+            stats[word, utt][phones] = stats[word, utt].get(phones, 0) + count
+        else:
+            stats[(word, utt)][phones] = count
+        sum_tot[(word, utt)] += count
+
+        if phones not in prons[word]:
+            prons[word].append(phones)
+
+    for (word, utt) in stats:
+       count_sum = 0.0
+       counts = dict()
+       for phones in stats[(word, utt)]:
+           count = stats[(word, utt)][phones]
+           count_sum += count
+           counts[phones] = count
+       # By default we normalize the pron posteriors of each word in each utterance,
+       # so that they sum up exactly to one. If a word occurs two times in a utterance,
+       # the effect of this operation is to average the posteriors of these two occurences
+       # so that there's only one "equivalent occurence" of this word in the utterance.
+       # However, this case should be extremely rare if the utterances are already
+       # short sub-utterances produced by steps/dict/internal/get_subsegments.py
+       for phones in stats[(word, utt)]:
+           count = counts[phones] / count_sum
+           print(word, utt, start_frames[(word, utt)], count, phones, file=args.stats_file_handle)
+       # # Diagnostics info implying incomplete arc_info or multiple occurences of a word in a utterance:
+       # if count_sum < 0.9 or count_sum > 1.1:
+       #    print(word, utt, start_frame, count_sum, stats[word, utt], file=sys.stderr)
+
+    args.stats_file_handle.close()
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
similarity index 93%
rename from egs/wsj/s5/steps/dict/learn_lexicon.sh
rename to egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
index a719422b593..adff11dd1b4 100755
--- a/egs/wsj/s5/steps/dict/learn_lexicon.sh
+++ b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
@@ -36,6 +36,7 @@ oov_symbol=
 lexicon_g2p=
 
 min_prob=0.3
+variant_counts_ratio=8 
 variants_prob_mass=0.7
 variants_prob_mass_ref=0.9
 
@@ -93,6 +94,10 @@ if [ $# -lt 6 ] || [ $# -gt 7 ]; then
   echo "  --min-prob <float>           # The cut-off parameter used to select pronunciation candidates from phonetic"
   echo "                               # decoding. We remove pronunciations with probabilities less than this value"
   echo "                               # after normalizing the probs s.t. the max-prob is 1.0 for each word."
+  echo "  --variant-counts-ratio <int> # This ratio parameter determines the maximum number of pronunciation"
+  echo "                               # candidates we will keep for each word, after pruning according to lattice statistics from"
+  echo "                               # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py"
+  echo "                               # for details."
   echo "  --prior-mean                 # Mean of priors (summing up to 1) assigned to three exclusive pronunciation"
   echo "         <float,float,float>   # source: reference lexicon, g2p, and phonetic decoding (used in the Bayesian"
   echo "                               # pronunciation selection procedure). We recommend setting a larger prior"
@@ -150,17 +155,17 @@ if [ $stage -le 0 ]; then
 
   # Remove non-scored-words from the reference lexicon.
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
-    $ref_dict/lexicon.txt | tr -s '\t' ' ' > $dir/ref_lexicon.txt
+    $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt
 
   cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
     $target_vocab | sort | uniq > $dir/target_vocab.txt
     
   # From the reference lexicon, we estimate the target_num_prons_per_word as,
-  # ceiling(avg. # prons per word in the reference lexicon). This'll be used as 
+  # round(avg. # prons per word in the reference lexicon). This'll be used as 
   # the upper bound of # pron variants per word when we apply G2P or select prons to
   # construct the learned lexicon in later stages.
-  python -c 'import sys; import math; print int(math.ceil(float(sys.argv[1])/float(sys.argv[2])))' \
+  python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \
     `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \
     > $dir/target_num_prons_per_word || exit 1;
 
@@ -225,10 +230,11 @@ if [ $stage -le 2 ]; then
 
   # Get the oov words list (w.r.t ref vocab) which are in training data. 
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \
-    $dir/train_counts.txt | sort > $dir/oov_train.txt 
+    $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \
+    $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; 
   
   awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \
-    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate
+    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1;
   
   echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:"
   cat $dir/train_oov_rate
@@ -237,14 +243,14 @@ if [ $stage -le 2 ]; then
   # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton
   # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on.
   awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \
-    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt
+    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt || exit 1;
   
   # Get the pronunciation of oov_symbol.
-  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | cut -f2- -d' '`
+  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'`
   # For oov words in training data for which we don't even have G2P pron candidates,
   # we simply assign them the pronunciation of the oov symbol (like <unk>).
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \
-    $dir/oov_train.txt | awk -v op=$oov_pron '{print $0" "op}' > $dir/oov_train_no_pron.txt
+    $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1;
     
   cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \
     awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
@@ -263,7 +269,7 @@ if [ $stage -le 3 ]; then
   
   # We prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob",
   # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon.
-  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt > $dir/phonetic_decoding/filter_lexicon.txt 
+  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt 
   
   $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \
     --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \
@@ -295,7 +301,7 @@ if [ $stage -le 4 ]; then
   
   # Generate lattices for the acoustic training data with the combined lexicon.
   if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
-  steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \
+  steps/align_fmllr_lats.sh --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
     $data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1;
 
   # Get arc level information from the lattice.
@@ -321,13 +327,10 @@ if [ $stage -le 5 ]; then
   rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null
 
   # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment.
-  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py $dir/lats_iter1/pron_stats.txt $dir/ref_lexicon.txt $dir/pruned_prons.txt
- 
-  awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_phonetic_decoding.txt \
-    > $dir/lexicon_phonetic_decoding_pruned.txt
-
-  awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_g2p.txt \
-    > $dir/lexicon_g2p_pruned.txt \
+  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
+    --variant-counts-ratio $variant_counts_ratio \
+    $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
+    $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt
 
   # Filter out words which don't appear in the acoustic training data
   cat $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt \
@@ -402,7 +405,7 @@ if [ $stage -le 7 ]; then
   # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any.
   cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \
     awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \
-    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1;
 
   awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \
     $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_no_acoustics.txt
@@ -426,5 +429,5 @@ if [ $stage -le 8 ]; then
   echo "  ...   sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon."
   cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null
   steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \
-    sort | uniq > $dest_dict/lexicon.txt
+    sort | uniq > $dest_dict/lexicon.txt || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
new file mode 100755
index 00000000000..56e85f20d62
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
@@ -0,0 +1,546 @@
+#! /bin/bash
+
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0
+
+# This recipe has similar inputs and outputs as steps/dict/learn_lexicon.sh
+# The major difference is, instead of using a Bayesian framework for 
+# pronunciation selection, we used a likelihood-reduction based greedy 
+# pronunciation selection framework presented in the paper:
+# "Acoustic data-driven lexicon learning based on a greedy pronunciation "
+# "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur,"
+# "Interspeech 2017."
+
+# This script demonstrate how to expand a existing lexicon using a combination
+# of acoustic evidence and G2P to learn a lexicon that covers words in a target 
+# vocab, and agrees sufficiently with the acoustics. The basic idea is to 
+# run phonetic decoding on acoustic training data using an existing
+# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get 
+# alternative pronunciations for words in training data. Then we combine three
+# exclusive sources of pronunciations: the reference lexicon (supposedly 
+# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run 
+# lattice alignment on the same data, to collect acoustic evidence (soft
+# counts) of all pronunciations. Based on these statistics, we use a greedy
+# framework (see steps/dict/select_prons_greedy.sh for details) to select an
+# informative subset of pronunciations for each word with acoustic evidence. 
+# two important parameters are alpha and beta. Basically, the three dimensions of alpha
+# and beta correspond to three pronunciation sources: phonetic-decoding, G2P and
+# the reference lexicon, and the larger a value is, the more aggressive we'll
+# prune pronunciations from that sooure. The valid range of each dim. is [0, 1]
+# (for alpha, and 0 means we never pruned pron from that source.) [0, 100] (for beta). 
+# The output of steps/dict/select_prons_greedy.sh is a learned lexicon whose vocab 
+# matches the user-specified target-vocab, and two intermediate outputs which were
+# used to generate the learned lexicon: an edits file which records the recommended
+# changes to all in-ref-vocab words' prons, and a half-learned lexicon
+# ($dest_dict/lexicon0.txt) where all in-ref-vocab words' prons were untouched
+# (on top of which we apply the edits file to produce the final learned lexicon). 
+# The user can always modify the edits file manually and then re-apply it on the 
+# half-learned lexicon using steps/dict/apply_lexicon_edits.sh to produce the 
+# final learned lexicon. See the last stage in this script for details.
+
+stage=0
+# Begin configuration section.  
+cmd=run.pl
+nj=
+stage=0
+oov_symbol=
+lexiconp_g2p=
+min_prob=0.3
+variant_counts_ratio=8 
+variant_counts_no_acoustics=1 
+alpha="0,0,0"
+beta="0,0,0"
+delta=0.0000001
+num_gauss=
+num_leaves=
+retrain_src_mdl=true
+cleanup=true
+nj_select_prons=200
+learn_iv_prons=false # whether we want to learn the prons of IV words (w.r.t. ref_vocab), 
+
+# End configuration section.  
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -lt 6 ] || [ $# -gt 7 ]; then
+  echo "Usage: $0 [options] <ref-dict> <target-vocab> <data> <src-mdl-dir> \\"
+  echo "          <ref-lang> <dest-dict> <dir>."
+  echo "  This script does lexicon expansion using a combination of acoustic"
+  echo "  evidence and G2P to produce a lexicon that covers words of a target vocab:"
+  echo ""               
+  echo "Arguments:"
+  echo " <ref-dict>     The dir which contains the reference lexicon (most probably hand-derived)"
+  echo "                we want to expand/improve, and nonsilence_phones.txt,.etc which we need " 
+  echo "                for building new dict dirs."
+  echo " <target-vocab> The vocabulary we want the final learned lexicon to cover (one word per line)."
+  echo " <data>         acoustic training data we use to get alternative"
+  echo "                pronunciations and collet acoustic evidence."
+  echo " <src-mdl-dir>  The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" 
+  echo "                using G2P expanded lexicon) to do phonetic decoding (to get alternative"
+  echo "                pronunciations) and lattice-alignment (to collect acoustic evidence for"
+  echo "                evaluating all prounciations)"
+  echo " <ref-lang>     The reference lang dir which we use to get non-scored-words"
+  echo "                like <UNK> for building new dict dirs"
+  echo " <dest-dict>    The dict dir where we put the final learned lexicon, whose vocab"
+  echo "                matches <target-vocab>."
+  echo " <dir>          The dir which contains all the intermediate outputs of this script."
+  echo ""
+  echo "Note: <target-vocab> and the vocab of <data> don't have to match. For words"
+  echo "     who are in <target-vocab> but not seen in <data>, their pronunciations" 
+  echo "     will be given by G2P at the end."
+  echo ""
+  echo "e.g. $0 data/local/dict data/local/lm/librispeech-vocab.txt data/train \\"
+  echo "          exp/tri3 data/lang data/local/dict_learned"
+  echo "Options:"
+  echo "  --stage <n>                         # stage to run from, to enable resuming from partially"
+  echo "                                      # completed run (default: 0)"
+  echo "  --cmd '$cmd'                        # command to submit jobs with (e.g. run.pl, queue.pl)"
+  echo "  --nj <nj>                           # number of parallel jobs"
+  echo "  --oov-symbol '$oov_symbol'          # oov symbol, like <UNK>."
+  echo "  --lexiconp-g2p                      # a lexicon (with prob in the second column) file containing g2p generated"
+  echo "                                      # pronunciations, for words in acoustic training data / target vocabulary. It's optional."
+  echo "  --min-prob <float>                  # The cut-off parameter used to select pronunciation candidates from phonetic"
+  echo "                                      # decoding. We remove pronunciations with probabilities less than this value"
+  echo "                                      # after normalizing the probs s.t. the max-prob is 1.0 for each word."
+  echo "  --variant-counts-ratio <int>        # This ratio parameter determines the maximum number of pronunciation"
+  echo "                                      # candidates we will keep for each word, after pruning according to lattice statistics from"
+  echo "                                      # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py"
+  echo "                                      # for details."
+  echo "  --variant-counts-no-acoustics <int> # how many g2p-prons per word we want to include for each words unseen in acoustic training data."
+  echo "  --alpha <float>,<float>,<float>     # scaling factors used in the greedy pronunciation selection framework, "
+  echo "                                      # see steps/dict/select_prons_greedy.py for details."
+  echo "  --beta <int>,<int>,<int>            # smoothing factors used in the greedy pronunciation selection framework, "
+  echo "                                      # see steps/dict/select_prons_greedy.py for details."
+  echo "  --delta <float>                     # a floor value used in the greedy pronunciation selection framework, "
+  echo "                                      # see steps/dict/select_prons_greedy.py for details."
+  echo "  --num-gauss                         # number of gaussians for the re-trained SAT model (on top of <src-mdl-dir>)."            
+  echo "  --num-leaves                        # number of leaves for the re-trained SAT model (on top of <src-mdl-dir>)." 
+  echo "  --retrain-src-mdl                   # true if you want to re-train the src_mdl before phone decoding (default false)."
+  exit 1
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+ref_dict=$1
+target_vocab=$2
+data=$3
+src_mdl_dir=$4
+ref_lang=$5
+dest_dict=$6
+
+if [ -z "$oov_symbol" ]; then
+   echo "$0: the --oov-symbol option is required."
+   exit 1
+fi
+
+if [ $# -gt 6 ]; then
+  dir=$7 # Most intermediate outputs will be put here. 
+else
+  dir=${src_mdl_dir}_lex_learn_work
+fi
+
+mkdir -p $dir
+if [ $stage -le 0 ]; then
+  echo "$0: Some preparatory work."
+  # Get the word counts of training data.
+  awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \
+    $data/text | sort > $dir/train_counts.txt
+  
+  # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab.
+  steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words
+  awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \
+    $ref_dict/lexicon.txt > $dir/non_scored_entries 
+
+  # Remove non-scored-words from the reference lexicon.
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
+    $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt
+
+  cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
+    $target_vocab | sort | uniq > $dir/target_vocab.txt
+    
+  # From the reference lexicon, we estimate the target_num_prons_per_word as,
+  # round(avg. # prons per word in the reference lexicon). This'll be used as 
+  # the upper bound of # pron variants per word when we apply G2P or select prons to
+  # construct the learned lexicon in later stages.
+  python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \
+    `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \
+    > $dir/target_num_prons_per_word || exit 1;
+
+  if [ -z $lexiconp_g2p ]; then
+    # create an empty list of g2p generated prons, if it's not given.
+    touch $dir/lexicon_g2p.txt
+    touch $dir/lexiconp_g2p.txt
+  else
+    # Exchange the 1st column (word) and 2nd column (prob) and remove pronunciations
+    # which are already in the reference lexicon.
+    cat $lexiconp_g2p | awk '{a=$1;b=$2; $1="";$2="";print b" "a$0}' | \
+      awk 'NR==FNR{a[$0] = 1; next} {w=$2;for (n=3;n<=NF;n++) w=w" "$n; if(!(w in a)) print $0}' \
+      $dir/ref_lexicon.txt - > $dir/lexiconp_g2p.txt 2>/dev/null
+    
+    # make a copy where we remove the first column (probabilities).
+    cat $dir/lexiconp_g2p.txt | cut -f1,3- > $dir/lexicon_g2p.txt 2>/dev/null
+  fi
+  variant_counts=`cat $dir/target_num_prons_per_word` || exit 1;
+  $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \
+    --top-N=$variant_counts $dir/lexiconp_g2p.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1;
+fi
+
+if [ $stage -le 1 ] && $retrain_src_mdl; then
+  echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then"
+  echo "   ... re-train the source acoustic model for phonetic decoding. "
+  mkdir -p $dir/dict_expanded_target_vocab
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_expanded_target_vocab  2>/dev/null
+  rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null
+  
+  # Get the oov words list (w.r.t ref vocab) which are in the target vocab. 
+  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt
+
+  # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which
+  # cannot be found in lexicon_g2p.txt, we simply ignore them.
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \
+    $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt
+  
+  cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \
+    cat $dir/non_scored_entries - | 
+    sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt
+  
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \
+    $oov_symbol $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1;
+  
+  # Align the acoustic training data using the given src_mdl_dir.
+  alidir=${src_mdl_dir}_ali_$(basename $data) 
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1;
+  
+  # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained
+  # this model will be used for phonetic decoding and lattice alignment later on.
+  if [ -z $num_leaves ] || [ -z $num_gauss ] ; then
+    echo "num_leaves and num_gauss need to be specified." && exit 1;
+  fi
+  steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \
+    $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Expand the reference lexicon to cover all words seen in,"
+  echo "  ... acoustic training data, and prepare corresponding dict and lang directories."
+  echo "  ... This is needed when generate pron candidates from phonetic decoding."
+  mkdir -p $dir/dict_expanded_train
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_expanded_train 2>/dev/null
+  rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null
+
+  # Get the oov words list (w.r.t ref vocab) which are in training data. 
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \
+    $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \
+    $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; 
+  
+  awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \
+    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1;
+  
+  echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:"
+  cat $dir/train_oov_rate
+
+  # Assign pronunciations from lexicon_g2p to oov_train. For words which
+  # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton
+  # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on.
+  variant_counts=`cat $dir/target_num_prons_per_word` || exit 1;
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_train.txt || exit 1;
+  
+  # Get the pronunciation of oov_symbol.
+  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'`
+  # For oov words in training data for which we don't even have G2P pron candidates,
+  # we simply assign them the pronunciation of the oov symbol (like <unk>),
+  # so that we can get pronunciations for them from phonetic decoding.
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \
+    $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1;
+    
+  cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat - $dir/non_scored_entries | \
+    sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1;
+  
+  utils/prepare_lang.sh $dir/dict_expanded_train $oov_symbol \
+    $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.."
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+  steps/cleanup/debug_lexicon.sh  --nj $nj \
+    --cmd "$decode_cmd" $data $dir/lang_expanded_train \
+    $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one"
+  echo "  ... lexicon, and run lattice alignment using this lexicon on acoustic training data"
+  echo "  ... to collect acoustic evidence."
+  # We first prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob",
+  # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon.
+  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt 
+  
+  $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \
+    --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \
+    $dir/phonetic_decoding/prons.txt $dir/lexicon_pd_with_eps.txt
+
+  # We abandon phonetic-decoding candidates for infrequent words.
+  awk '{if($2 < 3) print $1}' $dir/train_counts.txt > $dir/pd_candidates_to_exclude.txt 
+  awk 'NR==FNR{a[$1] = $2; next} {if(a[$1]<10) print $1}' $dir/train_counts.txt \
+    $dir/oov_train_no_pron.txt >> $dir/pd_candidates_to_exclude.txt 
+
+  if [ -s $dir/pd_candidates_to_exclude.txt ]; then
+    cat $dir/lexicon_pd_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
+      awk 'NR==FNR{a[$0] = 1; next} {if(!($1 in a)) print $0}' $dir/pd_candidates_to_exclude.txt - | \
+      sort | uniq > $dir/lexicon_pd.txt || exit 1;
+  else
+    cat $dir/lexicon_pd_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
+      sort | uniq > $dir/lexicon_pd.txt || exit 1;
+  fi
+
+  # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon.
+  mkdir -p $dir/dict_combined_iter1
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_combined_iter1/ 2>/dev/null
+  rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null
+
+  # Filter out words which don't appear in the acoustic training data
+  cat $dir/lexicon_pd.txt $dir/lexicon_g2p.txt \
+    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat $dir/non_scored_entries - | \
+    sort | uniq > $dir/dict_combined_iter1/lexicon.txt
+  
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
+    $dir/dict_combined_iter1 $oov_symbol \
+    $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1;
+  
+  # Generate lattices for the acoustic training data with the combined lexicon.
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+
+  # Get the vocab for words for which we want to learn pronunciations.
+  if $learn_iv_prons; then
+    # If we want to learn the prons of IV words (w.r.t. ref_vocab), the learn_vocab is just the intersection of
+    # target_vocab and the vocab of words seen in acoustic training data (first col. of train_counts.txt)
+    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt \
+      > $dir/learn_vocab.txt
+  else
+    # Exclude words from the ref_vocab if we don't want to learn the pronunciations of IV words.
+    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt | \
+      awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_vocab.txt - > $dir/learn_vocab.txt
+  fi
+  
+  # In order to get finer lattice stats of alternative prons, we want to make lattices deeper.
+  # To speed up lattice generation, we use a ctm to create sub-utterances and a sub-segmentation
+  # for each instance of a word within learn_vocab (or a string of consecutive words within learn_vocab),
+  # including a single out-of-learn-vocab word at the boundary if present.
+  mkdir -p $dir/resegmentation
+  steps/dict/internal/get_subsegments.py $dir/phonetic_decoding/word.ctm $dir/learn_vocab.txt \
+    $dir/resegmentation/subsegments $dir/resegmentation/text || exit 1;
+  utils/data/subsegment_data_dir.sh $data $dir/resegmentation/subsegments $dir/resegmentation/text \
+    $dir/resegmentation/data || exit 1;
+  steps/compute_cmvn_stats.sh $dir/resegmentation/data || exit 1;
+
+  steps/align_fmllr_lats.sh --beam 20 --retry-beam 50 --final-beam 30 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
+    $dir/resegmentation/data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1;
+
+  # Get arc level information from the lattice.
+  $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \
+    lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \
+    $dir/lats_iter1/final.mdl \
+    "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \
+    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \
+    utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \
+    utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \
+    $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1;
+  
+  # Compute soft counts (pron_stats) of every particular word-pronunciation pair by
+  # summing up arc level information over all utterances. We'll use this to prune
+  # pronunciation candidates before the next iteration of lattice generation.
+  cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
+    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1;
+  
+  # Accumlate utterance-level pronunciation posteriors (into arc_stats) by summing up
+  # posteriors of arcs representing the same word & pronunciation and starting
+  # from roughly the same location. See steps/dict/internal/sum_arc_info.py for details.
+  for i in `seq 1 $nj`;do
+    cat $dir/lats_iter1/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \
+      steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/arc_info_summed.${i}.txt
+  done 
+  cat $dir/lats_iter1/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter1/arc_stats.txt 
+
+  # Prune the phonetic_decoding lexicon so that any pronunciation that only has non-zero posterior at one word example will be removed.
+  # The pruned lexicon is put in $dir/lats_iter1. After further pruning in the next stage it'll be put back to $dir.
+  awk 'NR==FNR{w=$1;for (n=5;n<=NF;n++) w=w" "$n;a[w]+=1;next} {if($0 in a && a[$0]>1) print $0}' \
+    $dir/lats_iter1/arc_stats.txt $dir/lexicon_pd.txt > $dir/lats_iter1/lexicon_pd_pruned.txt
+fi
+
+# Here we re-generate lattices (with a wider beam and a pruned combined lexicon) and re-collect pronunciation statistics 
+if [ $stage -le 5 ]; then
+  echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment."
+  mkdir -p $dir/dict_combined_iter2
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_combined_iter2/ 2>/dev/null
+  rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null
+
+  # Prune away pronunciations which have low acoustic evidence from the first pass of lattice generation.
+  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
+    --variant-counts-ratio $variant_counts_ratio \
+    $dir/lats_iter1/pron_stats.txt $dir/lats_iter1/lexicon_pd_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
+    $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt
+
+  # Filter out words which don't appear in the acoustic training data.
+  cat $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt \
+    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat $dir/non_scored_entries - | \
+    sort | uniq > $dir/dict_combined_iter2/lexicon.txt
+
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
+    $dir/dict_combined_iter2 $oov_symbol \
+    $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1;
+  
+  # Re-generate lattices with a wider beam, so that we'll get deeper lattices.
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+  steps/align_fmllr_lats.sh  --beam 30 --retry-beam 60 --final-beam 50 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
+    $dir/resegmentation/data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1;
+
+  # Get arc level information from the lattice as we did in the last stage.
+  $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \
+    lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \
+    $dir/lats_iter2/final.mdl \
+    "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \
+    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \
+    utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \
+    utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \
+    $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1;
+  
+  # Compute soft counts (pron_stats) of every particular word-pronunciation pair as
+  # we did in the last stage. The stats will only be used as diagnostics.
+  cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
+    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1;
+  
+  # Accumlate utterance-level pronunciation posteriors as we did in the last stage.
+  for i in `seq 1 $nj`;do
+    cat $dir/lats_iter2/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \
+      steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/arc_info_summed.${i}.txt
+  done 
+  cat $dir/lats_iter2/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter2/arc_stats.txt 
+
+  # The pron_stats are the acoustic evidence which the likelihood-reduction-based pronunciation
+  # selection procedure will be based on.
+  # Split the utterance-level pronunciation posterior stats into $nj_select_prons pieces,
+  # so that the following pronunciation selection stage can be parallelized.
+  numsplit=$nj_select_prons
+  awk '{print $1"-"$2" "$1}' $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/utt2word
+  utt2words=$(for n in `seq $numsplit`; do echo $dir/lats_iter2/utt2word.$n; done)
+  utils/split_scp.pl --utt2spk=$dir/lats_iter2/utt2word $dir/lats_iter2/utt2word $utt2words || exit 1
+  for n in `seq $numsplit`; do 
+    (cat $dir/lats_iter2/utt2word.$n | awk '{$1=substr($1,length($2)+2);print $2" "$1}' - > $dir/lats_iter2/word2utt.$n
+     awk 'NR==FNR{a[$0] = 1; next} {b=$1" "$2; if(b in a) print $0}' $dir/lats_iter2/word2utt.$n \
+       $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/arc_stats.${n}.txt
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment."
+  # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations 
+  # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding.
+  # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt
+  # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt.
+  # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided).
+  # For words in the ref. vocab, we instead output a human readable & editable "edits" file called
+  # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a 
+  # summary is printed into the log file.
+  
+  $cmd JOB=1:$nj_select_prons $dir/lats_iter2/log/generate_learned_lexicon.JOB.log \
+    steps/dict/select_prons_greedy.py \
+      --alpha=${alpha} --beta=${beta} \
+      --delta=${delta} \
+      $ref_dict/silence_phones.txt $dir/lats_iter2/arc_stats.JOB.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
+      $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \
+      $dir/lats_iter2/learned_lexicon.JOB.txt || exit 1;
+
+  cat $dir/lats_iter2/learned_lexicon.*.txt > $dir/lats_iter2/learned_lexicon.txt
+  rm $dir/lats_iter2/learned_lexicon.*.txt
+
+  $cmd $dir/lats_iter2/log/lexicon_learning_summary.log \
+    steps/dict/merge_learned_lexicons.py \
+      $dir/lats_iter2/arc_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
+      $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \
+      $dir/lats_iter2/learned_lexicon.txt \
+      $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt || exit 1;
+
+  cp $dir/lats_iter2/ref_lexicon_edits.txt $dir/lats_iter2/ref_lexicon_edits.txt
+  # Remove some stuff that takes up space and is unlikely to be useful later on.
+  if $cleanup; then
+    rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null
+  fi
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Expand the learned lexicon further to cover words in target vocab that are."
+  echo "  ... not seen in acoustic training data."
+  mkdir -p $dest_dict
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dest_dict  2>/dev/null
+  rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null
+  # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the
+  # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any.
+  cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1;
+  
+  variant_counts=$variant_counts_no_acoustics
+  
+  $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \
+    --top-N=$variant_counts $dir/lexiconp_g2p.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1;
+  
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_no_acoustics.txt|| exit 1;
+
+  # Get the pronunciation of oov_symbol.
+  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` || exit 1;
+  # For oov words in target_vocab for which we don't even have G2P pron candidates,
+  # we simply assign them the pronunciation of the oov symbol (like <unk>),
+  if [ -s $dir/g2p_prons_for_oov_no_acoustics.txt ]; then
+    awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_no_acoustics.txt \
+      $dir/oov_no_acoustics.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_target_vocab_no_pron.txt || exit 1;
+  else
+    awk -v op="$oov_pron" '{print $0" "op}' $dir/oov_no_acoustics.txt > $dir/oov_target_vocab_no_pron.txt || exit 1
+  fi
+
+  # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics,
+  # learned lexicon for oov words with acoustics, and the original reference lexicon (for
+  # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py
+  cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \
+    $dir/oov_target_vocab_no_pron.txt $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp
+
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \
+    $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil
+
+  cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: Apply the ref_lexicon_edits file to the reference lexicon."
+  echo "  ... The user can inspect/modify the edits file and then re-run:"
+  echo "  ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt  - | \\"
+  echo "  ...   sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon."
+  cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null
+  steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \
+    sort | uniq > $dest_dict/lexicon.txt || exit 1;
+fi
+
+echo "Lexicon learning ends successfully. Please refer to $dir/lats_iter2/log/lexicon_learning_summary.log"
+echo "  for a summary. The learned lexicon, whose vocab matches the target_vocab, is $dest_dict/lexicon.txt"
diff --git a/egs/wsj/s5/steps/dict/merge_learned_lexicons.py b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py
new file mode 100755
index 00000000000..6df7eb7a744
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+import math
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "Convert a learned lexicon produced by steps/dict/select_prons_greedy.sh"
+        "into a lexicon for OOV words (w.r.t. ref. vocab) and a human editable lexicon-edit file."
+        "for in-vocab words, and generate detailed summaries of the lexicon learning results"
+        "The inputs are a learned lexicon, an arc-stats file, and three source lexicons "
+        "(phonetic-decoding(PD)/G2P/ref). The outputs are: a learned lexicon for OOVs"
+        "(learned_lexicon_oov), and a lexicon_edits file (ref_lexicon_edits) containing"
+        "suggested modifications of prons, for in-vocab words.",
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example.")
+    parser.add_argument("arc_stats_file", metavar = "<arc-stats-file>", type = str,
+                        help = "File containing word-pronunciation statistics obtained from lattices; "
+                        "each line must be <word> <utt-id> <start-frame> <count> <phones>")
+    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
+                        help = "File containing word counts in acoustic training data; "
+                        "each line must be <word> <count>.")
+    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
+                        help = "The reference lexicon (most probably hand-derived)."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
+                        help = "Candidate ronouciations from G2P results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("pd_lexicon", metavar = "<prons-in-acoustic-evidence>", type = str,
+                        help = "Candidate ronouciations from phonetic decoding results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("learned_lexicon", metavar = "<learned-lexicon>", type = str,
+                        help = "Learned lexicon."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("learned_lexicon_oov", metavar = "<learned-lexicon-oov>", type = str,
+                        help = "Output file which is the learned lexicon for words out of the ref. vocab.")
+    parser.add_argument("ref_lexicon_edits", metavar = "<lexicon-edits>", type = str,
+                        help = "Output file containing human-readable & editable pronounciation info (and the"
+                        "accept/reject decision made by our algorithm) for those words in ref. vocab," 
+                        "to which any change has been recommended. The info for each word is like:" 
+                        "------------ an 4086.0 --------------"
+                        "R  | Y |  2401.6 |  AH N"
+                        "R  | Y |  640.8 |  AE N"
+                        "P  | Y |  1035.5 |  IH N"
+                        "R(ef), P(hone-decoding) represents the pronunciation source"
+                        "Y/N means the recommended decision of including this pron or not"
+                        "and the numbers are soft counts accumulated from lattice-align-word outputs. "
+                        "See the function WriteEditsAndSummary for more details.")
+ 
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.arc_stats_file == "-":
+        args.arc_stats_file_handle = sys.stdin
+    else:
+        args.arc_stats_file_handle = open(args.arc_stats_file)
+    args.word_counts_file_handle = open(args.word_counts_file)
+    args.ref_lexicon_handle = open(args.ref_lexicon)
+    args.g2p_lexicon_handle = open(args.g2p_lexicon)
+    args.pd_lexicon_handle = open(args.pd_lexicon)
+    args.learned_lexicon_handle = open(args.learned_lexicon)
+    args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w")
+    args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w")
+    
+    return args
+
+def ReadArcStats(arc_stats_file_handle):
+    stats = defaultdict(lambda : defaultdict(dict))
+    stats_summed = defaultdict(float)
+    for line in arc_stats_file_handle.readlines():
+        splits = line.strip().split()
+
+        if (len(splits) == 0):
+            continue
+
+        if (len(splits) < 5):
+            raise Exception('Invalid format of line ' + line
+                                + ' in ' + arc_stats_file)
+        utt = splits[1]
+        start_frame = int(splits[2])
+        word = splits[0]
+        count = float(splits[3])
+        phones = splits[4:]
+        phones = ' '.join(phones)
+        stats[word][(utt, start_frame)][phones] = count
+        stats_summed[(word, phones)] += count
+    return stats, stats_summed
+
+def ReadWordCounts(word_counts_file_handle):
+    counts = {}
+    for line in word_counts_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in counts file.')
+        word = splits[0]
+        count = int(splits[1])
+        counts[word] = count
+    return counts
+
+def ReadLexicon(args, lexicon_file_handle, counts):
+    # we're skipping any word not in counts (not seen in training data),
+    # cause we're only learning prons for words who have acoustic examples.
+    lexicon = defaultdict(set)
+    for line in lexicon_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[0]
+        if word not in counts:
+            continue
+        phones = ' '.join(splits[1:])
+        lexicon[word].add(phones)
+    return lexicon
+
+def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed):
+    # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs.
+    threshold = 2
+    words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we
+    # classify each word into, according to whether it's count > threshold,
+    # and whether it's OOVs w.r.t the reference lexicon.
+
+    src = {}
+    print("# Note: This file contains pronunciation info for words who have candidate "
+          "prons from G2P/phonetic-decoding accepted in the learned lexicon"
+          ", sorted by their counts in acoustic training data, "
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)."
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle)
+    print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)."
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle)
+    
+    # words which are to be printed into the edits file.
+    words_to_edit = [] 
+    num_prons_tot = 0
+    for word in learned_lexicon:
+        num_prons_tot += len(learned_lexicon[word])
+        count = len(stats[word]) # This count could be smaller than the count read from the dict "counts",
+        # since in each sub-utterance, multiple occurences (which is rare) of the same word are compressed into one.
+        # We use this count here so that in the edit-file, soft counts for each word sum up to one. 
+        flags = ['0' for i in range(3)] # "flags" contains three binary indicators, 
+        # indicating where this word's pronunciations come from.
+        for pron in learned_lexicon[word]:
+            if word in pd_lexicon and pron in pd_lexicon[word]:
+                flags[0] = '1'
+                src[(word, pron)] = 'P'
+            elif word in ref_lexicon and pron in ref_lexicon[word]:
+                flags[1] = '1'
+                src[(word, pron)] = 'R'
+            elif word in g2p_lexicon and pron in g2p_lexicon[word]:
+                flags[2] = '1'
+                src[(word, pron)] = 'G'
+        if word in ref_lexicon:
+            all_ref_prons_accepted = True
+            for pron in ref_lexicon[word]:
+                if pron not in learned_lexicon[word]:
+                    all_ref_prons_accepted = False
+                    break
+            if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1':
+                words_to_edit.append((word, len(stats[word])))
+            if count > threshold:
+                words[0][flags[0] + flags[1] + flags[2]].add(word)
+            else:
+                words[1][flags[0] + flags[1] + flags[2]].add(word)
+        else:
+            if count > threshold: 
+                words[2][flags[0] + flags[2]].add(word)
+            else:
+                words[3][flags[0] + flags[2]].add(word)
+
+    words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True)
+    for word, count in words_to_edit_sorted:
+        print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle)
+        learned_prons = []
+        for pron in learned_lexicon[word]:
+            learned_prons.append((src[(word, pron)], 'Y', stats_summed[(word, pron)], pron))
+        for pron in ref_lexicon[word]:
+            if pron not in learned_lexicon[word]:
+                learned_prons.append(('R', 'N', stats_summed[(word, pron)], pron))
+        learned_prons_sorted = sorted(learned_prons, key=lambda item: item[2], reverse=True)
+        for item in learned_prons_sorted:
+            print('{} | {} |  {:.2f} | {}'.format(item[0], item[1], item[2], item[3]), file=args.ref_lexicon_edits_handle)
+
+    num_oovs_with_acoustic_evidence = len(set(learned_lexicon.keys()).difference(set(ref_lexicon.keys())))
+    num_oovs = len(set(counts.keys()).difference(set(ref_lexicon.keys())))
+    num_ivs = len(learned_lexicon) - num_oovs_with_acoustic_evidence
+    print("Average num. prons per word in the learned lexicon is {}".format(float(num_prons_tot)/float(len(learned_lexicon))), file=sys.stderr)
+    # print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr)
+    print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr)
+    print("We have acoustic evidence for {} out of {} in-vocab (w.r.t the reference lexicon) words from the acoustic training data.".format(num_ivs, len(ref_lexicon)), file=sys.stderr) 
+    print("  Among those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) 
+    num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011'])
+    num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100'])
+    num_freq_ivs_from_ref = len(words[0]['010'])
+    num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011'])
+    num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100'])
+    num_infreq_ivs_from_ref = len(words[1]['010'])
+    print('    {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr)
+    print('    {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) 
+    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
+    print('    {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr)
+    print('    {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) 
+    print("---------------------------------------------------------------------------------------------------", file=sys.stderr)
+    num_freq_oovs_from_both_sources = len(words[2]['11'])
+    num_freq_oovs_from_phonetic_decoding = len(words[2]['10'])
+    num_freq_oovs_from_g2p = len(words[2]['01'])
+    num_infreq_oovs_from_both_sources = len(words[3]['11'])
+    num_infreq_oovs_from_phonetic_decoding = len(words[3]['10'])
+    num_infreq_oovs_from_g2p = len(words[3]['01'])
+    print('We have acoustic evidence for {} out of {} OOV (w.r.t the reference lexicon) words from the acoustic training data.'.format(num_oovs_with_acoustic_evidence, num_oovs), file=sys.stderr)
+    print('  Among those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr)
+    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr)
+    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) 
+    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr)
+    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) 
+
+def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle):
+    for word, prons in learned_lexicon.iteritems():
+        if word not in ref_lexicon:
+            for pron in prons:
+                print('{0} {1}'.format(word, pron), file=file_handle)
+    file_handle.close()
+
+def Main():
+    args = GetArgs()
+
+    # Read in three lexicon sources, word counts, and pron stats.
+    counts = ReadWordCounts(args.word_counts_file_handle)
+    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
+    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
+    pd_lexicon =  ReadLexicon(args, args.pd_lexicon_handle, counts)
+    stats, stats_summed = ReadArcStats(args.arc_stats_file_handle)
+    learned_lexicon =  ReadLexicon(args, args.learned_lexicon_handle, counts)
+    
+    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
+    WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle)
+    # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr.
+    WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
index 2a87d172602..37d7810411b 100755
--- a/egs/wsj/s5/steps/dict/prons_to_lexicon.py
+++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
@@ -6,6 +6,7 @@
 
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
+from collections import defaultdict
 import argparse
 import sys
 
@@ -21,15 +22,15 @@ def __call__(self, parser, namespace, values, option_string=None):
             raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
 
 def GetArgs():
-    parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phone level decoding) "
-                                     "into a lexicon for lexicon learning. We prune the pronunciations "
+    parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phonetic decoding or g2p) "
+                                     "into a lexicon for. We prune the pronunciations "
                                      "based on a provided stats file, and optionally filter out entries which are present "
                                      "in a filter lexicon.",
                                      epilog = "e.g. steps/dict/prons_to_lexicon.py --min-prob=0.4 \\"
                                      "--filter-lexicon=exp/tri3_lex_0.4_work/phone_decode/filter_lexicon.txt \\"
                                      "exp/tri3_lex_0.4_work/phone_decode/prons.txt \\"
                                      "exp/tri3_lex_0.4_work/lexicon_phone_decoding.txt"
-                                     "See steps/dict/learn_lexicon.sh for examples in detail.")
+                                     "See steps/dict/learn_lexicon_greedy.sh for examples in detail.")
 
     parser.add_argument("--set-sum-to-one", type = str, default = False,
                         action = StrToBoolAction, choices = ["true", "false"],
@@ -39,6 +40,8 @@ def GetArgs():
                         action = StrToBoolAction, choices = ["true", "false"],
                         help = "If normalize lexicon such that the max "
                         "probability is 1.")
+    parser.add_argument("--top-N", type = int, default = 0,
+                        help = "If non-zero, we just take the top N pronunciations (according to stats/pron-probs) for each word.")
     parser.add_argument("--min-prob", type = float, default = 0.1,
                         help = "Remove pronunciation with probabilities less "
                         "than this value after normalization.")
@@ -46,8 +49,7 @@ def GetArgs():
                         help = "Exclude entries in this filter lexicon from the output lexicon."
                         "each line must be <word> <phones>")
     parser.add_argument("stats_file", metavar='<stats-file>', type = str,
-                        help = "Input file containing pronunciation statistics, representing how many times "
-                        "each word-pronunciation appear in the phonetic decoding results."
+                        help = "Input lexicon file containing pronunciation statistics/probs in the first column."
                         "each line must be <counts> <word> <phones>")
     parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
                         help = "Output lexicon.")
@@ -150,6 +152,18 @@ def NormalizeLexicon(lexicon, set_max_to_one = True,
             prob = 0
         lexicon[entry] = prob
 
+def TakeTopN(lexicon, top_N):
+    lexicon_reshaped = defaultdict(list) 
+    lexicon_pruned = {}
+    for entry, prob in lexicon.iteritems():
+        lexicon_reshaped[entry[0]].append([entry[1], prob])
+    for word in lexicon_reshaped:
+        prons = lexicon_reshaped[word]
+        sorted_prons = sorted(prons, reverse=True, key=lambda prons: prons[1])
+        for i in range(len(sorted_prons)):
+            if i >= top_N:
+                lexicon[(word, sorted_prons[i][0])] = 0
+        
 def WriteLexicon(args, lexicon, filter_lexicon):
     words = set()
     num_removed = 0
@@ -179,10 +193,15 @@ def Main():
     word_probs = ConvertWordCountsToProbs(args, lexicon, word_count)
 
     lexicon = ConvertWordProbsToLexicon(word_probs)
-    filter_lexicon = ReadLexicon(args.filter_lexicon_handle)
-    NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one,
-                     set_sum_to_one = args.set_sum_to_one,
-                     min_prob = args.min_prob)
+    filter_lexicon = set()
+    if args.filter_lexicon is not '':
+        filter_lexicon = ReadLexicon(args.filter_lexicon_handle)
+    if args.top_N > 0:
+        TakeTopN(lexicon, args.top_N)
+    else:
+        NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one,
+                         set_sum_to_one = args.set_sum_to_one,
+                         min_prob = args.min_prob)
     WriteLexicon(args, lexicon, filter_lexicon)
     args.out_lexicon_handle.close()
 
diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
index affc5b17705..cd90a389a7c 100755
--- a/egs/wsj/s5/steps/dict/prune_pron_candidates.py
+++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
@@ -4,6 +4,7 @@
 # Apache 2.0.
 
 from __future__ import print_function
+from __future__ import division
 from collections import defaultdict
 import argparse
 import sys
@@ -16,7 +17,7 @@ def GetArgs():
                                      "(For words in the reference lexicon, N = # pron variants given by the reference"
                                      "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
                                      "r is a user-specified constant, like 2.",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
 
     parser.add_argument("--r", type = float, default = "2.0",
                         help = "a user-specified ratio parameter which determines how many"
@@ -61,7 +62,7 @@ def ReadStats(pron_stats_handle):
         phones = ' '.join(splits[2:])
         stats[word].append((phones, count))
 
-    for word, entry in stats.iteritems():
+    for word, entry in stats.items():
         entry.sort(key=lambda x: x[1])
     return stats
 
@@ -86,12 +87,12 @@ def PruneProns(args, stats, ref_lexicon):
     # Compute the average # pron variants counts per word in the reference lexicon.
     num_words_ref = 0
     num_prons_ref = 0
-    for word, prons in ref_lexicon.iteritems():
+    for word, prons in ref_lexicon.items():
         num_words_ref += 1
         num_prons_ref += len(prons)
     avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref))
 
-    for word, entry in stats.iteritems():
+    for word, entry in stats.items():
         if word in ref_lexicon:
             variants_counts = args.r * len(ref_lexicon[word])
         else:
@@ -105,7 +106,7 @@ def PruneProns(args, stats, ref_lexicon):
             except IndexError:
                 break
         
-    for word, entry in stats.iteritems():
+    for word, entry in stats.items():
         for pron, prob in entry:
             if word not in ref_lexicon or pron not in ref_lexicon[word]:
                 print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle)
diff --git a/egs/wsj/s5/steps/dict/select_prons_bayesian.py b/egs/wsj/s5/steps/dict/select_prons_bayesian.py
index e728a4af0b8..893dd7cb818 100755
--- a/egs/wsj/s5/steps/dict/select_prons_bayesian.py
+++ b/egs/wsj/s5/steps/dict/select_prons_bayesian.py
@@ -4,6 +4,7 @@
 # Apache 2.0.
 
 from __future__ import print_function
+from __future__ import division
 from collections import defaultdict
 import argparse
 import sys
@@ -23,7 +24,7 @@ def GetArgs():
                                      "a learned lexicon for words out of the ref. vocab (learned_lexicon_oov),"
                                      "and a lexicon_edits file containing suggested modifications of prons, for"
                                      "words within the ref. vocab (ref_lexicon_edits).",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example.")
+                                     epilog = "See steps/dict/learn_lexicon_bayesian.sh for example.")
     parser.add_argument("--prior-mean", type = str, default = "0,0,0",
                         help = "Mean of priors (summing up to 1) assigned to three exclusive n"
                         "pronunciatio sources: reference lexicon, g2p, and phonetic decoding. We "
@@ -162,7 +163,7 @@ def FilterPhoneticDecodingLexicon(args, phonetic_decoding_lexicon, stats):
     for line in args.silence_file_handle:
         silphones.add(line.strip())
     rejected_candidates = set()
-    for word, prons in phonetic_decoding_lexicon.iteritems():
+    for word, prons in phonetic_decoding_lexicon.items():
         for pron in prons:
             for phone in pron.split():
                 if phone in silphones:
@@ -194,7 +195,7 @@ def ComputePriorCounts(args, counts, ref_lexicon, g2p_lexicon, phonetic_decoding
             prior_mean[2] = 0
         prior_mean_sum = sum(prior_mean)
         try:
-            prior_mean = [t / prior_mean_sum for t in prior_mean] 
+            prior_mean = [float(t) / prior_mean_sum for t in prior_mean] 
         except ZeroDivisionError:
             print('WARNING: word {} appears in train_counts but not in any lexicon.'.format(word), file=sys.stderr)
         prior_counts[word] = [t * args.prior_counts_tot for t in prior_mean] 
@@ -206,20 +207,20 @@ def ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_l
     # The soft-counts were augmented by a user-specified prior count, according the source 
     # (ref/G2P/phonetic-decoding) of this pronunciation.
 
-    for word, prons in ref_lexicon.iteritems():
+    for word, prons in ref_lexicon.items():
         for pron in prons:
             # c is the augmented soft count (observed count + prior count)
-            c = prior_counts[word][0] / len(ref_lexicon[word]) + stats.get((word, pron), 0)
+            c = float(prior_counts[word][0]) / len(ref_lexicon[word]) + stats.get((word, pron), 0)
             posteriors[word].append((pron, c))
 
-    for word, prons in g2p_lexicon.iteritems():
+    for word, prons in g2p_lexicon.items():
         for pron in prons:
-            c = prior_counts[word][1] / len(g2p_lexicon[word]) + stats.get((word, pron), 0)
+            c = float(prior_counts[word][1]) / len(g2p_lexicon[word]) + stats.get((word, pron), 0)
             posteriors[word].append((pron, c))
 
-    for word, prons in phonetic_decoding_lexicon.iteritems():
+    for word, prons in phonetic_decoding_lexicon.items():
         for pron in prons:
-            c = prior_counts[word][2] / len(phonetic_decoding_lexicon[word]) + stats.get((word, pron), 0)
+            c = float(prior_counts[word][2]) / len(phonetic_decoding_lexicon[word]) + stats.get((word, pron), 0)
             posteriors[word].append((pron, c))
 
     num_prons_from_ref = sum(len(ref_lexicon[i]) for i in ref_lexicon)
@@ -239,10 +240,10 @@ def ComputePosteriors(args, stats, ref_lexicon, g2p_lexicon, phonetic_decoding_l
         # each entry is a pair: (prounciation, count)
         count_sum[word] = sum([entry[1] for entry in posteriors[word]])
     
-    for word, entry in posteriors.iteritems():
+    for word, entry in posteriors.items():
         new_entry = []
         for pron, count in entry:      
-            post = count / count_sum[word]
+            post = float(count) / count_sum[word]
             new_entry.append((pron, post))
             source = 'R'
             if word in g2p_lexicon and pron in g2p_lexicon[word]:
@@ -260,7 +261,7 @@ def SelectPronsBayesian(args, counts, posteriors, ref_lexicon, g2p_lexicon, phon
     phonetic_decoding_selected = 0
     learned_lexicon = defaultdict(set)
 
-    for word, entry in posteriors.iteritems():
+    for word, entry in posteriors.items():
         num_variants = 0
         post_tot = 0.0
         variants_counts = args.variants_counts
@@ -411,7 +412,7 @@ def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, phonetic_decoding_l
     print('    {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) 
 
 def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle):
-    for word, prons in learned_lexicon.iteritems():
+    for word, prons in learned_lexicon.items():
         if word not in ref_lexicon:
             for pron in prons:
                 print('{0} {1}'.format(word, pron), file=file_handle)
diff --git a/egs/wsj/s5/steps/dict/select_prons_greedy.py b/egs/wsj/s5/steps/dict/select_prons_greedy.py
new file mode 100755
index 00000000000..cf71070e134
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/select_prons_greedy.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python
+
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+import math
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "Use a greedy framework to select pronunciation candidates"
+        "from three sources: a reference lexicon, G2P lexicon and phonetic-decoding"
+        "(PD) lexicon. Basically, this script implements the Alg. 1 in the paper:"
+        "Acoustic data-driven lexicon learning based on a greedy pronunciation "
+        "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur,"
+        "Interspeech 2017. The inputs are an arc-stats file, containing "
+        "acoustic evidence (tau_{uwb} in the paper) and three source lexicons "
+        "(phonetic-decoding(PD)/G2P/ref). The outputs is the learned lexicon for"
+        "all words in the arc_stats (acoustic evidence) file.",
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example.")
+    parser.add_argument("--alpha", type = str, default = "0,0,0",
+                        help = "Scaling factors for the likelihood reduction threshold."
+                        "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
+                        "G2P and reference. The valid range of each dimension is [0, 1], and"
+                        "a large value means we prune pronunciations from this source more"
+                        "aggressively. Setting a dimension to zero means we never want to remove"
+                        "pronunciaiton from that source. See Section 4.3 in the paper for details.")
+    parser.add_argument("--beta", type = str, default = "0,0,0",
+                        help = "smoothing factors for the likelihood reduction term."
+                        "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
+                        "G2P and reference. The valid range of each dimension is [0, 100], and"
+                        "a large value means we prune pronunciations from this source more"
+                        "aggressively. See Section 4.3 in the paper for details.")
+    parser.add_argument("--delta", type = float, default = 0.000000001,
+                        help = "Floor value of the pronunciation posterior statistics."
+                        "The valid range is (0, 0.01),"
+                        "See Section 3 in the paper for details.")
+    parser.add_argument("silence_phones_file", metavar = "<silphone-file>", type = str,
+                        help = "File containing a list of silence phones.")
+    parser.add_argument("arc_stats_file", metavar = "<arc-stats-file>", type = str,
+                        help = "File containing word-pronunciation statistics obtained from lattices; "
+                        "each line must be <word> <utt-id> <start-frame> <count> <phones>")
+    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
+                        help = "File containing word counts in acoustic training data; "
+                        "each line must be <word> <count>.")
+    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
+                        help = "The reference lexicon (most probably hand-derived)."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
+                        help = "Candidate ronouciations from G2P results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("pd_lexicon", metavar = "<phonetic-decoding-lexicon>", type = str,
+                        help = "Candidate ronouciations from phonetic decoding results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("learned_lexicon", metavar = "<learned-lexicon>", type = str,
+                        help = "Learned lexicon.")
+
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    args.silence_phones_file_handle = open(args.silence_phones_file)
+    if args.arc_stats_file == "-":
+        args.arc_stats_file_handle = sys.stdin
+    else:
+        args.arc_stats_file_handle = open(args.arc_stats_file)
+    args.word_counts_file_handle = open(args.word_counts_file)
+    args.ref_lexicon_handle = open(args.ref_lexicon)
+    args.g2p_lexicon_handle = open(args.g2p_lexicon)
+    args.pd_lexicon_handle = open(args.pd_lexicon)
+    args.learned_lexicon_handle = open(args.learned_lexicon, "w")
+    
+    alpha = args.alpha.strip().split(',')
+    if len(alpha) is not 3:
+        raise Exception('Invalid alpha ', args.alpha)
+    for i in range(0,3):
+        if float(alpha[i]) < 0 or float(alpha[i]) > 1:
+            raise Exception('alaph ', alpha[i], 
+                            ' is invalid, it must be within [0, 1].')
+        if float(alpha[i]) == 0:
+            alpha[i] = -1e-3
+        # The absolute likelihood loss (search for loss_abs) is supposed to be positive.
+        # But it could be negative near zero because of numerical precision limit.
+        # In this case, even if alpha is set to be zero, which means we never want to
+        # remove pronunciation from that source, the quality score (search for q_b)
+        # could still be negative, which means this pron could be potentially removed.
+        # To prevent this, we set alpha as a negative value near zero to ensure
+        # q_b is always positive.
+
+    args.alpha = [float(alpha[0]), float(alpha[1]), float(alpha[2])]
+    print("[alpha_{pd}, alpha_{g2p}, alpha_{ref}] is: ", args.alpha)
+    exit
+    beta = args.beta.strip().split(',')
+    if len(beta) is not 3:
+        raise Exception('Invalid beta ', args.beta)
+    for i in range(0,3):
+        if float(beta[i]) < 0 or float(beta[i]) > 100:
+            raise Exception('beta ', beta[i], 
+                            ' is invalid, it must be within [0, 100].')
+    args.beta = [float(beta[0]), float(beta[1]), float(beta[2])]
+    print("[beta_{pd}, beta_{g2p}, beta_{ref}] is: ", args.beta)
+
+    if args.delta <= 0 or args.delta > 0.1:
+        raise Exception('delta ', args.delta, ' is invalid, it must be within'
+                        '(0, 0.01).')
+    print("delta is: ", args.delta)
+
+    return args
+
+def ReadArcStats(arc_stats_file_handle):
+    stats = defaultdict(lambda : defaultdict(dict))
+    stats_summed = defaultdict(float)
+    for line in arc_stats_file_handle.readlines():
+        splits = line.strip().split()
+
+        if (len(splits) == 0):
+            continue
+
+        if (len(splits) < 5):
+            raise Exception('Invalid format of line ' + line
+                                + ' in ' + arc_stats_file)
+        utt = splits[1]
+        start_frame = int(splits[2])
+        word = splits[0]
+        count = float(splits[3])
+        phones = splits[4:]
+        phones = ' '.join(phones)
+        stats[word][(utt, start_frame)][phones] = count
+        stats_summed[(word, phones)] += count
+    return stats, stats_summed
+
+def ReadWordCounts(word_counts_file_handle):
+    counts = {}
+    for line in word_counts_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in counts file.')
+        word = splits[0]
+        count = int(splits[1])
+        counts[word] = count
+    return counts
+
+def ReadLexicon(args, lexicon_file_handle, counts):
+    # we're skipping any word not in counts (not seen in training data),
+    # cause we're only learning prons for words who have acoustic examples.
+    lexicon = defaultdict(set)
+    for line in lexicon_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[0]
+        if word not in counts:
+            continue
+        phones = ' '.join(splits[1:])
+        lexicon[word].add(phones)
+    return lexicon
+
+def FilterPhoneticDecodingLexicon(args, pd_lexicon):
+    # We want to remove all candidates which contain silence phones
+    silphones = set()
+    for line in args.silence_phones_file_handle:
+        silphones.add(line.strip())
+    rejected_candidates = set()
+    for word, prons in pd_lexicon.iteritems():
+        for pron in prons:
+            for phone in pron.split():
+                if phone in silphones:
+                   rejected_candidates.add((word, pron))
+                   break
+    for word, pron in rejected_candidates:
+        pd_lexicon[word].remove(pron)
+    return pd_lexicon
+
+# One iteration of Expectation-Maximization computation (Eq. 3-4 in the paper).
+def OneEMIter(args, word, stats, prons, pron_probs, debug=False):
+    prob_acc = [0.0 for i in range(len(prons[word]))]
+    s = sum(pron_probs)
+    for i in range(len(pron_probs)):
+        pron_probs[i] = pron_probs[i] / s
+    log_like = 0.0
+    for (utt, start_frame) in stats[word]:
+        prob = []
+        soft_counts = []
+        for i in range(len(prons[word])):
+            phones = prons[word][i]
+            soft_count = stats[word][(utt, start_frame)].get(phones, 0)
+            if soft_count < args.delta: 
+                soft_count = args.delta
+            soft_counts.append(soft_count)
+        prob = [i[0] * i[1] for i in zip(soft_counts, pron_probs)]
+        for i in range(len(prons[word])):
+            prob_acc[i] += prob[i] / sum(prob)
+        log_like += math.log(sum(prob))
+    pron_probs = [1.0 / float(len(stats[word])) * p for p in prob_acc]
+    log_like = 1.0 / float(len(stats[word])) * log_like
+    if debug:
+        print("Log_like of the word: ", log_like, "pron probs: ", pron_probs)
+    return pron_probs, log_like
+
+def SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon, dianostic_info=False):
+    prons = defaultdict(list) # Put all possible prons from three source lexicons into this dictionary
+    src = {} # Source of each (word, pron) pair: 'P' = phonetic-decoding, 'G' = G2P, 'R' = reference
+    learned_lexicon = defaultdict(set) # Put all selected prons in this dictionary
+    for lexicon in ref_lexicon, g2p_lexicon, pd_lexicon:
+        for word in lexicon:
+            for pron in lexicon[word]:
+                prons[word].append(pron)
+    for word in prons:
+        for pron in prons[word]:
+            if word in pd_lexicon and pron in pd_lexicon[word]:
+                src[(word, pron)] = 'P'
+            if word in g2p_lexicon and pron in g2p_lexicon[word]:
+                src[(word, pron)] = 'G'
+            if word in ref_lexicon and pron in ref_lexicon[word]:
+                src[(word, pron)] = 'R'
+   
+    for word in prons:
+        if word not in stats:
+            continue
+        n = len(prons[word])
+        pron_probs = [1/float(n) for i in range(n)]
+        if dianostic_info:
+            print("pronunciations of word '{}': {}".format(word, prons[word]))
+        active_indexes = set(range(len(prons[word])))
+       
+        deleted_prons = [] # indexes of prons to be deleted
+        soft_counts_normalized = []
+        while len(active_indexes) > 1:
+            log_like = 1.0
+            log_like_last = -1.0
+            num_iters = 0
+            while abs(log_like - log_like_last) > 1e-7:
+                num_iters += 1
+                log_like_last = log_like
+                pron_probs, log_like = OneEMIter(args, word, stats, prons, pron_probs, False)
+                if log_like_last == 1.0 and len(soft_counts_normalized) == 0: # the first iteration
+                    soft_counts_normalized = pron_probs
+                    if dianostic_info: 
+                        print("Avg.(over all egs) soft counts: {}".format(soft_counts_normalized))
+            if dianostic_info:
+                print("\n Log_like after {} iters of EM: {}, estimated pron_probs: {} \n".format(
+                        num_iters, log_like, pron_probs))
+            candidates_to_delete = []
+            
+            for i in active_indexes:
+                pron_probs_mod = [p for p in pron_probs]
+                pron_probs_mod[i] = 0.0
+                for j in range(len(pron_probs_mod)):
+                    if j in active_indexes and j != i:
+                        pron_probs_mod[j] += 0.01
+                pron_probs_mod = [s / sum(pron_probs_mod) for s in pron_probs_mod]
+                log_like2 = 1.0
+                log_like2_last = -1.0
+                num_iters2 = 0
+                # Running EM until convengence
+                while abs(log_like2 - log_like2_last) > 0.001 :
+                    num_iters2 += 1
+                    log_like2_last = log_like2
+                    pron_probs_mod, log_like2 = OneEMIter(args, word, stats,
+                                                          prons, pron_probs_mod, False)
+                
+                loss_abs = log_like - log_like2 # absolute likelihood loss before normalization
+                # (supposed to be positive, but could be negative near zero because of numerical precision limit).
+                log_delta = math.log(args.delta)
+                thr = -log_delta
+                loss = loss_abs
+                source = src[(word, prons[word][i])]
+                if dianostic_info:
+                    print("\n set the pron_prob of '{}' whose source is {}, to zero results in {}"
+                    " loss in avg. log-likelihood; Num. iters until converging:{}. ".format(
+                      prons[word][i], source, loss, num_iters2))
+                # Compute quality score q_b = loss_abs * / (M_w + beta_s(b)) + alpha_s(b) * log_delta
+                # See Sec. 4.3 and Alg. 1 in the paper.
+                if source == 'P':
+                   thr *= args.alpha[0]
+                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[0])
+                if source == 'G':
+                   thr *= args.alpha[1]
+                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[1])
+                if source == 'R':
+                   thr *= args.alpha[2]
+                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[2])
+                if loss - thr < 0: # loss - thr here is just q_b
+                   if dianostic_info:
+                       print("Smoothed log-like loss {} is smaller than threshold {} so that the quality"
+                             "score {} is negative, adding the pron to the list of candidates to delete"
+                             ". ".format(loss, thr, loss-thr))
+                   candidates_to_delete.append((loss-thr, i))
+            if len(candidates_to_delete) == 0:
+                break
+            candidates_to_delete_sorted = sorted(candidates_to_delete, 
+                                                 key=lambda candidates_to_delete: candidates_to_delete[0])
+
+            deleted_candidate = candidates_to_delete_sorted[0]
+            active_indexes.remove(deleted_candidate[1])
+            pron_probs[deleted_candidate[1]] = 0.0
+            for i in range(len(pron_probs)):
+                if i in active_indexes:
+                    pron_probs[i] += 0.01
+            pron_probs = [s / sum(pron_probs) for s in pron_probs]
+            source = src[(word, prons[word][deleted_candidate[1]])]
+            pron = prons[word][deleted_candidate[1]]
+            soft_count = soft_counts_normalized[deleted_candidate[1]]
+            quality_score = deleted_candidate[0]
+            # This part of diagnostic info provides hints to the user on how to adjust the parameters.
+            if dianostic_info:
+                print("removed pron {}, from source {} with quality score {:.5f}".format(
+                        pron, source, quality_score)) 
+                if (source == 'P' and soft_count > 0.7 and len(stats[word]) > 5):
+                    print("WARNING: alpha_{pd} or beta_{pd} may be too large!"
+                          "    For the word '{}' whose count is {}, the candidate "
+                          "    pronunciation from phonetic decoding '{}' with normalized "
+                          "    soft count {} (out of 1) is rejected. It shouldn't have been"
+                          "    rejected if alpha_{pd} is smaller than {}".format(
+                            word, len(stats[word]), pron, soft_count, -loss / log_delta, 
+                            -args.alpha[0] * len(stats[word]) + (objf_change + args.beta[0])),
+                            file=sys.stderr)
+                    if loss_abs > thr:
+                        print("    or beta_{pd} is smaller than {}".format(
+                                (loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
+                if (source == 'G' and soft_count > 0.7 and len(stats[word]) > 5):
+                    print("WARNING: alpha_{g2p} or beta_{g2p} may be too large!"
+                          "    For the word '{}' whose count is {}, the candidate "
+                          "    pronunciation from G2P '{}' with normalized "
+                          "    soft count {} (out of 1) is rejected. It shouldn't have been"
+                          "    rejected if alpha_{g2p} is smaller than {} ".format(
+                            word, len(stats[word]), pron, soft_count, -loss / log_delta, 
+                            -args.alpha[1] * len(stats[word]) + (objf_change + args.beta[1])),
+                          file=sys.stderr)
+                    if loss_abs > thr:
+                        print("    or beta_{g2p} is smaller than {}.".format((
+                                loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
+            deleted_prons.append(deleted_candidate[1])
+        for i in range(len(prons[word])):
+            if i not in deleted_prons:
+                learned_lexicon[word].add(prons[word][i])
+
+    return learned_lexicon
+
+def WriteLearnedLexicon(learned_lexicon, file_handle):
+    for word, prons in learned_lexicon.iteritems():
+        for pron in prons:
+            print('{0} {1}'.format(word, pron), file=file_handle)
+    file_handle.close()
+
+def Main():
+    args = GetArgs()
+    
+    # Read in three lexicon sources, word counts, and pron stats.
+    counts = ReadWordCounts(args.word_counts_file_handle)
+    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
+    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
+    pd_lexicon =  ReadLexicon(args, args.pd_lexicon_handle, counts)
+    stats, stats_summed = ReadArcStats(args.arc_stats_file_handle)
+    pd_lexicon = FilterPhoneticDecodingLexicon(args, pd_lexicon)
+                  
+    # Select prons to construct the learned lexicon.
+    learned_lexicon = SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon)
+    
+    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
+    WriteLearnedLexicon(learned_lexicon, args.learned_lexicon_handle)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh
index d793bbb5d8f..75eb3fc88ec 100755
--- a/egs/wsj/s5/steps/dict/train_g2p.sh
+++ b/egs/wsj/s5/steps/dict/train_g2p.sh
@@ -24,13 +24,18 @@ set -e
 if [ $# != 2 ]; then
    echo "Usage: $0 [options] <lexicon-in> <work-dir>"
    echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
-   echo "    word per line) and <word-dir> is directory where the models will "
-   echo "    be stored"
-   echo "e.g.: train_g2p.sh data/local/lexicon.txt exp/g2p/"
+   echo "    word per line, with lines like 'hello h uh l ow') and"
+   echo "    <work-dir> is directory where the models will be stored"
+   echo "e.g.: train_g2p.sh --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
    echo ""
    echo "main options (for others, see top of script file)"
    echo "  --iters <int>                                    # How many iterations. Relates to N-ngram order"
    echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --silence-phones <silphones-list>                # e.g. data/local/dict/silence_phones.txt."
+   echo "                                                   # A list of silence phones, one or more per line"
+   echo "                                                   # Relates to  --only-words option"
+   echo "  --only-words (true|false)    (default: true)     # If true, exclude silence words, i.e."
+   echo "                                                   # words with 1 phone which is a silence."
    exit 1;
 fi
 
diff --git a/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh b/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
new file mode 100755
index 00000000000..94c483e09e2
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Copyright 2017  Intellisist, Inc. (Author: Navneeth K)
+#           2017  Xiaohui Zhang
+#           2018  Ruizhe Huang
+# Apache License 2.0
+
+# This script trains a g2p model using Phonetisaurus.
+
+stage=0
+encoding='utf-8'
+only_words=true
+silence_phones=
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+set -u
+set -e
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [options] <lexicon-in> <work-dir>"
+  echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
+  echo "    word per line, with lines like 'hello h uh l ow') and"
+  echo "    <work-dir> is directory where the models will be stored"
+  echo "e.g.: $0 --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
+  echo ""
+  echo "main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --silence-phones <silphones-list>                # e.g. data/local/dict/silence_phones.txt."
+  echo "                                                   # A list of silence phones, one or more per line"
+  echo "                                                   # Relates to  --only-words option"
+  echo "  --only-words (true|false)    (default: true)     # If true, exclude silence words, i.e."
+  echo "                                                   # words with one or multiple phones which are all silence."
+  exit 1;
+fi
+
+lexicon=$1
+wdir=$2
+
+[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit
+
+isuconv=`which uconv`
+if [ -z $isuconv ]; then
+  echo "uconv was not found. You must install the icu4c package."
+  exit 1;
+fi
+
+if ! phonetisaurus=`which phonetisaurus-apply` ; then
+  echo "Phonetisarus was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
+  exit 1
+fi
+
+mkdir -p $wdir
+
+
+# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
+# and optionally remove words that are mapped to a single silence phone from the lexicon.
+if [ $stage -le 0 ]; then
+  if $only_words && [ ! -z "$silence_phones" ]; then
+    awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
+      $silence_phones $lexicon | \
+      awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
+      uconv -f "$encoding"  -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
+  else
+    awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
+      uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+  # Align lexicon stage. Lexicon is assumed to have first column tab separated
+  phonetisaurus-align --input=$wdir/lexicon_tab_separated.txt --ofile=${wdir}/aligned_lexicon.corpus || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
+  ./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
+fi
+
+if [ $stage -le 3 ]; then
+  # Convert the arpa file to FST.
+  phonetisaurus-arpa2wfst --lm=${wdir}/aligned_lexicon.arpa --ofile=${wdir}/model.fst
+fi
+
diff --git a/egs/wsj/s5/steps/get_ctm.sh b/egs/wsj/s5/steps/get_ctm.sh
index 85286e47bea..6ebce2049d9 100755
--- a/egs/wsj/s5/steps/get_ctm.sh
+++ b/egs/wsj/s5/steps/get_ctm.sh
@@ -2,9 +2,10 @@
 # Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
 
 # This script produces CTM files from a decoding directory that has lattices                                                                         
-# present.  It does this for a range of language model weights; see also 
+# present. It does this for a range of language model weights; see also 
 # get_ctm_fast.sh which does it for just one LM weight and also supports
-# the word insertion penalty.
+# the word insertion penalty, and get_ctm_conf.sh which outputs CTM files
+# with confidence scores.
 
 
 # begin configuration section.
@@ -36,7 +37,7 @@ if [ $# -ne 3 ]; then
   echo "                                    # not equal to 0.01 seconds"
   echo "e.g.:"
   echo "$0 data/train data/lang exp/tri4a/decode/"
-  echo "See also: steps/get_train_ctm.sh, steps/get_ctm_fast.sh"
+  echo "See also: steps/get_train_ctm.sh, steps/get_ctm_fast.sh, steps/get_ctm_conf.sh"
 
   exit 1;
 fi
diff --git a/egs/wsj/s5/steps/get_ctm_conf.sh b/egs/wsj/s5/steps/get_ctm_conf.sh
new file mode 120000
index 00000000000..cee23c66bf8
--- /dev/null
+++ b/egs/wsj/s5/steps/get_ctm_conf.sh
@@ -0,0 +1 @@
+conf/get_ctm_conf.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/get_ctm_conf_fast.sh b/egs/wsj/s5/steps/get_ctm_conf_fast.sh
new file mode 100755
index 00000000000..088fbd4a9cf
--- /dev/null
+++ b/egs/wsj/s5/steps/get_ctm_conf_fast.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2017  Vimal Manohar
+#           2018  Xiaohui Zhang
+#           2018  Music Technology Group, Universitat Pompeu Fabra.
+# Apache 2.0
+
+# This script produces CTM files with confidence scores
+# from a decoding directory that has lattices
+# present. It does this for one LM weight and also supports 
+# the word insertion penalty.
+# This is similar to get_ctm_conf.sh, but gets the CTM at the utterance-level.
+# It can be faster than steps/get_ctm_conf.sh --use-segments false as it splits
+# the process across many jobs. 
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+frame_shift=0.01
+lmwt=10
+wip=0.0
+print_silence=false
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir> <ctm-out-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
+  echo "                                    # not equal to 0.01 seconds"
+  echo "e.g.:"
+  echo "$0 data/train data/lang exp/tri4a/decode/"
+  echo "See also: steps/get_ctm.sh, steps/get_ctm_conf.sh"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+dir=$4
+
+if [ -f $decode_dir/final.mdl ]; then
+  model=$decode_dir/final.mdl
+else
+  model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
+fi
+
+for f in $lang/words.txt $model $decode_dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+mkdir -p $dir
+
+nj=$(cat $decode_dir/num_jobs)
+echo $nj > $dir/num_jobs
+
+if [ -f $lang/phones/word_boundary.int ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    set -o pipefail '&&' \
+    lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-prune --inv-acoustic-scale=$lmwt --beam=5 ark:- ark:- \| \
+    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+    lattice-to-ctm-conf --frame-shift=$frame_shift --decode-mbr=true --inv-acoustic-scale=$lmwt ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+elif [ -f $lang/phones/align_lexicon.int ]; then
+    set -o pipefail '&&' \
+    lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-prune --inv-acoustic-scale=$lmwt --beam=5 ark:- ark:- \| \
+    lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+    lattice-to-ctm-conf --frame-shift=$frame_shift --decode-mbr=true --inv-acoustic-scale=$lmwt ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+else
+  echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
+  exit 1;
+fi
+
+for n in `seq $nj`; do 
+  cat $dir/ctm.$n
+done > $dir/ctm
diff --git a/egs/wsj/s5/steps/get_ctm_fast.sh b/egs/wsj/s5/steps/get_ctm_fast.sh
index 75b666300fe..b0fae12b7bc 100755
--- a/egs/wsj/s5/steps/get_ctm_fast.sh
+++ b/egs/wsj/s5/steps/get_ctm_fast.sh
@@ -1,7 +1,9 @@
 #!/bin/bash
-# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
-# Copyright 2017  Vimal Manohar
-#           Music Technology Group, Universitat Pompeu Fabra, 2018. Apache 2.0
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2017  Vimal Manohar
+#           2018  Xiaohui Zhang
+#           2018  Music Technology Group, Universitat Pompeu Fabra.
+# Apache 2.0
 
 # This script produces CTM files from a decoding directory that has lattices
 # present. It does this for one LM weight and also supports 
@@ -33,7 +35,7 @@ if [ $# -ne 4 ]; then
   echo "                                    # not equal to 0.01 seconds"
   echo "e.g.:"
   echo "$0 data/train data/lang exp/tri4a/decode/"
-  echo "See also: steps/get_ctm.sh"
+  echo "See also: steps/get_ctm.sh, steps/get_ctm_conf.sh"
   exit 1;
 fi
 
diff --git a/egs/wsj/s5/steps/get_prons.sh b/egs/wsj/s5/steps/get_prons.sh
index e7a25890ba6..4c5453edbe2 100755
--- a/egs/wsj/s5/steps/get_prons.sh
+++ b/egs/wsj/s5/steps/get_prons.sh
@@ -3,7 +3,7 @@
 #            2014  Guoguo Chen
 # Apache 2.0
 
-# Begin configuration section.  
+# Begin configuration section.
 cmd=run.pl
 stage=1
 lmwt=10
@@ -29,6 +29,13 @@ if [ $# != 3 ]; then
    exit 1;
 fi
 
+# As the usage message of nbest-to-prons says, its output has lines that can be interpreted as
+#  <utterance-id> <begin-frame> <num-frames> <word> <phone1> <phone2> ... <phoneN>
+# and you could convert these into text form using a command like:
+# gunzip -c prons.*.gz | utils/sym2int.pl -f 4 words.txt | utils/sym2int.pl -f 5- phones.txt
+
+
+
 data=$1
 lang=$2
 dir=$3
@@ -66,7 +73,7 @@ fi
 
 if [ -f $dir/ali.1.gz ]; then
   echo "$0: $dir/ali.1.gz exists, so starting from alignments."
-  
+
   if [ $stage -le 1 ]; then
     rm $dir/prons.*.gz 2>/dev/null
     $cmd JOB=1:$nj $dir/log/nbest_to_prons.JOB.log \
diff --git a/egs/wsj/s5/steps/get_train_ctm.sh b/egs/wsj/s5/steps/get_train_ctm.sh
index 878e11e45ac..6942014fc88 100755
--- a/egs/wsj/s5/steps/get_train_ctm.sh
+++ b/egs/wsj/s5/steps/get_train_ctm.sh
@@ -20,8 +20,9 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [options] <data-dir> <lang-dir> <ali-dir|model-dir>"
+if [ $# -ne 3 ] && [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir> <ali-dir|model-dir> [<output-dir>]"
+  echo "(<output-dir> defaults to  <ali-dir|model-dir>.)"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
@@ -39,27 +40,31 @@ fi
 
 data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
-dir=$3
+ali_dir=$3
+dir=$4
+if [ -z $dir ]; then
+  dir=$ali_dir
+fi
 
 
-model=$dir/final.mdl # assume model one level up from decoding dir.
+model=$ali_dir/final.mdl # assume model one level up from decoding dir.
 
 
-for f in $lang/words.txt $model $dir/ali.1.gz $lang/oov.int; do
+for f in $lang/words.txt $model $ali_dir/ali.1.gz $lang/oov.int; do
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
 oov=`cat $lang/oov.int` || exit 1;
-nj=`cat $dir/num_jobs` || exit 1;
+nj=`cat $ali_dir/num_jobs` || exit 1;
 split_data.sh $data $nj || exit 1;
 sdata=$data/split$nj
 
-mkdir -p $dir/log
+mkdir -p $dir/log || exit 1;
 
 if [ $stage -le 0 ]; then
   if [ -f $lang/phones/word_boundary.int ]; then
     $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
-      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
+      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
@@ -72,7 +77,7 @@ if [ $stage -le 0 ]; then
       exit 1;
     fi
     $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
-      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
+      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
@@ -94,4 +99,3 @@ if [ $stage -le 1 ]; then
   fi
   rm $dir/ctm.*.gz
 fi
-
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index 1e8e2ced6ce..6bf0ea4932c 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -10,6 +10,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import logging
 import math
@@ -18,6 +19,11 @@
 import sys
 import threading
 
+try:
+    import thread as thread_module
+except:
+    import _thread as thread_module
+
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
@@ -230,8 +236,7 @@ def background_command_waiter(command, popen_object, require_zero_status):
             logger.error(str)
             # thread.interrupt_main() sends a KeyboardInterrupt to the main
             # thread, which will generally terminate the program.
-            import thread
-            thread.interrupt_main()
+            thread_module.interrupt_main()
         else:
             logger.warning(str)
 
@@ -312,7 +317,7 @@ def read_kaldi_matrix(matrix_file):
     'matrix_file' and stores it as a list of rows, where each row is a list.
     """
     try:
-        lines = map(lambda x: x.split(), open(matrix_file).readlines())
+        lines = [x.split() for x in open(matrix_file).readlines()]
         first_field = lines[0][0]
         last_field = lines[-1][-1]
         lines[0] = lines[0][1:]
@@ -322,7 +327,7 @@ def read_kaldi_matrix(matrix_file):
                 "Kaldi matrix file has incorrect format, "
                 "only text format matrix files can be read by this script")
         for i in range(len(lines)):
-            lines[i] = map(lambda x: int(float(x)), lines[i])
+            lines[i] = [int(float(x)) for x in lines[i]]
         return lines
     except IOError:
         raise Exception("Error while reading the kaldi matrix file "
@@ -344,7 +349,7 @@ def write_kaldi_matrix(output_file, matrix):
             if num_cols != len(matrix[row_index]):
                 raise Exception("All the rows of a matrix are expected to "
                                 "have the same length")
-            f.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+            f.write(" ".join([str(x) for x in matrix[row_index]]))
             if row_index != num_rows - 1:
                 f.write("\n")
         f.write(" ]")
@@ -504,7 +509,7 @@ def compute_idct_matrix(K, N, cepstral_lifter=0):
         lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, K)
         for k in range(0, K):
             for n in range(0, N):
-                matrix[n][k] = matrix[n][k] / lifter_coeffs[k]
+                matrix[n][k] = float(matrix[n][k]) / lifter_coeffs[k]
 
     return matrix
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 73f4e5b6533..97da5e04962 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -322,7 +322,7 @@ def parse_progress_logs_for_param_diff(exp_dir, pattern):
         groups = mat_obj.groups()
         iteration = groups[0]
         differences = parse_difference_string(groups[1])
-        component_names = component_names.union(differences.keys())
+        component_names = component_names.union(list(differences.keys()))
         progress_per_iter[int(iteration)] = differences
 
     component_names = list(component_names)
@@ -435,14 +435,14 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"):
         raise KaldiLogParseException("Could not find any lines with {k} in "
                 " {l}".format(k=key, l=valid_prob_files))
 
-    iters = list(set(valid_objf.keys()).intersection(train_objf.keys()))
+    iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys())))
     if not iters:
         raise KaldiLogParseException("Could not any common iterations with"
                 " key {k} in both {tl} and {vl}".format(
                     k=key, tl=train_prob_files, vl=valid_prob_files))
     iters.sort()
-    return list(map(lambda x: (int(x), float(train_objf[x]),
-                               float(valid_objf[x])), iters))
+    return list([(int(x), float(train_objf[x]),
+                               float(valid_objf[x])) for x in iters])
 
 def parse_rnnlm_prob_logs(exp_dir, key='objf'):
     train_prob_files = "%s/log/train.*.*.log" % (exp_dir)
@@ -498,14 +498,14 @@ def parse_rnnlm_prob_logs(exp_dir, key='objf'):
         raise KaldiLogParseException("Could not find any lines with {k} in "
                 " {l}".format(k=key, l=valid_prob_files))
 
-    iters = list(set(valid_objf.keys()).intersection(train_objf.keys()))
+    iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys())))
     if not iters:
         raise KaldiLogParseException("Could not any common iterations with"
                 " key {k} in both {tl} and {vl}".format(
                     k=key, tl=train_prob_files, vl=valid_prob_files))
     iters.sort()
-    return map(lambda x: (int(x), float(train_objf[x]),
-                          float(valid_objf[x])), iters)
+    return [(int(x), float(train_objf[x]),
+                          float(valid_objf[x])) for x in iters]
 
 
 
@@ -532,7 +532,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
         try:
             report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
                                                   x[1], x[2], x[2]-x[1]))
-        except KeyError, IndexError:
+        except (KeyError, IndexError):
             continue
 
     total_time = 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 229f290e94c..c932a9c54f7 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -7,6 +7,8 @@
 """ This is a module with methods which will be used by scripts for training of
 deep neural network acoustic model with chain objective.
 """
+from __future__ import division
+from __future__ import print_function
 
 import logging
 import math
@@ -167,7 +169,7 @@ def train_new_models(dir, iter, srand, num_jobs,
         # work out the 1-based archive index.
         archive_index = (k % num_archives) + 1
         # previous : frame_shift = (k/num_archives) % frame_subsampling_factor
-        frame_shift = ((archive_index + k/num_archives)
+        frame_shift = ((archive_index + k//num_archives)
                        % frame_subsampling_factor)
 
         multitask_egs_opts = common_train_lib.get_multitask_egs_opts(
@@ -413,8 +415,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                              range(1, num_lda_jobs + 1)))
+    lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)]
 
     common_lib.execute_command(
         """{command} {dir}/log/sum_transform_stats.log \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 720164e5436..eac376f165d 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -7,6 +7,7 @@
 """This module contains classes and methods common to training of
 nnet3 neural networks.
 """
+from __future__ import division
 
 import argparse
 import glob
@@ -69,9 +70,12 @@ def get_multitask_egs_opts(egs_dir, egs_prefix="",
         '--output=ark:foo/egs/output.3.ark --weight=ark:foo/egs/weights.3.ark'
         i.e. egs_prefix is "" for train and
         "valid_diagnostic." for validation.
+
+        Caution: archive_index is usually an integer, but may be a string ("JOB")
+        in some cases.
     """
     multitask_egs_opts = ""
-    egs_suffix = ".{0}".format(archive_index) if archive_index > -1 else ""
+    egs_suffix = ".{0}".format(archive_index) if archive_index != -1 else ""
 
     if use_multitask_egs:
         output_file_name = ("{egs_dir}/{egs_prefix}output{egs_suffix}.ark"
@@ -265,8 +269,7 @@ def validate_minibatch_size_str(minibatch_size_str):
                 return False
         # check that the thing before the '=' sign is a positive integer
         try:
-            i = b[0]
-            if i <= 0:
+            if int(b[0]) <= 0:
                 return False
         except:
             return False  # not an integer at all.
@@ -288,7 +291,7 @@ def halve_range_str(range_str):
     halved_ranges = []
     for r in ranges:
         # a range may be either e.g. '64', or '128:256'
-        c = [str(max(1, int(x)/2)) for x in r.split(":")]
+        c = [str(max(1, int(x)//2)) for x in r.split(":")]
         halved_ranges.append(":".join(c))
     return ','.join(halved_ranges)
 
@@ -317,7 +320,8 @@ def halve_minibatch_size_str(minibatch_size_str):
 
 def copy_egs_properties_to_exp_dir(egs_dir, dir):
     try:
-        for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat']:
+        for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat',
+                     'global_cmvn.stats', 'online_cmvn']:
             file_name = '{dir}/{file}'.format(dir=egs_dir, file=file)
             if os.path.isfile(file_name):
                 shutil.copy(file_name, dir)
@@ -525,13 +529,13 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts,
                                          presoftmax_prior_scale_power=-0.25,
                                          smooth=0.01):
     total = sum(pdf_counts)
-    average_count = total/len(pdf_counts)
+    average_count = float(total) / len(pdf_counts)
     scales = []
     for i in range(len(pdf_counts)):
         scales.append(math.pow(pdf_counts[i] + smooth * average_count,
                                presoftmax_prior_scale_power))
     num_pdfs = len(pdf_counts)
-    scaled_counts = list(map(lambda x: x * float(num_pdfs) / sum(scales), scales))
+    scaled_counts = [x * float(num_pdfs) / sum(scales) for x in scales]
     return scaled_counts
 
 
@@ -561,7 +565,7 @@ def get_model_combine_iters(num_iters, num_epochs,
         in the final model-averaging phase.  (note: it's a weighted average
         where the weights are worked out from a subset of training data.)"""
 
-    approx_iters_per_epoch_final = num_archives/num_jobs_final
+    approx_iters_per_epoch_final = float(num_archives) / num_jobs_final
     # Note: it used to be that we would combine over an entire epoch,
     # but in practice we very rarely would use any weights from towards
     # the end of that range, so we are changing it to use not
@@ -578,8 +582,8 @@ def get_model_combine_iters(num_iters, num_epochs,
     # But if this value is > max_models_combine, then the models
     # are subsampled to get these many models to combine.
 
-    num_iters_combine_initial = min(approx_iters_per_epoch_final/2 + 1,
-                                    num_iters/2)
+    num_iters_combine_initial = min(int(approx_iters_per_epoch_final/2) + 1,
+                                    int(num_iters/2))
 
     if num_iters_combine_initial > max_models_combine:
         subsample_model_factor = int(
@@ -591,13 +595,23 @@ def get_model_combine_iters(num_iters, num_epochs,
         models_to_combine.add(num_iters)
     else:
         subsample_model_factor = 1
-        num_iters_combine = min(max_models_combine, num_iters/2)
+        num_iters_combine = min(max_models_combine, num_iters//2)
         models_to_combine = set(range(num_iters - num_iters_combine + 1,
                                       num_iters + 1))
 
     return models_to_combine
 
 
+def get_current_num_jobs(it, num_it, start, step, end):
+    "Get number of jobs for iteration number 'it' of range('num_it')"
+
+    ideal = float(start) + (end - start) * float(it) / num_it
+    if ideal < step:
+        return int(0.5 + ideal)
+    else:
+        return int(0.5 + ideal / step) * step
+
+
 def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
                       num_archives_to_process,
                       initial_effective_lrate, final_effective_lrate):
@@ -607,8 +621,7 @@ def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed,
         effective_learning_rate = (
                 initial_effective_lrate
                 * math.exp(num_archives_processed
-                           * math.log(final_effective_lrate
-                                      / initial_effective_lrate)
+                           * math.log(float(final_effective_lrate) / initial_effective_lrate)
                            / num_archives_to_process))
 
     return num_jobs * effective_learning_rate
@@ -679,13 +692,11 @@ def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
         os.remove(file_name)
 
 
-def self_test():
-    assert halve_minibatch_size_str('64') == '32'
-    assert halve_minibatch_size_str('64,16:32') == '32,8:16'
-    assert halve_minibatch_size_str('1') == '1'
-    assert halve_minibatch_size_str('128=64/256=40,80:100') == '128=32/256=20,40:50'
-    assert validate_chunk_width('64')
-    assert validate_chunk_width('64,25,128')
+def positive_int(arg):
+   val = int(arg)
+   if (val <= 0):
+      raise argparse.ArgumentTypeError("must be positive int: '%s'" % arg)
+   return val
 
 
 class CommonParser(object):
@@ -842,6 +853,10 @@ def __init__(self,
                                  type=int, dest='num_jobs_final', default=8,
                                  help="Number of neural net jobs to run in "
                                  "parallel at the end of training")
+        self.parser.add_argument("--trainer.optimization.num-jobs-step",
+            type=positive_int,  metavar='N', dest='num_jobs_step', default=1,
+            help="""Number of jobs increment, when exceeds this number. For
+            example, if N=3, the number of jobs may progress as 1, 2, 3, 6, 9...""")
         self.parser.add_argument("--trainer.optimization.max-models-combine",
                                  "--trainer.max-models-combine",
                                  type=int, dest='max_models_combine',
@@ -980,5 +995,43 @@ def __init__(self,
                                  then only failure notifications are sent""")
 
 
+import unittest
+
+class SelfTest(unittest.TestCase):
+
+    def test_halve_minibatch_size_str(self):
+        self.assertEqual('32', halve_minibatch_size_str('64'))
+        self.assertEqual('32,8:16', halve_minibatch_size_str('64,16:32'))
+        self.assertEqual('1', halve_minibatch_size_str('1'))
+        self.assertEqual('128=32/256=20,40:50', halve_minibatch_size_str('128=64/256=40,80:100'))
+
+
+    def test_validate_chunk_width(self):
+        for s in [ '64', '64,25,128' ]:
+            self.assertTrue(validate_chunk_width(s), s)
+
+
+    def test_validate_minibatch_size_str(self):
+        # Good descriptors.
+        for s in [ '32', '32,64', '1:32', '1:32,64', '64,1:32', '1:5,10:15',
+                   '128=64:128/256=32,64', '1=2/3=4', '1=1/2=2/3=3/4=4' ]:
+            self.assertTrue(validate_minibatch_size_str(s), s)
+        # Bad descriptors.
+        for s in [ None, 42, (43,), '', '1:', ':2', '3,', ',4', '5:6,', ',7:8',
+                   '9=', '10=10/', '11=11/11', '12=1:2//13=1:3' '14=/15=15',
+                   '16/17=17', '/18=18', '/18', '//19', '/' ]:
+            self.assertFalse(validate_minibatch_size_str(s), s)
+
+
+    def test_get_current_num_jobs(self):
+        niters = 12
+        self.assertEqual([2, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8],
+                         [get_current_num_jobs(i, niters, 2, 1, 9)
+                              for i in range(niters)])
+        self.assertEqual([2, 3, 3, 3, 3, 6, 6, 6, 6, 6, 9, 9],
+                         [get_current_num_jobs(i, niters, 2, 3, 9)
+                              for i in range(niters)])
+
+
 if __name__ == '__main__':
-    _self_test()
+    unittest.main()
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
index 0ad93e5977d..0de9074517f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
@@ -186,9 +186,22 @@ def _get_component_dropout(dropout_schedule, data_fraction):
 
 def _get_dropout_proportions(dropout_schedule, data_fraction):
     """Returns dropout proportions based on the dropout_schedule for the
-    fraction of data seen at this stage of training.
+    fraction of data seen at this stage of training.  Returns a list of
+    pairs (pattern, dropout_proportion); for instance, it might return
+    the list ['*', 0.625] meaning a dropout proportion of 0.625 is to
+    be applied to all dropout components.
+
     Returns None if dropout_schedule is None.
 
+    dropout_schedule might be (in the sample case using the default pattern of
+    '*'): '0.1,0.5@0.5,0.1', meaning a piecewise linear function that starts at
+    0.1 when data_fraction=0.0, rises to 0.5 when data_fraction=0.5, and falls
+    again to 0.1 when data_fraction=1.0.   It can also contain space-separated
+    items of the form 'pattern=schedule', for instance:
+       '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0'
+    The more specific patterns should go later, otherwise they will be overridden
+    by the less specific patterns' commands.
+
     Calls _get_component_dropout() for the different component name patterns
     in dropout_schedule.
 
@@ -198,6 +211,7 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
             See _self_test() for examples.
         data_fraction: The fraction of data seen until this stage of
             training.
+
     """
     if dropout_schedule is None:
         return None
@@ -213,6 +227,10 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
 def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
     """Return an nnet3-copy --edits line to modify raw_model_string to
     set dropout proportions according to dropout_proportions.
+    E.g. if _dropout_proportions(dropout_schedule, data_fraction)
+    returns [('*', 0.625)],  this will return the string:
+     "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'"
+
 
     Arguments:
         dropout_schedule: Value for the --trainer.dropout-schedule option.
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index cc5c9693a12..f2722350e41 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -348,8 +348,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                              range(1, num_lda_jobs + 1)))
+    lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)]
 
     common_lib.execute_command(
         """{command} {dir}/log/sum_transform_stats.log \
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
index e870c1a60cf..db4cb392f10 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py
@@ -6,6 +6,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import math
 import re
 import sys
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index e95de336586..e18c1359b61 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -9,6 +9,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import math
 import re
 import sys
@@ -748,7 +749,8 @@ def check_configs(self):
         if self.config['target-rms'] < 0.0:
             raise RuntimeError("target-rms has invalid value {0}"
                                .format(self.config['target-rms']))
-        if self.config['learning-rate-factor'] <= 0.0:
+        if (self.config['learning-rate-factor'] != '' and
+            self.config['learning-rate-factor'] <= 0.0):
             raise RuntimeError("learning-rate-factor has invalid value {0}"
                                .format(self.config['learning-rate-factor']))
 
@@ -1141,6 +1143,9 @@ class XconfigIdctLayer(XconfigLayerBase):
       dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
       cepstral-lifter=22       [Apply liftering co-efficient.]
       affine-transform-file='' [Must be specified.]
+      include-in-init=false     [You should set this to true if this precedes a
+                                `fixed-affine-layer` that is to be initialized
+                                 via LDA]
     """
     def __init__(self, first_token, key_to_value, prev_names=None):
         assert first_token == 'idct-layer'
@@ -1152,7 +1157,8 @@ def set_default_configs(self):
         self.config = {'input': '[-1]',
                        'dim': -1,
                        'cepstral-lifter': 22.0,
-                       'affine-transform-file': ''}
+                       'affine-transform-file': '',
+                       'include-in-init': False}
 
     def check_configs(self):
         if self.config['affine-transform-file'] is None:
@@ -1173,6 +1179,18 @@ def output_dim(self, auxiliary_output=None):
 
     def get_full_config(self):
         ans = []
+        config_lines = self._generate_config()
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+            if self.config['include-in-init']:
+                ans.append(('init', line))
+        return ans
+
+
+    def _generate_config(self):
 
         # note: each value of self.descriptors is (descriptor, dim,
         # normalized-string, output-string).
@@ -1191,20 +1209,16 @@ def get_full_config(self):
             idct_mat[n].append(0)
         common_lib.write_kaldi_matrix(transform_file, idct_mat)
 
+        configs = []
+
         # write the 'real' component to final.config
         line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
             self.name, transform_file)
-        ans.append(('final', line))
-        # write a random version of the component, with the same dims, to ref.config
-        line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format(
-            self.name, input_dim, output_dim)
-        ans.append(('ref', line))
-        # the component-node gets written to final.config and ref.config.
+        configs.append(line)
         line = 'component-node name={0} component={0} input={1}'.format(
             self.name, descriptor_final_string)
-        ans.append(('final', line))
-        ans.append(('ref', line))
-        return ans
+        configs.append(line)
+        return configs
 
 
 class XconfigExistingLayer(XconfigLayerBase):
@@ -1260,6 +1274,90 @@ def get_full_config(self):
         return ans
 
 
+class XconfigSpecAugmentLayer(XconfigLayerBase):
+    """This class is for parsing lines like
+     'spec-augment-layer name=spec-augment freq-max-proportion=0.5 time-zeroed-proportion=0.2 time-mask-max-frames=10'
+
+    which will produce a component of type GeneralDropoutComponent (to do the
+    frequency-domain part) and then one of type SpecaugmentTimeMaskComponent (to
+    do the time part).
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      freq-max-proportion=0.5  [The maximum proportion of the frequency space that
+                                might be zeroed out]
+      time-zeroed-proportion=0.2  [The proportion of time frames that will be zeroed
+                                  out]
+      time-mask-max-frames=20   [The maximum length of a zeroed region in the time
+                                axis, in frames.]
+      include-in-init=false     [You should set this to true if this precedes a
+                                `fixed-affine-layer` that is to be initialized
+                                 via LDA]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'freq-max-proportion': 0.5,
+                       'time-zeroed-proportion': 0.2,
+                       'time-mask-max-frames': 20,
+                       'include-in-init': False}
+
+
+    def check_configs(self):
+        assert (self.config['freq-max-proportion'] > 0.0 and self.config['freq-max-proportion'] < 1.0
+                and self.config['time-zeroed-proportion'] > 0.0 and self.config['time-zeroed-proportion'] < 1.0
+                and self.config['time-mask-max-frames'] >= 1)
+
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return '{0}.time-mask'.format(self.name)
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+            if self.config['include-in-init']:
+                ans.append(('init', line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        freq_max_proportion = self.config['freq-max-proportion']
+        time_zeroed_proportion = self.config['time-zeroed-proportion']
+        time_mask_max_frames = self.config['time-mask-max-frames']
+
+        configs = []
+        line = ('component name={0}.freq-mask type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format(
+            self.name, input_dim, freq_max_proportion))
+        configs.append(line)
+        line = ('component-node name={0}.freq-mask component={0}.freq-mask input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        line = ('component name={0}.time-mask type=SpecAugmentTimeMaskComponent dim={1} '
+                'zeroed-proportion={2} time-mask-max-frames={3}'.format(
+                    self.name, input_dim, time_zeroed_proportion, time_mask_max_frames))
+        configs.append(line)
+        line = ('component-node name={0}.time-mask component={0}.time-mask input={0}.freq-mask'.format(
+            self.name))
+        configs.append(line)
+        return configs
+
 
 def test_layers():
     # for some config lines that should be printed the same way as they
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py
index e1905d0aa48..928ca445ccc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py
@@ -80,7 +80,8 @@ def set_default_configs(self):
                        'time-stride':1,
                        'l2-regularize':0.0,
                        'max-change': 0.75,
-                       'self-repair-scale': 1.0e-05}
+                       'self-repair-scale': 1.0e-05,
+                       'context': 'default'}
 
     def set_derived_configs(self):
         pass
@@ -104,6 +105,10 @@ def check_configs(self):
             raise RuntimeError('bypass-scale is nonzero but output-dim != input-dim: {0} != {1}'
                                ''.format(output_dim, input_dim))
 
+        if not self.config['context'] in ['default', 'left-only', 'shift-left', 'none']:
+            raise RuntimeError('context must be default, left-only shift-left or none, got {}'.format(
+                self.config['context']))
+
 
     def output_name(self, auxiliary_output=None):
         assert auxiliary_output is None
@@ -135,18 +140,23 @@ def get_full_config(self):
     def _generate_config(self):
         configs = []
         name = self.name
-
         input_dim = self.descriptors['input']['dim']
         input_descriptor = self.descriptors['input']['final-string']
         output_dim = self.config['dim']
-        assert output_dim == input_dim
         bottleneck_dim = self.config['bottleneck-dim']
         bypass_scale = self.config['bypass-scale']
         dropout_proportion = self.config['dropout-proportion']
         time_stride = self.config['time-stride']
-        if time_stride != 0:
+        context = self.config['context']
+        if time_stride != 0 and context != 'none':
             time_offsets1 = '{0},0'.format(-time_stride)
-            time_offsets2 = '0,{0}'.format(time_stride)
+            if context == 'default':
+                time_offsets2 = '0,{0}'.format(time_stride)
+            elif context == 'shift-left':
+                time_offsets2 = '{0},0'.format(-time_stride)
+            else:
+                assert context == 'left-only'
+                time_offsets2 = '0'
         else:
             time_offsets1 = '0'
             time_offsets2 = '0'
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
index be8bcaefedf..1628a5e314f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/convolution.py
@@ -7,6 +7,7 @@
 """ This module has the implementation of convolutional layers.
 """
 from __future__ import print_function
+from __future__ import division
 import math
 import re
 import sys
@@ -148,7 +149,7 @@ def set_derived_configs(self):
         if input_dim % height_in != 0:
             raise RuntimeError("Input dimension {0} is not a multiple of height-in={1}".format(
                 input_dim, height_in))
-        self.config['num-filters-in'] = input_dim / height_in
+        self.config['num-filters-in'] = input_dim // height_in
 
 
     # Check whether 'str' is a sorted, unique, nonempty list of integers, like -1,0,1.,
@@ -880,7 +881,7 @@ def _generate_normal_resblock_config(self):
         num_filters_out = self.config['num-filters']
 
         if height_out != height_in:
-            if height_out < height_in / 2 - 1 or height_out > height_in /  2 + 1:
+            if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1:
                 raise RuntimeError("Expected height-out to be about half height-in, or the same: "
                                    "height-in={0} height-out={1}".format(height_in, height_out))
             if not time_period_out % 2 == 0:
@@ -1030,7 +1031,7 @@ def _generate_bottleneck_resblock_config(self):
         num_filters_out = self.config['num-filters']
 
         if height_out != height_in:
-            if height_out < height_in / 2 - 1 or height_out > height_in /  2 + 1:
+            if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1:
                 raise RuntimeError("Expected height-out to be about half height-in, or the same: "
                                    "height-in={0} height-out={1}".format(height_in, height_out))
             height_subsample = 2
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py
index 530ba14474a..2f387a6a1e5 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/gru.py
@@ -1,6 +1,7 @@
 # Copyright 2016    Johns Hopkins University (Dan Povey)
 #           2017    Gaofeng Cheng (UCAS)
 #           2017    Lu Huang (THU)
+#           2018    Hang Lyu
 # Apache 2.0.
 
 
@@ -83,7 +84,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the LSTM config
+    # convenience function to generate the GRU config
     def generate_gru_config(self):
 
         # assign some variables to reduce verbosity
@@ -468,7 +469,7 @@ def output_name(self, auxiliary_output = None):
     def output_dim(self, auxiliary_output = None):
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
-                if node_name == 'c_t':
+                if node_name == 'h_t':
                     return self.config['cell-dim']
                 # add code for other auxiliary_outputs here when we decide to expose them
             else:
@@ -487,7 +488,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the PGRU config
+    # convenience function to generate the Norm-PGRU config
     def generate_pgru_config(self):
 
         # assign some variables to reduce verbosity
@@ -711,7 +712,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the PGRU config
+    # convenience function to generate the OPGRU config
     def generate_pgru_config(self):
 
         # assign some variables to reduce verbosity
@@ -922,7 +923,7 @@ def get_full_config(self):
                 ans.append((config_name, line))
         return ans
 
-    # convenience function to generate the PGRU config
+    # convenience function to generate the Norm-OPGRU config
     def generate_pgru_config(self):
 
         # assign some variables to reduce verbosity
@@ -1039,3 +1040,1072 @@ def generate_pgru_config(self):
         configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip_renorm".format(name))
 
         return configs
+
+# This class is for lines like
+#   'fast-gru-layer name=gru1 input=[-1] delay=-3'
+# It generates an GRU sub-graph without output projections.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+# decay-time is deprecated under GRU or PGRU, as I found the PGRUs do not need the decay-time option to get generalized to unseen sequence length
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1              [Dimension of the cell]
+#   delay=-1                 [Delay in the recurrent connections of the GRU/LSTM ]
+#   clipping-threshold=30    [similar to LSTMs ,nnet3 GRUs use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self-repair-scale-nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the GRU/LSTM ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''                [Additional options used for the full matrices in the GRU/LSTM, can be used to do things like set biases to initialize to 1]
+class XconfigFastGruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-gru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input':'[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75'
+                        }
+
+    def set_derived_configs(self):
+        if self.config['cell-dim'] <= 0:
+            self.config['cell-dim'] = self.descriptors['input']['dim']
+
+    def check_configs(self):
+        key = 'cell-dim'
+        if self.config['cell-dim'] <= 0:
+            raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        return self.config['cell-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_gru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the GRU config
+    def generate_gru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'], abs(delay)))
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        affine_str = self.config['ng-affine-options']
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z y_{t-1} )   # update gate
+        # r_t = \sigmoid ( U^r x_t + W^r y_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h ( y_{t-1} \dot r_t ) )
+        # y_t = ( 1 - z_t ) \dot h_t  +  z_t \dot y_{t-1}
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_r* matrices")
+        configs.append("component name={0}.W_r.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str))
+
+        configs.append("# hpart_t related matrix : W_hpart matrice")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities for z_t and r_t")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+        configs.append("# r_t")
+        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# y_t")
+        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we just get the second half. Otherwise, in non-projection gru layer, y_t = c_t")
+        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
+        configs.append("dim-range-node name={0}.y_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# s_t : recurrence")
+        configs.append("# Note: in non-projection gru layer, the recurrent part equals the output, namely y_t.")
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name))
+        return configs
+
+
+# This class is for lines like
+#   'fast-pgru-layer name=pgru1 input=[-1] delay=-3'
+# It generates an PGRU sub-graph with output projections. It can also generate
+# outputs without projection, but you could use the XconfigGruLayer for this
+# simple RNN.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastPgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-pgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75'
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the PGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t = W^y c_t  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                         #  This is the output of the GRU.
+        # s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t 
+                                        # dim(s_t) = recurrent_dim.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_r* matrices")
+        configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))
+        
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t and r_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.c and y_t")
+        configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t component={0}.W_y.c input={0}.c_t".format(name))
+
+        configs.append("# s_t : recurrence")
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_pre".format(name))
+        return configs
+
+
+# This class is for lines like
+#   'fast-norm-pgru-layer name=pgru1 input=[-1] delay=-3'
+
+# Different from the vanilla PGRU, the NormPGRU uses batchnorm in the forward direction
+# and renorm in the recurrence.
+
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastNormPgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-norm-pgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75',
+                        'dropout-proportion' : -1.0,  # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : True  # If False, regular dropout, not per frame
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+        if ((self.config['dropout-proportion'] > 1.0 or
+             self.config['dropout-proportion'] < 0.0) and
+             self.config['dropout-proportion'] != -1.0 ):
+             raise RuntimeError("dropout-proportion has invalid value {0}."
+                                .format(self.config['dropout-proportion']))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the Norm-PGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t_tmp = W^y c_t
+        # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim.
+        # y_t = batchnorm ( y_t_tmp )  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                                       # This is the output of the GRU.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_r* matrices")
+        configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str))
+
+        if dropout_proportion != -1.0:
+            configs.append("# Defining the dropout component")
+            configs.append("component name={0}.dropout_z type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
+            configs.append("component name={0}.dropout_r type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, rec_proj_dim, dropout_proportion, dropout_per_frame))
+
+
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name))
+            configs.append("component-node name={0}.z_t component={0}.dropout_z input={0}.z_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+
+        configs.append("# r_t")
+        configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.r_t_predrop component={0}.r input={0}.r_t_pre".format(name))
+            configs.append("component-node name={0}.r_t component={0}.dropout_r input={0}.r_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.c and y_t_tmp")
+        configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t_tmp component={0}.W_y.c input={0}.c_t".format(name))
+
+        configs.append("# s_t : recurrence")
+        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name))
+
+        configs.append("# y_t : output")
+        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
+        configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name))
+        return configs
+
+
+# This class is for lines like
+#   'fast-opgru-layer name=opgru1 input=[-1] delay=-3'
+# It generates an PGRU sub-graph with output projections. It can also generate
+# outputs without projection, but you could use the XconfigGruLayer for this
+# simple RNN.
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastOpgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-opgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75'
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the OPGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # reset gate
+        # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t = ( c_t \dot o_t ) W^y  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                                      #  This is the output of the GRU.
+        # s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t 
+                                        # dim(s_t) = recurrent_dim.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t and o_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+        configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.cdoto and y_t")
+        configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name))
+        configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t component={0}.W_y.cdoto input={0}.cdoto".format(name))
+
+        configs.append("# s_t recurrence")
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name))
+
+        return configs
+
+
+# This class is for lines like
+#   'fast-norm-opgru-layer name=opgru1 input=[-1] delay=-3'
+
+# Different from the vanilla OPGRU, the NormOPGRU uses batchnorm in the forward direction
+# and renorm in the recurrence.
+
+# The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
+# the dimension defaults to the same as the input.
+# See other configuration values below.
+#
+# Parameters of the class, and their defaults:
+#   input='[-1]'             [Descriptor giving the input of the layer.]
+#   cell-dim=-1            [Dimension of the cell]
+#   recurrent-projection_dim [Dimension of the projection used in recurrent connections, e.g. cell-dim/4]
+#   non-recurrent-projection-dim   [Dimension of the projection in non-recurrent connections,
+#                                   in addition to recurrent-projection-dim, e.g. cell-dim/4]
+#   delay=-1                 [Delay in the recurrent connections of the GRU ]
+#   clipping-threshold=30    [nnet3 GRU use a gradient clipping component at the recurrent connections.
+#                             This is the threshold used to decide if clipping has to be activated ]
+#   zeroing-interval=20      [interval at which we (possibly) zero out the recurrent derivatives.]
+#   zeroing-threshold=15     [We only zero out the derivs every zeroing-interval, if derivs exceed this value.]
+#   self_repair_scale_nonlinearity=1e-5      [It is a constant scaling the self-repair vector computed in derived classes of NonlinearComponent]
+#                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
+#   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the GRU ]
+#   gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail]
+#   ng-affine-options=''              [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1]
+class XconfigFastNormOpgruLayer(XconfigLayerBase):
+    def __init__(self, first_token, key_to_value, prev_names = None):
+        assert first_token == "fast-norm-opgru-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input' : '[-1]',
+                        'cell-dim' : -1, # this is a compulsory argument
+                        'recurrent-projection-dim' : -1,  # defaults to cell-dim / 4
+                        'non-recurrent-projection-dim' : -1, # defaults to
+                                                             # recurrent-projection-dim
+                        'clipping-threshold' : 30.0,
+                        'delay' : -1,
+                        'ng-per-element-scale-options' : ' max-change=0.75 ',
+                        'ng-affine-options' : ' max-change=0.75 ',
+                        'self-repair-scale-nonlinearity' : 0.00001,
+                        'zeroing-interval' : 20,
+                        'zeroing-threshold' : 15.0,
+                        # if you want to set 'self-repair-scale', ' self-repair-threshold'
+                        # or 'param-stddev' for GruNonlinearityComponent
+                        # For default, they are 1.0e-05, 0.2 and  1.0 / sqrt(d) where d is cell-dim.
+                        # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options.
+                        # you can also see src/nnet3/nnet-combined-component.h for detail
+                        'gru-nonlinearity-options' : ' max-change=0.75',
+                        'dropout-proportion' : -1.0,  # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : True  # If False, regular dropout, not per frame
+                       }
+
+    def set_derived_configs(self):
+        if self.config['recurrent-projection-dim'] <= 0:
+            self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4
+
+        if self.config['non-recurrent-projection-dim'] <= 0:
+            self.config['non-recurrent-projection-dim'] = \
+               self.config['recurrent-projection-dim']
+
+    def check_configs(self):
+        for key in ['cell-dim', 'recurrent-projection-dim',
+                    'non-recurrent-projection-dim']:
+            if self.config[key] <= 0:
+                raise RuntimeError("{0} has invalid value {1}.".format(
+                    key, self.config[key]))
+
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
+        if (self.config['recurrent-projection-dim'] +
+            self.config['non-recurrent-projection-dim'] >
+            self.config['cell-dim']):
+            raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
+                                "cell dim.")
+        for key in ['self-repair-scale-nonlinearity']:
+            if self.config[key] < 0.0 or self.config[key] > 1.0:
+                raise RuntimeError("{0} has invalid value {2}."
+                                   .format(self.layer_type, key,
+                                           self.config[key]))
+        if ((self.config['dropout-proportion'] > 1.0 or
+             self.config['dropout-proportion'] < 0.0) and
+             self.config['dropout-proportion'] != -1.0 ):
+             raise RuntimeError("dropout-proportion has invalid value {0}."
+                                .format(self.config['dropout-proportion']))
+
+    def auxiliary_outputs(self):
+        return ['c_t']
+
+    def output_name(self, auxiliary_output = None):
+        node_name = 'y_t'
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                node_name = auxiliary_output
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return '{0}.{1}'.format(self.name, node_name)
+
+    def output_dim(self, auxiliary_output = None):
+        if auxiliary_output is not None:
+            if auxiliary_output in self.auxiliary_outputs():
+                if node_name == 'c_t':
+                    return self.config['cell-dim']
+                # add code for other auxiliary_outputs here when we decide to expose them
+            else:
+                raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output))
+
+        return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self.generate_pgru_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    # convenience function to generate the Norm-OPGRU config
+    def generate_pgru_config(self):
+
+        # assign some variables to reduce verbosity
+        name = self.name
+        # in the below code we will just call descriptor_strings as descriptors for conciseness
+        input_dim = self.descriptors['input']['dim']
+        input_descriptor = self.descriptors['input']['final-string']
+        cell_dim = self.config['cell-dim']
+        rec_proj_dim = self.config['recurrent-projection-dim']
+        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
+        delay = self.config['delay']
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        bptrunc_str = ("clipping-threshold={0}"
+                      " zeroing-threshold={1}"
+                      " zeroing-interval={2}"
+                      " recurrence-interval={3}"
+                      "".format(self.config['clipping-threshold'],
+                                self.config['zeroing-threshold'],
+                                self.config['zeroing-interval'],
+                                abs(delay)))
+        affine_str = self.config['ng-affine-options']
+        pes_str = self.config['ng-per-element-scale-options']
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' 
+
+        # Natural gradient per element scale parameters
+        # TODO: decide if we want to keep exposing these options
+        if re.search('param-mean', pes_str) is None and \
+           re.search('param-stddev', pes_str) is None:
+           pes_str += " param-mean=0.0 param-stddev=1.0 "
+
+        # string for GruNonlinearityComponent
+        gru_nonlin_str = self.config['gru-nonlinearity-options']
+        
+        # formulation like:
+        # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate
+        # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # output gate
+        # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
+        # c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+        # y_t_tmp = ( c_t \dot o_t ) W^y
+        # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim.
+        # y_t = batchnorm ( y_t_tmp )  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                                       # This is the output of the GRU.
+        # Note:
+        # naming convention:
+        # <layer-name>.W_<outputname>.<inputname> e.g. Gru1.W_i.xr for matrix
+        # providing output to gate i and operating on an appended vector [x,r]
+        # notation convention:
+        # In order to be consistent with the notations which are used in
+        # nnet-combined-component.cc, we map "\tilde{h_t}" and "h_t" which are
+        # used in paper to "h_t" and "c_t"
+
+        configs = []
+        configs.append("### Begin Gru layer '{0}'".format(name))
+        configs.append("# Update gate control : W_z* matrices")
+        configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+        configs.append("# Reset gate control : W_o* matrices")
+        configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str))
+
+
+        configs.append("# hpart_t related matrix : W_hpart matric")
+        configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str))
+        
+        configs.append("# Defining the non-linearities")
+        configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
+
+        if dropout_proportion != -1.0:
+            configs.append("# Defining the dropout component")
+            configs.append("component name={0}.dropout type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
+
+        recurrent_connection = '{0}.s_t'.format(name)
+
+        configs.append("# z_t")
+        configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name))
+            configs.append("component-node name={0}.z_t component={0}.dropout input={0}.z_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name))
+
+        configs.append("# o_t")
+        configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.o_t_predrop component={0}.o input={0}.o_t_pre".format(name))
+            configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name))
+
+        configs.append("# hpart_t")
+        configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor))
+        
+        configs.append("# c_t")
+        configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.")
+        configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str))
+        configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay))
+        configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim))
+
+        configs.append("# the projected matrix W_y.cdoto and y_t_tmp")
+        configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
+        configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name))
+        configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
+        configs.append("component-node name={0}.y_t_tmp component={0}.W_y.cdoto input={0}.cdoto".format(name))
+
+        configs.append("# s_t : recurrence")
+        configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim))
+        configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
+        configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim))
+        configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name))
+        configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name))
+
+        configs.append("# y_t : output")
+        configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim))
+        configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name))
+        
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 1d284146e35..5e21c4c0274 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -27,6 +27,7 @@
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
         'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer,
+        'batchnorm-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
         'tanh-layer' : xlayers.XconfigBasicLayer,
         'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
@@ -68,13 +69,24 @@
         'opgru-layer' : xlayers.XconfigOpgruLayer,
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
+        'fast-gru-layer' : xlayers.XconfigFastGruLayer,
+        'fast-pgru-layer' : xlayers.XconfigFastPgruLayer,
+        'fast-norm-pgru-layer' : xlayers.XconfigFastNormPgruLayer,
+        'fast-opgru-layer' : xlayers.XconfigFastOpgruLayer,
+        'fast-norm-opgru-layer' : xlayers.XconfigFastNormOpgruLayer,
         'tdnnf-layer': xlayers.XconfigTdnnfLayer,
         'prefinal-layer': xlayers.XconfigPrefinalLayer,
+        'spec-augment-layer': xlayers.XconfigSpecAugmentLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
         'batchnorm-component': xlayers.XconfigBatchnormComponent,
         'no-op-component': xlayers.XconfigNoOpComponent,
         'linear-component': xlayers.XconfigLinearComponent,
-        'scale-component':  xlayers.XconfigPerElementScaleComponent
+        'affine-component': xlayers.XconfigAffineComponent,
+        'scale-component':  xlayers.XconfigPerElementScaleComponent,
+        'dim-range-component': xlayers.XconfigDimRangeComponent,
+        'offset-component':  xlayers.XconfigPerElementOffsetComponent,
+        'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer,
+        'delta-layer': xlayers.XconfigDeltaLayer
 }
 
 # Turn a config line and a list of previous layers into
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 6b8e3c3a5c2..4afea78ad3f 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -70,6 +70,7 @@ def _generate_config(self):
         return configs
 
 
+
 class XconfigBatchnormComponent(XconfigLayerBase):
     """This class is for parsing lines like
      'batchnorm-component name=batchnorm input=Append(-3,0,3)'
@@ -78,13 +79,17 @@ class XconfigBatchnormComponent(XconfigLayerBase):
     Parameters of the class, and their defaults:
       input='[-1]'             [Descriptor giving the input of the layer.]
       target-rms=1.0           [The target RMS of the BatchNormComponent]
+      include-in-init=false     [You should set this to true if this precedes a
+                                `fixed-affine-layer` that is to be initialized
+                                 via LDA]
     """
     def __init__(self, first_token, key_to_value, prev_names=None):
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
         self.config = {'input': '[-1]',
-                       'target-rms': 1.0 }
+                       'target-rms': 1.0,
+                       'include-in-init': False}
 
     def check_configs(self):
         assert self.config['target-rms'] > 0.0
@@ -107,6 +112,8 @@ def get_full_config(self):
                 # we do not support user specified matrices in this layer
                 # so 'ref' and 'final' configs are the same.
                 ans.append((config_name, line))
+            if self.config['include-in-init']:
+                ans.append(('init', line))
         return ans
 
     def _generate_config(self):
@@ -179,6 +186,77 @@ def _generate_config(self):
         return configs
 
 
+class XconfigDeltaLayer(XconfigLayerBase):
+    """This class is for parsing lines like
+     'delta-layer name=delta input=idct'
+    which appends the central frame with the delta features
+    (i.e. -1,0,1 since scale equals 1) and delta-delta features 
+    (i.e. 1,0,-2,0,1), and then applies batchnorm to it.
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]'}
+
+    def check_configs(self):
+        pass
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return (3*input_dim)
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.output_dim()
+
+        configs = []
+        line = ('dim-range-node name={0}_copy1 input-node={0} dim={1} dim-offset=0'.format(
+            input_desc, input_dim))
+        configs.append(line)
+        line = ('dim-range-node name={0}_copy2 input-node={0} dim={1} dim-offset=0'.format(
+            input_desc, input_dim))
+        configs.append(line)
+
+        line = ('component name={0}_2 type=NoOpComponent dim={1}'.format(
+            input_desc, output_dim))
+        configs.append(line)
+        line = ('component-node name={0}_2 component={0}_2 input=Append(Offset({0},0),'
+            ' Sum(Offset(Scale(-1.0,{0}_copy1),-1), Offset({0},1)), Sum(Offset({0},-2), Offset({0},2),' 
+            ' Offset(Scale(-2.0,{0}_copy2),0)))'.format(input_desc))
+        configs.append(line)
+        
+        line = ('component name={0} type=BatchNormComponent dim={1}'.format(
+            self.name, output_dim))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}_2'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
+
+
 class XconfigLinearComponent(XconfigLayerBase):
     """This class is for parsing lines like
      'linear-component name=linear1 dim=1024 input=Append(-3,0,3)'
@@ -206,7 +284,9 @@ def set_default_configs(self):
                        'dim': -1,
                        'orthonormal-constraint': '',
                        'max-change': 0.75,
-                       'l2-regularize': '' }
+                       'l2-regularize': '',
+                       'param-stddev': '',
+                       'learning-rate-factor': '' }
 
     def check_configs(self):
         if self.config['dim'] <= 0:
@@ -240,7 +320,8 @@ def _generate_config(self):
         output_dim = self.config['dim']
 
         opts = ''
-        for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize']:
+        for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize',
+                         'param-stddev', 'learning-rate-factor' ]:
             value = self.config[opt_name]
             if value != '':
                 opts += ' {0}={1}'.format(opt_name, value)
@@ -255,6 +336,181 @@ def _generate_config(self):
         return configs
 
 
+class XconfigCombineFeatureMapsLayer(XconfigLayerBase):
+    """This class is for parsing lines like
+      'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4'
+      or
+      'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4 num-filters3=2'
+
+      It produces a PermuteComponent.  It expects its input to be two or three things
+      appended together, where the first is of dimension height * num-filters1 and
+      the second is of dimension height * num-filters2 (and the third, if present is
+      of dimension height * num-filters2; it interpolates the filters
+      so the output can be interpreted as a single feature map with the same height
+      as the input and the sum of the num-filters.
+
+      This is to be used in convolutional setups as part of how we combine the
+      filterbank inputs with ivectors.
+    """
+
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = { 'input': '[-1]',
+                        'num-filters1': -1,
+                        'num-filters2': -1,
+                        'num-filters3': 0,
+                        'height': -1 }
+
+    def check_configs(self):
+        input_dim = self.descriptors['input']['dim']
+        if (self.config['num-filters1'] <= 0 or
+            self.config['num-filters2'] <= 0 or
+            self.config['num-filters3'] < 0 or
+            self.config['height'] <= 0):
+            raise RuntimeError("invalid values of num-filters1, num-filters2 and/or height")
+        f1 = self.config['num-filters1']
+        f2 = self.config['num-filters2']
+        f3 = self.config['num-filters3']
+        h = self.config['height']
+        if input_dim != (f1 + f2 + f3) * h:
+            raise RuntimeError("Expected input-dim={0} based on num-filters1={1}, num-filters2={2}, "
+                               "num-filters3={3} and height={4}, but got input-dim={5}".format(
+                                   (f1 + f2 + f3) * h, f1, f2, f3, h, input_dim))
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        dim = self.descriptors['input']['dim']
+        num_filters1 = self.config['num-filters1']
+        num_filters2 = self.config['num-filters2']
+        num_filters3 = self.config['num-filters3']  # normally 0.
+        height = self.config['height']
+        assert dim == (num_filters1 + num_filters2 + num_filters3) * height
+
+        column_map = []
+        for h in range(height):
+            for f in range(num_filters1):
+                column_map.append(h * num_filters1 + f)
+            for f in range(num_filters2):
+                column_map.append(height * num_filters1 + h * num_filters2 + f)
+            for f in range(num_filters3):
+                column_map.append(height * (num_filters1 + num_filters2) + h * num_filters3 + f)
+
+        configs = []
+        line = ('component name={0} type=PermuteComponent column-map={1} '.format(
+            self.name, ','.join([str(x) for x in column_map])))
+        configs.append(line)
+
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
+
+
+
+
+class XconfigAffineComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'affine-component name=linear1 dim=1024 input=Append(-3,0,3)'
+    which will produce just a single component, of type NaturalGradientAffineComponent,
+    with output-dim 1024 in this case, and input-dim determined by the dimension
+    of the input .
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      dim=-1                   [Dimension of the output]
+
+    The following (shown with their effective defaults) are just passed through
+    to the component's config line.
+
+      orthonormal-constraint=0.0
+      max-change=0.75
+      l2-regularize=0.0
+
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'dim': -1,
+                       'orthonormal-constraint': '',
+                       'max-change': 0.75,
+                       'param-stddev': '',
+                       'bias-stddev': '',
+                       'l2-regularize': '' }
+
+    def check_configs(self):
+        if self.config['dim'] <= 0:
+            raise RuntimeError("'dim' must be specified and > 0.")
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        assert self.config['dim'] > 0
+        return self.config['dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.config['dim']
+
+        opts = ''
+        for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize',
+                         'param-stddev', 'bias-stddev']:
+            value = self.config[opt_name]
+            if value != '':
+                opts += ' {0}={1}'.format(opt_name, value)
+
+        configs = []
+        line = ('component name={0} type=NaturalGradientAffineComponent input-dim={1} output-dim={2} '
+                '{3}'.format(self.name, input_dim, output_dim, opts))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
+
+
 class XconfigPerElementScaleComponent(XconfigLayerBase):
     """This class is for parsing lines like
      'scale-component name=scale1 input=Append(-3,0,3)'
@@ -328,3 +584,141 @@ def _generate_config(self):
             self.name, input_desc))
         configs.append(line)
         return configs
+
+class XconfigPerElementOffsetComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'offset-component name=offset1 input=Append(-3,0,3)'
+    which will produce just a single component, of type PerElementOffsetComponent, with
+    output-dim 1024 in this case, and input-dim determined by the dimension of the input .
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+
+    The following (shown with their effective defaults) are just passed through
+    to the component's config line.  (These defaults are mostly set in the
+    code).
+
+      max-change=0.75
+      l2-regularize=0.0
+      param-mean=0.0   # affects initialization
+      param-stddev=0.0  # affects initialization
+      learning-rate-factor=1.0
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'l2-regularize': '',
+                       'max-change': 0.75,
+                       'param-mean': '',
+                       'param-stddev': '',
+                       'learning-rate-factor': '' }
+
+    def check_configs(self):
+        pass
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.descriptors['input']['dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        dim = self.descriptors['input']['dim']
+
+        opts = ''
+        for opt_name in ['learning-rate-factor', 'max-change', 'l2-regularize', 'param-mean',
+                         'param-stddev' ]:
+            value = self.config[opt_name]
+            if value != '':
+                opts += ' {0}={1}'.format(opt_name, value)
+
+        configs = []
+        line = ('component name={0} type=PerElementOffsetComponent dim={1} {2} '
+                ''.format(self.name, dim, opts))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
+
+
+class XconfigDimRangeComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'dim-range-component name=feature1 input=Append(-3,0,3) dim=40 dim-offset=0'
+    which will produce just a single component, of part of the input.
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      dim=-1                   [Dimension of the output.]
+      dim-offset=0             [Dimension offset of the input.]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'dim': -1,
+                       'dim-offset': 0 }
+
+    def check_configs(self):
+        input_dim = self.descriptors['input']['dim']
+        if self.config['dim'] <= 0:
+            raise RuntimeError("'dim' must be specified and > 0.")
+        elif self.config['dim'] > input_dim:
+            raise RuntimeError("'dim' must be specified and lower than the input dim.")
+        if self.config['dim-offset'] < 0 :
+            raise RuntimeError("'dim-offset' must be specified and >= 0.")
+        elif self.config['dim-offset'] + self.config['dim'] > input_dim:
+            raise RuntimeError("'dim-offset' plus output dim must be lower than the input dim.")
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        output_dim = self.config['dim']
+        if output_dim <= 0:
+            self.config['dim'] = self.descriptors['input']['dim']
+        return output_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_node = self.descriptors['input']['final-string']
+        output_dim = self.config['dim']
+        dim_offset = self.config['dim-offset']
+
+        configs = []
+        line = ('dim-range-node name={0} input-node={1} dim={2} dim-offset={3}'.format(
+            self.name, input_node, output_dim, dim_offset))
+        configs.append(line)
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
index 08de18167cd..0188248d694 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
@@ -184,7 +184,7 @@ def convert_value_to_type(key, dest_type, string_value):
 # Also, in any place a raw input/layer/output name can appear, we accept things
 # like [-1] meaning the previous input/layer/output's name, or [-2] meaning the
 # last-but-one input/layer/output, and so on.
-class Descriptor:
+class Descriptor(object):
     def __init__(self,
                  descriptor_string = None,
                  prev_names = None):
@@ -595,7 +595,7 @@ def parse_config_line(orig_config_line):
 
     rest_of_line = ' '.join(fields)
     # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)'
-    positions = list(map(lambda x: x.start(), re.finditer('"', rest_of_line)))
+    positions = [x.start() for x in re.finditer('"', rest_of_line)]
     if not len(positions) % 2 == 0:
         raise RuntimeError("Double-quotes should occur in pairs")
 
diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
index 1dbcbe1a192..049e15df303 100755
--- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
+++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh
@@ -57,12 +57,9 @@ fi
 oldlm=$oldlang/G.fst
 if [ -f $oldlang/G.carpa ]; then
   oldlm=$oldlang/G.carpa
-elif [ ! -f $oldlm ]; then
-  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
-    exit 1;
 fi
 
-[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
+[ ! -f $oldlm ] && echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" && exit 1;
 [ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1;
 [ ! -f $rnnlm_dir/unk.probs ] &&\
   echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1;
diff --git a/egs/wsj/s5/steps/make_fbank.sh b/egs/wsj/s5/steps/make_fbank.sh
index 77c48be2e90..29153458f9b 100755
--- a/egs/wsj/s5/steps/make_fbank.sh
+++ b/egs/wsj/s5/steps/make_fbank.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-# Copyright 2012-2016  Karel Vesely  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2012-2016  Karel Vesely
+# Copyright 2012-2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
@@ -10,23 +11,28 @@ nj=4
 cmd=run.pl
 fbank_config=conf/fbank.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_fbank/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --fbank-config <config-file>                     # config passed to compute-fbank-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <fbank-dir> defaults to <data-dir>/data
+Options:
+  --fbank-config <config-file>         # config passed to compute-fbank-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -64,7 +70,7 @@ required="$scp $fbank_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_fbank.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -91,9 +97,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -103,11 +115,11 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- \| \
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$fbank_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
      || exit 1;
-
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
   split_scps=""
@@ -118,39 +130,57 @@ else
   utils/split_scp.pl $scp $split_scps || exit 1;
 
   $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \
-    compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+     --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
      ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \
      || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing fbank features for $name:"
+  echo "$0: Error producing filterbank features for $name:"
   tail $logdir/make_fbank_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $fbankdir/raw_fbank_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $fbankdir/raw_fbank_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and fbank_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $fbank_config $data/conf/fbank.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
+fi
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating filterbank features for $name"
+echo "$0: Succeeded creating filterbank features for $name"
diff --git a/egs/wsj/s5/steps/make_fbank_pitch.sh b/egs/wsj/s5/steps/make_fbank_pitch.sh
index b250128fd03..7f971df54ae 100755
--- a/egs/wsj/s5/steps/make_fbank_pitch.sh
+++ b/egs/wsj/s5/steps/make_fbank_pitch.sh
@@ -2,7 +2,7 @@
 
 # Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
 #                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
-#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine filterbank and pitch features together
 # Note: This file is based on make_fbank.sh and make_pitch_kaldi.sh
@@ -15,26 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_fbank/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <fbank-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --fbank-config             <config-file>             # config passed to compute-fbank-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<fbank-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <fbank-dir> defaults to <data-dir>/data
+Options:
+  --fbank-config <fbank-config-file>   # config passed to compute-fbank-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -72,19 +77,19 @@ required="$scp $fbank_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_fbank_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
 
+utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
+
 if [ ! -z "$pitch_postprocess_config" ]; then
   postprocess_config_opt="--config=$pitch_postprocess_config";
 else
   postprocess_config_opt=
 fi
 
-utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
-
 if [ -f $data/spk2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/spk2warp"
   vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk"
@@ -105,9 +110,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -115,61 +126,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  fbank_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- |\
+    compute-fbank-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$fbank_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$fbank_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
-    split_scps="$split_scps $logdir/wav.$n.scp"
+    split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  fbank_feats="ark:compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  fbank_feats="ark:compute-fbank-feats $vtln_opts $write_utt2dur_opt \
+   --verbose=2 --config=$fbank_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_fbank_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$fbank_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$fbank_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.ark,$fbank_pitch_dir/raw_fbank_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing fbank & pitch features for $name:"
+  echo "$0: Error producing filterbank and pitch features for $name:"
   tail $logdir/make_fbank_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $fbank_pitch_dir/raw_fbank_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, fbank_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $fbank_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $fbank_config $data/conf/fbank.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
+fi
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating filterbank & pitch features for $name"
+echo "$0: Succeeded creating filterbank and pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc.sh b/egs/wsj/s5/steps/make_mfcc.sh
index c88e0d65e65..37433f87dcd 100755
--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@@ -10,23 +10,28 @@ nj=4
 cmd=run.pl
 mfcc_config=conf/mfcc.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfccdir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config <config-file>                      # config passed to compute-mfcc-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data.
+Options:
+  --mfcc-config <config-file>          # config passed to compute-mfcc-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -63,10 +68,11 @@ required="$scp $mfcc_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_mfcc.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
+
 utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 
 if [ -f $data/spk2warp ]; then
@@ -75,6 +81,8 @@ if [ -f $data/spk2warp ]; then
 elif [ -f $data/utt2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
   vtln_opts="--vtln-map=ark:$data/utt2warp"
+else
+  vtln_opts=""
 fi
 
 for n in $(seq $nj); do
@@ -90,11 +98,16 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
 
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
 
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -104,14 +117,15 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- \| \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
@@ -123,44 +137,58 @@ else
   # utterances that have bad wave data.
 
   $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \
-    compute-mfcc-feats  $vtln_opts --verbose=2 --config=$mfcc_config \
-     scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
-      copy-feats $write_num_frames_opt --compress=$compress ark:- \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+    copy-feats $write_num_frames_opt --compress=$compress ark:- \
       ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
       || exit 1;
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc features for $name:"
+  echo "$0: Error producing MFCC features for $name:"
   tail $logdir/make_mfcc_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1;
+  cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1
 done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and mfcc_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $mfcc_config $data/conf/mfcc.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC features for $name"
+
+echo "$0: Succeeded creating MFCC features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc_pitch.sh b/egs/wsj/s5/steps/make_mfcc_pitch.sh
index 98b670b82ae..dda31667d6a 100755
--- a/egs/wsj/s5/steps/make_mfcc_pitch.sh
+++ b/egs/wsj/s5/steps/make_mfcc_pitch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine MFCC and pitch features together
@@ -15,26 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfcc-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config              <mfcc-config-file>        # config passed to compute-mfcc-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file>  # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
-   echo "  --write-utt2num-frames <true|false>     # If true, write utt2num_frames file."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data
+Options:
+  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -72,10 +77,11 @@ required="$scp $mfcc_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_mfcc_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
+
 utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
 
 if [ ! -z "$pitch_postprocess_config" ]; then
@@ -104,9 +110,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -114,66 +126,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc & pitch features for $name:"
+  echo "$0: Error producing MFCC and pitch features for $name:"
   tail $logdir/make_mfcc_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
   cat $mfcc_pitch_dir/raw_mfcc_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, mfcc_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $mfcc_config $data/conf/mfcc.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC & Pitch features for $name"
+echo "$0: Succeeded creating MFCC and pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_mfcc_pitch_online.sh b/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
index 26588506053..001c1e4c6f4 100755
--- a/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
+++ b/egs/wsj/s5/steps/make_mfcc_pitch_online.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2014-2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine MFCC and online-pitch features together
@@ -14,25 +14,30 @@ mfcc_config=conf/mfcc.conf
 online_pitch_config=conf/online_pitch.conf
 paste_length_tolerance=2
 compress=true
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_mfcc/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <mfcc-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --mfcc-config              <mfcc-config-file>        # config passed to compute-mfcc-feats, default "
-   echo "                                                       # is conf/mfcc.conf"
-   echo "  --online-pitch-config <online-pitch-config-file>     # config passed to compute-and-process-kaldi-pitch-feats, "
-   echo "                                                       # default is conf/online_pitch.conf"
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<mfcc-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <mfcc-dir> defaults to <data-dir>/data
+Options:
+  --mfcc-config <mfcc-config-file>     # config passed to compute-mfcc-feats [conf/mfcc.conf]
+  --online-pitch-config <online-pitch-config-file> # config passed to compute-and-process-kaldi-pitch-feats [conf/online_pitch.conf]
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -70,7 +75,7 @@ required="$scp $mfcc_config $online_pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_mfcc_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -90,9 +95,21 @@ for n in $(seq $nj); do
   utils/create_data_link.pl $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.ark
 done
 
+if $write_utt2num_frames; then
+  write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB"
+else
+  write_num_frames_opt=
+fi
+
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -100,58 +117,88 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-and-process-kaldi-pitch-feats --verbose=2 --config=$online_pitch_config ark:- ark:- |"
+  mfcc_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$mfcc_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-and-process-kaldi-pitch-feats --verbose=2 \
+      --config=$online_pitch_config ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
-    copy-feats --compress=$compress ark:- \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
+    copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-  mfcc_feats="ark:compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-and-process-kaldi-pitch-feats --verbose=2 --config=$online_pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  mfcc_feats="ark:compute-mfcc-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$mfcc_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-and-process-kaldi-pitch-feats --verbose=2 \
+    --config=$online_pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_mfcc_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$mfcc_feats" "$pitch_feats" ark:- \| \
-    copy-feats --compress=$compress ark:- \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$mfcc_feats" "$pitch_feats" ark:- \| \
+    copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.ark,$mfcc_pitch_dir/raw_mfcc_online_pitch_$name.JOB.scp \
       || exit 1;
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing mfcc & pitch features for $name:"
+  echo "$0: Error producing MFCC and online-pitch features for $name:"
   tail $logdir/make_mfcc_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $mfcc_pitch_dir/raw_mfcc_online_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
+
+if $write_utt2num_frames; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2num_frames.$n || exit 1
+  done > $data/utt2num_frames || exit 1
+fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+# Store frame_shift, mfcc_config and pitch_config_online along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $mfcc_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $mfcc_config $data/conf/mfcc.conf &&
+  cp $online_pitch_config $data/conf/online_pitch.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating MFCC & online-pitch features for $name"
+echo "$0: Succeeded creating MFCC and online-pitch features for $name"
diff --git a/egs/wsj/s5/steps/make_plp.sh b/egs/wsj/s5/steps/make_plp.sh
index 85b4a02fbb6..c4a987aaeeb 100755
--- a/egs/wsj/s5/steps/make_plp.sh
+++ b/egs/wsj/s5/steps/make_plp.sh
@@ -10,22 +10,28 @@ nj=4
 cmd=run.pl
 plp_config=conf/plp.conf
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_plp/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <plp-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --plp-config <config-file>                      # config passed to compute-plp-feats "
-   echo "  --nj <nj>                                        # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <plp-dir> defaults to <data-dir>/data
+Options:
+  --plp-config <config-file>           # config passed to compute-plp-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -62,7 +68,7 @@ required="$scp $plp_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_plp.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -74,6 +80,8 @@ if [ -f $data/spk2warp ]; then
 elif [ -f $data/utt2warp ]; then
   echo "$0 [info]: using VTLN warp factors from $data/utt2warp"
   vtln_opts="--vtln-map=ark:$data/utt2warp"
+else
+  vtln_opts=
 fi
 
 for n in $(seq $nj); do
@@ -88,9 +96,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -100,14 +114,15 @@ if [ -f $data/segments ]; then
 
   $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
     extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \
-    compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config ark:- ark:- \| \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config ark:- ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
@@ -115,7 +130,8 @@ else
   utils/split_scp.pl $scp $split_scps || exit 1;
 
   $cmd JOB=1:$nj $logdir/make_plp_${name}.JOB.log \
-    compute-plp-feats  $vtln_opts --verbose=2 --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plpdir/raw_plp_$name.JOB.ark,$plpdir/raw_plp_$name.JOB.scp \
       || exit 1;
@@ -124,34 +140,48 @@ fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing plp features for $name:"
+  echo "$0: Error producing PLP features for $name:"
   tail $logdir/make_plp_${name}.1.log
   exit 1;
 fi
 
 # concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $plpdir/raw_plp_$name.$n.scp || exit 1;
+  cat $plpdir/raw_plp_$name.$n.scp || exit 1
 done > $data/feats.scp
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift and plp_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf && cp $plp_config $data/conf/plp.conf || exit 1
+
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating PLP features for $name"
+echo "$0: Succeeded creating PLP features for $name"
diff --git a/egs/wsj/s5/steps/make_plp_pitch.sh b/egs/wsj/s5/steps/make_plp_pitch.sh
index 40ddd314f6c..9f565d8a5bf 100755
--- a/egs/wsj/s5/steps/make_plp_pitch.sh
+++ b/egs/wsj/s5/steps/make_plp_pitch.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# Copyright 2013 The Shenzhen Key Laboratory of Intelligent Media and Speech,
-#                PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
+# Copyright 2013  The Shenzhen Key Laboratory of Intelligent Media and Speech,
+#                 PKU-HKUST Shenzhen Hong Kong Institution (Author: Wei Shi)
 #           2016  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 # Combine PLP and pitch features together
@@ -15,25 +15,31 @@ pitch_config=conf/pitch.conf
 pitch_postprocess_config=
 paste_length_tolerance=2
 compress=true
-write_utt2num_frames=false  # if true writes utt2num_frames
+write_utt2num_frames=true  # If true writes utt2num_frames.
+write_utt2dur=true
 # End configuration section.
 
-echo "$0 $@"  # Print the command line for logging
+echo "$0 $@"  # Print the command line for logging.
 
 if [ -f ./path.sh ]; then . ./path.sh;  fi
 . parse_options.sh || exit 1;
 
 if [ $# -lt 1 ] || [ $# -gt 3 ]; then
-   echo "Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]";
-   echo "e.g.: $0 data/train exp/make_plp/train mfcc"
-   echo "Note: <log-dir> defaults to <data-dir>/log, and <plp-dir> defaults to <data-dir>/data"
-   echo "Options: "
-   echo "  --plp-config               <config-file>             # config passed to compute-plp-feats "
-   echo "  --pitch-config             <pitch-config-file>       # config passed to compute-kaldi-pitch-feats "
-   echo "  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats "
-   echo "  --paste-length-tolerance   <tolerance>               # length tolerance passed to paste-feats"
-   echo "  --nj                       <nj>                      # number of parallel jobs"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>)     # how to run jobs."
+  cat >&2 <<EOF
+Usage: $0 [options] <data-dir> [<log-dir> [<plp-dir>] ]
+ e.g.: $0 data/train
+Note: <log-dir> defaults to <data-dir>/log, and
+      <plp-dir> defaults to <data-dir>/data
+Options:
+  --plp-config <plp-config-file>       # config passed to compute-plp-feats.
+  --pitch-config <pitch-config-file>   # config passed to compute-kaldi-pitch-feats.
+  --pitch-postprocess-config <postprocess-config-file> # config passed to process-kaldi-pitch-feats.
+  --paste-length-tolerance <tolerance> # length tolerance passed to paste-feats.
+  --nj <nj>                            # number of parallel jobs.
+  --cmd <run.pl|queue.pl <queue opts>> # how to run jobs.
+  --write-utt2num-frames <true|false>  # If true, write utt2num_frames file.
+  --write-utt2dur <true|false>         # If true, write utt2dur file.
+EOF
    exit 1;
 fi
 
@@ -70,7 +76,7 @@ required="$scp $plp_config $pitch_config"
 
 for f in $required; do
   if [ ! -f $f ]; then
-    echo "make_plp_pitch.sh: no such file $f"
+    echo "$0: no such file $f"
     exit 1;
   fi
 done
@@ -102,9 +108,15 @@ else
   write_num_frames_opt=
 fi
 
+if $write_utt2dur; then
+  write_utt2dur_opt="--write-utt2dur=ark,t:$logdir/utt2dur.JOB"
+else
+  write_utt2dur_opt=
+fi
+
 if [ -f $data/segments ]; then
   echo "$0 [info]: segments file exists: using that."
-  split_segments=""
+  split_segments=
   for n in $(seq $nj); do
     split_segments="$split_segments $logdir/segments.$n"
   done
@@ -112,67 +124,89 @@ if [ -f $data/segments ]; then
   utils/split_scp.pl $data/segments $split_segments || exit 1;
   rm $logdir/.error 2>/dev/null
 
-  plp_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config ark:- ark:- |"
-  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  plp_feats="ark:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+      --config=$plp_config ark:- ark:- |"
+  pitch_feats="ark,s,cs:extract-segments scp,p:$scp $logdir/segments.JOB ark:- | \
+    compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config ark:- ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$plp_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
      || exit 1;
 
 else
   echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance."
-  split_scps=""
+  split_scps=
   for n in $(seq $nj); do
     split_scps="$split_scps $logdir/wav_${name}.$n.scp"
   done
 
   utils/split_scp.pl $scp $split_scps || exit 1;
 
-
-  plp_feats="ark:compute-plp-feats $vtln_opts --verbose=2 --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
-  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
+  plp_feats="ark:compute-plp-feats $vtln_opts $write_utt2dur_opt --verbose=2 \
+    --config=$plp_config scp,p:$logdir/wav_${name}.JOB.scp ark:- |"
+  pitch_feats="ark,s,cs:compute-kaldi-pitch-feats --verbose=2 \
+      --config=$pitch_config scp,p:$logdir/wav_${name}.JOB.scp ark:- | \
+    process-kaldi-pitch-feats $postprocess_config_opt ark:- ark:- |"
 
   $cmd JOB=1:$nj $logdir/make_plp_pitch_${name}.JOB.log \
-    paste-feats --length-tolerance=$paste_length_tolerance "$plp_feats" "$pitch_feats" ark:- \| \
+    paste-feats --length-tolerance=$paste_length_tolerance \
+      "$plp_feats" "$pitch_feats" ark:- \| \
     copy-feats --compress=$compress $write_num_frames_opt ark:- \
       ark,scp:$plp_pitch_dir/raw_plp_pitch_$name.JOB.ark,$plp_pitch_dir/raw_plp_pitch_$name.JOB.scp \
       || exit 1;
-
 fi
 
 
 if [ -f $logdir/.error.$name ]; then
-  echo "Error producing plp & pitch features for $name:"
+  echo "$0: Error producing PLP and pitch features for $name:"
   tail $logdir/make_plp_pitch_${name}.1.log
   exit 1;
 fi
 
-# concatenate the .scp files together.
+# Concatenate the .scp files together.
 for n in $(seq $nj); do
-  cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1;
-done > $data/feats.scp
+  cat $plp_pitch_dir/raw_plp_pitch_$name.$n.scp || exit 1
+done > $data/feats.scp || exit 1
 
 if $write_utt2num_frames; then
   for n in $(seq $nj); do
-    cat $logdir/utt2num_frames.$n || exit 1;
+    cat $logdir/utt2num_frames.$n || exit 1
   done > $data/utt2num_frames || exit 1
-  rm $logdir/utt2num_frames.*
 fi
 
-rm $logdir/wav_${name}.*.scp  $logdir/segments.* 2>/dev/null
+if $write_utt2dur; then
+  for n in $(seq $nj); do
+    cat $logdir/utt2dur.$n || exit 1
+  done > $data/utt2dur || exit 1
+fi
+
+# Store frame_shift, plp_config and pitch_config along with features.
+frame_shift=$(perl -ne 'if (/^--frame-shift=(\d+)/) {
+                          printf "%.3f", 0.001 * $1; exit; }' $plp_config)
+echo ${frame_shift:-'0.01'} > $data/frame_shift
+mkdir -p $data/conf &&
+  cp $plp_config $data/conf/plp.conf &&
+  cp $pitch_config $data/conf/pitch.conf || exit 1
 
-nf=`cat $data/feats.scp | wc -l`
-nu=`cat $data/utt2spk | wc -l`
+rm $logdir/wav_${name}.*.scp  $logdir/segments.* \
+   $logdir/utt2num_frames.* $logdir/utt2dur.* 2>/dev/null
+
+nf=$(wc -l < $data/feats.scp)
+nu=$(wc -l < $data/utt2spk)
 if [ $nf -ne $nu ]; then
-  echo "It seems not all of the feature files were successfully processed ($nf != $nu);"
-  echo "consider using utils/fix_data_dir.sh $data"
+  echo "$0: It seems not all of the feature files were successfully procesed" \
+       "($nf != $nu); consider using utils/fix_data_dir.sh $data"
 fi
 
-if [ $nf -lt $[$nu - ($nu/20)] ]; then
-  echo "Less than 95% the features were successfully generated.  Probably a serious error."
-  exit 1;
+if (( nf < nu - nu/20 )); then
+  echo "$0: Less than 95% the features were successfully generated."\
+       "Probably a serious error."
+  exit 1
 fi
 
-echo "Succeeded creating PLP & Pitch features for $name"
+echo "$0: Succeeded creating PLP and pitch features for $name"
diff --git a/egs/wsj/s5/steps/nnet/align.sh b/egs/wsj/s5/steps/nnet/align.sh
index 78f45a0609c..f976711fa58 100755
--- a/egs/wsj/s5/steps/nnet/align.sh
+++ b/egs/wsj/s5/steps/nnet/align.sh
@@ -5,7 +5,7 @@
 # Aligns 'data' to sequences of transition-ids using Neural Network based acoustic model.
 # Optionally produces alignment in lattice format, this is handy to get word alignment.
 
-# Begin configuration section.  
+# Begin configuration section.
 nj=4
 cmd=run.pl
 stage=0
@@ -71,28 +71,29 @@ done
 
 # PREPARE FEATURE EXTRACTION PIPELINE
 # import config,
+online_cmvn_opts=
 cmvn_opts=
 delta_opts=
 D=$srcdir
-[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
 [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
 [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
 #
 # Create the feature stream,
 feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# apply-cmvn-online (optional),
+[ -n "$online_cmvn_opts" -a ! -f $D/global_cmvn_stats.mat ] && echo "$0: Missing $D/global_cmvn_stats.mat" && exit 1
+[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $D/global_cmvn_stats.mat ark:- ark:- |"
 # apply-cmvn (optional),
-[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
-[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
+[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
-[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-# add-pytel transform (optional),
-[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
 
 # add-ivector (optional),
 if [ -e $D/ivector_dim ]; then
   [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
-  # Get the tool, 
+  # Get the tool,
   ivector_append_tool=append-vector-to-feats # default,
   [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
   # Check dims,
@@ -113,7 +114,7 @@ feats="$feats nnet-forward $nnet_forward_opts --feature-transform=$feature_trans
 
 echo "$0: aligning data '$data' using nnet/model '$srcdir', putting alignments in '$dir'"
 
-# Map oovs in reference transcription, 
+# Map oovs in reference transcription,
 oov=`cat $lang/oov.int` || exit 1;
 [ -z "$text" ] && text=$sdata/JOB/text
 tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $text |";
diff --git a/egs/wsj/s5/steps/nnet/decode.sh b/egs/wsj/s5/steps/nnet/decode.sh
index d2766661e12..c374905b127 100755
--- a/egs/wsj/s5/steps/nnet/decode.sh
+++ b/egs/wsj/s5/steps/nnet/decode.sh
@@ -98,23 +98,24 @@ thread_string=
 
 # PREPARE FEATURE EXTRACTION PIPELINE
 # import config,
+online_cmvn_opts=
 cmvn_opts=
 delta_opts=
 D=$srcdir
-[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
 [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
 [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
 #
 # Create the feature stream,
 feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# apply-cmvn-online (optional),
+[ -n "$online_cmvn_opts" -a ! -f $D/global_cmvn_stats.mat ] && echo "$0: Missing $D/global_cmvn_stats.mat" && exit 1
+[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $D/global_cmvn_stats.mat ark:- ark:- |"
 # apply-cmvn (optional),
-[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
-[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
+[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
-[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-# add-pytel transform (optional),
-[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
 
 # add-ivector (optional),
 if [ -e $D/ivector_dim ]; then
diff --git a/egs/wsj/s5/steps/nnet/make_bn_feats.sh b/egs/wsj/s5/steps/nnet/make_bn_feats.sh
index 8489e824f2f..631f3d5243a 100755
--- a/egs/wsj/s5/steps/nnet/make_bn_feats.sh
+++ b/egs/wsj/s5/steps/nnet/make_bn_feats.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
 # Apache 2.0
@@ -67,28 +67,29 @@ echo "Creating bn-feats into $data"
 
 # PREPARE FEATURE EXTRACTION PIPELINE
 # import config,
+online_cmvn_opts=
 cmvn_opts=
 delta_opts=
 D=$nndir
-[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+[ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
 [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
 [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
 #
 # Create the feature stream,
 feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# apply-cmvn-online (optional),
+[ -n "$online_cmvn_opts" -a ! -f $nndir/global_cmvn_stats.mat ] && echo "$0: Missing $nndir/global_cmvn_stats.mat" && exit 1
+[ -n "$online_cmvn_opts" ] && feats="$feats apply-cmvn-online $online_cmvn_opts --spk2utt=ark:$srcdata/spk2utt $nndir/global_cmvn_stats.mat ark:- ark:- |"
 # apply-cmvn (optional),
-[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
-[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
+[ -n "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
+[ -n "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$srcdata/utt2spk scp:$srcdata/cmvn.scp ark:- ark:- |"
 # add-deltas (optional),
-[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
-# add-pytel transform (optional),
-[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
+[ -n "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
 
 # add-ivector (optional),
 if [ -e $D/ivector_dim ]; then
   [ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
-  # Get the tool, 
+  # Get the tool,
   ivector_append_tool=append-vector-to-feats # default,
   [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
   # Check dims,
@@ -115,7 +116,7 @@ if [ $htk_save == false ]; then
   done
 
   # check sentence counts,
-  N0=$(cat $srcdata/feats.scp | wc -l) 
+  N0=$(cat $srcdata/feats.scp | wc -l)
   N1=$(cat $data/feats.scp | wc -l)
   [[ "$N0" != "$N1" ]] && echo "$0: sentence-count mismatch, $srcdata $N0, $data $N1" && exit 1
   echo "Succeeded creating MLP-BN features '$data'"
diff --git a/egs/wsj/s5/steps/nnet/train.sh b/egs/wsj/s5/steps/nnet/train.sh
index c23a15362c7..e73fa5f77d1 100755
--- a/egs/wsj/s5/steps/nnet/train.sh
+++ b/egs/wsj/s5/steps/nnet/train.sh
@@ -22,6 +22,7 @@ nnet_proto=         # (optional) use this NN prototype for initialization,
 
 # feature processing,
 splice=5            # (default) splice features both-ways along time axis,
+online_cmvn_opts=   # (optional) adds 'apply-cmvn-online' to input feature pipeline, see opts,
 cmvn_opts=          # (optional) adds 'apply-cmvn' to input feature pipeline, see opts,
 delta_opts=         # (optional) adds 'add-deltas' to input feature pipeline, see opts,
 ivector=            # (optional) adds 'append-vector-to-feats', the option is rx-filename for the 2nd stream,
@@ -34,7 +35,6 @@ splice_after_transf=5 # (feat_type=transf) splice after the linear transform,
 
 feature_transform_proto= # (optional) use this prototype for 'feature_transform',
 feature_transform=  # (optional) directly use this 'feature_transform',
-pytel_transform=    # (BUT) use external python transform,
 
 # labels,
 labels=            # (optional) specify non-default training targets,
@@ -209,12 +209,11 @@ cp $data_cv/feats.scp $dir/cv.scp_non_local
 
 ###### OPTIONALLY IMPORT FEATURE SETTINGS (from pre-training) ######
 ivector_dim= # no ivectors,
-if [ ! -z $feature_transform ]; then
+if [ -n "$feature_transform" ]; then
   D=$(dirname $feature_transform)
   echo "# importing feature settings from dir '$D'"
-  [ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+  [ -e $D/online_cmvn_opts ] && online_cmvn_opts=$(cat $D/online_cmvn_opts)
   [ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
-  [ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
   [ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
   [ -e $D/ivector_dim ] && ivector_dim=$(cat $D/ivector_dim)
   [ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
@@ -227,7 +226,14 @@ feats_tr="ark:copy-feats scp:$dir/train.scp ark:- |"
 feats_cv="ark:copy-feats scp:$dir/cv.scp ark:- |"
 
 # optionally add per-speaker CMVN,
-if [ ! -z "$cmvn_opts" ]; then
+[ -n "$online_cmvn_opts" -a -n "$cmvn_opts" ] && echo "Error: use \$online_cmvn_opts or \$cmvn_opts, not both!" && exit 1
+if [ -n "$online_cmvn_opts" ]; then
+  echo "# + 'apply-cmvn-online' with '$online_cmvn_opts' is used,"
+  global_cmvn_stats=$dir/global_cmvn_stats.mat
+  matrix-sum --binary=false scp:$data/cmvn.scp $global_cmvn_stats
+  feats_tr="$feats_tr apply-cmvn-online $online_cmvn_opts $global_cmvn_stats ark:- ark:- |"
+  feats_cv="$feats_cv apply-cmvn-online $online_cmvn_opts $global_cmvn_stats ark:- ark:- |"
+elif [ -n "$cmvn_opts" ]; then
   echo "# + 'apply-cmvn' with '$cmvn_opts' using statistics : $data/cmvn.scp, $data_cv/cmvn.scp"
   [ ! -r $data/cmvn.scp ] && echo "Missing $data/cmvn.scp" && exit 1;
   [ ! -r $data_cv/cmvn.scp ] && echo "Missing $data_cv/cmvn.scp" && exit 1;
@@ -245,20 +251,11 @@ if [ ! -z "$delta_opts" ]; then
 fi
 
 # keep track of the config,
-[ ! -z "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts
-[ ! -z "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts
+[ -n "$online_cmvn_opts" ] && echo "$online_cmvn_opts" >$dir/online_cmvn_opts
+[ -n "$cmvn_opts" ] && echo "$cmvn_opts" >$dir/cmvn_opts
+[ -n "$delta_opts" ] && echo "$delta_opts" >$dir/delta_opts
 #
 
-# optionally append python feature transform,
-if [ ! -z "$pytel_transform" ]; then
-  cp $pytel_transform $dir/pytel_transform.py
-  { echo; echo "### Comes from here: '$pytel_transform' ###"; } >> $dir/pytel_transform.py
-  pytel_transform=$dir/pytel_transform.py
-  feats_tr="$feats_tr /bin/env python $pytel_transform |"
-  feats_cv="$feats_cv /bin/env python $pytel_transform |"
-  echo "# + 'pytel-transform' from '$pytel_transform'"
-fi
-
 # temoprary pipeline with first 10k,
 feats_tr_10k="${feats_tr/train.scp/train.scp.10k}"
 
@@ -273,13 +270,13 @@ echo "# feature dim : $feat_dim (input of 'feature_transform')"
 # So it has to be done by a single process (we are using exclusive mode).
 # This also reduces the CPU-GPU uploads/downloads to minimum.
 
-if [ ! -z "$feature_transform" ]; then
+if [ -n "$feature_transform" ]; then
   echo "# importing 'feature_transform' from '$feature_transform'"
   tmp=$dir/imported_$(basename $feature_transform)
   cp $feature_transform $tmp; feature_transform=$tmp
 else
   # Make default proto with splice,
-  if [ ! -z $feature_transform_proto ]; then
+  if [ -n "$feature_transform_proto" ]; then
     echo "# importing custom 'feature_transform_proto' from '$feature_transform_proto'"
   else
     echo "# + default 'feature_transform_proto' with splice +/-$splice frames,"
@@ -374,7 +371,7 @@ if [ ! -z $ivector ]; then
   echo $dim_ivec >$dir/ivector_dim # mark down the iVec dim!
   echo $ivector_append_tool >$dir/ivector_append_tool
 
-  # pasting the iVecs to the feaures,
+  # pasting the iVecs to the features,
   echo "# + ivector input '$ivector'"
   feats_tr="$feats_tr $ivector_append_tool ark:- '$ivector' ark:- |"
   feats_cv="$feats_cv $ivector_append_tool ark:- '$ivector' ark:- |"
@@ -433,18 +430,6 @@ else
         ${bn_dim:+ --bottleneck-dim=$bn_dim} \
         "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
       ;;
-    cnn2d)
-      delta_order=$([ -z $delta_opts ] && echo "0" || { echo $delta_opts | tr ' ' '\n' | grep "delta[-_]order" | sed 's:^.*=::'; })
-      echo "Debug : $delta_opts, delta_order $delta_order"
-      utils/nnet/make_cnn2d_proto.py $cnn_proto_opts \
-        --splice=$splice --delta-order=$delta_order --dir=$dir \
-        $num_fea >$nnet_proto
-      cnn_fea=$(cat $nnet_proto | grep -v '^$' | tail -n1 | awk '{ print $5; }')
-      utils/nnet/make_nnet_proto.py $proto_opts \
-        --no-smaller-input-weights \
-        ${bn_dim:+ --bottleneck-dim=$bn_dim} \
-        "$cnn_fea" $num_tgt $hid_layers $hid_dim >>$nnet_proto
-      ;;
     lstm)
       utils/nnet/make_lstm_proto.py $proto_opts \
         $num_fea $num_tgt >$nnet_proto
diff --git a/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py b/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py
index 6e7bff3fa17..b5338b516e8 100755
--- a/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py
+++ b/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py
@@ -4,14 +4,16 @@
 # Creates the nnet.config and hidde_*.config scripts used in train_pnorm_multisplice.sh
 # Parses the splice string to generate relevant variables for get_egs.sh, get_lda.sh and nnet/hidden.config files
 
+from __future__ import division
+from __future__ import print_function
 import re, argparse, sys, math, warnings
 
 # returns the set of frame indices required to perform the convolution
 # between sequences with frame indices in x and y
 def get_convolution_index_set(x, y):
   z = []
-  for i in xrange(len(x)):
-    for j in xrange(len(y)):
+  for i in range(len(x)):
+    for j in range(len(y)):
       z.append(x[i]+y[j])
   z = list(set(z))
   z.sort()
@@ -19,7 +21,7 @@ def get_convolution_index_set(x, y):
 
 def parse_splice_string(splice_string):
   layerwise_splice_indexes = splice_string.split('layer')[1:]
-  print splice_string.split('layer')
+  print(splice_string.split('layer'))
   contexts={}
   first_right_context = 0 # default value
   first_left_context = 0 # default value
@@ -29,14 +31,14 @@ def parse_splice_string(splice_string):
   try:
     for cur_splice_indexes in layerwise_splice_indexes:
       layer_index, frame_indexes  = cur_splice_indexes.split("/")
-      frame_indexes = map(lambda x: int(x), frame_indexes.split(':'))
+      frame_indexes = [int(x) for x in frame_indexes.split(':')]
       layer_index = int(layer_index)
       assert(layer_index >= 0)
       if layer_index == 0:
         first_left_context = min(frame_indexes)
         first_right_context = max(frame_indexes)
         try:
-          assert(frame_indexes == range(first_left_context, first_right_context+1))
+          assert(frame_indexes == list(range(first_left_context, first_right_context+1)))
         except AssertionError:
           raise Exception('Currently the first splice component just accepts contiguous context.')
         try:
@@ -46,11 +48,11 @@ def parse_splice_string(splice_string):
           left context provided is %d and right context provided is %d.""" % (first_left_context, first_right_context))
         # convolve the current splice indices with the splice indices until last layer
       nnet_frame_indexes = get_convolution_index_set(frame_indexes, nnet_frame_indexes)
-      cur_context = ":".join(map(lambda x: str(x), frame_indexes))
+      cur_context = ":".join([str(x) for x in frame_indexes])
       contexts[layer_index] = cur_context
   except ValueError:
     raise Exception('Unknown format in splice_indexes variable: {0}'.format(params.splice_indexes))
-  print nnet_frame_indexes
+  print(nnet_frame_indexes)
   max_left_context = min(nnet_frame_indexes)
   max_right_context = max(nnet_frame_indexes)
   return [contexts, ' nnet_left_context={0};\n nnet_right_context={1}\n first_left_context={2};\n first_right_context={3}\n'.format(abs(max_left_context), abs(max_right_context), abs(first_left_context), abs(first_right_context) )]
@@ -87,7 +89,7 @@ def create_config_files(output_dir, params):
   except KeyError:
     raise Exception('A splice layer is expected to be the first layer. Provide a context for the first layer.')
 
-  for i in xrange(1, params.num_hidden_layers): #just run till num_hidden_layers-1 since we do not add splice before the final affine transform
+  for i in range(1, params.num_hidden_layers): #just run till num_hidden_layers-1 since we do not add splice before the final affine transform
     lines=[]
     context_len = 1
     if i in contexts:
@@ -109,7 +111,7 @@ def create_config_files(output_dir, params):
 
 
 if __name__ == "__main__":
-  print " ".join(sys.argv)
+  print(" ".join(sys.argv))
   parser = argparse.ArgumentParser()
   parser.add_argument('--splice-indexes', type=str, help='string specifying the indexes for the splice layers throughout the network')
   parser.add_argument('--total-input-dim', type=int, help='dimension of the input to the network')
@@ -127,7 +129,7 @@ def create_config_files(output_dir, params):
   parser.add_argument("output_dir", type=str, help="output directory to store the files")
   params = parser.parse_args() 
   
-  print params
+  print(params)
   if params.mode == "contexts":
     [context, context_variables] = parse_splice_string(params.splice_indexes)
     var_file = open("{0}/vars".format(params.output_dir), "w")
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index cf1cc9124d3..948d5a4dc38 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -24,6 +24,7 @@ extra_right_context=0
 extra_left_context_initial=-1
 extra_right_context_final=-1
 online_ivector_dir=
+graphs_scp=
 # End configuration options.
 
 echo "$0 $@"  # Print the command line for logging
@@ -52,14 +53,13 @@ dir=$4
 oov=`cat $lang/oov.int` || exit 1;
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
-touch $dir/per_utt
-sdata=$data/split${nj}utt
+sdata=$data/split${nj}
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
-   split_data.sh --per-utt $data $nj || exit 1;
+   split_data.sh $data $nj || exit 1;
 
 if $use_gpu; then
   queue_opt="--gpu 1"
-  gpu_opt="--use-gpu=yes"
+  gpu_opt="--use-gpu=wait"
 else
   queue_opt=""
   gpu_opt="--use-gpu=no"
@@ -97,8 +97,6 @@ fi
 
 echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
 
-tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
-
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   # e.g. for 'chain' systems
@@ -114,9 +112,20 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   fi
 fi
 
+if [ ! -z "$graphs_scp" ]; then
+  if [ ! -f $graphs_scp ]; then
+    echo "Could not find graphs $graphs_scp" && exit 1
+  fi
+  tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
+  prog=compile-train-graphs-fsts
+else
+  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+  prog=compile-train-graphs
+fi
 
 $cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \
-  compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
+  $prog --read-disambig-syms=$lang/phones/disambig.int $dir/tree \
+  $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
   nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \
   --frames-per-chunk=$frames_per_chunk \
   --extra-left-context=$extra_left_context \
diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh
index 4edc38751c8..201cc3552ba 100755
--- a/egs/wsj/s5/steps/nnet3/align_lats.sh
+++ b/egs/wsj/s5/steps/nnet3/align_lats.sh
@@ -50,10 +50,9 @@ dir=$4
 oov=`cat $lang/oov.int` || exit 1;
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
-touch $dir/per_utt
-sdata=$data/split${nj}utt
+sdata=$data/split${nj}
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
-   split_data.sh --per-utt $data $nj || exit 1;
+   split_data.sh $data $nj || exit 1;
 
 extra_files=
 if [ ! -z "$online_ivector_dir" ]; then
@@ -93,12 +92,16 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
-  if [ "$frame_subsampling_factor" -gt 1 ] && \
-     [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then
-    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
-    echo "...  but the scale opts are the defaults.  You probably want"
-    echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'"
-    sleep 1
+  if [[ $frame_subsampling_factor -gt 1 ]]; then
+    # Assume a chain system, check agrument sanity.
+    if [[ ! ($scale_opts == *--self-loop-scale=1.0* &&
+             $scale_opts == *--transition-scale=1.0* &&
+             $acoustic_scale = '1.0') ]]; then
+      echo "$0: ERROR: frame-subsampling-factor is not 1, assuming a chain system."
+      echo "... You should pass the following options to this script:"
+      echo "  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" \
+           "--acoustic_scale 1.0"
+    fi
   fi
 fi
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh
index 48028634e26..31fa0aa6b4b 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree_multiple_sources.sh
@@ -131,8 +131,8 @@ for n in `seq 0 $[num_sys-1]`; do
     fi
 
     echo "$0: Using transforms from ${alidirs[$n]}"
-    feats[i]="${feats[i]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:${alidirs[$n]}/trans.JOB ark:- ark:- |"
-    feats_one[i]="${feats_one[i]} transform-feats --utt2spk=ark:$this_sdata/1/utt2spk ark,s,cs:${alidirs[$n]}/trans.1 ark:- ark:- |"
+    feats[$n]="${feats[$n]} transform-feats --utt2spk=ark:$this_sdata/JOB/utt2spk ark,s,cs:${alidirs[$n]}/trans.JOB ark:- ark:- |"
+    feats_one[$n]="${feats_one[$n]} transform-feats --utt2spk=ark:$this_sdata/1/utt2spk ark,s,cs:${alidirs[$n]}/trans.1 ark:- ark:- |"
   fi
 
   # Do subsampling of feats, if needed
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py b/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py
new file mode 100755
index 00000000000..e009cc17a9b
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/compute_biphone_stats.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+# Copyright    2018 Hossein Hadian
+# Apache 2.0
+
+import argparse
+from os.path import join
+import sys
+import copy
+import random
+
+parser = argparse.ArgumentParser(description="""This script reads
+    sequences of phone ids from std input and counts mono/biphone stats
+    and writes the results to std out. The output can be used with
+    gmm-init-biphone to create a better tree. The first part of the
+    outupt is biphone counts with this format for each line:
+    <phone-id> <phone-id> <count>
+    and the second part of the output is monophone counts with the
+    following format:
+    <phone-id> <count>""")
+parser.add_argument('langdir', type=str)
+parser.add_argument('--shared-phones', type=str, choices=['true','false'],
+                    default='true',
+                    help="If true, stats will be collected for shared phones.")
+
+args = parser.parse_args()
+args.shared_phones = True if args.shared_phones == 'true' else False
+
+# Read phone sets
+phone_sets = []
+phones = []
+phone_to_shard_phone = {}
+phone_to_shard_phone[0] = 0  # The no-left-context case
+with open(join(args.langdir, 'phones/sets.int'), 'r', encoding='latin-1') as f:
+    for line in f:
+        phone_set = line.strip().split()
+        phone_sets.append(phone_set)
+        for phone in phone_set:
+            phones.append(phone)
+            phone_to_shard_phone[phone] = phone_set[0]
+
+print('Loaded {} phone-sets containing {} phones.'.format(len(phone_sets),
+                                                          len(phones)),
+      file=sys.stderr)
+
+biphone_counts = {}
+mono_counts = {}
+for line in sys.stdin:
+    line = line.strip().split()
+    key = line[0]
+    line_phones = line[1:]
+    for pair in zip([0] + line_phones, line_phones):  # 0 is for the no left-context case
+        if args.shared_phones:
+            pair = (phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
+        if pair not in biphone_counts:
+            biphone_counts[pair] = 0
+        biphone_counts[pair] += 1
+        mono_counts[pair[1]] = 1 if pair[1] not in mono_counts else mono_counts[pair[1]] + 1
+
+for phone1 in [0] + phones:
+    for phone2 in phones:
+        pair = (phone1, phone2)
+        shared_pair = ((phone_to_shard_phone[pair[0]], phone_to_shard_phone[pair[1]])
+                       if args.shared_phones else pair)
+        count = biphone_counts[shared_pair] if shared_pair in biphone_counts else 0
+        if count != 0:
+            print('{} {} {}'.format(pair[0], pair[1], count))
+for phone in phones:
+    shared = phone_to_shard_phone[phone] if args.shared_phones else phone
+    count = mono_counts[shared] if shared in mono_counts else 0
+    if count != 0:
+        print('{} {}'.format(phone, count))
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh b/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh
index e642fad29bd..570962063be 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh
@@ -14,7 +14,6 @@
 # Begin configuration section.
 cmd=run.pl
 normalize_egs=true
-feat_type=raw     # set it to 'lda' to use LDA features.
 frame_subsampling_factor=3 # frames-per-second of features we train on divided
                            # by frames-per-second at output of chain model
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
@@ -47,6 +46,13 @@ online_ivector_dir=  # can be used if we are including speaker information as iV
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
             # it doesn't make sense to use different options than were used as input to the
             # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+online_cmvn=false # Set to 'true' to replace 'apply-cmvn' by 'apply-cmvn-online' in the nnet3 input.
+                  # The configuration is passed externally via '$cmvn_opts' given to train.py,
+                  # typically as: --cmvn-opts="--config conf/online_cmvn.conf".
+                  # The global_cmvn.stats are computed by this script from the features.
+                  # Note: the online cmvn for ivector extractor it is controlled separately in
+                  #       steps/online/nnet2/train_ivector_extractor.sh by --online-cmvn-iextractor
+
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -147,18 +153,39 @@ if [ $len_uttlist -lt $num_utts_subset ]; then
   echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1;
 fi
 
-
 ## Set up features.
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
-  raw) feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
-    valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
-    train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
-    echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
-   ;;
-  *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1;
-esac
+
+# get the global_cmvn stats for online-cmvn,
+if $online_cmvn; then
+  # create global_cmvn.stats,
+  #
+  # caution: the top-level nnet training script should copy
+  # 'global_cmvn.stats' and 'online_cmvn' to its own dir.
+  if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
+    echo "$0: Error summing cmvn stats"
+    exit 1
+  fi
+  touch $dir/online_cmvn
+else
+  [ -f $dir/online_cmvn ] && rm $dir/online_cmvn
+fi
+
+# create the feature pipelines,
+if ! $online_cmvn; then
+  # the original front-end with 'apply-cmvn',
+  echo "$0: feature type is raw, with 'apply-cmvn'"
+  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+else
+  # the alternative front-end with 'apply-cmvn-online',
+  # - the $cmvn_opts can be set to '--config=conf/online_cmvn.conf' which is the setup of ivector-extractor,
+  echo "$0: feature type is raw, with 'apply-cmvn-online'"
+  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+fi
+echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
 
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
index a060f0f3b36..07d5ee8cfb8 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/prepare_e2e.sh
@@ -14,12 +14,23 @@ cmd=run.pl
 nj=4
 stage=0
 shared_phones=true
-treedir=              # if specified, the tree and model will be copied from there
+treedir=              # If specified, the tree and model will be copied from there
                       # note that it may not be flat start anymore.
-type=mono             # can be either mono or biphone -- either way
+type=mono             # Can be either mono or biphone -- either way
                       # the resulting tree is full (i.e. it doesn't do any tying)
+ci_silence=false      # If true, silence phones will be treated as context independent
 
 scale_opts="--transition-scale=0.0 --self-loop-scale=0.0"
+tie=false             # If true, gmm-init-biphone will do some tying when
+                      # creating the full biphone tree (it won't be full anymore).
+                      # Specifically, it will revert to monophone if the data
+                      # counts for a biphone are smaller than min_biphone_count.
+                      # If the monophone count is also smaller than min_monophone_count,
+                      # it will revert to a shared global phone. Note that this
+                      # only affects biphone models (i.e., type=biphone) which
+                      # use the special chain topology.
+min_biphone_count=100
+min_monophone_count=20
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -34,6 +45,7 @@ if [ $# != 3 ]; then
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --type <mono | biphone>                          # context dependency type"
+  echo "  --tie <true | false>                             # enable/disable count-based tying"
   exit 1;
 fi
 
@@ -63,12 +75,28 @@ if $shared_phones; then
   shared_phones_opt="--shared-phones=$lang/phones/sets.int"
 fi
 
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+if $ci_silence; then
+  ci_opt="--ci-phones=$ciphonelist"
+fi
+
+tie_opts=
+if $tie && [[ "$type" = "biphone" ]]; then
+  cat $data/text | steps/chain/e2e/text_to_phones.py --edge-silprob 0 \
+                                                     --between-silprob 0 \
+                                                     $lang | \
+    cut -d' ' -f 2- | utils/sym2int.pl $lang/phones.txt | \
+    steps/chain/e2e/compute_biphone_stats.py $lang >$dir/phone-stats.txt
+  tie_opts="--min-biphone-count=$min_biphone_count \
+--min-monophone-count=$min_monophone_count --phone-counts=$dir/phone-stats.txt"
+fi
+
 if [ $stage -le 0 ]; then
   if [ -z $treedir ]; then
     echo "$0: Initializing $type system."
     # feat dim does not matter here. Just set it to 10
     $cmd $dir/log/init_${type}_mdl_tree.log \
-         gmm-init-$type $shared_phones_opt $lang/topo 10 \
+         gmm-init-$type $tie_opts $ci_opt $shared_phones_opt $lang/topo 10 \
          $dir/0.mdl $dir/tree || exit 1;
   else
     echo "$0: Copied tree/mdl from $treedir." >$dir/log/init_mdl_tree.log
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py b/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py
index 0ff05e3c48e..2c51cb57750 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/text_to_phones.py
@@ -8,6 +8,7 @@
     to phone transcriptions using the provided lexicon,
     and writes them to standard output.
 """
+from __future__ import print_function
 
 import argparse
 from os.path import join
diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
index e96f2a10820..d5fa89f3ce0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
+++ b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py
@@ -202,11 +202,10 @@ def process_args(args):
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
 
-    if (not os.path.exists(args.dir)
-            or not os.path.exists(args.dir+"/configs")):
-        raise Exception("This scripts expects {0} to exist and have a configs "
-                        "directory which is the output of "
-                        "make_configs.py script")
+    if (not os.path.exists(args.dir + "/configs")):
+        raise Exception("This scripts expects the directory specified with "
+                        "--dir={0} to exist and have a configs/ directory which "
+                        "is the output of make_configs.py script".format(args.dir))
 
     # set the options corresponding to args.use_gpu
     run_opts = common_train_lib.RunOpts()
@@ -423,9 +422,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -451,12 +451,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             chain_lib.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index 9996820d6d3..4dcf40518a2 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -65,6 +65,12 @@ online_ivector_dir=  # can be used if we are including speaker information as iV
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
             # it doesn't make sense to use different options than were used as input to the
             # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+online_cmvn=false # Set to 'true' to replace 'apply-cmvn' by 'apply-cmvn-online' in the nnet3 input.
+                  # The configuration is passed externally via '$cmvn_opts' given to train.py,
+                  # typically as: --cmvn-opts="--config conf/online_cmvn.conf".
+                  # The global_cmvn.stats are computed by this script from the features.
+                  # Note: the online cmvn for ivector extractor it is controlled separately in
+                  #       steps/online/nnet2/train_ivector_extractor.sh by --online-cmvn-iextractor
 lattice_lm_scale=     # If supplied, the graph/lm weight of the lattices will be
                       # used (with this scale) in generating supervisions
                       # This is 0 by default for conventional supervised training,
@@ -151,44 +157,77 @@ mkdir -p $dir/log $dir/info
 # Get list of validation utterances.
 frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
 
+if [ -f $data/utt2uniq ]; then
+  # Must hold out all augmented versions of the same utterance.
+  echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \
+       "includes all perturbed versions of the same source utterance."
+  utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null | \
+      utils/shuffle_list.pl 2>/dev/null | \
+    awk -v max_utt=$num_utts_subset '{
+        for (n=2;n<=NF;n++) print $n;
+        printed += NF-1;
+        if (printed >= max_utt) exit(0); }' |
+    sort > $dir/valid_uttlist
+else
+  awk '{print $1}' $data/utt2spk | \
+    utils/shuffle_list.pl 2>/dev/null | \
+    head -$num_utts_subset > $dir/valid_uttlist
+fi
+len_valid_uttlist=$(wc -l < $dir/valid_uttlist)
+
 awk '{print $1}' $data/utt2spk | \
-  utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist
+   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
+   utils/shuffle_list.pl 2>/dev/null | \
+   head -$num_utts_subset > $dir/train_subset_uttlist
+len_trainsub_uttlist=$(wc -l <$dir/train_subset_uttlist)
 
-len_uttlist=$(wc -l < $dir/valid_uttlist)
-if [ $len_uttlist -lt $num_utts_subset ]; then
-  echo "Number of utterances is very small. Please check your data." && exit 1;
+if [[ $len_valid_uttlist -lt $num_utts_subset ||
+      $len_trainsub_uttlist -lt $num_utts_subset ]]; then
+  echo "$0: Number of utterances is very small. Please check your data." && exit 1;
 fi
 
-if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
-  # because of this stage we can again have utts with lengths less than
-  # frames_per_eg
-  echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
-  echo "include all perturbed versions of the same 'real' utterances."
-  mv $dir/valid_uttlist $dir/valid_uttlist.tmp
-  utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
-  cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
-    sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
-    awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
-  rm $dir/uniq2utt $dir/valid_uttlist.tmp
-fi
+echo "$0: Holding out $len_valid_uttlist utterances in validation set and" \
+     "$len_trainsub_uttlist in training diagnostic set, out of total" \
+     "$(wc -l < $data/utt2spk)."
+
 
 echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
 
-awk '{print $1}' $data/utt2spk | \
-   utils/filter_scp.pl --exclude $dir/valid_uttlist | \
-   utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist
-len_uttlist=$(wc -l <$dir/train_subset_uttlist)
-if [ $len_uttlist -lt $num_utts_subset ]; then
-  echo "Number of utterances is very small. Please check your data." && exit 1;
+## Set up features.
+
+# get the global_cmvn stats for online-cmvn,
+if $online_cmvn; then
+  # create global_cmvn.stats,
+  #
+  # caution: the top-level nnet training script should copy
+  # 'global_cmvn.stats' and 'online_cmvn' to its own dir.
+  if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
+    echo "$0: Error summing cmvn stats"
+    exit 1
+  fi
+  touch $dir/online_cmvn
+else
+  [ -f $dir/online_cmvn ] && rm $dir/online_cmvn
 fi
 
-## Set up features.
-echo "$0: feature type is raw"
-feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
-valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
-train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+# create the feature pipelines,
+if ! $online_cmvn; then
+  # the original front-end with 'apply-cmvn',
+  echo "$0: feature type is raw, with 'apply-cmvn'"
+  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+else
+  # the alternative front-end with 'apply-cmvn-online',
+  # - the $cmvn_opts can be set to '--config=conf/online_cmvn.conf' which is the setup of ivector-extractor,
+  echo "$0: feature type is raw, with 'apply-cmvn-online'"
+  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+fi
 echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
 
+
 tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1
 
 if [ ! -z "$online_ivector_dir" ]; then
@@ -342,9 +381,8 @@ if [ $stage -le 2 ]; then
         $egs_opts --normalization-fst-scale=$normalization_fst_scale \
         $trans_mdl_opt $chaindir/normalization.fst \
         "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1
-    wait
     sleep 5  # wait for file system to sync.
-    echo "... Getting subsets of validation examples for diagnostics and combination."
+    echo "$0: Getting subsets of validation examples for diagnostics and combination."
     if $generate_egs_scp; then
       valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp"
       train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp"
@@ -365,7 +403,6 @@ if [ $stage -le 2 ]; then
     $cmd $dir/log/create_train_subset_diagnostic.log \
       nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
       $train_diagnostic_output || exit 1
-    wait
     sleep 5  # wait for file system to sync.
     if $generate_egs_scp; then
       cat $dir/valid_combine.cegs $dir/train_combine.cegs | \
@@ -375,7 +412,7 @@ if [ $stage -le 2 ]; then
     fi
 
     for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
-      [ ! -s $f ] && echo "No examples in file $f" && exit 1;
+      [ ! -s $f ] && echo "$0: No examples in file $f" && exit 1;
     done
     rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
   ) || touch $dir/.error &
@@ -412,7 +449,7 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ -f $dir/.error ]; then
-  echo "Error detected while creating train/valid egs" && exit 1
+  echo "$0: Error detected while creating train/valid egs" && exit 1
 fi
 
 if [ $stage -le 5 ]; then
@@ -485,11 +522,11 @@ fi
 
 wait
 if [ -f $dir/.error ]; then
-  echo "Error detected while creating train/valid egs" && exit 1
+  echo "$0: Error detected while creating train/valid egs" && exit 1
 fi
 
 if [ $stage -le 6 ]; then
-  echo "$0: removing temporary archives"
+  echo "$0: Removing temporary archives, alignments and lattices"
   (
     cd $dir
     for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->")  print $Y, $NF; }'); do rm $f; done
@@ -501,7 +538,6 @@ if [ $stage -le 6 ]; then
     # there are some extra soft links that we should delete.
     for f in $dir/cegs.*.*.ark; do rm $f; done
   fi
-  echo "$0: removing temporary alignments, lattices and transforms"
   rm $dir/ali.{ark,scp} 2>/dev/null
   rm $dir/lat_special.*.{ark,scp} 2>/dev/null
 fi
diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh
index 7dade75a0ed..3b6371168ce 100755
--- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh
@@ -86,37 +86,44 @@ else
   fi
 fi
 
-if [ $stage -le 1 ]; then
-  all_phones=""  # will contain the names of the .gz files containing phones,
-                 # with some members possibly repeated per the --num-repeats
-                 # option
-  for n in `seq 0 $[num_alignments-1]`; do
-    this_num_repeats=${num_repeats_array[$n]}
-    this_alignment_dir=${ali_dirs[$n]}
-    num_jobs=$(cat $this_alignment_dir/num_jobs)
-    if ! [ "$this_num_repeats" -gt 0 ]; then
-      echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'"
-      exit 1
-    fi
+all_phones=""  # will contain the names of the .gz files containing phones,
+               # with some members possibly repeated per the --num-repeats
+               # option
+for n in `seq 0 $[num_alignments-1]`; do
+  this_num_repeats=${num_repeats_array[$n]}
+  this_alignment_dir=${ali_dirs[$n]}
+  num_jobs=$(cat $this_alignment_dir/num_jobs)
+  if ! [ "$this_num_repeats" -ge 0 ]; then
+    echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'"
+    exit 1
+  fi
 
 
+  if [ $stage -le 1 ]; then
     for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \
       ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1;
+  fi
 
-    all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)"
-  done
+  if [ ! -s $dir/phones.$n.gz ]; then
+    echo "$dir/phones.$n.gz is empty or does not exist"
+    exit 1
+  fi
 
+  all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)"
+done
+
+if [ $stage -le 2 ]; then
   $cmd $dir/log/make_phone_lm_fst.log \
     gunzip -c $all_phones \| \
     chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1;
   rm $dir/phones.*.gz
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 3 ]; then
   copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl || exit 1;
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 4 ]; then
   $cmd $dir/log/make_den_fst.log \
     chain-make-den-fst $dir/tree $dir/0.trans_mdl \
     $dir/phone_lm.fst \
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index a832f57cd8f..1af33590781 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -6,6 +6,8 @@
 
 """ This script is based on steps/nnet3/chain/train.sh
 """
+from __future__ import division
+from __future__ import print_function
 
 import argparse
 import logging
@@ -58,6 +60,13 @@ def get_args():
                         should halve --trainer.samples-per-iter.  May be
                         a comma-separated list of alternatives: first width
                         is the 'principal' chunk-width, used preferentially""")
+    parser.add_argument("--egs.nj", type=int, required=False,
+                        default=0, dest="egs_nj",
+                        help="""Number of jobs to use when generating egs.
+                        Default: the same number as used for tree generation.
+                        You probably do not need to tweak this, unless you
+                        want to adapt a neural network on some different,
+                        smaller-size data.""")
 
     # chain options
     parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
@@ -217,8 +226,9 @@ def process_args(args):
                 args.deriv_truncate_margin))
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -280,7 +290,10 @@ def train(args, run_opts):
     shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir)
 
     # Set some variables.
-    num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
+    if args.egs_nj <= 0:
+        num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
+    else:
+        num_jobs = args.egs_nj
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
     ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
@@ -355,6 +368,13 @@ def train(args, run_opts):
                                right_context_final >= 0 else -1)
 
     default_egs_dir = '{0}/egs'.format(args.dir)
+
+    if (args.egs_dir is not None) and (args.cmvn_opts != "--norm-means=false --norm-vars=false"):
+        logger.warning("the --feat.cmvn-opts option has no effect because we are not dumping egs")
+
+    if (args.egs_dir is not None) and (args.frames_per_iter != 800000):
+        logger.warning("the --trainer.frames-per-iter option has no effect because we are not dumping egs")
+
     if ((args.stage <= -3) and args.egs_dir is None):
         logger.info("Generating egs")
         if (not os.path.exists("{0}/den.fst".format(args.dir)) or
@@ -468,9 +488,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -499,12 +520,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             chain_lib.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/components.py b/egs/wsj/s5/steps/nnet3/components.py
index 34443d586ca..8e879579776 100644
--- a/egs/wsj/s5/steps/nnet3/components.py
+++ b/egs/wsj/s5/steps/nnet3/components.py
@@ -84,7 +84,7 @@ def AddBlockAffineLayer(config_lines, name, input, output_dim, num_blocks):
 def AddPermuteLayer(config_lines, name, input, column_map):
     components = config_lines['components']
     component_nodes = config_lines['component-nodes']
-    permute_indexes = ",".join(map(lambda x: str(x), column_map))
+    permute_indexes = ",".join([str(x) for x in column_map])
     components.append('component name={0}_permute type=PermuteComponent column-map={1}'.format(name, permute_indexes))
     component_nodes.append('component-node name={0}_permute component={0}_permute input={1}'.format(name, input['descriptor']))
 
diff --git a/egs/wsj/s5/steps/nnet3/compute_output.sh b/egs/wsj/s5/steps/nnet3/compute_output.sh
index da3cb704878..e55f705043b 100755
--- a/egs/wsj/s5/steps/nnet3/compute_output.sh
+++ b/egs/wsj/s5/steps/nnet3/compute_output.sh
@@ -54,7 +54,7 @@ fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print
 
 model=$srcdir/$iter.raw
 if [ ! -f $srcdir/$iter.raw ]; then
-  echo "$0: WARNING: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl instead." && exit 1
+  echo "$0: WARNING: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl instead."
   model=$srcdir/$iter.mdl
 fi
 
@@ -104,12 +104,15 @@ gpu_queue_opt=
 
 if $use_gpu; then
   gpu_queue_opt="--gpu 1"
+  suffix="-batch"
   gpu_opt="--use-gpu=yes"
+else
+  gpu_opt="--use-gpu=no"
 fi
 
 if [ $stage -le 2 ]; then
   $cmd $gpu_queue_opt JOB=1:$nj $dir/log/compute_output.JOB.log \
-    nnet3-compute $gpu_opt $ivector_opts $frame_subsampling_opt \
+    nnet3-compute$suffix $gpu_opt $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
      --extra-left-context=$extra_left_context \
      --extra-right-context=$extra_right_context \
diff --git a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
index f0a4341d12b..d089258990f 100755
--- a/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
+++ b/egs/wsj/s5/steps/nnet3/convert_nnet2_to_nnet3.py
@@ -6,6 +6,7 @@
 # It requires knowledge of valid components which
 # can be modified in the configuration section below.
 
+from __future__ import print_function
 import argparse, os, tempfile, logging, sys, shutil, fileinput, re
 from collections import defaultdict, namedtuple
 import numpy as np
@@ -51,7 +52,7 @@
 SPLICE_COMPONENTS = [c for c in NODE_NAMES if "Splice" in c]
 AFFINE_COMPONENTS = [c for c in NODE_NAMES if "Affine" in c]
 
-KNOWN_COMPONENTS = NODE_NAMES.keys()
+KNOWN_COMPONENTS = list(NODE_NAMES.keys())
 # End configuration section
 
 logger = logging.getLogger(__name__)
@@ -99,6 +100,7 @@ class Nnet3Model(object):
     def __init__(self):
         self.input_dim = -1
         self.output_dim = -1
+        self.ivector_dim = 0 
         self.counts = defaultdict(int)
         self.num_components = 0
         self.components_read = 0
@@ -117,7 +119,10 @@ def add_component(self, component, pairs):
         Component = namedtuple("Component", "ident component pairs")
 
         if "<InputDim>" in pairs and self.input_dim == -1:
-            self.input_dim = pairs["<InputDim>"]
+            self.input_dim = int(pairs["<InputDim>"])
+
+        if "<ConstComponentDim>" in pairs and self.ivector_dim == 0:
+            self.ivector_dim = int(pairs["<ConstComponentDim>"])
 
         # remove nnet2 specific tokens and catch descriptors
         if component == "<PnormComponent>" and "<P>" in pairs:
@@ -158,13 +163,18 @@ def write_config(self, filename):
                                     config_string=config_string))
 
             f.write("\n# Component nodes\n")
-            f.write("input-node name=input dim={0}\n".format(self.input_dim))
+            if self.ivector_dim != 0:
+                f.write("input-node name=input dim={0}\n".format(self.input_dim-self.ivector_dim))
+                f.write("input-node name=ivector dim={0}\n".format(self.ivector_dim))
+            else:
+                f.write("input-node name=input dim={0}\n".format(self.input_dim))
             previous_component = "input"
             for component in self.components:
                 if component.ident == "splice":
                     # Create splice string for the next node
                     previous_component = make_splice_string(previous_component, 
-                                                   component.pairs["<Context>"])
+                                                   component.pairs["<Context>"],
+                                                   component.pairs["<ConstComponentDim>"])
                     continue
                 f.write("component-node name={name} component={name} "
                         "input={inp}\n".format(name=component.ident, 
@@ -263,7 +273,7 @@ def parse_component(line, line_buffer):
     pairs = {}
 
     if component in SPLICE_COMPONENTS:
-        pairs = parse_splice_component(component, line, line_buffer)
+        line, pairs = parse_splice_component(component, line, line_buffer)
     elif component in AFFINE_COMPONENTS:
         pairs = parse_affine_component(component, line, line_buffer)
     elif component == "<FixedScaleComponent>":
@@ -284,7 +294,7 @@ def parse_component(line, line_buffer):
 def parse_standard_component(component, line, line_buffer):
     # Ignores stats such as ValueSum and DerivSum
     line = consume_token(component, line)
-    pairs = re.findall("(<\w+>) ([\w.]+)", line)
+    pairs = re.findall("(<\w+>) ([\w.-]+)", line)
 
     return dict(pairs)
 
@@ -334,7 +344,13 @@ def parse_splice_component(component, line, line_buffer):
     line = consume_token("<Context>", line)
     context = line.strip()[1:-1].split()
 
-    return {"<InputDim>" : input_dim, "<Context>" : context}
+    const_component_dim = 0
+    line = next(line_buffer) # Context vector adds newline
+    line = consume_token("<ConstComponentDim>", line)
+    const_component_dim = int(line.strip().split()[0])
+
+    return line, {"<InputDim>" : input_dim, "<Context>" : context, 
+            "<ConstComponentDim>" : const_component_dim}
 
 def parse_end_of_component(component, line, line_buffer):
     # Keeps reading until it hits the end tag for component
@@ -348,7 +364,7 @@ def parse_end_of_component(component, line, line_buffer):
 def parse_affine_component(component, line, line_buffer):
     assert ("<LinearParams>" in line)
 
-    pairs = dict(re.findall("(<\w+>) ([\w.]+)", line))
+    pairs = dict(re.findall("(<\w+>) ([\w.-]+)", line))
 
     # read the linear params and bias and convert it to a matrix
     weights = parse_weights(line_buffer)
@@ -421,7 +437,7 @@ def consume_token(token, line):
 
     return line.partition(token)[2]
 
-def make_splice_string(nodename, context):
+def make_splice_string(nodename, context, const_component_dim=0):
     """Generates splice string from a list of context.
 
     E.g. make_splice_string("renorm4", [-4, 4])
@@ -429,6 +445,8 @@ def make_splice_string(nodename, context):
     """
     assert type(context) == list, "context argument must be a list"
     string = ["Offset({0}, {1})".format(nodename, i) for i in context]
+    if const_component_dim > 0:
+        string.append("ReplaceIndex(ivector, t, 0)")
     string = "Append(" + ", ".join(string) + ")"
     return string
 
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 5b8374a5a1d..bbd81a6db9f 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -20,6 +20,10 @@ ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
 num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+              # In that case it is recommended to set num-threads to a large
+              # number, e.g. 20 if you have that many free CPU slots on a GPU
+              # node, and to use a small number of jobs.
 scoring_opts=
 skip_diagnostics=false
 skip_scoring=false
@@ -49,6 +53,9 @@ if [ $# -ne 3 ]; then
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
   exit 1;
 fi
 
@@ -74,17 +81,31 @@ done
 sdata=$data/split$nj;
 cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
 thread_string=
-[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+if $use_gpu; then
+  if [ $num_threads -eq 1 ]; then
+    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
+  fi
+  thread_string="-batch --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads --gpu 1"
+elif [ $num_threads -gt 1 ]; then
+  thread_string="-parallel --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads"
+fi
 
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs
 
-
 ## Set up features.
-echo "$0: feature type is raw"
+if [ -f $srcdir/online_cmvn ]; then online_cmvn=true
+else online_cmvn=false; fi
 
-feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+if ! $online_cmvn; then
+echo "$0: feature type is raw"
+  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+else
+  feats="ark,s,cs:apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- |"
+fi
 
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
@@ -104,7 +125,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
 fi
 
 if [ $stage -le 1 ]; then
-  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
     nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
      --extra-left-context=$extra_left_context \
diff --git a/egs/wsj/s5/steps/nnet3/decode_grammar.sh b/egs/wsj/s5/steps/nnet3/decode_grammar.sh
new file mode 100755
index 00000000000..7ee1efeb7df
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_grammar.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This is a version of ./decode.sh that allows you to decode with a GrammarFst.
+# See kaldi-asr.org/doc/grammar.html for an overview of what this is about.
+
+# Begin configuration section.
+stage=1
+nj=4 # number of decoding jobs.
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+minimize=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
+for f in $graphdir/HCLG.gra $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features.
+echo "$0: feature type is raw"
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-grammar $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --extra-right-context-final=$extra_right_context_final \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.gra "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh
index 2fcc4a1944d..cb678e84245 100755
--- a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh
+++ b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh
@@ -38,7 +38,7 @@ extra_right_context=0
 extra_left_context_initial=-1
 extra_right_context_final=-1
 online_ivector_dir=
-frame_subsampling_factor=1
+frame_subsampling_factor=
 frames_per_chunk=150
 average=true
 
@@ -76,10 +76,10 @@ write_compact=true   # If set to false, then writes the lattice in non-compact f
 
 if [ $# -lt 5 ]; then
   echo "Usage: $0 [options] <data-dir> <graph-dir> <nnet3-dir> <nnet3-dir2> [<nnet3-dir3> ... ] <output-dir>"
-  echo "e.g.:   local/socal/score_fusion.sh --nj 8 \\"
-  echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\"
-  echo "    data/test_eval92_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\"
-  echo "    exp/nnet3/tdnn_comb/decode_dev"
+  echo "e.g.:   steps/nnet3/decode_score_fusion.sh --nj 8 \\"
+  echo "    --online-ivector-dir exp/nnet3/ivectors_test \\"
+  echo "    data/test_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\"
+  echo "    exp/nnet3/tdnn_comb/decode_test"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
@@ -110,15 +110,28 @@ if [ ! -z "$online_ivector_dir" ]; then
     ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 fi
 
+# assign frame_subsampling_factor automatically if empty
+if [ -z $frame_subsampling_factor ]; then
+   frame_subsampling_factor=`cat ${model_dirs[0]}/frame_subsampling_factor` || exit 1;
+fi
+
+# check if standard chain system or not.
+if [ $frame_subsampling_factor -eq 3 ]; then
+   if [ $acwt != 1.0 ] || [ $post_decode_acwt != 10.0 ]; then
+     echo -e '\n\n'
+     echo "$0 WARNING: In standard chain system, acwt = 1.0, post_decode_acwt = 10.0"
+     echo "$0 WARNING: Your acwt = $acwt, post_decode_acwt = $post_decode_acwt"
+     echo "$0 WARNING: This is OK if you know what you are doing."
+     echo -e '\n\n'
+   fi
+fi
+
 frame_subsampling_opt=
 if [ $frame_subsampling_factor -ne 1 ]; then
   # e.g. for 'chain' systems
   frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
 fi
 
-# convert $dir to absolute pathname
-fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
-
 # Possibly use multi-threaded decoder
 thread_string=
 [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
@@ -143,11 +156,13 @@ for i in `seq 0 $[num_sys-1]`; do
   
   # check that they have the same frame-subsampling-factor
   if [ $frame_subsampling_factor -ne `cat $srcdir/frame_subsampling_factor` ]; then
-    echo "$0 frame_subsampling_factor must be the same."
+    echo "$0 frame_subsampling_factor must be the same.\\"
+    echo "Default:$frame_subsampling_factor \\"
+    echo "In $srcdir:`cat $srcdir/frame_subsampling_factor`"
     exit 0;
   fi
   
-    for f in $data/feats.scp $model $extra_files; do
+  for f in $data/feats.scp $model $extra_files; do
     [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
   done
 
@@ -231,9 +246,9 @@ fi
 
 if [ $stage -le 0 ]; then  
   $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
-    matrix-sum --average=$average "${models[@]}" ark:- \| \
-	latgen-faster-mapped$thread_string --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
-	 --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     matrix-sum --average=$average "${models[@]}" ark:- \| \
+     latgen-faster-mapped$thread_string --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
      --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \
      $graphdir/HCLG.fst ark:- "$lat_wspecifier"
 fi
@@ -259,4 +274,3 @@ fi
 
 
 exit 0
-
diff --git a/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py b/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py
index a46d144d0b6..ee6fa11b5c9 100644
--- a/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py
+++ b/egs/wsj/s5/steps/nnet3/dot/descriptor_parser.py
@@ -33,7 +33,7 @@ def ParseSubsegmentsAndArguments(segment_endpoints, sub_segments, arguments, inp
             else:
                 arguments.append(sub_segment_name)
     else:
-        arguments = map(lambda x: re.sub(',','', x.strip()), input_string[segment_endpoints[0]:segment_endpoints[1]+1].split())
+        arguments = [re.sub(',','', x.strip()) for x in input_string[segment_endpoints[0]:segment_endpoints[1]+1].split()]
         sub_segments = []
     return sub_segments, arguments
 
diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
index f8cd357fa3b..07cf85be070 100755
--- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
+++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py
@@ -189,7 +189,7 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non
         sub_segment = segment['sub_segments'][i]
         part_name = "{0}{1}{2}".format(desc_name, sub_segment['name'], i)
         names.append("<{0}> part {1}".format(GetDotNodeName(part_name)['node'], i))
-        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), desc_name+"_"+str(i))
+        dot_graph += DescriptorSegmentToDot(sub_segment, "{0}:{1}".format(desc_name, part_name), "{0}_{1}".format(desc_name, i))
 
     # link the sum node parts to corresponding segments
     part_index = len(segment['sub_segments'])
@@ -269,6 +269,8 @@ def DescriptorSegmentToDot(segment, parent_node_name, affix, edge_attributes = N
         dot_graph += ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attributes)
     elif segment['name'] == "Round":
         dot_graph += ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes)
+    elif segment['name'] == "Scale":
+        pass
     else:
         raise Exception('Descriptor {0}, is not recognized by this script. Please add Process{0}Descriptor method'.format(segment['name']))
     return dot_graph
@@ -321,7 +323,7 @@ def Nnet3ComponentToDot(component_config, component_attributes = None):
     label = ''
     if component_attributes is None:
         component_attributes = component_config.keys()
-    attributes_to_print = set(component_attributes).intersection(component_config.keys())
+    attributes_to_print = set(component_attributes).intersection(list(component_config.keys()))
     # process the known fields
     for key in attributes_to_print:
         if key in component_config:
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
index 8098b59c4ad..7853daa4563 100755
--- a/egs/wsj/s5/steps/nnet3/get_degs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -471,7 +471,6 @@ if [ $stage -le 10 ] && $cleanup; then
 fi
 
 
-exit 0
-
-
 echo "$0: Finished decoding and preparing training examples"
+
+exit 0
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 2888f77ed59..da6ec362ec7 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -51,6 +51,13 @@ online_ivector_dir=  # can be used if we are including speaker information as iV
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
             # it doesn't make sense to use different options than were used as input to the
             # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
+online_cmvn=false # Set to 'true' to replace 'apply-cmvn' by 'apply-cmvn-online' in the nnet3 input.
+                  # The configuration is passed externally via '$cmvn_opts' given to train.py,
+                  # typically as: --cmvn-opts="--config conf/online_cmvn.conf".
+                  # The global_cmvn.stats are computed by this script from the features.
+                  # Note: the online cmvn for ivector extractor it is controlled separately in
+                  #       steps/online/nnet2/train_ivector_extractor.sh by --online-cmvn-iextractor
+
 generate_egs_scp=false # If true, it will generate egs.JOB.*.scp per egs archive
 
 echo "$0 $@"  # Print the command line for logging
@@ -137,13 +144,40 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
 
 ## Set up features.
-echo "$0: feature type is raw"
 
-feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
-valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
-train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+# get the global_cmvn stats for online-cmvn,
+if $online_cmvn; then
+  # create global_cmvn.stats
+  #
+  # caution: the top-level nnet training script should copy
+  # 'global_cmvn.stats' and 'online_cmvn' to its own dir.
+  if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
+    echo "$0: Error summing cmvn stats"
+    exit 1
+  fi
+  touch $dir/online_cmvn
+else
+  [ -f $dir/online_cmvn ] && rm $dir/online_cmvn
+fi
+
+# create the feature pipelines,
+if ! $online_cmvn; then
+  # the original front-end with 'apply-cmvn',
+  echo "$0: feature type is raw, with 'apply-cmvn'"
+  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |"
+else
+  # the alternative front-end with 'apply-cmvn-online',
+  # - the $cmvn_opts can be set to '--config=conf/online_cmvn.conf' which is the setup of ivector-extractor,
+  echo "$0: feature type is raw, with 'apply-cmvn-online'"
+  feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$sdata/JOB/spk2utt  $dir/global_cmvn.stats scp:- ark:- |"
+  valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+  train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-online $cmvn_opts --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:- ark:- |"
+fi
 echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now.
 
+
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 2e368283ed4..784693ee44c 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -130,8 +130,8 @@ if ! [ $num_utts -gt $[$num_utts_subset_valid*4] ]; then
 fi
 
 # Get list of validation utterances.
-awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset_valid | sort \
-    > $dir/valid_uttlist || exit 1;
+awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_valid | sort \
+    > $dir/valid_uttlist
 
 if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
   echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
@@ -145,7 +145,7 @@ if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
 fi
 
 awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
-   utils/shuffle_list.pl | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist || exit 1;
+   utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist
 
 ## Set up features.
 echo "$0: feature type is raw"
diff --git a/egs/wsj/s5/steps/nnet3/get_saturation.pl b/egs/wsj/s5/steps/nnet3/get_saturation.pl
index ed18fc1c399..979736f0847 100755
--- a/egs/wsj/s5/steps/nnet3/get_saturation.pl
+++ b/egs/wsj/s5/steps/nnet3/get_saturation.pl
@@ -74,6 +74,14 @@
     if (! $ok) {
       print STDERR "Could not parse at least one of the avg-deriv values in the following info line: $_";
     }
+  } elsif (m/type=.*GruNonlinearityComponent/) {
+    if (m/deriv-avg=[^m]+mean=([^,]+),/) {
+      $num_nonlinearities += 1;
+      my $this_saturation = 1.0 - ($1 / 1.0);
+      $total_saturation += $this_saturation;
+    } else {
+      print STDERR "$0: could not make sense of line (no deriv-avg?): $_";
+    }
   }
 }
 
diff --git a/egs/wsj/s5/steps/nnet3/get_successful_models.py b/egs/wsj/s5/steps/nnet3/get_successful_models.py
index 3661d91b8d5..e6dcf376a51 100755
--- a/egs/wsj/s5/steps/nnet3/get_successful_models.py
+++ b/egs/wsj/s5/steps/nnet3/get_successful_models.py
@@ -56,7 +56,7 @@
         if (loss[max_index] - loss[i]) <= args.difference_threshold:
             accepted_models.append(i+1)
 
-    model_list = " ".join(map(lambda x: str(x), accepted_models))
+    model_list = " ".join([str(x) for x in accepted_models])
     print(model_list)
 
     if len(accepted_models) != args.num_models:
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index b80a8d4045b..8a533465f07 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -181,7 +181,7 @@ def ParseSpliceString(splice_indexes, label_delay=None):
     splice_array = []
     try:
         for i in range(len(split1)):
-            indexes = map(lambda x: int(x), split1[i].strip().split(","))
+            indexes = [int(x) for x in split1[i].strip().split(",")]
             print(indexes)
             if len(indexes) < 1:
                 raise ValueError("invalid --splice-indexes argument, too-short element: "
@@ -214,12 +214,12 @@ def ParseLstmDelayString(lstm_delay):
     lstm_delay_array = []
     try:
         for i in range(len(split1)):
-            indexes = map(lambda x: int(x), split1[i].strip().lstrip('[').rstrip(']').strip().split(","))
+            indexes = [int(x) for x in split1[i].strip().lstrip('[').rstrip(']').strip().split(",")]
             if len(indexes) < 1:
                 raise ValueError("invalid --lstm-delay argument, too-short element: "
                                 + lstm_delay)
             elif len(indexes) == 2 and indexes[0] * indexes[1] >= 0:
-                raise ValueError('Warning: ' + str(indexes) + ' is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.')
+                raise ValueError('Warning: {} is not a standard BLSTM mode. There should be a negative delay for the forward, and a postive delay for the backward.'.format(indexes))
             if len(indexes) == 2 and indexes[0] > 0: # always a negative delay followed by a postive delay
                 indexes[0], indexes[1] = indexes[1], indexes[0]
             lstm_delay_array.append(indexes)
@@ -335,9 +335,9 @@ def ProcessSpliceIndexes(config_dir, splice_indexes, label_delay, num_lstm_layer
 
     # write the files used by other scripts like steps/nnet3/get_egs.sh
     f = open(config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+    print('model_left_context={}'.format(left_context), file=f)
+    print('model_right_context={}'.format(right_context), file=f)
+    print('num_hidden_layers={}'.format(num_hidden_layers), file=f)
     # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
     f.close()
 
diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
index 162fda16d16..d121be6d899 100644
--- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
@@ -98,21 +98,21 @@
 input_dim = len(splice_array[0]) * args.feat_dim  +  args.ivector_dim
 
 f = open(args.config_dir + "/vars", "w")
-print('left_context=' + str(left_context), file=f)
-print('right_context=' + str(right_context), file=f)
+print('left_context={}'.format(left_context), file=f)
+print('right_context={}'.format(right_context), file=f)
 # the initial l/r contexts are actually not needed.
 # print('initial_left_context=' + str(splice_array[0][0]), file=f)
 # print('initial_right_context=' + str(splice_array[0][-1]), file=f)
-print('num_hidden_layers=' + str(num_hidden_layers), file=f)
+print('num_hidden_layers={}'.format(num_hidden_layers), file=f)
 f.close()
 
 f = open(args.config_dir + "/init.config", "w")
 print('# Config file for initializing neural network prior to', file=f)
 print('# preconditioning matrix computation', file=f)
-print('input-node name=input dim=' + str(args.feat_dim), file=f)
+print('input-node name=input dim={}'.format(args.feat_dim), file=f)
 list=[ ('Offset(input, {0})'.format(n) if n != 0 else 'input' ) for n in splice_array[0] ]
 if args.ivector_dim > 0:
-    print('input-node name=ivector dim=' + str(args.ivector_dim), file=f)
+    print('input-node name=ivector dim={}'.format(args.ivector_dim), file=f)
     list.append('ReplaceIndex(ivector, t, 0)')
 # example of next line:
 # output-node name=output input="Append(Offset(input, -3), Offset(input, -2), Offset(input, -1), ... , Offset(input, 3), ReplaceIndex(ivector, t, 0))"
diff --git a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
index 54c65eb5403..a407869854d 100755
--- a/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
+++ b/egs/wsj/s5/steps/nnet3/multilingual/allocate_multilingual_examples.py
@@ -40,7 +40,6 @@
 
 """
 
-from __future__ import print_function
 import os, argparse, sys, random
 import logging
 import traceback
@@ -163,7 +162,7 @@ def process_multilingual_egs(args):
                        "not include any examples from this lang.")
         logger.info("The proportion of egs from lang {} is {:.2f}. The number of blocks "
                     "per archive for this lang is approximately {:.2f}. "
-                    "{}".format(lang, lang_to_num_examples[lang] / tot_num_egs,
+                    "{}".format(lang, float(lang_to_num_examples[lang]) / tot_num_egs,
                                 blocks_per_archive_this_lang,
                                 warning))
 
@@ -173,11 +172,11 @@ def process_multilingual_egs(args):
     lang_to_num_remaining_egs = [n for n in lang_to_num_examples]
     for archive_index in range(num_archives + 1):  #  +1 is because we write to the last archive in two rounds
         num_remaining_archives = num_archives - archive_index
-        num_remaining_blocks = num_remaining_egs / args.block_size
+        num_remaining_blocks = float(num_remaining_egs) / args.block_size
 
         last_round = (archive_index == num_archives)
         if not last_round:
-            num_blocks_this_archive = int(round(num_remaining_blocks / num_remaining_archives))
+            num_blocks_this_archive = int(round(float(num_remaining_blocks) / num_remaining_archives))
             logger.info("Generating archive {} containing {} blocks...".format(archive_index, num_blocks_this_archive))
         else:  # This is the second round for the last archive. Flush all the remaining egs...
             archive_index = num_archives - 1
@@ -194,7 +193,7 @@ def process_multilingual_egs(args):
 
         for block_index in range(num_blocks_this_archive):
             # Find the lang with the highest proportion of remaining examples
-            remaining_proportions = [remain / tot for remain, tot in zip(lang_to_num_remaining_egs, lang_to_num_examples)]
+            remaining_proportions = [float(remain) / tot for remain, tot in zip(lang_to_num_remaining_egs, lang_to_num_examples)]
             lang_index, max_proportion = max(enumerate(remaining_proportions), key=lambda a: a[1])
 
             # Read 'block_size' examples from the selected lang and write them to the current output scp file:
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index 93cbc940c33..d79db1604fd 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -4,6 +4,7 @@
 #           2016    Vimal Manohar
 # Apache 2.0.
 
+from __future__ import division
 import argparse
 import errno
 import logging
@@ -22,104 +23,90 @@
     import matplotlib.pyplot as plt
     import numpy as np
     from matplotlib.patches import Rectangle
+    # matplotlib issue https://github.com/matplotlib/matplotlib/issues/12513
+    # plt.subplot() generates a false-positive warninig, suppress it for now.
+    from matplotlib.cbook import MatplotlibDeprecationWarning
+    warnings.filterwarnings('ignore', category=MatplotlibDeprecationWarning,
+                            message='Adding an axes using the same arguments')
     g_plot = True
 except ImportError:
-    warnings.warn(
-        """This script requires matplotlib and numpy.
-        Please install them to generate plots.
-        Proceeding with generation of tables.
-        If you are on a cluster where you do not have admin rights you could
-        try using virtualenv.""")
     g_plot = False
 
 
-logger = logging.getLogger('libs')
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-logger.info('Generating plots')
+logging.basicConfig(format="%(filename)s:%(lineno)s:%(levelname)s:%(message)s",
+                    level=logging.INFO)
+logger = logging.getLogger(__name__)
 
 
 def get_args():
     parser = argparse.ArgumentParser(
-        description="""Parses the training logs and generates a variety of
-        plots.
-        e.g. (deprecated): steps/nnet3/report/generate_plots.py
-        --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2
-        exp/nnet3/tdnn exp/nnet3/tdnn/report
-        or (current): steps/nnet3/report/generate_plots.py
-        exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report.
-        Look for the report.pdf in the output (report) directory.""")
-
-    parser.add_argument("--comparison-dir", type=str, action='append',
-                        help="other experiment directories for comparison. "
-                        "These will only be used for plots, not tables"
-                        "Note: this option is deprecated.")
-    parser.add_argument("--start-iter", type=int,
-                        help="Iteration from which plotting will start",
-                        default=1)
-    parser.add_argument("--is-chain", type=str, default=False,
-                        action=common_lib.StrToBoolAction,
-                        help="True if directory contains chain models")
-    parser.add_argument("--is-rnnlm", type=str, default=False,
-                        action=common_lib.StrToBoolAction,
-                        help="True if directory contains RNNLM.")
-    parser.add_argument("--output-nodes", type=str, default=None,
+        prog=sys.argv[0],  # By default, prog is set this to filename only.
+        formatter_class=type('', (argparse.RawDescriptionHelpFormatter,
+                                  argparse.ArgumentDefaultsHelpFormatter), {}),
+        description="Parses the training logs and generates a variety of plots.\n"
+        "e.g.: %(prog)s \\\n"
+        "  exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report.\n"
+        "The report file 'report.pdf' will be generated in the <output_dir> directory.")
+
+    parser.add_argument("--start-iter", type=int, metavar='N', default=1,
+                        help="Iteration from which plotting will start.")
+    parser.add_argument("--is-chain", type=common_lib.str_to_bool, default='false', metavar='BOOL',
+                        help="Set to 'true' if <exp_dir>s contain chain models.")
+    parser.add_argument("--is-rnnlm", type=common_lib.str_to_bool, default='false', metavar='BOOL',
+                        help="Set to 'true' if <exp_dir>s contain RNNLM.")
+    parser.add_argument("--output-nodes", type=str, metavar='NODES',
                         action=common_lib.NullstrToNoneAction,
-                        help="""List of space separated
-                        <output-node>:<objective-type> entities,
-                        one for each output node""")
+                        help="List of space separated <output-node>:<objective-type> entries, "
+                        "one for each output node")
+    parser.add_argument("--comparison-dir", type=str, metavar='DIR', action='append',
+                        help="[DEPRECATED] Experiment directories for comparison. "
+                        "These will only be used for plots, not tables.")
     parser.add_argument("exp_dir", nargs='+',
-                        help="the first dir is the experiment directory, "
-                        "e.g. exp/nnet3/tdnn, the rest dirs (if exist) "
-                        "are other experiment directories for comparison.")
+                        help="The first <exp_dir> is the current experiment directory, e.g. "
+                        "'exp/nnet3/tdnn'; the rest are up to 6 optional directories of other "
+                        "experiments to be graphed on same plots for comparison.")
     parser.add_argument("output_dir",
-                        help="experiment directory, "
-                        "e.g. exp/nnet3/tdnn/report")
+                        help="output directory for reports, e.g. 'exp/nnet3/tdnn/report'")
 
     args = parser.parse_args()
-    if (args.comparison_dir is not None and len(args.comparison_dir) > 6) or \
-    (args.exp_dir is not None and len(args.exp_dir) > 7):
+    if ((args.comparison_dir is not None and len(args.comparison_dir) > 6) or
+        (args.exp_dir is not None and len(args.exp_dir) > 7)):
         raise Exception(
-            """max 6 comparison directories can be specified.
-            If you want to compare with more comparison_dir, you would have to
-            carefully tune the plot_colors variable which specified colors used
-            for plotting.""")
+            "Up to 6 comparison directories may be specified. "
+            "If you want to compare with more experiments, you would have to carefully tune "
+            "the plot_colors variable which specified colors used for plotting.")
     assert args.start_iter >= 1
     if args.is_chain and args.is_rnnlm:
-        raise Exception("""is_chain and is_rnnlm is not compatible.""")
+        raise Exception("Options --is-chain and --is-rnnlm cannot be both true.")
     return args
 
 
 g_plot_colors = ['red', 'blue', 'green', 'black', 'magenta', 'yellow', 'cyan']
 
-class LatexReport:
+class LatexReport(object):
     """Class for writing a Latex report"""
 
     def __init__(self, pdf_file):
         self.pdf_file = pdf_file
         self.document = []
-        self.document.append("""
+        self.document.append(r"""
 \documentclass[prl,10pt,twocolumn]{revtex4}
 \usepackage{graphicx}    % Used to import the graphics
-\\begin{document}
+\begin{document}
 """)
 
     def add_figure(self, figure_pdf, title):
         """we will have keep extending this replacement list based on errors
         during compilation escaping underscores in the title"""
-        title = "\\texttt{"+re.sub("_", "\_", title)+"}"
-        fig_latex = """
+
+        title = r"\texttt{"+re.sub("_", "\_", title)+"}"
+        fig_latex = r"""
 %...
-\\newpage
-\\begin{figure}[h]
-  \\begin{center}
-    \caption{""" + title + """}
-    \includegraphics[width=\\textwidth]{""" + figure_pdf + """}
+\newpage
+\begin{figure}[h]
+  \begin{center}
+    \caption{""" + title + r"""}
+    \includegraphics[width=\textwidth]{""" + figure_pdf + r"""}
   \end{center}
 \end{figure}
 \clearpage
@@ -128,7 +115,7 @@ def add_figure(self, figure_pdf, title):
         self.document.append(fig_latex)
 
     def close(self):
-        self.document.append("\end{document}")
+        self.document.append(r"\end{document}")
         return self.compile()
 
     def compile(self):
@@ -138,14 +125,15 @@ def compile(self):
         lat_file = open(latex_file, "w")
         lat_file.write("\n".join(self.document))
         lat_file.close()
-        logger.info("Compiling the latex report.")
+        logger.info("Compiling the LaTeX report.")
         try:
             common_lib.execute_command(
                 "pdflatex -interaction=batchmode "
                 "-output-directory={0} {1}".format(dir_name, latex_file))
         except Exception as e:
-            logger.warning("There was an error compiling the latex file {0}, "
-                           "please do it manually: {1}".format(latex_file, e))
+            logger.warning("There was an error compiling LaTeX file %s. "
+                           "Check report.log generated by pdflatex in the same directory. %s",
+                           latex_file, e)
             return False
         return True
 
@@ -221,10 +209,11 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
 # The name of five gates of lstmp
 g_lstm_gate = ['i_t_sigmoid', 'f_t_sigmoid', 'c_t_tanh', 'o_t_sigmoid', 'm_t_tanh']
 
-# The "extra" item looks like a placeholder. As each unit in python plot is
+# The "extra" item is a placeholder. As each unit in python plot is
 # composed by a legend_handle(linestyle) and a legend_label(description).
 # For the unit which doesn't have linestyle, we use the "extra" placeholder.
-extra = Rectangle((0, 0), 1, 1, facecolor="w", fill=False, edgecolor='none', linewidth=0)
+if g_plot:
+    extra = Rectangle((0, 0), 1, 1, facecolor="w", fill=False, edgecolor='none', linewidth=0)
 
 # This function is used to insert a column to the legend, the column_index is 1-based
 def insert_a_column_legend(legend_handle, legend_label, lp, mp, hp,
@@ -389,8 +378,7 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
             comp_data = stats_per_component_per_iter[component_name]
             comp_type = comp_data['type']
             comp_stats = comp_data['stats']
-            iters = comp_stats.keys()
-            iters.sort()
+            iters = sorted(comp_stats)
             iter_stats = []
             for iter in iters:
                 iter_stats.append([iter] + comp_stats[iter])
@@ -406,15 +394,16 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
                     dir=output_dir, comp_name=component_name), "w") as f:
             if with_oderiv:
                 # with oderiv-rms
-                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\tOderivMean\tOderivStddev\t"
-                               "Value_5th\tValue_50th\tValue_95th\t"
-                               "Deriv_5th\tDeriv_50th\tDeriv_95th\t"
-                               "Oderiv_5th\tOderiv_50th\tOderiv_95th\n")
+                f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
+                        "OderivMean\tOderivStddev\t"
+                        "Value_5th\tValue_50th\tValue_95th\t"
+                        "Deriv_5th\tDeriv_50th\tDeriv_95th\t"
+                        "Oderiv_5th\tOderiv_50th\tOderiv_95th\n")
             else:
                 # without oderiv-rms
                 f.write("Iteration\tValueMean\tValueStddev\tDerivMean\tDerivStddev\t"
-                               "Value_5th\tValue_50th\tValue_95th\t"
-                               "Deriv_5th\tDeriv_50th\tDeriv_95th\n")
+                        "Value_5th\tValue_50th\tValue_95th\t"
+                        "Deriv_5th\tDeriv_50th\tDeriv_95th\n")
             iter_stat_report = []
             iter_stats = main_stat_tables[component_name]
             for row in iter_stats:
@@ -422,21 +411,18 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
             f.write("\n".join(iter_stat_report))
             f.close()
     if plot:
-        main_component_names = main_stat_tables.keys()
-        main_component_names.sort()
-
+        main_component_names = sorted(main_stat_tables)
         plot_component_names = set(main_component_names)
         for dir in dirs:
             component_names = set(stats_per_dir[dir].keys())
             plot_component_names = plot_component_names.intersection(
                 component_names)
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
-            logger.warning("""The components in all the neural networks in the
-            given experiment dirs are not the same, so comparison plots are
-            provided only for common component names. Make sure that these are
-            comparable experiments before analyzing these plots.""")
+            logger.warning("The components in all the neural networks in the "
+                           "given experiment dirs are not the same, so comparison plots are "
+                           "provided only for common component names. Make sure that these are "
+                           "comparable experiments before analyzing these plots.")
 
         fig = plt.figure()
 
@@ -509,9 +495,8 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
         except log_parse.MalformedClippedProportionLineException as e:
             raise e
         except common_lib.KaldiCommandException as e:
-            warnings.warn("Could not extract the clipped proportions for {0},"
-                          " this might be because there are no "
-                          "ClipGradientComponents.".format(dir))
+            logger.warning("Could not extract the clipped proportions for %s, "
+                           "this might be because there are no ClipGradientComponents.", dir)
             continue
         if len(stats_per_dir[dir]) == 0:
             logger.warning("Couldn't find any rows for the"
@@ -519,40 +504,35 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
     try:
         main_cp_stats = stats_per_dir[exp_dir]['table']
     except KeyError:
-        warnings.warn("The main experiment directory {0} does not have "
-                      "clipped proportions. So not generating clipped "
-                      "proportion plots.".format(exp_dir))
+        logger.warning("The main experiment directory %s does not have clipped proportions. "
+                       "Not generating clipped proportion plots.", exp_dir)
         return
 
     # this is the main experiment directory
     file = open("{dir}/clipped_proportion.log".format(dir=output_dir), "w")
     iter_stat_report = ""
     for row in main_cp_stats:
-        iter_stat_report += "\t".join(map(lambda x: str(x), row)) + "\n"
+        iter_stat_report += "\t".join([str(x) for x in row]) + "\n"
     file.write(iter_stat_report)
     file.close()
 
     if plot:
-        main_component_names = (
-            stats_per_dir[exp_dir]['cp_per_iter_per_component'].keys())
-        main_component_names.sort()
+        main_component_names = sorted(stats_per_dir[exp_dir]['cp_per_iter_per_component'])
         plot_component_names = set(main_component_names)
         for dir in dirs:
             try:
-                component_names = set(
-                    stats_per_dir[dir]['cp_per_iter_per_component'].keys())
+                component_names = set(stats_per_dir[dir]['cp_per_iter_per_component'])
                 plot_component_names = (
                     plot_component_names.intersection(component_names))
             except KeyError:
                 continue
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
             logger.warning(
-                """The components in all the neural networks in the given
-                experiment dirs are not the same, so comparison plots are
-                provided only for common component names. Make sure that these
-                are comparable experiments before analyzing these plots.""")
+                "The components in all the neural networks in the given "
+                "experiment dirs are not the same, so comparison plots are "
+                "provided only for common component names. Make sure that these "
+                "are comparable experiments before analyzing these plots.")
 
         fig = plt.figure()
         for component_name in main_component_names:
@@ -635,35 +615,27 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
                     except KeyError:
                         total_missing_iterations += 1
                         iter_data.append("NA")
-                if (total_missing_iterations/len(component_names) > 20
+                if (float(total_missing_iterations)/len(component_names) > 20
                         and not gave_user_warning):
-                    logger.warning("There are more than {0} missing "
-                                   "iterations per component. "
-                                   "Something might be wrong.".format(
-                                       total_missing_iterations
-                                       / len(component_names)))
+                    logger.warning("There are more than %.0f missing iterations per component. "
+                                   "Something might be wrong.",
+                                   float(total_missing_iterations)/ len(component_names))
                     gave_user_warning = True
 
-                f.write(" ".join(iter_data)+"\n")
+                f.write(" ".join(iter_data) + "\n")
 
     if plot:
         # get the component names
-        diff_type = key_file.keys()[0]
-        main_component_names = stats_per_dir[exp_dir][diff_type][
-            'progress_per_component'].keys()
-        main_component_names.sort()
+        diff_type = list(key_file.keys())[0]
+        main_component_names = sorted(stats_per_dir[exp_dir][diff_type]['progress_per_component'])
         plot_component_names = set(main_component_names)
-
         for dir in dirs:
             try:
-                component_names = set(stats_per_dir[dir][diff_type][
-                    'progress_per_component'].keys())
-                plot_component_names = plot_component_names.intersection(
-                    component_names)
+                component_names = set(stats_per_dir[dir][diff_type]['progress_per_component'])
+                plot_component_names = plot_component_names.intersection(component_names)
             except KeyError:
                 continue
-        plot_component_names = list(plot_component_names)
-        plot_component_names.sort()
+        plot_component_names = sorted(plot_component_names)
         if plot_component_names != main_component_names:
             logger.warning("The components in all the neural networks in the "
                            "given experiment dirs are not the same, "
@@ -675,9 +647,8 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
         assert main_component_names
 
         fig = plt.figure()
-        logger.info("Generating parameter-difference plots for the "
-                    "following components:{0}".format(
-                        ', '.join(main_component_names)))
+        logger.info("Plotting parameter differences for components: " +
+                    ", ".join(main_component_names))
 
         for component_name in main_component_names:
             fig.clf()
@@ -698,12 +669,9 @@ def generate_parameter_diff_plots(exp_dir, output_dir, plot,
                     # this component is not available in this network so lets
                     # not just plot it
                     if dir == exp_dir:
-                        raise Exception("No parameter differences were "
-                                        "available even in the main "
-                                        "experiment dir for the component "
-                                        "{0}. Something went wrong: "
-                                        "{1}.".format(
-                                            component_name, str(e)))
+                        raise Exception("No parameter differences were available even in the main "
+                                        "experiment dir for the component {0}. Something went "
+                                        "wrong: {1}.".format(component_name, e))
                     continue
                 ax = plt.subplot(211)
                 mp, = ax.plot(iter_stats[0][:, 0], iter_stats[0][:, 1],
@@ -755,35 +723,35 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
 
     for (output_name, objective_type) in output_names:
         if objective_type == "linear":
-            logger.info("Generating accuracy plots")
+            logger.info("Generating accuracy plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='accuracy',
                 file_basename='accuracy', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
 
-            logger.info("Generating log-likelihood plots")
+            logger.info("Generating log-likelihood plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='log-likelihood',
                 file_basename='loglikelihood', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "chain":
-            logger.info("Generating log-probability plots")
+            logger.info("Generating log-probability plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot,
                 key='log-probability', file_basename='log_probability',
                 comparison_dir=comparison_dir, start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "rnnlm_objective":
-            logger.info("Generating RNNLM objective plots")
+            logger.info("Generating RNNLM objective plots for '%s'", output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='rnnlm_objective',
                 file_basename='objective', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         else:
-            logger.info("Generating " + objective_type + " objective plots")
+            logger.info("Generating %s objective plots for '%s'", objective_type, output_name)
             generate_acc_logprob_plots(
                 exp_dir, output_dir, g_plot, key='objective',
                 file_basename='objective', comparison_dir=comparison_dir,
@@ -808,14 +776,19 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
     if g_plot and latex_report is not None:
         has_compiled = latex_report.close()
         if has_compiled:
-            logger.info("Report has been generated. "
-                        "You can find it at the location "
-                        "{0}".format("{0}/report.pdf".format(output_dir)))
+            logger.info("Report file %s/report.pdf has been generated successfully.", output_dir)
 
 
 def main():
     args = get_args()
 
+    if not g_plot:
+        logger.warning(
+            "This script requires matplotlib and numpy.\n"
+            "... Install these packages to generate plots.\n"
+            "... If you are on a cluster where you do not have admin rights, use venv.\n"
+            "... Generating text data table files only.")
+
     output_nodes = []
 
     if args.output_nodes is not None:
diff --git a/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py b/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py
index 442ca4e35cf..5c74eaf128c 100755
--- a/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py
+++ b/egs/wsj/s5/steps/nnet3/report/summarize_compute_debug_timing.py
@@ -7,6 +7,7 @@
 
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
+from __future__ import division
 import sys
 import re
 import argparse
@@ -101,7 +102,7 @@ def Main():
     total_time = sum(command_times.values())
     sorted_commands = sorted(command_times.items(), key = lambda x: x[1], reverse = True)
     for item in sorted_commands:
-        print("{c} : time {t} : fraction {f}".format(c=item[0], t=item[1], f=item[1] / total_time))
+        print("{c} : time {t} : fraction {f}".format(c=item[0], t=item[1], f=float(item[1]) / total_time))
 
 
 if __name__ == "__main__":
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 5445b16e165..9e7e92f6768 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -4,6 +4,7 @@
 
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
+from __future__ import division
 import os
 import argparse
 import shlex
@@ -519,10 +520,10 @@ def MakeConfigs(config_dir, splice_indexes_string,
 
     # write the files used by other scripts like steps/nnet3/get_egs.sh
     f = open(config_dir + "/vars", "w")
-    print('model_left_context=' + str(left_context), file=f)
-    print('model_right_context=' + str(right_context), file=f)
-    print('num_hidden_layers=' + str(num_hidden_layers), file=f)
-    print('num_targets=' + str(num_targets), file=f)
+    print('model_left_context={}'.format(left_context), file=f)
+    print('model_right_context={}'.format(right_context), file=f)
+    print('num_hidden_layers={}'.format(num_hidden_layers), file=f)
+    print('num_targets={}'.format(num_targets), file=f)
     print('add_lda=' + ('true' if add_lda else 'false'), file=f)
     print('include_log_softmax=' + ('true' if include_log_softmax else 'false'), file=f)
     print('objective_type=' + objective_type, file=f)
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 0c881b4dbdf..84817446b6e 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -9,6 +9,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import logging
 import os
@@ -116,8 +117,9 @@ def process_args(args):
         raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -193,7 +195,7 @@ def train(args, run_opts):
     shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
 
     with open('{0}/num_jobs'.format(args.dir), 'w') as f:
-        f.write(str(num_jobs))
+        f.write('{}'.format(num_jobs))
 
     if args.input_model is None:
         config_dir = '{0}/configs'.format(args.dir)
@@ -301,8 +303,7 @@ def train(args, run_opts):
     num_archives_expanded = num_archives * args.frames_per_eg
     num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
-    num_iters = ((num_archives_to_process * 2)
-                 / (args.num_jobs_initial + args.num_jobs_final))
+    num_iters = int(num_archives_to_process * 2 / (args.num_jobs_initial + args.num_jobs_final))
 
     # If do_final_combination is True, compute the set of models_to_combine.
     # Otherwise, models_to_combine will be none.
@@ -321,9 +322,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
@@ -344,12 +346,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 34214169d5d..af921048bb5 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -9,6 +9,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import logging
 import pprint
@@ -101,7 +102,14 @@ def get_args():
                         help="Directory with features used for training "
                         "the neural network.")
     parser.add_argument("--targets-scp", type=str, required=False,
-                        help="Targets for training neural network.")
+                        help="""Targets for training neural network.
+                        This is a kaldi-format SCP file of target matrices.
+                        <utterance-id> <extended-filename-of-target-matrix>.
+                        The target matrix's column dim must match 
+                        the neural network output dim, and the
+                        row dim must match the number of output frames 
+                        i.e. after subsampling if "--frame-subsampling-factor" 
+                        option is passed to --egs.opts.""")
     parser.add_argument("--dir", type=str, required=True,
                         help="Directory to store the models and "
                         "all other files.")
@@ -127,8 +135,9 @@ def process_args(args):
         raise Exception("--trainer.optimization.minibatch-size has an invalid value")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -314,8 +323,7 @@ def train(args, run_opts):
     num_archives_expanded = num_archives * args.frames_per_eg
     num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
-    num_iters = ((num_archives_to_process * 2)
-                 / (args.num_jobs_initial + args.num_jobs_final))
+    num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final))
 
     # If do_final_combination is True, compute the set of models_to_combine.
     # Otherwise, models_to_combine will be none.
@@ -349,9 +357,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
@@ -373,12 +382,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index e797c86b323..b2d55ac20e7 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -10,6 +10,7 @@
 raw neural network instead of an acoustic model.
 """
 from __future__ import print_function
+from __future__ import division
 import argparse
 import logging
 import pprint
@@ -180,8 +181,9 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
                         "and exist; or the {0}/configs directory should exist."
@@ -368,8 +370,7 @@ def train(args, run_opts):
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_to_process = int(args.num_epochs * num_archives)
     num_archives_processed = 0
-    num_iters = ((num_archives_to_process * 2)
-                 / (args.num_jobs_initial + args.num_jobs_final))
+    num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final))
 
     # If do_final_combination is True, compute the set of models_to_combine.
     # Otherwise, models_to_combine will be none.
@@ -411,9 +412,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.raw".format(dir=args.dir, iter=iter)
@@ -445,12 +447,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
@@ -509,7 +512,8 @@ def train(args, run_opts):
                 run_opts=run_opts, chunk_width=args.chunk_width,
                 get_raw_nnet_from_am=False,
                 compute_per_dim_accuracy=args.compute_per_dim_accuracy,
-                max_objective_evaluations=args.max_objective_evaluations)
+                max_objective_evaluations=args.max_objective_evaluations,
+                use_multitask_egs=use_multitask_egs)
         else:
             common_lib.force_symlink("{0}.raw".format(num_iters),
                                      "{0}/final.raw".format(args.dir))
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 25e7dced19b..6ed7197c22b 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -8,6 +8,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import logging
 import os
@@ -171,12 +172,12 @@ def process_args(args):
         raise Exception("--egs.chunk-right-context should be non-negative")
 
     if (not os.path.exists(args.dir)):
-        raise Exception("This script expects --dir={0} to exist.")
-
-    if (not os.path.exists(args.dir+"/configs") and
+        raise Exception("Directory specified with --dir={0} "
+                        "does not exist.".format(args.dir))
+    if (not os.path.exists(args.dir + "/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
-                        "and exist; or the {0}/configs directory should exist."
+                        "and exist; or the {0}/configs directory should exist. "
                         "{0}/configs is the output of make_configs.py"
                         "".format(args.dir))
 
@@ -248,7 +249,7 @@ def train(args, run_opts):
     shutil.copy('{0}/tree'.format(args.ali_dir), args.dir)
 
     with open('{0}/num_jobs'.format(args.dir), 'w') as f:
-        f.write(str(num_jobs))
+        f.write('{}'.format(num_jobs))
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
@@ -369,8 +370,7 @@ def train(args, run_opts):
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_to_process = int(args.num_epochs * num_archives)
     num_archives_processed = 0
-    num_iters = ((num_archives_to_process * 2)
-                 / (args.num_jobs_initial + args.num_jobs_final))
+    num_iters = int((num_archives_to_process * 2) / (args.num_jobs_initial + args.num_jobs_final))
 
     # If do_final_combination is True, compute the set of models_to_combine.
     # Otherwise, models_to_combine will be none.
@@ -396,9 +396,10 @@ def train(args, run_opts):
         if (args.exit_stage is not None) and (iter == args.exit_stage):
             logger.info("Exiting early due to --exit-stage {0}".format(iter))
             return
-        current_num_jobs = int(0.5 + args.num_jobs_initial
-                               + (args.num_jobs_final - args.num_jobs_initial)
-                               * float(iter) / num_iters)
+
+        current_num_jobs = common_train_lib.get_current_num_jobs(
+            iter, num_iters,
+            args.num_jobs_initial, args.num_jobs_step, args.num_jobs_final)
 
         if args.stage <= iter:
             model_file = "{dir}/{iter}.mdl".format(dir=args.dir, iter=iter)
@@ -428,12 +429,13 @@ def train(args, run_opts):
             shrink_info_str = ''
             if shrinkage_value != 1.0:
                 shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
-            logger.info("Iter: {0}/{1}    "
-                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
-                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
-                                                     epoch, args.num_epochs,
-                                                     percent,
-                                                     lrate, shrink_info_str))
+            logger.info("Iter: {0}/{1}   Jobs: {2}   "
+                        "Epoch: {3:0.2f}/{4:0.1f} ({5:0.1f}% complete)   "
+                        "lr: {6:0.6f}   {7}".format(iter, num_iters - 1,
+                                                    current_num_jobs,
+                                                    epoch, args.num_epochs,
+                                                    percent,
+                                                    lrate, shrink_info_str))
 
             train_lib.common.train_one_iteration(
                 dir=args.dir,
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_config.py b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
new file mode 100755
index 00000000000..952745cea9f
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# Copyright 2016-2018    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
+# This is like xconfig_to_configs.py but with a simpler interface; it writes
+# to a single named file.
+
+
+import argparse
+import os
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, 'steps/')
+# the following is in case we weren't running this from the normal directory.
+sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')
+
+import libs.nnet3.xconfig.parser as xparser
+import libs.common as common_lib
+
+
+def get_args():
+    # we add compulsory arguments as named arguments for readability
+    parser = argparse.ArgumentParser(
+        description="Reads an xconfig file and creates config files "
+                    "for neural net creation and training",
+        epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
+    parser.add_argument('--xconfig-file', required=True,
+                        help='Filename of input xconfig file')
+    parser.add_argument('--existing-model',
+                        help='Filename of previously trained neural net '
+                             '(e.g. final.mdl) which is useful in case of '
+                             'using nodes from list of component-nodes in '
+                             'already trained model '
+                             'to generate new config file for new model.'
+                             'The context info is also generated using '
+                             'a model generated by adding final.config '
+                             'to the existing model.'
+                             'e.g. In Transfer learning: generate new model using '
+                             'component nodes in existing model.')
+    parser.add_argument('--config-file-out', required=True,
+                        help='Filename to write nnet config file.');
+    parser.add_argument('--nnet-edits', type=str, default=None,
+                        action=common_lib.NullstrToNoneAction,
+                        help="""This option is useful in case the network you
+                        are creating does not have an output node called
+                        'output' (e.g. for multilingual setups).  You can set
+                        this to an edit-string like: 'rename-node old-name=xxx
+                        new-name=output' if node xxx plays the role of the
+                        output node in this network.  This is only used for
+                        computing the left/right context.""")
+
+    print(' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+
+    return args
+
+
+
+def write_config_file(config_file_out, all_layers):
+    # config_basename_to_lines is map from the basename of the
+    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
+    # strings representing lines to put in the config file.
+    config_basename_to_lines = defaultdict(list)
+
+    for layer in all_layers:
+        try:
+            pairs = layer.get_full_config()
+            for config_basename, line in pairs:
+                config_basename_to_lines[config_basename].append(line)
+        except Exception as e:
+            print("{0}: error producing config lines from xconfig "
+                  "line '{1}': error was: {2}".format(sys.argv[0],
+                                                      str(layer), repr(e)),
+                  file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+    with open(config_file_out, 'w') as f:
+        print('# This file was created by the command:\n'
+              '# {0} '.format(sys.argv), file=f)
+        lines = config_basename_to_lines['final']
+        for line in lines:
+            print(line, file=f)
+
+
+def main():
+    args = get_args()
+    existing_layers = []
+    if args.existing_model is not None:
+        existing_layers = xparser.get_model_component_info(args.existing_model)
+    all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers)
+    write_config_file(args.config_file_out, all_layers)
+
+
+if __name__ == '__main__':
+    main()
+
+
+# test:
+# (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; steps/nnet3/xconfig_to_config.py --xconfig-file=xconfig --config-file-out=foo
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index 3b8dc82fe48..4fb7ec63afd 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -115,7 +115,7 @@ def write_expanded_xconfig_files(config_dir, all_layers):
           '# See also ./xconfig.expanded.2\n', file=xconfig_file_out)
 
     for layer in all_layers:
-        print(str(layer), file=xconfig_file_out)
+        print('{}'.format(layer), file=xconfig_file_out)
     xconfig_file_out.close()
 
     try:
@@ -135,7 +135,7 @@ def write_expanded_xconfig_files(config_dir, all_layers):
 
     for layer in all_layers:
         layer.normalize_descriptors()
-        print(str(layer), file=xconfig_file_out)
+        print('{}'.format(layer), file=xconfig_file_out)
     xconfig_file_out.close()
 
 
@@ -243,15 +243,15 @@ def add_nnet_context_info(config_dir, nnet_edits=None,
     if nnet_edits is not None:
         model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                           model)
-    out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
-                                        .format(model))
+    out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
     # out looks like this
     # left-context: 7
     # right-context: 0
     # num-parameters: 90543902
     # modulus: 1
+    # ...
     info = {}
-    for line in out.split("\n"):
+    for line in out.split("\n")[:4]: # take 4 initial lines,
         parts = line.split(":")
         if len(parts) != 2:
             continue
@@ -277,17 +277,17 @@ def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
                                                  None else '',
                                                  config_dir, file_name))
             model = "{0}/{1}.raw".format(config_dir, file_name)
-            if nnet_edits is not None:
+            if nnet_edits is not None and file_name != 'init':
                 model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                                   model)
-            out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
-                                                .format(model))
+            out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
             # out looks like this
             # left-context: 7
             # right-context: 0
             # num-parameters: 90543902
             # modulus: 1
-            for line in out.split("\n"):
+            # ...
+            for line in out.split("\n")[:4]: # take 4 initial lines,
                 parts = line.split(":")
                 if len(parts) != 2:
                     continue
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
index a423be7aa20..429e184ddf4 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
@@ -64,6 +64,10 @@ if [ -f path.sh ]; then . ./path.sh; fi
 if [ $# != 4 ] && [ $# != 5 ]; then
   echo "Usage: $0 [options] <data> <lang> <extractor-dir> [<alignment-dir>|<decode-dir>|<weights-archive>] <ivector-dir>"
   echo " e.g.: $0 data/test data/lang exp/nnet2_online/extractor exp/tri3/decode_test exp/nnet2_online/ivectors_test"
+  echo "If <alignment-dir|decode-dir> is provided, it is converted to frame-weights "
+  echo "giving silence frames a weight of --silence-weight (default: 0.0). "
+  echo "If <weights-archive> is provided, it must be a single archive file compressed "
+  echo "(using gunzip) containing per-frame weights for each utterance."
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config containing options"
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
@@ -90,7 +94,7 @@ else # 5 arguments
   data=$1
   lang=$2
   srcdir=$3
-  ali_or_decode_dir=$4
+  ali_or_decode_dir_or_weights=$4
   dir=$5
 fi
 
@@ -102,23 +106,23 @@ done
 mkdir -p $dir/log
 silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
 
-if [ ! -z "$ali_or_decode_dir" ]; then
+if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
 
 
-  if [ -f $ali_or_decode_dir/ali.1.gz ]; then
-    if [ ! -f $ali_or_decode_dir/${mdl}.mdl ]; then
-      echo "$0: expected $ali_or_decode_dir/${mdl}.mdl to exist."
+  if [ -f $ali_or_decode_dir_or_weights/ali.1.gz ]; then
+    if [ ! -f $ali_or_decode_dir_or_weights/${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/${mdl}.mdl to exist."
       exit 1;
     fi
-    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
 
     if [ $stage -le 0 ]; then
       rm $dir/weights.*.gz 2>/dev/null
 
       $cmd JOB=1:$nj_orig  $dir/log/ali_to_post.JOB.log \
-        gunzip -c $ali_or_decode_dir/ali.JOB.gz \| \
+        gunzip -c $ali_or_decode_dir_or_weights/ali.JOB.gz \| \
         ali-to-post ark:- ark:- \| \
-        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/final.mdl ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/final.mdl ark:- ark:- \| \
         post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
 
       # put all the weights in one archive.
@@ -126,10 +130,10 @@ if [ ! -z "$ali_or_decode_dir" ]; then
       rm $dir/weights.*.gz || exit 1;
     fi
 
-  elif [ -f $ali_or_decode_dir/lat.1.gz ]; then
-    nj_orig=$(cat $ali_or_decode_dir/num_jobs) || exit 1;
-    if [ ! -f $ali_or_decode_dir/../${mdl}.mdl ]; then
-      echo "$0: expected $ali_or_decode_dir/../${mdl}.mdl to exist."
+  elif [ -f $ali_or_decode_dir_or_weights/lat.1.gz ]; then
+    nj_orig=$(cat $ali_or_decode_dir_or_weights/num_jobs) || exit 1;
+    if [ ! -f $ali_or_decode_dir_or_weights/../${mdl}.mdl ]; then
+      echo "$0: expected $ali_or_decode_dir_or_weights/../${mdl}.mdl to exist."
       exit 1;
     fi
 
@@ -138,19 +142,19 @@ if [ ! -z "$ali_or_decode_dir" ]; then
       rm $dir/weights.*.gz 2>/dev/null
 
       $cmd JOB=1:$nj_orig  $dir/log/lat_to_post.JOB.log \
-        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir/lat.JOB.gz|" ark:/dev/null ark:- \| \
+        lattice-best-path --acoustic-scale=$acwt "ark:gunzip -c $ali_or_decode_dir_or_weights/lat.JOB.gz|" ark:/dev/null ark:- \| \
         ali-to-post ark:- ark:- \| \
-        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir/../${mdl}.mdl ark:- ark:- \| \
+        weight-silence-post $silence_weight $silphonelist $ali_or_decode_dir_or_weights/../${mdl}.mdl ark:- ark:- \| \
         post-to-weights ark:- "ark:|gzip -c >$dir/weights.JOB.gz" || exit 1;
 
       # put all the weights in one archive.
       for j in $(seq $nj_orig); do gunzip -c $dir/weights.$j.gz; done | gzip -c >$dir/weights.gz || exit 1;
       rm $dir/weights.*.gz || exit 1;
     fi
-  elif [ -f $ali_or_decode_dir ] && gunzip -c $ali_or_decode_dir >/dev/null; then
-    cp $ali_or_decode_dir $dir/weights.gz || exit 1;
+  elif [ -f $ali_or_decode_dir_or_weights ] && gunzip -c $ali_or_decode_dir_or_weights >/dev/null; then
+    cp $ali_or_decode_dir_or_weights $dir/weights.gz || exit 1;
   else
-    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir";
+    echo "$0: expected ali.1.gz or lat.1.gz to exist in $ali_or_decode_dir_or_weights";
     exit 1;
   fi
 fi
@@ -164,12 +168,15 @@ splice_opts=$(cat $srcdir/splice_opts)
 gmm_feats="ark,s,cs:apply-cmvn-online --spk2utt=ark:$sdata/JOB/spk2utt --config=$srcdir/online_cmvn.conf $srcdir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
 feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
 
+# This adds online-cmvn in $feats, upon request (configuration taken from UBM),
+[ -f $srcdir/online_cmvn_iextractor ] && feats="$gmm_feats"
+
 
 if [ $sub_speaker_frames -gt 0 ]; then
 
   if [ $stage -le 1 ]; then
   # We work out 'fake' spk2utt files that possibly split each speaker into multiple pieces.
-    if [ ! -z "$ali_or_decode_dir" ]; then
+    if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
       gunzip -c $dir/weights.gz | copy-vector ark:- ark,t:- | \
         awk '{ sum=0; for (n=3;n<NF;n++) sum += $n; print $1, sum; }' > $dir/utt_counts || exit 1;
     else
@@ -230,7 +237,7 @@ else
 fi
 
 if [ $stage -le 2 ]; then
-  if [ ! -z "$ali_or_decode_dir" ]; then
+  if [ ! -z "$ali_or_decode_dir_or_weights" ]; then
     $cmd --num-threads $num_threads JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
       gmm-global-get-post --n=$num_gselect --min-post=$min_post $srcdir/final.dubm "$gmm_feats" ark:- \| \
       weight-post ark:- "ark,s,cs:gunzip -c $dir/weights.gz|" ark:- \| \
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
index 0a5eb340a34..f7264b910ad 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
@@ -42,6 +42,7 @@ max_count=0         # The use of this option (e.g. --max-count 100) can make
                     # posterior-scaling, so assuming the posterior-scale is 0.1,
                     # --max-count 100 starts having effect after 1000 frames, or
                     # 10 seconds of data.
+use_vad=false
 
 # End configuration section.
 
@@ -69,8 +70,13 @@ data=$1
 srcdir=$2
 dir=$3
 
+extra_files=
+if $use_vad; then
+  extra_files=$data/vad.scp
+fi
+
 for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \
-     $srcdir/online_cmvn.conf $srcdir/final.mat; do
+     $srcdir/online_cmvn.conf $srcdir/final.mat $extra_files; do
   [ ! -f $f ] && echo "$0: No such file $f" && exit 1;
 done
 
@@ -105,6 +111,7 @@ echo "--min-post=$min_post" >>$ieconf
 echo "--posterior-scale=$posterior_scale" >>$ieconf
 echo "--max-remembered-frames=1000" >>$ieconf # the default
 echo "--max-count=$max_count" >>$ieconf
+[ -f $srcdir/online_cmvn_iextractor ] && echo "--online-cmvn-iextractor=true" >>$ieconf
 
 
 absdir=$(utils/make_absolute.sh $dir)
@@ -117,9 +124,15 @@ done
 
 if [ $stage -le 0 ]; then
   echo "$0: extracting iVectors"
+  extra_opts=
+  if $use_vad; then
+    extra_opts="--frame-weights-rspecifier=scp:$data/vad.scp"
+  fi
+
   $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
-     ivector-extract-online2 --config=$ieconf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \
-     copy-feats --compress=$compress ark:- \
+    ivector-extract-online2 --config=$ieconf $extra_opts \
+      ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \
+    copy-feats --compress=$compress ark:- \
       ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1;
 fi
 
diff --git a/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh b/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
index 80a023fed8a..76554394205 100755
--- a/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
+++ b/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
@@ -35,7 +35,7 @@ subsample=2 # subsample all features with this periodicity, in the main E-M phas
 cleanup=true
 min_gaussian_weight=0.0001
 remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
-num_threads=32
+num_threads=16
 parallel_opts=  # ignored now.
 online_cmvn_config=conf/online_cmvn.conf
 # End configuration section.
@@ -111,8 +111,8 @@ fi
 
 # Note: there is no point subsampling all_feats, because gmm-global-init-from-feats
 # effectively does subsampling itself (it keeps a random subset of the features).
-all_feats="ark,s,cs:apply-cmvn-online --config=$online_cmvn_config $dir/global_cmvn.stats scp:$data/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
-feats="ark,s,cs:apply-cmvn-online --config=$online_cmvn_config $dir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+all_feats="ark,s,cs:apply-cmvn-online --config=$online_cmvn_config --spk2utt=ark:$data/spk2utt $dir/global_cmvn.stats scp:$data/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+feats="ark,s,cs:apply-cmvn-online --config=$online_cmvn_config --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
 
 num_gauss_init=$(perl -e "print int($initial_gauss_proportion * $num_gauss); ");
 ! [ $num_gauss_init -gt 0 ] && echo "Invalid num-gauss-init $num_gauss_init" && exit 1;
diff --git a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
index d2bd32385f3..27cc28326b5 100755
--- a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
+++ b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
@@ -41,6 +41,7 @@ num_processes=4 # each job runs this many processes, each with --num-threads thr
 cmd="run.pl"
 stage=-4
 ivector_dim=100 # dimension of the extracted i-vector
+online_cmvn_iextractor=false # apply online-cmvn on i-vector input features, uses the configuration from UBM,
 num_iters=10
 num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
 posterior_scale=0.1 # Scale on the acoustic posteriors, intended to account for
@@ -112,9 +113,16 @@ splice_opts=$(cat $srcdir/splice_opts)
 
 ## Set up features.  $gmm_feats is the version of the features with online CMVN, that we use
 ## to get the Gaussian posteriors, $feats is the version of the features with no CMN.
-gmm_feats="ark,s,cs:apply-cmvn-online --config=$dir/online_cmvn.conf $dir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+gmm_feats="ark,s,cs:apply-cmvn-online --config=$dir/online_cmvn.conf --spk2utt=ark:$sdata/JOB/spk2utt $dir/global_cmvn.stats scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
 feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
 
+## This adds online-cmvn in $feats, upon request (configuration taken from UBM),
+## ('online_cmvn_iextractor' marks that we added online_cmvn_iextractor)
+rm $dir/online_cmvn_iextractor 2>/dev/null || true
+if $online_cmvn_iextractor; then
+  feats="$gmm_feats"
+  touch $dir/online_cmvn_iextractor
+fi
 
 # Initialize the i-vector extractor using the input GMM, which is converted to
 # full because that's what the i-vector extractor expects.  Note: we have to do
@@ -201,4 +209,3 @@ ln -s $x.ie $dir/final.ie
 # assign a unique id to this extractor
 # we are not interested in the id itself, just pre-caching ...
 steps/nnet2/get_ivector_id.sh $dir > /dev/null || exit 1
-
diff --git a/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh b/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh
index 912bf89bc59..a20ddd496eb 100755
--- a/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh
+++ b/egs/wsj/s5/steps/online/nnet3/prepare_online_decoding.sh
@@ -10,11 +10,19 @@ add_pitch=false
 mfcc_config=conf/mfcc.conf # you can override any of these you need to override.
 plp_config=conf/plp.conf
 fbank_config=conf/fbank.conf
+
 # online_pitch_config is the config file for both pitch extraction and
 # post-processing; we combine them into one because during training this
 # is given to the program compute-and-process-kaldi-pitch-feats.
 online_pitch_config=conf/online_pitch.conf
 
+# online_cmvn_config can be used both for nn-features and i-vector features.
+# If the file $dir/online_cmvn exists, it is used for both feature streams.
+# If $dir/online_cmvn does not exist, the config file is used only for normalizing
+# the input of ubm in i-vector extractor, the rest of the system is without online-cmvn.
+# The $dir/online_cmvn 'flag' file is created when training with online-cmvn.
+online_cmvn_config=conf/online_cmvn.conf
+
 # Below are some options that affect the iVectors, and should probably
 # match those used in extract_ivectors_online.sh.
 num_gselect=5 # Gaussian-selection using diagonal model: number of Gaussians to select
@@ -29,6 +37,9 @@ max_count=100   # This max-count of 100 can make iVectors more consistent for
                 # after posterior-scaling, so assuming the posterior-scale is
                 # 0.1, --max-count 100 starts having effect after 1000 frames,
                 # or 10 seconds of data.
+ivector_period=10 # Number of frames for which the i-vector stays the same
+                  # (use same value as from local/nnet3/run_ivector_common.sh).
+
 iter=final
 # End configuration.
 
@@ -74,6 +85,11 @@ if [ ! -z "$iedir" ]; then
   for f in final.{mat,ie,dubm} splice_opts global_cmvn.stats online_cmvn.conf; do
     [ ! -f $iedir/$f ] && echo "$0: no such file $iedir/$f" && exit 1;
   done
+  if $add_pitch; then
+    iedim=`matrix-dim $iedir/final.mat | awk '{print $1}'`
+    amdim=`nnet3-am-info $srcdir/${iter}.mdl | grep "input-dim:" | awk '{print $2}'`
+    [ $(($amdim-$iedim)) -eq 0 ] && echo "$0: remove pitch from the input of ivector extractor" && exit 1;
+  fi
 fi
 
 
@@ -91,6 +107,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   cp $srcdir/frame_subsampling_factor $dir/
 fi
 
+
 if [ ! -z "$iedir" ]; then
   mkdir -p $dir/ivector_extractor/
   cp $iedir/final.{mat,ie,dubm} $iedir/global_cmvn.stats $dir/ivector_extractor/ || exit 1;
@@ -131,13 +148,23 @@ case "$feature_type" in
     echo "Unknown feature type $feature_type"
 esac
 
-
+cp $online_cmvn_config $dir/conf/online_cmvn.conf || exit 1;
 
 if [ ! -z "$iedir" ]; then
   ieconf=$dir/conf/ivector_extractor.conf
   echo -n >$ieconf
   echo "--ivector-extraction-config=$ieconf" >>$conf
-  cp $iedir/online_cmvn.conf $dir/conf/online_cmvn.conf || exit 1;
+
+  # make sure that the online_cmvn config for i-extractor is same
+  # as the one passed in with '--online_cmvn_config'
+  ivec_cmvn_config=$iedir/online_cmvn.conf
+  if ! $(cmp --silent $online_cmvn_config $ivec_cmvn_config); then
+    echo "Error, configs must be the same:
+      \$online_cmvn_config=$online_cmvn_config
+      \$ivec_cmvn_config=$ivec_cmvn_config"
+    exit 1;
+  fi
+
   # the next line puts each option from splice_opts on its own line in the config.
   for x in $(cat $iedir/splice_opts); do echo "$x"; done > $dir/conf/splice.conf
   echo "--splice-config=$dir/conf/splice.conf" >>$ieconf
@@ -151,6 +178,12 @@ if [ ! -z "$iedir" ]; then
   echo "--posterior-scale=$posterior_scale" >>$ieconf # this is currently the default in the scripts.
   echo "--max-remembered-frames=1000" >>$ieconf # the default
   echo "--max-count=$max_count" >>$ieconf
+  echo "--ivector-period=$ivector_period" >>$ieconf
+  # activate online-cmvn for the i-extractor, not only the ubm,
+  if [ -f $srcdir/online_cmvn ]; then
+    cp $iedir/online_cmvn_iextractor $dir/ivector_extractor/ || exit 1
+    echo "--online-cmvn-iextractor=true" >>$ieconf
+  fi
 fi
 
 if $add_pitch; then
@@ -167,4 +200,13 @@ fi
 
 silphonelist=`cat $lang/phones/silence.csl` || exit 1;
 echo "--endpoint.silence-phones=$silphonelist" >>$conf
+
+# activate the online-cmvn in nnet input features,
+if [ -f $srcdir/online_cmvn ]; then
+  cp $srcdir/online_cmvn $dir/
+  cp $srcdir/global_cmvn.stats $dir/
+  echo "--cmvn-config=$dir/conf/online_cmvn.conf" >>$conf
+  echo "--global-cmvn-stats=$dir/global_cmvn.stats" >>$conf
+fi
+
 echo "$0: created config file $conf"
diff --git a/egs/wsj/s5/steps/segmentation/ali_to_targets.sh b/egs/wsj/s5/steps/segmentation/ali_to_targets.sh
index 78c76a8ea01..56d93df3c6b 100644
--- a/egs/wsj/s5/steps/segmentation/ali_to_targets.sh
+++ b/egs/wsj/s5/steps/segmentation/ali_to_targets.sh
@@ -82,9 +82,9 @@ nj=$(cat $ali_dir/num_jobs) || exit 1
 
 $cmd JOB=1:$nj $dir/log/get_arc_info.JOB.log \
   ali-to-phones --ctm-output --frame-shift=1 \
-    $srcdir/final.mdl "ark:gunzip -c $ali_dir/lat.JOB.gz |" - \| \
+    $srcdir/final.mdl "ark:gunzip -c $ali_dir/ali.JOB.gz |" - \| \
   utils/int2sym.pl -f 5 $lang/phones.txt \| \
-  awk '{print $1" "int($3)" "int($4)" 1.0 "$5}' \| \
+  awk '{print $1" "int($3)" "int($4)" 1.0 "$5}' \> \
   $dir/arc_info_sym.JOB.txt || exit 1
 
 # make $dir an absolute pathname.
diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh
index 60e3df20df2..831283bb5ec 100755
--- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh
+++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh
@@ -56,7 +56,15 @@ acwt=0.3
 # e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3
 transform_probs_opts=""
 
+# Postprocessing options
 segment_padding=0.2   # Duration (in seconds) of padding added to segments 
+min_segment_dur=0   # Minimum duration (in seconds) required for a segment to be included
+                    # This is before any padding. Segments shorter than this duration will be removed.
+                    # This is an alternative to --min-speech-duration above.
+merge_consecutive_max_dur=0   # Merge consecutive segments as long as the merged segment is no longer than this many
+                              # seconds. The segments are only merged if their boundaries are touching.
+                              # This is after padding by --segment-padding seconds.
+                              # 0 means do not merge. Use 'inf' to not limit the duration.
 
 echo $* 
 
@@ -99,14 +107,14 @@ data_id=`basename $data_dir`
 sad_dir=${dir}/${sad_name}${affix}_${data_id}_whole${feat_affix}
 seg_dir=${dir}/${segmentation_name}${affix}_${data_id}_whole${feat_affix}
 
-test_data_dir=data/${data_id}${feat_affix}_hires
-
 if $convert_data_dir_to_whole; then
+  test_data_dir=data/${data_id}_whole${feat_affix}_hires
   if [ $stage -le 0 ]; then
     rm -r ${test_data_dir} || true
     utils/data/convert_data_dir_to_whole.sh $src_data_dir ${test_data_dir}
   fi
 else
+  test_data_dir=data/${data_id}${feat_affix}_hires
   if [ $stage -le 0 ]; then
     rm -r ${test_data_dir} || true
     utils/copy_data_dir.sh $src_data_dir $test_data_dir
@@ -170,7 +178,8 @@ fi
 ## Prepare FST we search to make speech/silence decisions.
 ###############################################################################
 
-frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir)
+utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1
+frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1
 
 graph_dir=${dir}/graph_${output_name}
 if [ $stage -le 5 ]; then
@@ -224,7 +233,8 @@ fi
 
 if [ $stage -le 7 ]; then
   steps/segmentation/post_process_sad_to_segments.sh \
-    --segment-padding $segment_padding \
+    --segment-padding $segment_padding --min-segment-dur $min_segment_dur \
+    --merge-consecutive-max-dur $merge_consecutive_max_dur \
     --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \
     ${test_data_dir} ${seg_dir} ${seg_dir}
 fi
diff --git a/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py b/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py
index 3e9cbbbf178..038640f6271 100644
--- a/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py
+++ b/egs/wsj/s5/steps/segmentation/internal/find_oov_phone.py
@@ -8,6 +8,7 @@
 <lang>/phones/align_lexicon.int.
 It prints the OOV phone to stdout, if it can find a single phone
 mapping for the OOV word."""
+from __future__ import print_function
 
 import sys
 
diff --git a/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py b/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py
index e7000b9de00..0361999d904 100755
--- a/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py
+++ b/egs/wsj/s5/steps/segmentation/internal/get_default_targets_for_out_of_segments.py
@@ -14,6 +14,7 @@
 the application and data, this could be [ 0 0 0 ] or [ 0 0 1 ] or
 something with fractional weights.
 """
+from __future__ import division
 
 import argparse
 import logging
@@ -131,7 +132,7 @@ def run(args):
             and np.shape(default_targets)[1] == 3)
 
     with common_lib.smart_open(args.out_targets_ark, 'w') as f:
-        for reco, utts in reco2utt.iteritems():
+        for reco, utts in reco2utt.items():
             reco_mat = np.repeat(default_targets, reco2num_frames[reco],
                                  axis=0)
             utts.sort(key=lambda x: segments[x][1])   # sort on start time
diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py
index 8c53e5e8db9..e48afbeb872 100755
--- a/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py
+++ b/egs/wsj/s5/steps/segmentation/internal/merge_segment_targets_to_recording.py
@@ -9,6 +9,7 @@
 in any of the segments are assigned the default targets vector, specified by
 the option --default-targets or [ 0 0 0 ] if unspecified.
 """
+from __future__ import division
 
 import argparse
 import logging
@@ -158,7 +159,7 @@ def run(args):
     num_reco = 0
 
     with common_lib.smart_open(args.out_targets_ark, 'w') as fh:
-        for reco, utts in reco2utt.iteritems():
+        for reco, utts in reco2utt.items():
             # Read a recording and the list of its utterances from the
             # reco2utt dictionary
             reco_mat = np.repeat(default_targets, reco2num_frames[reco],
diff --git a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
index 8222eddad8f..84b0c884f45 100755
--- a/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
+++ b/egs/wsj/s5/steps/segmentation/internal/merge_targets.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright 2017  Vimal Manohar
 # Apache 2.0
@@ -16,7 +16,6 @@
 option.
 """
 
-from __future__ import print_function
 import argparse
 import logging
 import numpy as np
@@ -110,7 +109,7 @@ def should_remove_frame(row, dim):
                                      # source[2] = [ 0 0 0 ]
     """
     assert len(row) % dim == 0
-    num_sources = len(row) / dim
+    num_sources = len(row) // dim
 
     max_idx = np.argmax(row)
     max_val = row[max_idx]
diff --git a/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py b/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py
index 9b1c0f12b9a..09da9cbecc1 100755
--- a/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py
+++ b/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 # Copyright 2017  Vimal Manohar
+#           2018  Capital One (Author: Zhiyuan Guan)
 # Apache 2.0
 
 """
@@ -29,6 +30,7 @@
 
 global_verbose = 0
 
+
 def get_args():
     parser = argparse.ArgumentParser(
         description="""
@@ -44,18 +46,31 @@ def get_args():
 
     parser.add_argument("--utt2dur", type=str,
                         help="File containing durations of utterances.")
+
     parser.add_argument("--frame-shift", type=float, default=0.01,
                         help="Frame shift to convert frame indexes to time")
 
     parser.add_argument("--segment-padding", type=float, default=0.2,
                         help="Additional padding on speech segments. But we "
-                        "ensure that the padding does not go beyond the "
-                        "adjacent segment.")
+                             "ensure that the padding does not go beyond the "
+                             "adjacent segment.")
+
+    parser.add_argument("--min-segment-dur", type=float, default=0,
+                        help="Minimum duration (in seconds) required for a segment "
+                             "to be included. This is before any padding. Segments "
+                             "shorter than this duration will be removed.")
 
+    parser.add_argument("--merge-consecutive-max-dur", type=float, default=0,
+                        help="Merge consecutive segments as long as the merged "
+                             "segment is no longer than this many seconds. The segments "
+                             "are only merged if their boundaries are touching. "
+                             "This is after padding by --segment-padding seconds."
+                             "0 means do not merge. Use 'inf' to not limit the duration.")
 
     parser.add_argument("in_sad", type=str,
                         help="Input file containing alignments in "
-                        "text archive format")
+                             "text archive format")
+
     parser.add_argument("out_segments", type=str,
                         help="Output kaldi segments file")
 
@@ -80,28 +95,45 @@ def to_str(segment):
 
 class SegmenterStats(object):
     """Stores stats about the post-process stages"""
+
     def __init__(self):
-        self.num_segments = 0
+        self.num_segments_initial = 0
+        self.num_short_segments_filtered = 0
+        self.num_merges = 0
+        self.num_segments_final = 0
         self.initial_duration = 0.0
         self.padding_duration = 0.0
+        self.filter_short_duration = 0.0
         self.final_duration = 0.0
 
     def add(self, other):
         """Adds stats from another object"""
-        self.num_segments += other.num_segments
+        self.num_segments_initial += other.num_segments_initial
+        self.num_short_segments_filtered += other.num_short_segments_filtered
+        self.num_merges += other.num_merges
+        self.num_segments_final += other.num_segments_final
         self.initial_duration += other.initial_duration
-        self.padding_duration = other.padding_duration
-        self.final_duration = other.final_duration
+        self.filter_short_duration += other.filter_short_duration
+        self.padding_duration += other.padding_duration
+        self.final_duration += other.final_duration
 
     def __str__(self):
-        return ("num-segments={num_segments}, "
+        return ("num-segments-initial={num_segments_initial}, "
+                "num-short-segments-filtered={num_short_segments_filtered}, "
+                "num-merges={num_merges}, "
+                "num-segments-final={num_segments_final}, "
                 "initial-duration={initial_duration}, "
+                "filter-short-duration={filter_short_duration}, "
                 "padding-duration={padding_duration}, "
                 "final-duration={final_duration}".format(
-                    num_segments=self.num_segments,
-                    initial_duration=self.initial_duration,
-                    padding_duration=self.padding_duration,
-                    final_duration=self.final_duration))
+            num_segments_initial=self.num_segments_initial,
+            num_short_segments_filtered=self.num_short_segments_filtered,
+            num_merges=self.num_merges,
+            num_segments_final=self.num_segments_final,
+            initial_duration=self.initial_duration,
+            filter_short_duration=self.filter_short_duration,
+            padding_duration=self.padding_duration,
+            final_duration=self.final_duration))
 
 
 def process_label(text_label):
@@ -114,13 +146,14 @@ def process_label(text_label):
     prev_label = int(text_label)
     if prev_label not in [1, 2]:
         raise ValueError("Expecting label to 1 (non-speech) or 2 (speech); "
-                         "got {0}".format(prev_label))
+                         "got {}".format(prev_label))
 
     return prev_label
 
 
 class Segmentation(object):
     """Stores segmentation for an utterances"""
+
     def __init__(self):
         self.segments = None
         self.stats = SegmenterStats()
@@ -141,10 +174,9 @@ def initialize_segments(self, alignment, frame_shift=0.01):
                     self.segments.append(
                         [float(i - prev_length) * frame_shift,
                          float(i) * frame_shift, prev_label])
-
+                    self.stats.initial_duration += (prev_length * frame_shift)
                 prev_label = process_label(text_label)
                 prev_length = 0
-                self.stats.initial_duration += (prev_length * frame_shift)
             elif prev_label is None:
                 prev_label = process_label(text_label)
 
@@ -156,7 +188,27 @@ def initialize_segments(self, alignment, frame_shift=0.01):
                  float(len(alignment)) * frame_shift, prev_label])
             self.stats.initial_duration += (prev_length * frame_shift)
 
-        self.stats.num_segments = len(self.segments)
+        self.stats.num_segments_initial = len(self.segments)
+        self.stats.num_segments_final = len(self.segments)
+        self.stats.final_duration = self.stats.initial_duration
+
+    def filter_short_segments(self, min_dur):
+        """Filters out segments with durations shorter than 'min_dur'."""
+        if min_dur <= 0:
+            return
+
+        segments_kept = []
+        for segment in self.segments:
+            assert segment[2] == 2, segment
+            dur = segment[1] - segment[0]
+            if dur < min_dur:
+                self.stats.filter_short_duration += dur
+                self.stats.num_short_segments_filtered += 1
+            else:
+                segments_kept.append(segment)
+        self.segments = segments_kept
+        self.stats.num_segments_final = len(self.segments)
+        self.stats.final_duration -= self.stats.filter_short_duration
 
     def pad_speech_segments(self, segment_padding, max_duration=float("inf")):
         """Pads segments by duration 'segment_padding' on either sides, but
@@ -166,19 +218,19 @@ def pad_speech_segments(self, segment_padding, max_duration=float("inf")):
             max_duration = float("inf")
         for i, segment in enumerate(self.segments):
             assert segment[2] == 2, segment
-            segment[0] -= segment_padding   # try adding padding on the left side
+            segment[0] -= segment_padding  # try adding padding on the left side
             self.stats.padding_duration += segment_padding
             if segment[0] < 0.0:
                 # Padding takes the segment start to before the beginning of the utterance.
                 # Reduce padding.
                 self.stats.padding_duration += segment[0]
                 segment[0] = 0.0
-            if i >= 1 and self.segments[i-1][1] > segment[0]:
+            if i >= 1 and self.segments[i - 1][1] > segment[0]:
                 # Padding takes the segment start to before the end the previous segment.
                 # Reduce padding.
                 self.stats.padding_duration -= (
-                    self.segments[i-1][1] - segment[0])
-                segment[0] = self.segments[i-1][1]
+                        self.segments[i - 1][1] - segment[0])
+                segment[0] = self.segments[i - 1][1]
 
             segment[1] += segment_padding
             self.stats.padding_duration += segment_padding
@@ -188,12 +240,35 @@ def pad_speech_segments(self, segment_padding, max_duration=float("inf")):
                 self.stats.padding_duration -= (segment[1] - max_duration)
                 segment[1] = max_duration
             if (i + 1 < len(self.segments)
-                    and segment[1] > self.segments[i+1][0]):
+                    and segment[1] > self.segments[i + 1][0]):
                 # Padding takes the segment end beyond the start of the next segment.
                 # Reduce padding.
                 self.stats.padding_duration -= (
-                    segment[1] - self.segments[i+1][0])
-                segment[1] = self.segments[i+1][0]
+                        segment[1] - self.segments[i + 1][0])
+                segment[1] = self.segments[i + 1][0]
+        self.stats.final_duration += self.stats.padding_duration
+
+    def merge_consecutive_segments(self, max_dur):
+        """Merge consecutive segments (happens after padding), provided that
+        the merged segment is no longer than 'max_dur'."""
+        if max_dur <= 0 or not self.segments:
+            return
+
+        merged_segments = [self.segments[0]]
+        for segment in self.segments[1:]:
+            assert segment[2] == 2, segment
+            if segment[0] == merged_segments[-1][1] and \
+                    segment[1] - merged_segments[-1][0] <= max_dur:
+                # The segment starts at the same time the last segment ends,
+                # and the merged segment is shorter than 'max_dur'.
+                # Extend the previous segment.
+                merged_segments[-1][1] = segment[1]
+                self.stats.num_merges += 1
+            else:
+                merged_segments.append(segment)
+
+        self.segments = merged_segments
+        self.stats.num_segments_final = len(self.segments)
 
     def write(self, key, file_handle):
         """Write segments to file"""
@@ -203,9 +278,9 @@ def write(self, key, file_handle):
         for segment in self.segments:
             seg_id = "{key}-{st:07d}-{end:07d}".format(
                 key=key, st=int(segment[0] * 100), end=int(segment[1] * 100))
-            print ("{seg_id} {key} {st:.2f} {end:.2f}".format(
+            print("{seg_id} {key} {st:.2f} {end:.2f}".format(
                 seg_id=seg_id, key=key, st=segment[0], end=segment[1]),
-                   file=file_handle)
+                file=file_handle)
 
 
 def run(args):
@@ -235,9 +310,11 @@ def run(args):
             segmentation = Segmentation()
             segmentation.initialize_segments(
                 parts[1:], args.frame_shift)
+            segmentation.filter_short_segments(args.min_segment_dur)
             segmentation.pad_speech_segments(args.segment_padding,
                                              None if args.utt2dur is None
                                              else utt2dur[utt_id])
+            segmentation.merge_consecutive_segments(args.merge_consecutive_max_dur)
             segmentation.write(utt_id, out_segments_fh)
             global_stats.add(segmentation.stats)
     logger.info(global_stats)
diff --git a/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh b/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh
index ca9cea2518b..b168c307b57 100755
--- a/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh
+++ b/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh
@@ -18,6 +18,8 @@ nj=18
 # The values below are in seconds
 frame_shift=0.01
 segment_padding=0.2
+min_segment_dur=0
+merge_consecutive_max_dur=0
 
 . utils/parse_options.sh
 
@@ -53,6 +55,7 @@ if [ $stage -le 0 ]; then
     copy-int-vector "ark:gunzip -c $vad_dir/ali.JOB.gz |" ark,t:- \| \
     steps/segmentation/internal/sad_to_segments.py \
       --frame-shift=$frame_shift --segment-padding=$segment_padding \
+      --min-segment-dur=$min_segment_dur --merge-consecutive-max-dur=$merge_consecutive_max_dur \
       --utt2dur=$data_dir/utt2dur - $dir/segments.JOB
 fi
 
diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh
index 20bcfd96d96..76025f4a388 100755
--- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh
+++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh
@@ -46,6 +46,7 @@ overlap_duration=2.5
 max_remaining_duration=5  # If the last remaining piece when splitting uniformly
                           # is smaller than this duration, then the last piece 
                           # is  merged with the previous.
+remove_mismatch_frames=true
 
 # List of weights on labels obtained from alignment, 
 # labels obtained from decoding and default labels in out-of-segment regions
@@ -108,7 +109,7 @@ for f in $in_whole_data_dir/feats.scp $in_data_dir/segments \
   fi
 done
 
-utils/validate_data_dir.sh $in_data_dir || exit 1
+utils/validate_data_dir.sh --no-feats $in_data_dir || exit 1
 utils/validate_data_dir.sh --no-text $in_whole_data_dir || exit 1
 
 if ! cat $garbage_phones_list $silence_phones_list | \
@@ -159,7 +160,7 @@ whole_data_dir=$dir/$whole_data_id
 # Obtain supervision-constrained lattices
 ###############################################################################
 sup_lats_dir=$dir/`basename ${ali_model_dir}`_sup_lats_${data_id}
-if [ $stage -le 2 ]; then
+if [ $stage -le 3 ]; then
   steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \
     ${data_dir} ${lang} ${ali_model_dir} $sup_lats_dir || exit 1
 fi
@@ -170,7 +171,7 @@ fi
 uniform_seg_data_dir=$dir/${whole_data_id}_uniformseg_${max_segment_duration}sec
 uniform_seg_data_id=`basename $uniform_seg_data_dir`
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 4 ]; then
   utils/data/get_segments_for_data.sh ${whole_data_dir} > \
     ${whole_data_dir}/segments
 
@@ -193,7 +194,7 @@ model_id=$(basename $model_dir)
 ###############################################################################
 if [ -z "$graph_dir" ]; then
   graph_dir=$dir/$model_id/graph
-  if [ $stage -le 4 ]; then
+  if [ $stage -le 5 ]; then
     if [ ! -f $graph_dir/HCLG.fst ]; then
       rm -r $dir/lang_test 2>/dev/null || true
       cp -r $lang_test/ $dir/lang_test
@@ -207,7 +208,7 @@ fi
 ###############################################################################
 model_id=$(basename $model_dir)
 decode_dir=$dir/${model_id}/decode_${uniform_seg_data_id}
-if [ $stage -le 5 ]; then 
+if [ $stage -le 6 ]; then 
   mkdir -p $decode_dir
   
   cp $model_dir/{final.mdl,final.mat,*_opts,tree} $dir/${model_id}
@@ -228,7 +229,7 @@ ali_model_id=`basename $ali_model_dir`
 # The target values are obtained by summing up posterior probabilites of 
 # arcs from lattice-arc-post over silence, speech and garbage phones.
 ###############################################################################
-if [ $stage -le 6 ]; then
+if [ $stage -le 7 ]; then
   steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
     --silence-phones "$silence_phones_list" \
     --garbage-phones "$garbage_phones_list" \
@@ -237,7 +238,7 @@ if [ $stage -le 6 ]; then
     $dir/${ali_model_id}_${data_id}_sup_targets
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 8 ]; then
   steps/segmentation/lats_to_targets.sh --cmd "$train_cmd" \
     --silence-phones "$silence_phones_list" \
     --garbage-phones "$garbage_phones_list" \
@@ -253,7 +254,7 @@ fi
 # for the manual segments, these are converted to whole recording-levels 
 # by inserting [ 0 0 0 ] for the out-of-manual segment regions.
 ###############################################################################
-if [ $stage -le 8 ]; then
+if [ $stage -le 9 ]; then
   steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
     $data_dir $whole_data_dir \
     $dir/${ali_model_id}_${data_id}_sup_targets \
@@ -268,7 +269,7 @@ fi
 ###############################################################################
 # Convert the targets from decoding to whole recording. 
 ###############################################################################
-if [ $stage -le 9 ]; then
+if [ $stage -le 10 ]; then
   steps/segmentation/convert_targets_dir_to_whole_recording.sh --cmd "$train_cmd" --nj $reco_nj \
     $dir/${uniform_seg_data_id} $whole_data_dir \
     $dir/${model_id}_${uniform_seg_data_id}_targets \
@@ -285,7 +286,7 @@ fi
 # We assume in this setup that this is silence i.e. [ 1 0 0 ].
 ###############################################################################
 
-if [ $stage -le 10 ]; then
+if [ $stage -le 11 ]; then
   echo " [ 1 0 0 ]" > $dir/default_targets.vec
   steps/segmentation/get_targets_for_out_of_segments.sh --cmd "$train_cmd" \
     --nj $reco_nj --frame-subsampling-factor 3 \
@@ -301,9 +302,9 @@ fi
 # disagree (more than 0.5 probability on different classes), then those frames
 # are removed by setting targets to [ 0 0 0 ]. 
 ###############################################################################
-if [ $stage -le 11 ]; then
+if [ $stage -le 12 ]; then
   steps/segmentation/merge_targets_dirs.sh --cmd "$train_cmd" --nj $reco_nj \
-    --weights $merge_weights --remove-mismatch-frames true \
+    --weights $merge_weights --remove-mismatch-frames $remove_mismatch_frames \
     $whole_data_dir \
     $dir/${ali_model_id}_${whole_data_id}_sup_targets_sub3 \
     $dir/${model_id}_${whole_data_id}_targets_sub3 \
diff --git a/egs/wsj/s5/steps/tfrnnlm/lstm.py b/egs/wsj/s5/steps/tfrnnlm/lstm.py
index 06969fbcb5d..433dc87b4c6 100644
--- a/egs/wsj/s5/steps/tfrnnlm/lstm.py
+++ b/egs/wsj/s5/steps/tfrnnlm/lstm.py
@@ -16,8 +16,8 @@
 
 # this script trains a vanilla RNNLM with TensorFlow. 
 # to call the script, do
-# python steps/tfrnnlm/lstm.py --data-path=$datadir \
-#        --save-path=$savepath --vocab-path=$rnn.wordlist [--hidden-size=$size]
+# python steps/tfrnnlm/lstm.py --data_path=$datadir \
+#        --save_path=$savepath --vocab_path=$rnn.wordlist [--hidden-size=$size]
 #
 # One example recipe is at egs/ami/s5/local/tfrnnlm/run_lstm.sh
 
@@ -38,15 +38,15 @@
 flags = tf.flags
 logging = tf.logging
 
-flags.DEFINE_integer("hidden-size", 200, "hidden dim of RNN")
+flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")
 
-flags.DEFINE_string("data-path", None,
+flags.DEFINE_string("data_path", None,
                     "Where the training/test data is stored.")
-flags.DEFINE_string("vocab-path", None,
+flags.DEFINE_string("vocab_path", None,
                     "Where the wordlist file is stored.")
-flags.DEFINE_string("save-path", None,
+flags.DEFINE_string("save_path", None,
                     "Model output directory.")
-flags.DEFINE_bool("use-fp16", False,
+flags.DEFINE_bool("use_fp16", False,
                   "Train using 16-bit floats instead of 32bit floats")
 
 FLAGS = flags.FLAGS
@@ -203,7 +203,7 @@ def attn_cell():
                                       config.max_grad_norm)
     optimizer = tf.train.GradientDescentOptimizer(self._lr)
     self._train_op = optimizer.apply_gradients(
-        zip(grads, tvars),
+        list(zip(grads, tvars)),
         global_step=tf.contrib.framework.get_or_create_global_step())
 
     self._new_lr = tf.placeholder(
diff --git a/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py b/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py
index 9643468ccfb..ff6c7263804 100644
--- a/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py
+++ b/egs/wsj/s5/steps/tfrnnlm/lstm_fast.py
@@ -16,8 +16,8 @@
 
 # this script trains a vanilla RNNLM with TensorFlow. 
 # to call the script, do
-# python steps/tfrnnlm/lstm_fast.py --data-path=$datadir \
-#        --save-path=$savepath --vocab-path=$rnn.wordlist [--hidden-size=$size]
+# python steps/tfrnnlm/lstm_fast.py --data_path=$datadir \
+#        --save_path=$savepath --vocab_path=$rnn.wordlist [--hidden-size=$size]
 #
 # One example recipe is at egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh
 
@@ -38,15 +38,15 @@
 flags = tf.flags
 logging = tf.logging
 
-flags.DEFINE_integer("hidden-size", 200, "hidden dim of RNN")
+flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")
 
-flags.DEFINE_string("data-path", None,
+flags.DEFINE_string("data_path", None,
                     "Where the training/test data is stored.")
-flags.DEFINE_string("vocab-path", None,
+flags.DEFINE_string("vocab_path", None,
                     "Where the wordlist file is stored.")
-flags.DEFINE_string("save-path", None,
+flags.DEFINE_string("save_path", None,
                     "Model output directory.")
-flags.DEFINE_bool("use-fp16", False,
+flags.DEFINE_bool("use_fp16", False,
                   "Train using 16-bit floats instead of 32bit floats")
 
 FLAGS = flags.FLAGS
@@ -218,7 +218,7 @@ def attn_cell():
                                       config.max_grad_norm)
     optimizer = tf.train.GradientDescentOptimizer(self._lr)
     self._train_op = optimizer.apply_gradients(
-        zip(grads, tvars),
+        list(zip(grads, tvars)),
         global_step=tf.contrib.framework.get_or_create_global_step())
 
     self._new_lr = tf.placeholder(
diff --git a/egs/wsj/s5/steps/tfrnnlm/reader.py b/egs/wsj/s5/steps/tfrnnlm/reader.py
index fc3d4d0471c..80cdeccbb26 100644
--- a/egs/wsj/s5/steps/tfrnnlm/reader.py
+++ b/egs/wsj/s5/steps/tfrnnlm/reader.py
@@ -31,7 +31,7 @@ def _read_words(filename):
 
 def _build_vocab(filename):
   words = _read_words(filename)
-  word_to_id = dict(zip(words, range(len(words))))
+  word_to_id = dict(list(zip(words, list(range(len(words))))))
   return word_to_id
 
 
diff --git a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py
index de263c6923f..ae7a257906e 100644
--- a/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py
+++ b/egs/wsj/s5/steps/tfrnnlm/vanilla_rnnlm.py
@@ -16,8 +16,8 @@
 
 # this script trains a vanilla RNNLM with TensorFlow. 
 # to call the script, do
-# python steps/tfrnnlm/vanilla_rnnlm.py --data-path=$datadir \
-#        --save-path=$savepath --vocab-path=$rnn.wordlist [--hidden-size=$size]
+# python steps/tfrnnlm/vanilla_rnnlm.py --data_path=$datadir \
+#        --save_path=$savepath --vocab_path=$rnn.wordlist [--hidden-size=$size]
 #
 # One example recipe is at egs/ami/s5/local/tfrnnlm/run_vanilla_rnnlm.sh
 
@@ -38,15 +38,15 @@
 flags = tf.flags
 logging = tf.logging
 
-flags.DEFINE_integer("hidden-size", 200, "hidden dim of RNN")
+flags.DEFINE_integer("hidden_size", 200, "hidden dim of RNN")
 
-flags.DEFINE_string("data-path", None,
+flags.DEFINE_string("data_path", None,
                     "Where the training/test data is stored.")
-flags.DEFINE_string("vocab-path", None,
+flags.DEFINE_string("vocab_path", None,
                     "Where the wordlist file is stored.")
-flags.DEFINE_string("save-path", None,
+flags.DEFINE_string("save_path", None,
                     "Model output directory.")
-flags.DEFINE_bool("use-fp16", False,
+flags.DEFINE_bool("use_fp16", False,
                   "Train using 16-bit floats instead of 32bit floats")
 
 FLAGS = flags.FLAGS
@@ -201,7 +201,7 @@ def attn_cell():
                                       config.max_grad_norm)
     optimizer = tf.train.MomentumOptimizer(self._lr, 0.9)
     self._train_op = optimizer.apply_gradients(
-        zip(grads, tvars),
+        list(zip(grads, tvars)),
         global_step=tf.contrib.framework.get_or_create_global_step())
 
     self._new_lr = tf.placeholder(
diff --git a/egs/wsj/s5/steps/train_diag_ubm.sh b/egs/wsj/s5/steps/train_diag_ubm.sh
index 10cc4a4b43e..4389844d478 100755
--- a/egs/wsj/s5/steps/train_diag_ubm.sh
+++ b/egs/wsj/s5/steps/train_diag_ubm.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright Johns Hopkins University (Author: Daniel Povey),  2012.  
+# Copyright Johns Hopkins University (Author: Daniel Povey),  2012.
 # Apache 2.0.
 
 # Train a diagonal mixture of Gaussians.  This is trained without
@@ -67,7 +67,7 @@ echo "$0: feature type is $feat_type"
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
-    cp $alidir/final.mat $dir    
+    cp $alidir/final.mat $dir
     ;;
   *) echo "Invalid feature type $feat_type" && exit 1;
 esac
diff --git a/egs/wsj/s5/steps/train_mono.sh b/egs/wsj/s5/steps/train_mono.sh
index 141d128c329..5a0b79a4a1c 100755
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2019  Xiaohui Zhang
 # Apache 2.0
 
 
@@ -13,6 +14,9 @@ cmd=run.pl
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 num_iters=40    # Number of iterations of training
 max_iter_inc=30 # Last iter to increase #Gauss on.
+initial_beam=6 # beam used in the first iteration (set smaller to speed up initialization)
+regular_beam=10 # beam used after the first iteration
+retry_beam=40
 totgauss=1000 # Target #Gaussians.
 careful=false
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
@@ -105,8 +109,7 @@ if [ $stage -le 0 ]; then
   rm $dir/0.*.acc
 fi
 
-
-beam=6 # will change to 10 below after 1st pass
+beam=$initial_beam # will change to regular_beam below after 1st pass
 # note: using slightly wider beams for WSJ vs. RM.
 x=1
 while [ $x -lt $num_iters ]; do
@@ -116,7 +119,7 @@ while [ $x -lt $num_iters ]; do
       echo "$0: Aligning data"
       mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
       $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
-        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$[$beam*4] --careful=$careful "$mdl" \
+        gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \
         "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" \
         || exit 1;
     fi
@@ -132,7 +135,7 @@ while [ $x -lt $num_iters ]; do
   if [ $x -le $max_iter_inc ]; then
      numgauss=$[$numgauss+$incgauss];
   fi
-  beam=10
+  beam=$regular_beam
   x=$[$x+1]
 done
 
diff --git a/egs/wsj/s5/steps/train_sat.sh b/egs/wsj/s5/steps/train_sat.sh
index 0211f7bcf67..92b744dc75c 100755
--- a/egs/wsj/s5/steps/train_sat.sh
+++ b/egs/wsj/s5/steps/train_sat.sh
@@ -276,4 +276,3 @@ steps/info/gmm_dir_info.pl $dir
 echo "$0: done training SAT system in $dir"
 
 exit 0
-
diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh
index 45384fe4ecd..5245ea0c619 100755
--- a/egs/wsj/s5/steps/train_sat_basis.sh
+++ b/egs/wsj/s5/steps/train_sat_basis.sh
@@ -17,6 +17,7 @@ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 beam=10
 retry_beam=40
 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
+basis_fmllr_opts="--fmllr-min-count=22  --num-iters=10 --size-scale=0.2 --step-size-iters=3"
 context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
 realign_iters="10 20 30";
 fmllr_iters="2 4 6 12";
@@ -93,7 +94,7 @@ esac
 ## Get initial fMLLR transforms (possibly from alignment dir)
 if [ -f $alidir/trans.1 ]; then
   echo "$0: Using transforms from $alidir"
-  feats="$sifeats transform-feats ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
+  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
   cur_trans_dir=$alidir
 else
   if [ $stage -le -5 ]; then
@@ -114,13 +115,11 @@ else
       ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:-  \| \
       weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \
       gmm-post-to-gpost $alidir/final.mdl "$sifeats" ark:- ark:- \| \
-      gmm-est-basis-fmllr-gpost --fmllr-min-count=22  --num-iters=10 \
-      --size-scale=0.2 --step-size-iters=3 \
-      --write-weights=ark:$dir/pre_wgt.JOB \
+      gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt  \
       $alidir/final.mdl $alidir/fmllr.basis "$sifeats"  ark,s,cs:- \
       ark:$alidir/trans.JOB || exit 1;
 
-    feats="$sifeats transform-feats ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
+    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |"
     cur_trans_dir=$alidir
   fi
 fi
@@ -214,14 +213,12 @@ while [ $x -lt $num_iters ]; do
         ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
         weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \
         gmm-post-to-gpost $dir/$x.mdl "$sifeats" ark:- ark:- \| \
-        gmm-est-basis-fmllr-gpost --fmllr-min-count=22  --num-iters=10 \
-          --size-scale=0.2 --step-size-iters=3 \
-          --write-weights=ark:$dir/pre_wgt.JOB \
+        gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \
           $dir/$x.mdl $dir/fmllr.basis "$sifeats"  ark,s,cs:- \
           ark:$dir/trans.JOB || exit 1;
 
     fi
-    feats="$sifeats transform-feats ark:$dir/trans.JOB ark:- ark:- |"
+    feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |"
     cur_trans_dir=$dir
   fi
 
diff --git a/egs/wsj/s5/steps/train_sgmm2.sh b/egs/wsj/s5/steps/train_sgmm2.sh
index 29c5346c480..7f7df2e046a 100755
--- a/egs/wsj/s5/steps/train_sgmm2.sh
+++ b/egs/wsj/s5/steps/train_sgmm2.sh
@@ -10,16 +10,15 @@
 # (Computer Speech and Language, 2011).
 
 # Begin configuration section.
-nj=4
 cmd=run.pl
-stage=-6 # use this to resume partially finished training 
+stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
-realign_iters="5 10 15"; # Iters to realign on. 
+realign_iters="5 10 15"; # Iters to realign on.
 spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
 increase_iters="6 10 14"; # Iters on which to increase phn dim and/or spk dim;
     # rarely necessary, and if it is, only the 1st will normally be necessary.
@@ -70,7 +69,7 @@ first_spkvec_iter=`echo $spkvec_iters | awk '{print $1}'` || exit 1;
 ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
 
 # Check some files.
-for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm $alidir/num_jobs; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
@@ -112,7 +111,7 @@ echo "$0: feature type is $feat_type"
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
-    cp $alidir/final.mat $dir    
+    cp $alidir/final.mat $dir
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
@@ -151,7 +150,7 @@ if [ $stage -le -5 ]; then
 fi
 
 if [ $stage -le -4 ]; then
-  echo "$0: Initializing the model"  
+  echo "$0: Initializing the model"
   # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims
   # will be truncated on initialization.
   $cmd $dir/log/init_sgmm.log \
@@ -176,7 +175,7 @@ if [ $stage -le -2 ]; then
 fi
 
 if [ $stage -le -1 ]; then
-  echo "$0: converting alignments" 
+  echo "$0: converting alignments"
   $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \
     convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \
     "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
@@ -204,10 +203,10 @@ while [ $x -lt $num_iters ]; do
          ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1;
      fi
      spkvecs_opt="--spk-vecs=ark:$dir/vecs.JOB"
-   fi  
+   fi
    if [ $x -eq 0 ]; then
      flags=vwcSt # on the first iteration, don't update projections M or N
-   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then 
+   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then
      # Update N if we have speaker-vector space and x is odd,
      # and we've already updated the speaker vectors...
      flags=vNwSct
@@ -218,9 +217,9 @@ while [ $x -lt $num_iters ]; do
        flags=vwSct # no M on early iters, if --update-m-iter option given.
      fi
    fi
-   $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update 
+   $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update
    # spk-weight projections "u".
-   
+
    if [ $stage -le $x ]; then
      $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
        sgmm2-acc-stats $spkvecs_opt --utt2spk=ark:$sdata/JOB/utt2spk \
@@ -235,7 +234,7 @@ while [ $x -lt $num_iters ]; do
    if echo $increase_dim_iters | grep -w $x >/dev/null; then
      increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
      # Note: the command below might have a null effect on some iterations.
-     if [ $spk_dim -gt $feat_dim ]; then 
+     if [ $spk_dim -gt $feat_dim ]; then
        cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \
          copy-vector --print-args=false --change-dim=$spk_dim \
          ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \
@@ -292,7 +291,7 @@ if [ $spk_dim -gt 0 ]; then
     cur_alimdl=$dir/$[$x+1].alimdl
     x=$[$x+1]
   done
-  rm $dir/final.alimdl 2>/dev/null 
+  rm $dir/final.alimdl 2>/dev/null
   ln -s $x.alimdl $dir/final.alimdl
 fi
 
diff --git a/egs/wsj/s5/steps/train_sgmm2_group.sh b/egs/wsj/s5/steps/train_sgmm2_group.sh
index 4639616aceb..7263e2d5e8e 100755
--- a/egs/wsj/s5/steps/train_sgmm2_group.sh
+++ b/egs/wsj/s5/steps/train_sgmm2_group.sh
@@ -14,14 +14,14 @@
 
 # Begin configuration section.
 cmd=run.pl
-stage=-6 # use this to resume partially finished training 
+stage=-6 # use this to resume partially finished training
 context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
 # quinphone system.
 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
 num_iters=25   # Total number of iterations of training
 num_iters_alimdl=3 # Number of iterations for estimating alignment model.
 max_iter_inc=15 # Last iter to increase #substates on.
-realign_iters="5 10 15"; # Iters to realign on. 
+realign_iters="5 10 15"; # Iters to realign on.
 spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
 increase_iters="6 10 14"; # Iters on which to increase phn dim and/or spk dim;
     # rarely necessary, and if it is, only the 1st will normally be necessary.
@@ -75,7 +75,7 @@ first_spkvec_iter=`echo $spkvec_iters | awk '{print $1}'` || exit 1;
 ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
 
 # Check some files.
-for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm; do
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm $alidir/num_jobs; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
@@ -117,7 +117,7 @@ echo "$0: feature type is $feat_type"
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
-    cp $alidir/final.mat $dir    
+    cp $alidir/final.mat $dir
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
@@ -156,7 +156,7 @@ if [ $stage -le -5 ]; then
 fi
 
 if [ $stage -le -4 ]; then
-  echo "$0: Initializing the model"  
+  echo "$0: Initializing the model"
   # Note: if phn_dim > feat_dim+1 or spk_dim > feat_dim, these dims
   # will be truncated on initialization.
   $cmd $dir/log/init_sgmm.log \
@@ -181,7 +181,7 @@ if [ $stage -le -2 ]; then
 fi
 
 if [ $stage -le -1 ]; then
-  echo "$0: converting alignments" 
+  echo "$0: converting alignments"
   $cmd JOB=1:$nj $dir/log/convert_ali.JOB.log \
     convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/ali.JOB.gz|" \
     "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
@@ -209,10 +209,10 @@ while [ $x -lt $num_iters ]; do
          ark:$dir/tmp_vecs.JOB '&&' mv $dir/tmp_vecs.JOB $dir/vecs.JOB || exit 1;
      fi
      spkvecs_opt="--spk-vecs=ark:$dir/vecs.JOB"
-   fi  
+   fi
    if [ $x -eq 0 ]; then
      flags=vwcSt # on the first iteration, don't update projections M or N
-   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then 
+   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge $first_spkvec_iter ]; then
      # Update N if we have speaker-vector space and x is odd,
      # and we've already updated the speaker vectors...
      flags=vNwSct
@@ -223,9 +223,9 @@ while [ $x -lt $num_iters ]; do
        flags=vwSct # no M on early iters, if --update-m-iter option given.
      fi
    fi
-   $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update 
+   $spk_dep_weights && [ $x -ge $first_spkvec_iter ] && flags=${flags}u; # update
    # spk-weight projections "u".
-   
+
    # Submit separate jobs for small groups (of size $group) of accumulators.
    Args=() # bash array of training commands for 1:nj, that put accs to stdout.
    for n in `seq $nj`; do
@@ -233,16 +233,16 @@ while [ $x -lt $num_iters ]; do
              --update-flags=$flags '$gselect_opt' --rand-prune=$rand_prune \
              $dir/$x.mdl '$feats' 'ark,s,cs:gunzip -c $dir/ali.JOB.gz | ali-to-post ark:- ark:-|' - |" | sed s/JOB/$n/g`
    done
-   
+
    g=0
    rm $dir/.error 2>/dev/null
    if [ $stage -le $x ]; then
      while [ $[$g*$group] -lt $nj ]; do
        if [ -s $dir/acc.$x.$g.gz ]; then
          echo "Skipping creation of acc $dir/acc.$x.$g.gz as it already exists."
-       else 
+       else
          start=$[$g*$group + 1]; # start-position in array Args.
-         # see http://www.thegeekstuff.com/2010/06/bash-array-tutorial/, this uses Bash arrays."  
+         # see http://www.thegeekstuff.com/2010/06/bash-array-tutorial/, this uses Bash arrays."
          # The syntax "${Args[@]:$start:$group}" is equivalent to, say,
          # "${Args[3]}" "${Args[4]}" if start=3 and group=2.  Except it's smart about the end
          # of the array, it won't give you empty quoted strings if the length "group" takes you off
@@ -258,14 +258,14 @@ while [ $x -lt $num_iters ]; do
        exit 1;
      fi
    fi
-   
+
    # The next option is needed if the user specifies a phone or speaker sub-space
    # dimension that's higher than the "normal" one.
    increase_dim_opts=
    if echo $increase_dim_iters | grep -w $x >/dev/null; then
      increase_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
      # Note: the command below might have a null effect on some iterations.
-     if [ $spk_dim -gt $feat_dim ]; then 
+     if [ $spk_dim -gt $feat_dim ]; then
        cmd JOB=1:$nj $dir/log/copy_vecs.$x.JOB.log \
          copy-vector --print-args=false --change-dim=$spk_dim \
          ark:$dir/vecs.JOB ark:$dir/vecs_tmp.$JOB '&&' \
@@ -322,7 +322,7 @@ if [ $spk_dim -gt 0 ]; then
       while [ $[$g*$group] -lt $nj ]; do
         if [ -s $dir/acc.$x.$g.gz ]; then
           echo "Skipping creation of acc $dir/acc.$x.$g.gz as it already exists."
-        else 
+        else
           start=$[$g*$group + 1]; # start-position in array Args.
           $cmd --num-threads "$group" $dir/log/acc.$x.$g.log \
             sgmm2-sum-accs --parallel=true "|gzip -c >$dir/acc.$x.$g.gz" "${Args[@]:$start:$group}"  || touch $dir/.error &
@@ -345,7 +345,7 @@ if [ $spk_dim -gt 0 ]; then
     cur_alimdl=$dir/$[$x+1].alimdl
     x=$[$x+1]
   done
-  rm $dir/final.alimdl 2>/dev/null 
+  rm $dir/final.alimdl 2>/dev/null
   ln -s $x.alimdl $dir/final.alimdl
 fi
 
diff --git a/egs/wsj/s5/steps/train_ubm.sh b/egs/wsj/s5/steps/train_ubm.sh
index a78d0639404..5351abbb784 100755
--- a/egs/wsj/s5/steps/train_ubm.sh
+++ b/egs/wsj/s5/steps/train_ubm.sh
@@ -7,7 +7,6 @@
 # We mostly use this for SGMM systems.
 
 # Begin configuration section.
-nj=4
 cmd=run.pl
 silence_weight=  # You can set it to e.g. 0.0, to weight down silence in training.
 stage=-2
@@ -42,7 +41,7 @@ lang=$3
 alidir=$4
 dir=$5
 
-for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/num_jobs; do
   [ ! -f $f ] && echo "No such file $f" && exit 1;
 done
 
@@ -75,7 +74,7 @@ echo "$0: feature type is $feat_type"
 case $feat_type in
   delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";;
   lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
-    cp $alidir/final.mat $dir    
+    cp $alidir/final.mat $dir
     ;;
   *) echo "$0: invalid feature type $feat_type" && exit 1;
 esac
@@ -90,7 +89,7 @@ if [ -f $alidir/trans.1 ]; then
   fi
 elif [ -f $alidir/raw_trans.1 ]; then
   echo "$0: using raw-FMLLR transforms from $alidir"
-  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"  
+  feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/raw_trans.JOB ark:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
 fi
 ##
 
@@ -129,7 +128,7 @@ while [ $x -lt $num_iters ]; do
     fgmm-global-acc-stats $weights_opt --gselect=ark,s,cs:- $dir/$x.ubm "$feats" \
     $dir/$x.JOB.acc || exit 1;
   lowcount_opt="--remove-low-count-gaussians=false"
-  [ $[$x+1] -eq $num_iters ] && lowcount_opt=   # Only remove low-count Gaussians 
+  [ $[$x+1] -eq $num_iters ] && lowcount_opt=   # Only remove low-count Gaussians
   # on last iter-- we can't do it earlier, or the Gaussian-selection info would
   # be mismatched.
   $cmd $dir/log/update.$x.log \
diff --git a/egs/wsj/s5/utils/add_lex_disambig.pl b/egs/wsj/s5/utils/add_lex_disambig.pl
index dd8a25de6e1..c4277e8dc06 100755
--- a/egs/wsj/s5/utils/add_lex_disambig.pl
+++ b/egs/wsj/s5/utils/add_lex_disambig.pl
@@ -122,6 +122,7 @@
     if ($sil_probs) {
       shift @A; # Remove silprob
       shift @A; # Remove silprob
+      shift @A; # Remove silprob, there three numbers for sil_probs
     }
     while(@A > 0) {
         pop @A;  # Remove last phone
diff --git a/egs/wsj/s5/utils/apply_map.pl b/egs/wsj/s5/utils/apply_map.pl
index ff9507fd894..725d3463a00 100755
--- a/egs/wsj/s5/utils/apply_map.pl
+++ b/egs/wsj/s5/utils/apply_map.pl
@@ -9,47 +9,59 @@
 # be sequences of tokens.  See the usage message.
 
 
-if (@ARGV > 0 && $ARGV[0] eq "-f") {
-  shift @ARGV;
-  $field_spec = shift @ARGV; 
-  if ($field_spec =~ m/^\d+$/) {
-    $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
-  }
-  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
-    if ($1 ne "") {
-      $field_begin = $1 - 1;    # Change to zero-based indexing.
+$permissive = 0;
+
+for ($x = 0; $x <= 2; $x++) {
+
+  if (@ARGV > 0 && $ARGV[0] eq "-f") {
+    shift @ARGV;
+    $field_spec = shift @ARGV;
+    if ($field_spec =~ m/^\d+$/) {
+      $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
     }
-    if ($2 ne "") {
-      $field_end = $2 - 1;      # Change to zero-based indexing.
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+      if ($1 ne "") {
+        $field_begin = $1 - 1;  # Change to zero-based indexing.
+      }
+      if ($2 ne "") {
+        $field_end = $2 - 1;    # Change to zero-based indexing.
+      }
+    }
+    if (!defined $field_begin && !defined $field_end) {
+      die "Bad argument to -f option: $field_spec";
     }
   }
-  if (!defined $field_begin && !defined $field_end) {
-    die "Bad argument to -f option: $field_spec"; 
-  }
-}
 
-# Mapping is obligatory
-$permissive = 0;
-if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
-  shift @ARGV;
-  # Mapping is optional (missing key is printed to output)
-  $permissive = 1;
+  if (@ARGV > 0 && $ARGV[0] eq '--permissive') {
+    shift @ARGV;
+    # Mapping is optional (missing key is printed to output)
+    $permissive = 1;
+  }
 }
 
 if(@ARGV != 1) {
   print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n";
-  print STDERR "Usage: apply_map.pl [options] map <input >output\n" .
-    "options: [-f <field-range> ]\n" .
-    "Applies the map 'map' to all input text, where each line of the map\n" .
-    "is interpreted as a map from the first field to the list of the other fields\n" .
-    "Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field\n" .
-    "range in the input to apply the map to.\n" .
-    "e.g.: echo A B | apply_map.pl a.txt\n" .
-    "where a.txt is:\n" .
-    "A a1 a2\n" .
-    "B b\n" .
-    "will produce:\n" .
-    "a1 a2 b\n";
+  print STDERR <<'EOF';
+Usage: apply_map.pl [options] map <input >output
+ options: [-f <field-range> ] [--permissive]
+   This applies a map to some specified fields of some input text:
+   For each line in the map file: the first field is the thing we
+   map from, and the remaining fields are the sequence we map it to.
+   The -f (field-range) option says which fields of the input file the map
+   map should apply to.
+   If the --permissive option is supplied, fields which are not present
+   in the map will be left as they were.
+ Applies the map 'map' to all input text, where each line of the map
+ is interpreted as a map from the first field to the list of the other fields
+ Note: <field-range> can look like 4-5, or 4-, or 5-, or 1, it means the field
+ range in the input to apply the map to.
+ e.g.: echo A B | apply_map.pl a.txt
+ where a.txt is:
+ A a1 a2
+ B b
+ will produce:
+ a1 a2 b
+EOF
   exit(1);
 }
 
@@ -72,12 +84,12 @@
       $a = $A[$x];
       if (!defined $map{$a}) {
         if (!$permissive) {
-          die "apply_map.pl: undefined key $a in $map_file\n"; 
+          die "apply_map.pl: undefined key $a in $map_file\n";
         } else {
           print STDERR "apply_map.pl: warning! missing key $a in $map_file\n";
         }
       } else {
-        $A[$x] = $map{$a}; 
+        $A[$x] = $map{$a};
       }
     }
   }
diff --git a/egs/wsj/s5/utils/build_const_arpa_lm.sh b/egs/wsj/s5/utils/build_const_arpa_lm.sh
index ec067df0d39..51aca1bb2ad 100755
--- a/egs/wsj/s5/utils/build_const_arpa_lm.sh
+++ b/egs/wsj/s5/utils/build_const_arpa_lm.sh
@@ -34,8 +34,8 @@ mkdir -p $new_lang
 cp -r $old_lang/* $new_lang
 
 unk=`cat $new_lang/oov.int`
-bos=`grep -w "<s>" $new_lang/words.txt | awk '{print $2}'`
-eos=`grep "</s>" $new_lang/words.txt | awk '{print $2}'`
+bos=`grep "^<s>\s" $new_lang/words.txt | awk '{print $2}'`
+eos=`grep "^</s>\s" $new_lang/words.txt | awk '{print $2}'`
 if [[ -z $bos || -z $eos ]]; then
   echo "$0: <s> and </s> symbols are not in $new_lang/words.txt"
   exit 1
diff --git a/egs/wsj/s5/utils/combine_data.sh b/egs/wsj/s5/utils/combine_data.sh
index a43cf9d77f3..8daffcea8c5 100755
--- a/egs/wsj/s5/utils/combine_data.sh
+++ b/egs/wsj/s5/utils/combine_data.sh
@@ -42,6 +42,20 @@ for dir in $*; do
   fi
 done
 
+# Check that frame_shift are compatible, where present together with features.
+dir_with_frame_shift=
+for dir in $*; do
+  if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
+    if [[ $dir_with_frame_shift ]] &&
+       ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
+      echo "$0:error: different frame_shift in directories $dir and " \
+           "$dir_with_frame_shift. Cannot combine features."
+      exit 1;
+    fi
+    dir_with_frame_shift=$dir
+  fi
+done
+
 # W.r.t. utt2uniq file the script has different behavior compared to other files
 # it is not compulsary for it to exist in src directories, but if it exists in
 # even one it should exist in all. We will create the files where necessary
@@ -94,7 +108,7 @@ else
   echo "$0 [info]: not combining segments as it does not exist"
 fi
 
-for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
+for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
   exists_somewhere=false
   absent_somewhere=false
   for d in $*; do
@@ -121,6 +135,10 @@ done
 
 utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
 
+if [[ $dir_with_frame_shift ]]; then
+  cp $dir_with_frame_shift/frame_shift $dest
+fi
+
 if ! $skip_fix ; then
   utils/fix_data_dir.sh $dest || exit 1;
 fi
diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh
index f3b885c5e79..fbd31203e34 100755
--- a/egs/wsj/s5/utils/copy_data_dir.sh
+++ b/egs/wsj/s5/utils/copy_data_dir.sh
@@ -103,6 +103,9 @@ fi
 if [ -f $srcdir/utt2dur ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
 fi
+if [ -f $srcdir/utt2num_frames ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2num_frames >$destdir/utt2num_frames
+fi
 if [ -f $srcdir/reco2dur ]; then
   if [ -f $srcdir/segments ]; then
     cp $srcdir/reco2dur $destdir/reco2dur
@@ -116,7 +119,7 @@ fi
 if [ -f $srcdir/cmvn.scp ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp
 fi
-for f in stm glm ctm; do
+for f in frame_shift stm glm ctm; do
   if [ -f $srcdir/$f ]; then
     cp $srcdir/$f $destdir
   fi
@@ -126,7 +129,7 @@ rm $destdir/spk_map $destdir/utt_map
 
 echo "$0: copied data from $srcdir to $destdir"
 
-for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do
+for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel frame_shift stm glm ctm; do
   if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then
     echo "$0: file $f exists in dest $destdir but not in src $srcdir.  Moving it to"
     echo " ... $destdir/.backup/$f"
diff --git a/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py b/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py
index deb8207c5b7..4463bc9fcf0 100755
--- a/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py
+++ b/egs/wsj/s5/utils/ctm/resolve_ctm_overlaps.py
@@ -17,6 +17,7 @@
 """
 
 from __future__ import print_function
+from __future__ import division
 import argparse
 import collections
 import logging
@@ -37,7 +38,8 @@
 def get_args():
     """gets command line arguments"""
 
-    usage = """ Python script to resolve overlaps in ctms """
+    usage = """ Python script to resolve overlaps in ctms.  May be used with
+                utils/data/subsegment_data_dir.sh. """
     parser = argparse.ArgumentParser(usage)
     parser.add_argument('segments', type=argparse.FileType('r'),
                         help='use segments to resolve overlaps')
@@ -231,7 +233,7 @@ def resolve_overlaps(ctms, segments):
             try:
                 index = next(
                     (i for i, line in enumerate(ctm_for_next_utt)
-                     if line[2] + line[3] / 2.0 > overlap / 2.0))
+                    if line[2] + line[3] / 2.0 > overlap / 2.0))
             except StopIteration:
                 # This can happen if there is no word hypothesized after
                 # half the overlap region.
@@ -277,7 +279,7 @@ def run(args):
     segments, reco2utt = read_segments(args.segments)
     ctms = read_ctm(args.ctm_in, segments)
 
-    for reco, utts in reco2utt.iteritems():
+    for reco, utts in reco2utt.items():
         ctms_for_reco = []
         for utt in sorted(utts, key=lambda x: segments[x][1]):
             if (reco, utt) in ctms:
diff --git a/egs/wsj/s5/utils/data/combine_short_segments.sh b/egs/wsj/s5/utils/data/combine_short_segments.sh
index ef4927efdd8..5ce621694f6 100755
--- a/egs/wsj/s5/utils/data/combine_short_segments.sh
+++ b/egs/wsj/s5/utils/data/combine_short_segments.sh
@@ -16,8 +16,13 @@
 
 # begin configuration section
 cleanup=true
+speaker_only=false  # If true, utterances are only combined from the same speaker.
+                    # It may be useful for the speaker recognition task.
+                    # If false, utterances are preferentially combined from the same speaker,
+                    # and then combined across different speakers.
 # end configuration section
 
+
 . utils/parse_options.sh
 
 if [ $# != 3 ]; then
@@ -25,7 +30,8 @@ if [ $# != 3 ]; then
   echo "  $0 [options] <srcdir> <min-segment-length-in-seconds> <dir>"
   echo "e.g.:"
   echo " $0 data/train 1.55 data/train_comb"
-  # options documentation here.
+  echo " Options:"
+  echo "  --speaker-only <true|false>  # options to internal/choose_utts_to_combine.py, default false."
   exit 1;
 fi
 
@@ -55,7 +61,7 @@ if ! mkdir -p $dir; then
   exit 1;
 fi
 
-if ! utils/validate_data_dir.sh $srcdir; then
+if ! utils/validate_data_dir.sh --no-text $srcdir; then
   echo "$0: failed to validate input directory $srcdir.  If needed, run   utils/fix_data_dir.sh $srcdir"
   exit 1
 fi
@@ -72,6 +78,7 @@ set -o pipefail
 utils/data/get_utt2dur.sh $srcdir
 
 utils/data/internal/choose_utts_to_combine.py --min-duration=$min_seg_len \
+  --merge-within-speakers-only=$speaker_only \
   $srcdir/spk2utt $srcdir/utt2dur $dir/utt2utts $dir/utt2spk $dir/utt2dur
 
 utils/utt2spk_to_spk2utt.pl < $dir/utt2spk > $dir/spk2utt
@@ -87,7 +94,9 @@ utils/apply_map.pl -f 2- $srcdir/feats.scp <$dir/utt2utts | \
 
 # create $dir/text by concatenating the source 'text' entries for the original
 # utts.
-utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text
+if [ -f $srcdir/text ]; then
+  utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text
+fi
 
 if [ -f $srcdir/utt2uniq ]; then
   # the utt2uniq file is such that if 2 utts were derived from the same original
@@ -171,7 +180,7 @@ fi
 # note: the user will have to recompute the cmvn, as the speakers may have changed.
 rm $dir/cmvn.scp 2>/dev/null || true
 
-utils/validate_data_dir.sh --no-wav $dir
+utils/validate_data_dir.sh --no-text --no-wav $dir
 
 if $cleanup; then
   rm $dir/utt2utts
diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
index dd315cc405b..c113bb512ef 100755
--- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
+++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
@@ -35,9 +35,15 @@ if [ -f $data/reco2file_and_channel ]; then
 fi
 
 mkdir -p $dir/.backup
-mv $dir/feats.scp $dir/cmvn.scp $dir/.backup
-
-rm $dir/utt2spk || true
+if [ -f $dir/feats.scp ]; then
+  mv $dir/feats.scp $dir/.backup
+fi
+if [ -f $dir/cmvn.scp ]; then
+  mv $dir/cmvn.scp $dir/.backup
+fi
+if [ -f $dir/utt2spk ]; then
+  mv $dir/utt2spk $dir/.backup
+fi
 
 [ -f $data/stm ] && cp $data/stm $dir
 [ -f $data/glm ] && cp $data/glm $dir
diff --git a/egs/wsj/s5/utils/data/get_frame_shift.sh b/egs/wsj/s5/utils/data/get_frame_shift.sh
index eaf21b37ea6..c836bde1b18 100755
--- a/egs/wsj/s5/utils/data/get_frame_shift.sh
+++ b/egs/wsj/s5/utils/data/get_frame_shift.sh
@@ -14,12 +14,16 @@
 . ./path.sh
 
 if [ $# != 1 ]; then
-  echo "Usage: $0 <datadir>"
-  echo "e.g.:"
-  echo " $0 data/train"
-  echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
-  echo "If <datadir> does not contain utt2dur, this script may call utils/data/get_utt2dur.sh,"
-  echo "which will require write permission to <datadir>"
+  cat >&2 <<EOF
+Usage: frame_shift=\$($0 <datadir>)
+e.g.:  frame_shift=\$($0 data/train)
+
+This script prints the frame-shift in seconds (e.g. 0.01) to the standard out.
+Its output is intended to be captured in a shell variable.
+
+If <datadir> does not contain the file utt2dur, this script may invoke
+utils/data/get_utt2dur.sh, which will require write permission to <datadir>.
+EOF
   exit 1
 fi
 
@@ -27,6 +31,15 @@ export LC_ALL=C
 
 dir=$1
 
+if [[ -s $dir/frame_shift ]]; then
+  cat $dir/frame_shift
+  exit
+fi
+
+if [ ! -f $dir/feats.scp ]; then
+  echo "$0: $dir/feats.scp does not exist" 1>&2
+  exit 1
+fi
 
 if [ ! -s $dir/utt2dur ]; then
   if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then
@@ -35,37 +48,27 @@ if [ ! -s $dir/utt2dur ]; then
     exit 0
   fi
   echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
-  utils/data/get_utt2dur.sh $dir 1>&2
+  utils/data/get_utt2dur.sh 1>&2 $dir || exit 1
 fi
 
-if [ ! -s $dir/frame_shift ]; then
-  if [ ! -f $dir/feats.scp ]; then
-    echo "$0: $dir/feats.scp does not exist" 1>&2
-    exit 1
-  fi
-
-  temp=$(mktemp /tmp/tmp.XXXX)
+temp=$(mktemp /tmp/tmp.XXXX) || exit 1
 
-  feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
+feat-to-len --print-args=false "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
 
-  if [ -z $temp ]; then
-    echo "$0: error running feat-to-len" 1>&2
-    exit 1
-  fi
-
-  frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | \
-    awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }') || exit 1;
-
-  echo $frame_shift > $dir/frame_shift
+if [[ ! -s $temp ]]; then
   rm $temp
-fi
-
-frame_shift=$(cat $dir/frame_shift)
-if [ -z "$frame_shift" ]; then
-  echo "$0: Could not read get frame shift from directory $dir" 1>&2
+  echo "$0: error running feat-to-len" 1>&2
   exit 1
 fi
 
-echo $frame_shift
+frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk '
+      { dur += $2; frames += $4; }
+  END { shift = dur / frames;
+        if (shift > 0.01 && shift < 0.0102) shift = 0.01;
+        print shift; }') || exit 1;
 
+rm $temp
+
+echo $frame_shift > $dir/frame_shift
+echo $frame_shift
 exit 0
diff --git a/egs/wsj/s5/utils/data/get_uniform_subsegments.py b/egs/wsj/s5/utils/data/get_uniform_subsegments.py
index c61b96e0dbb..dfaa55b18f7 100755
--- a/egs/wsj/s5/utils/data/get_uniform_subsegments.py
+++ b/egs/wsj/s5/utils/data/get_uniform_subsegments.py
@@ -4,6 +4,7 @@
 #           2017  Matthew Maciejewski
 # Apache 2.0.
 
+from __future__ import print_function
 import argparse
 import logging
 import sys
@@ -12,11 +13,22 @@
 def get_args():
     parser = argparse.ArgumentParser(
         description=textwrap.dedent("""
-        Creates a subsegments file from an input segments file
-        that has the format
+        Creates a subsegments file from an input segments file.
+
+        The output format is
+
         <subsegment-id> <utterance-id> <start-time> <end-time>,
-        where the timing are relative to the start-time of the
-        <utterance-id> in the input segments file.
+
+        where the timings are relative to the start-time of the
+         <utterance-id> in the input segments file.
+        Reminder: the format of the input segments file is:
+
+         <utterance-id> <recording-id> <start-time> <end-time>
+
+        where the recording-id corresponds to a wav file (or a channel of
+        a wav file) from wav.scp.  Note: you can use
+        utils/data/get_segments_for_data.sh to generate a 'default'
+        segments file for your data if one doesn't already exist.
 
         e.g.: get_uniform_subsegments.py data/dev/segments > \\
                 data/dev_uniform_segments/sub_segments
@@ -72,7 +84,7 @@ def run(args):
             new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
                 utt_id=utt_id, s=int(100 * start_relative),
                 e=int(100 * end_relative))
-            print ("{new_utt} {utt_id} {s} {e}".format(
+            print ("{new_utt} {utt_id} {s:.3f} {e:.3f}".format(
                 new_utt=new_utt, utt_id=utt_id, s=start_relative,
                 e=start_relative + args.max_segment_duration))
             start += args.max_segment_duration - args.overlap_duration
@@ -89,7 +101,7 @@ def run(args):
         new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
             utt_id=utt_id, s=int(round(100 * (start - start_time))),
             e=int(round(100 * (end - start_time))))
-        print ("{new_utt} {utt_id} {s} {e}".format(
+        print ("{new_utt} {utt_id} {s:.3f} {e:.3f}".format(
             new_utt=new_utt, utt_id=utt_id, s=start - start_time,
             e=end - start_time))
 
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
index 995136a5575..a760981d198 100755
--- a/egs/wsj/s5/utils/data/get_utt2dur.sh
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -23,7 +23,8 @@ if [ $# != 1 ]; then
   echo " $0 data/train"
   echo " Options:"
   echo " --frame-shift      # frame shift in seconds. Only relevant when we are"
-  echo "                    # getting duration from feats.scp (default: 0.01). "
+  echo "                    # getting duration from feats.scp, and only if the "
+  echo "                    # file frame_shift does not exist (default: 0.01). "
   exit 1
 fi
 
@@ -40,12 +41,17 @@ fi
 if [ -s $data/segments ]; then
   echo "$0: working out $data/utt2dur from $data/segments"
   awk '{len=$4-$3; print $1, len;}' < $data/segments  > $data/utt2dur
+elif [[ -s $data/frame_shift && -f $data/utt2num_frames ]]; then
+  echo "$0: computing $data/utt2dur from $data/{frame_shift,utt2num_frames}."
+  frame_shift=$(cat $data/frame_shift) || exit 1
+  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
+  awk -v fs=$frame_shift '{ $2=($2+1.5)*fs; print }' <$data/utt2num_frames  >$data/utt2dur
 elif [ -f $data/wav.scp ]; then
   echo "$0: segments file does not exist so getting durations from wave files"
 
   # if the wav.scp contains only lines of the form
   # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
-  if cat $data/wav.scp | perl -e '
+  if perl <$data/wav.scp -e '
      while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
              @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
                                $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
@@ -102,7 +108,13 @@ elif [ -f $data/wav.scp ]; then
   fi
 elif [ -f $data/feats.scp ]; then
   echo "$0: wave file does not exist so getting durations from feats files"
-  feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur
+  if [[ -s $data/frame_shift ]]; then
+    frame_shift=$(cat $data/frame_shift) || exit 1
+    echo "$0: using frame_shift=$frame_shift from file $data/frame_shift"
+  fi
+  # The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
+  feat-to-len scp:$data/feats.scp ark,t:- |
+    awk -v frame_shift=$frame_shift '{print $1, ($2+1.5)*frame_shift}' >$data/utt2dur
 else
   echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
   exit 1
diff --git a/egs/wsj/s5/utils/data/get_utt2num_frames.sh b/egs/wsj/s5/utils/data/get_utt2num_frames.sh
index a6d4f0ecb10..96e0ddfb0bc 100755
--- a/egs/wsj/s5/utils/data/get_utt2num_frames.sh
+++ b/egs/wsj/s5/utils/data/get_utt2num_frames.sh
@@ -10,13 +10,14 @@ frame_shift=0.01
 frame_overlap=0.015
 
 . utils/parse_options.sh
+. ./path.sh
 
 if [ $# -ne 1 ]; then
   echo "This script writes a file utt2num_frames with the "
   echo "number of frames in each utterance as measured based on the "
   echo "duration of the utterances (in utt2dur) and the specified "
   echo "frame_shift and frame_overlap."
-  echo "Usage: $0 <data>" 
+  echo "Usage: $0 <data>"
   exit 1
 fi
 
@@ -28,7 +29,7 @@ if [ -s $data/utt2num_frames ]; then
 fi
 
 if [ ! -f $data/feats.scp ]; then
-  utils/data/get_utt2dur.sh $data
+  utils/data/get_utt2dur.sh --nj ${nj} --cmd "$cmd" $data
   awk -v fs=$frame_shift -v fovlp=$frame_overlap \
     '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames
   exit 0
diff --git a/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py b/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py
index 740b9aa612b..2b97f5207fa 100755
--- a/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py
+++ b/egs/wsj/s5/utils/data/internal/choose_utts_to_combine.py
@@ -37,6 +37,12 @@
 
 parser.add_argument("--min-duration", type = float, default = 1.55,
                     help="Minimum utterance duration")
+parser.add_argument("--merge-within-speakers-only", type = str, default = 'false',
+                    choices = ['true', 'false'],
+                    help="If true, utterances are only combined from the same speaker."
+                    "It may be useful for the speaker recognition task."
+                    "If false, utterances are preferentially combined from the same speaker,"
+                    "and then combined across different speakers.")
 parser.add_argument("spk2utt_in", type = str, metavar = "<spk2utt-in>",
                     help="Filename of [input] speaker to utterance map needed "
                     "because this script tries to merge utterances from the "
@@ -89,7 +95,7 @@ def CombineList(min_duration, durations):
     # for each utterance-index i, group_start[i] gives us the
     # start-index of the group of utterances of which it's currently
     # a member.
-    group_start = range(num_utts)
+    group_start = list(range(num_utts))
     # if utterance-index i currently corresponds to the start of a group
     # of utterances, then group_durations[i] is the total duration of
     # that utterance-group, otherwise undefined.
@@ -216,12 +222,14 @@ def SelfTest():
 # This function figures out the grouping of utterances.
 # The input is:
 # 'min_duration' which is the minimum utterance length in seconds.
+# 'merge_within_speakers_only' which is a ['true', 'false'] choice.
+# If true, then utterances are only combined if they belong to the same speaker.
 # 'spk2utt' which is a list of pairs (speaker-id, [list-of-utterances])
 # 'utt2dur' which is a dict from utterance-id to duration (as a float)
 # It returns a lists of lists of utterances; each list corresponds to
 # a group, e.g.
 # [ ['utt1'], ['utt2', 'utt3'] ]
-def GetUtteranceGroups(min_duration, spk2utt, utt2dur):
+def GetUtteranceGroups(min_duration, merge_within_speakers_only, spk2utt, utt2dur):
     # utt_groups will be a list of lists of utterance-ids formed from the
     # first pass of combination.
     utt_groups = []
@@ -256,22 +264,24 @@ def GetUtteranceGroups(min_duration, spk2utt, utt2dur):
     # Now we combine the groups obtained above, in case we had situations where
     # the combination of all the utterances of one speaker were still below
     # the minimum duration.
-    new_utt_groups = []
-    ranges = CombineList(min_duration, group_durations)
-    for start, end in ranges:
-        # the following code is destructive of 'utt_groups' but it doesn't
-        # matter.
-        this_group = utt_groups[start]
-        for i in range(start + 1, end):
-            this_group += utt_groups[i]
-        new_utt_groups.append(this_group)
-    print("choose_utts_to_combine.py: combined {0} utterances to {1} utterances "
-          "while respecting speaker boundaries, and then to {2} utterances "
-          "with merging across speaker boundaries.".format(
-            len(utt2dur), len(utt_groups), len(new_utt_groups)),
-          file = sys.stderr)
-    return new_utt_groups
-
+    if merge_within_speakers_only == 'true':
+      return utt_groups
+    else:
+      new_utt_groups = []
+      ranges = CombineList(min_duration, group_durations)
+      for start, end in ranges:
+          # the following code is destructive of 'utt_groups' but it doesn't
+          # matter.
+          this_group = utt_groups[start]
+          for i in range(start + 1, end):
+              this_group += utt_groups[i]
+          new_utt_groups.append(this_group)
+      print("choose_utts_to_combine.py: combined {0} utterances to {1} utterances "
+            "while respecting speaker boundaries, and then to {2} utterances "
+            "with merging across speaker boundaries.".format(
+              len(utt2dur), len(utt_groups), len(new_utt_groups)),
+            file = sys.stderr)
+      return new_utt_groups
 
 
 SelfTest()
@@ -324,10 +334,10 @@ def GetUtteranceGroups(min_duration, spk2utt, utt2dur):
                 args.utt2dur_in, line))
 
 
-utt_groups = GetUtteranceGroups(args.min_duration, spk2utt, utt2dur)
+utt_groups = GetUtteranceGroups(args.min_duration, args.merge_within_speakers_only, spk2utt, utt2dur)
 
 # set utt_group names to an array like [ 'utt1', 'utt2-comb2', 'utt4', ... ]
-utt_group_names = [ group[0] if len(group)==1 else group[0] + "-comb" + str(len(group))
+utt_group_names = [ group[0] if len(group)==1 else "{0}-comb{1}".format(group[0], len(group))
                     for group in utt_groups ]
 
 
diff --git a/egs/wsj/s5/utils/data/internal/perturb_volume.py b/egs/wsj/s5/utils/data/internal/perturb_volume.py
index b3bd4225191..c1dfd936358 100755
--- a/egs/wsj/s5/utils/data/internal/perturb_volume.py
+++ b/egs/wsj/s5/utils/data/internal/perturb_volume.py
@@ -8,6 +8,7 @@
 volume of the recordings and writes to stdout the contents of
 a new wav.scp file.
 """
+from __future__ import print_function
 
 import argparse
 import re
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
index dae440b03a3..e357ba8cbfb 100755
--- a/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_volume.sh
@@ -52,15 +52,15 @@ for line in sys.stdin.readlines():
   parts = line.strip().split()
   if line.strip()[-1] == '|':
     if re.search('sox --vol', ' '.join(parts[-11:])):
-      print 'true'
+      print('true')
       sys.exit(0)
   elif re.search(':[0-9]+$', line.strip()) is not None:
     continue
   else:
     if ' '.join(parts[1:3]) == 'sox --vol':
-      print 'true'
+      print('true')
       sys.exit(0)
-print 'false'
+print('false')
 "` || exit 1
 
 if $volume_perturb_done; then
diff --git a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py
index c6bdb95cb2f..ae16e63c945 100755
--- a/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py
+++ b/egs/wsj/s5/utils/data/perturb_speed_to_allowed_lengths.py
@@ -60,13 +60,13 @@ def get_args():
     args.speed_perturb = True if args.speed_perturb == 'true' else False
     return args
 
-class Utterance:
+class Utterance(object):
     """ This class represents a Kaldi utterance
         in a data directory like data/train
     """
 
     def __init__(self, uid, wavefile, speaker, transcription, dur):
-        self.wavefile = (wavefile if wavefile.rstrip().endswith('|') else
+        self.wavefile = (wavefile if wavefile.rstrip(" \t\r\n").endswith('|') else
                          'cat {} |'.format(wavefile))
         self.speaker = speaker
         self.transcription = transcription
@@ -130,7 +130,7 @@ def read_kaldi_mapfile(path):
     m = {}
     with open(path, 'r', encoding='latin-1') as f:
         for line in f:
-            line = line.strip()
+            line = line.strip(" \t\r\n")
             sp_pos = line.find(' ')
             key = line[:sp_pos]
             val = line[sp_pos+1:]
@@ -321,7 +321,7 @@ def main():
                 "Coverage rate: {}%".format(start_dur, end_dur,
                                       100.0 - args.coverage_factor * 2))
     logger.info("There will be {} unique allowed lengths "
-                "for the utterances.".format(int(math.log(end_dur / start_dur) /
+                "for the utterances.".format(int(math.log(end_dur / start_dur)/
                                                  math.log(args.factor))))
 
     allowed_durations = find_allowed_durations(start_dur, end_dur, args)
diff --git a/egs/wsj/s5/utils/data/resample_data_dir.sh b/egs/wsj/s5/utils/data/resample_data_dir.sh
index b972bcc119a..8d96667092f 100755
--- a/egs/wsj/s5/utils/data/resample_data_dir.sh
+++ b/egs/wsj/s5/utils/data/resample_data_dir.sh
@@ -39,7 +39,6 @@ for line in sys.stdin.readlines():
   if splits[-1] == '|':
     out_line = line.strip() + ' $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'
   else:
-    out_line = 'cat {0} {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:]))
+    out_line = '{0} cat {1} | $sox -t wav - -c 1 -b 16 -t wav - rate $freq |'.format(splits[0], ' '.join(splits[1:]))
   print (out_line)" > ${dir}/wav.scp
 rm $dir/wav.scp.tmp
-
diff --git a/egs/wsj/s5/utils/data/subsegment_data_dir.sh b/egs/wsj/s5/utils/data/subsegment_data_dir.sh
index 526fee0b4ef..088d0ff1871 100755
--- a/egs/wsj/s5/utils/data/subsegment_data_dir.sh
+++ b/egs/wsj/s5/utils/data/subsegment_data_dir.sh
@@ -44,6 +44,7 @@ if [ $# != 4 ] && [ $# != 3 ]; then
   echo "                                             # not just applied to the input segments file, is that"
   echo "                                             # for purposes of computing the num-frames of the parts of"
   echo "                                             # matrices in feats.scp, the padding should not be done."
+  echo "  See also: resolve_ctm_overlaps.py"
   exit 1;
 fi
 
@@ -114,6 +115,11 @@ if [ -f $srcdir/reco2file_and_channel ]; then
   cp $srcdir/reco2file_and_channel $dir
 fi
 
+# copy the source reco2dur
+if [ -f $srcdir/reco2dur ]; then
+  cp $srcdir/reco2dur $dir
+fi
+
 if [ -f $srcdir/segments ]; then
   # we have to map the segments file.
   # What's going on below is a little subtle.
@@ -147,7 +153,7 @@ if [ -f $srcdir/feats.scp ]; then
     frame_shift=$(cat $srcdir/frame_shift)
   fi
   echo "$0: note: frame shift is $frame_shift [affects feats.scp]"
-  
+
   # The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>.
   # e.g. 'utt_foo-1 utt_foo 7.21 8.93'
   # The first awk command replaces this with the format:
@@ -167,31 +173,31 @@ if [ -f $srcdir/feats.scp ]; then
   # like pipes that might contain spaces, so it has to be able to produce output like the
   # following:
   # utt_foo-1 some command|[721:892]
-  # The 'end' frame is ensured to not exceed the feature archive size of 
+  # The 'end' frame is ensured to not exceed the feature archive size of
   # <old-utt-id>. This is done using the script fix_subsegment_feats.pl.
-  # e.g if the number of frames in foo-bar.ark is 891, then the features are 
+  # e.g if the number of frames in foo-bar.ark is 891, then the features are
   # truncated to that many frames.
   # utt_foo-1 foo-bar.ark:514231[721:890]
   # Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if
   # the original data-dir already had data-ranges in square brackets.
-  
+
   # Here, we computes the maximum 'end' frame allowed for each <new-utt-id>.
   # This is equal to the number of frames in the feature archive for <old-utt-id>.
   if [ ! -f $srcdir/utt2num_frames ]; then
     echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh."
-    echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc." 
+    echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc."
   fi
   utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir
   awk '{print $1" "$2}' $subsegments | \
     utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \
     $dir/utt2max_frames
-  
+
   awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \
     utils/apply_map.pl -f 2 $srcdir/feats.scp | \
     awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
     utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
     utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; }
-  
+
   # Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame]
   # and write the number-of-frames = last-frame - first-frame + 1 for the utterance.
   cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . "\n"' > \
@@ -222,8 +228,11 @@ fi
 if [ -f $srcdir/glm ]; then
   cp $srcdir/glm $dir
 fi
+if [ -f $srcdir/stm ]; then
+  cp $srcdir/stm $dir
+fi
 
-for f in stm ctm; do
+for f in ctm; do
   if [ -f $srcdir/$f ]; then
     echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
     echo " ... not implemented yet (and probably it's not needed.)"
@@ -233,4 +242,3 @@ done
 rm $dir/new2old_utt
 
 echo "$0: subsegmented data from $srcdir to $dir"
-
diff --git a/egs/wsj/s5/utils/filt.py b/egs/wsj/s5/utils/filt.py
index 2847c0034dd..9201d9e493f 100755
--- a/egs/wsj/s5/utils/filt.py
+++ b/egs/wsj/s5/utils/filt.py
@@ -2,6 +2,7 @@
 
 # Apache 2.0
 
+from __future__ import print_function
 import sys
 
 vocab=set()
@@ -11,4 +12,4 @@
 
 with open(sys.argv[2]) as textfile:
     for line in textfile:
-        print " ".join(map(lambda word: word if word in vocab else '<UNK>', line.strip().split()))
+        print(" ".join([word if word in vocab else '<UNK>' for word in line.strip().split()]))
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index ca0972ca85b..8adf5e45e7b 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -155,12 +155,30 @@ function filter_utts {
   maybe_reco2dur=
   [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
   [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts
-  for x in feats.scp text segments utt2lang $maybe_wav; do
+
+  maybe_utt2dur=
+  if [ -f $data/utt2dur ]; then
+    cat $data/utt2dur | \
+      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2dur.ok || exit 1
+    maybe_utt2dur=utt2dur.ok
+  fi
+
+  maybe_utt2num_frames=
+  if [ -f $data/utt2num_frames ]; then
+    cat $data/utt2num_frames | \
+      awk '{ if (NF == 2 && $2 > 0) { print }}' > $data/utt2num_frames.ok || exit 1
+    maybe_utt2num_frames=utt2num_frames.ok
+  fi
+
+  for x in feats.scp text segments utt2lang $maybe_wav $maybe_utt2dur $maybe_utt2num_frames; do
     if [ -f $data/$x ]; then
       utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
       mv $tmpdir/utts.tmp $tmpdir/utts
     fi
   done
+  rm $data/utt2dur.ok 2>/dev/null || true
+  rm $data/utt2num_frames.ok 2>/dev/null || true
+
   [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
     rm $tmpdir/utts && exit 1;
 
diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh
index 4ef31d925ca..08f842a08f5 100755
--- a/egs/wsj/s5/utils/format_lm_sri.sh
+++ b/egs/wsj/s5/utils/format_lm_sri.sh
@@ -48,8 +48,6 @@ else
   out_dir=$3
 fi
 
-mkdir -p $out_dir
-
 for f in $lm $lang_dir/words.txt; do
   if [ ! -f $f ]; then
     echo "$0: expected input file $f to exist."
@@ -73,7 +71,6 @@ trap 'rm -rf "$tmpdir"' EXIT
 mkdir -p $out_dir
 cp -r $lang_dir/* $out_dir || exit 1;
 
-lm_base=$(basename $lm '.gz')
 awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1;
 
 # Change the LM vocabulary to be the intersection of the current LM vocabulary
diff --git a/egs/wsj/s5/utils/int2sym.pl b/egs/wsj/s5/utils/int2sym.pl
index d6189394a5f..b4a0e36db30 100755
--- a/egs/wsj/s5/utils/int2sym.pl
+++ b/egs/wsj/s5/utils/int2sym.pl
@@ -7,12 +7,12 @@
 
 
 if ($ARGV[0] eq "-f") {
-  shift @ARGV; 
-  $field_spec = shift @ARGV; 
+  shift @ARGV;
+  $field_spec = shift @ARGV;
   if ($field_spec =~ m/^\d+$/) {
     $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
   }
-  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+  if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
     if ($1 ne "") {
       $field_begin = $1 - 1; # Change to zero-based indexing.
     }
@@ -21,12 +21,12 @@
     }
   }
   if (!defined $field_begin && !defined $field_end) {
-    die "Bad argument to -f option: $field_spec"; 
+    die "Bad argument to -f option: $field_spec";
   }
 }
 $symtab = shift @ARGV;
 if(!defined $symtab) {
-    print STDERR "Usage: sym2int.pl [options] symtab [input] > output\n" .
+    print STDERR "Usage: int2sym.pl [options] symtab [input] > output\n" .
       "options: [-f (<field>|<field_start>-<field-end>)]\n" .
       "e.g.: -f 2, or -f 3-4\n";
     exit(1);
diff --git a/egs/wsj/s5/utils/lang/adjust_unk_graph.sh b/egs/wsj/s5/utils/lang/adjust_unk_graph.sh
index 9a40a9960f2..c40f75ceec8 100755
--- a/egs/wsj/s5/utils/lang/adjust_unk_graph.sh
+++ b/egs/wsj/s5/utils/lang/adjust_unk_graph.sh
@@ -26,7 +26,7 @@ mkdir -p $graphdir_out
 
 required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt"
 for f in $required; do
-  [ ! -f $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1;
+  [ ! -e $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1;
   cp -r $graphdir_in/$f $graphdir_out
 done
 
diff --git a/egs/wsj/s5/utils/lang/bpe/bidi.py b/egs/wsj/s5/utils/lang/bpe/bidi.py
new file mode 100755
index 00000000000..447313a5d02
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/bpe/bidi.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# Copyright   2018 Chun-Chieh Chang
+
+# This script is largely written by Stephen Rawls
+# and uses the python package https://pypi.org/project/PyICU_BiDi/
+# The code leaves right to left text alone and reverses left to right text.
+
+import icu_bidi
+import io
+import sys
+import unicodedata
+# R=strong right-to-left;  AL=strong arabic right-to-left
+rtl_set =  set(chr(i) for i in range(sys.maxunicode)
+               if unicodedata.bidirectional(chr(i)) in ['R','AL'])
+def determine_text_direction(text):
+    # Easy case first
+    for char in text:
+        if char in rtl_set:
+            return icu_bidi.UBiDiLevel.UBIDI_RTL
+    # If we made it here we did not encounter any strongly rtl char
+    return icu_bidi.UBiDiLevel.UBIDI_LTR
+
+def utf8_visual_to_logical(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+    bidi.inverse = True
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+def utf8_logical_to_visual(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT  #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+
+##main##
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+for line in sys.stdin:
+    line = line.strip()
+    line = utf8_logical_to_visual(line)[::-1]
+    sys.stdout.write(line + '\n')
diff --git a/egs/wsj/s5/utils/lang/bpe/learn_bpe.py b/egs/wsj/s5/utils/lang/bpe/learn_bpe.py
index 70f18f2d1d9..f6c6d5a0ebb 100755
--- a/egs/wsj/s5/utils/lang/bpe/learn_bpe.py
+++ b/egs/wsj/s5/utils/lang/bpe/learn_bpe.py
@@ -13,6 +13,8 @@
 """
 
 from __future__ import unicode_literals
+from __future__ import division
+from __future__ import print_function
 
 import sys
 import codecs
diff --git a/egs/wsj/s5/utils/lang/bpe/prepend_words.py b/egs/wsj/s5/utils/lang/bpe/prepend_words.py
new file mode 100755
index 00000000000..4a11895a712
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/bpe/prepend_words.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+
+# This script, prepend '|' to every words in the transcript to mark
+# the beginning of the words for finding the initial-space of every word
+# after decoding.
+
+import sys
+import io
+import re
+
+whitespace = re.compile("[ \t]+")
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')
+for line in infile:
+    words = whitespace.split(line.strip(" \t\r\n"))
+    output.write(' '.join([ "|"+word for word in words]) + '\n')
diff --git a/egs/madcat_ar/v1/local/reverse.py b/egs/wsj/s5/utils/lang/bpe/reverse.py
similarity index 100%
rename from egs/madcat_ar/v1/local/reverse.py
rename to egs/wsj/s5/utils/lang/bpe/reverse.py
diff --git a/egs/wsj/s5/utils/lang/check_g_properties.pl b/egs/wsj/s5/utils/lang/check_g_properties.pl
index ee0f6ddb515..e092b606157 100755
--- a/egs/wsj/s5/utils/lang/check_g_properties.pl
+++ b/egs/wsj/s5/utils/lang/check_g_properties.pl
@@ -28,6 +28,7 @@
   ($sym, $int) = @A;
   if ($sym eq "<s>" || $sym eq "</s>") { $is_forbidden{$int} = 1; }
   if ($sym eq "#0") { $hash_zero = $int; }
+  if ($sym =~ m/^#nonterm/) { $is_nonterminal{$int} = 1; }
 }
 
 if (-e "$lang/phones/wdisambig_words.int") {
@@ -65,9 +66,9 @@
     } elsif ($A[2] == 0) {
       print I $_;
       $has_epsilons = 1;
-    } elsif ($A[2] != $A[3]) {
+    } elsif ($A[2] != $A[3] && !$is_nonterminal{$A[2]} ) {
       chop;
-      print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol.\n";
+      print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol or nonterminal.\n";
       exit(1);
     }
   }
diff --git a/egs/wsj/s5/utils/lang/compute_sentence_probs_arpa.py b/egs/wsj/s5/utils/lang/compute_sentence_probs_arpa.py
index 5a7743badee..dc480903db4 100755
--- a/egs/wsj/s5/utils/lang/compute_sentence_probs_arpa.py
+++ b/egs/wsj/s5/utils/lang/compute_sentence_probs_arpa.py
@@ -99,13 +99,13 @@ def compute_begin_prob(sub_list):
     for i in range(1, len(sub_list) - 1):
         logprob += compute_sublist_prob(sub_list[:i + 1])
     return logprob
-    
+
 # The probability is computed in this way:
 # p(word_N | word_N-1 ... word_1) = ngram_dict[word_1 ... word_N][0].
 # Here gram_dict is a dictionary stores a tuple corresponding to ngrams.
 # The first element of tuple is probablity and the second is backoff probability (if exists).
 # If the particular ngram (word_1 ... word_N) is not in the dictionary, then
-# p(word_N | word_N-1 ... word_1) = p(word_N | word_(N-1) ... word_2) * backoff_weight(word_(N-1) | word_(N-2) ... word_1) 
+# p(word_N | word_N-1 ... word_1) = p(word_N | word_(N-1) ... word_2) * backoff_weight(word_(N-1) | word_(N-2) ... word_1)
 # If the sequence (word_(N-1) ... word_1) is not in the dictionary, then the backoff_weight gets replaced with 0.0 (log1)
 # More details can be found in https://cmusphinx.github.io/wiki/arpaformat/
 def compute_sentence_prob(sentence, ngram_order):
@@ -127,7 +127,7 @@ def compute_sentence_prob(sentence, ngram_order):
             logprob += compute_sublist_prob(cur_sublist)
 
     return logprob
-        
+
 
 def output_result(text_in_handle, output_file_handle, ngram_order):
     lines = text_in_handle.readlines()
@@ -139,8 +139,8 @@ def output_result(text_in_handle, output_file_handle, ngram_order):
         output_file_handle.write("{}\n".format(new_logprob))
     text_in_handle.close()
     output_file_handle.close()
-     
-            
+
+
 if __name__ == "__main__":
     check_args(args)
     ngram_dict, tot_num = load_model(args.arpa_lm)
@@ -149,7 +149,7 @@ def output_result(text_in_handle, output_file_handle, ngram_order):
     if not num_valid:
         sys.exit("compute_sentence_probs_arpa.py: Wrong loading model.")
     if args.ngram_order <= 0 or args.ngram_order > max_ngram_order:
-        sys.exit("compute_sentence_probs_arpa.py: " +   
+        sys.exit("compute_sentence_probs_arpa.py: " +
             "Invalid ngram_order (either negative or greater than maximum ngram number ({}) allowed)".format(max_ngram_order))
 
     output_result(args.text_in_handle, args.prob_file_handle, args.ngram_order)
diff --git a/egs/wsj/s5/utils/lang/extend_lang.sh b/egs/wsj/s5/utils/lang/extend_lang.sh
new file mode 100755
index 00000000000..7602cb983de
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/extend_lang.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+# Copyright     2018  Johns Hopkins University (Author: Daniel Povey);
+#               2019  Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# derived files, that go in data/lang/.
+
+# Begin configuration section.
+sil_prob=0.5
+silprob_file=
+# end configuration section
+
+echo "$0 $@"  # Print the command line for logging
+
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "Usage: utils/extend_lang.sh <old-lang-dir> <lexicon> <new-lang-dir>"
+  echo "e.g.: utils/extend_lang.sh data/lang data/local/dict_new_words/lexiconp.txt data/lang_new_words"
+  echo ""
+  echo "This script creates a lang/ directory <new-lang-dir> with L.fst and L_disambig.fst"
+  echo "derived from the provided lexicon, but all other information being the same as the old"
+  echo "lang/ directory, including the phones.txt and words.txt being compatible (however,"
+  echo "words.txt may have new words, and phones.txt may have extra disambiguation symbols"
+  echo "if needed).  We do not allow new phones."
+  echo ""
+  echo "CAUTION: the lexicon generated will only cover the words in the provided lexicon,"
+  echo "which might not include all the words in words.txt.  You should make sure your"
+  echo "lexicon is a superset of the original lexicon used to generate <old-lang-dir>,"
+  echo "if this would be a problem for your scenario."
+  echo ""
+  echo "The basename of <lexicon> must be either lexicon.txt, lexiconp.txt or lexiconp_silprob.txt."
+  echo ""
+  echo "Options"
+  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
+  echo "     --silprob-file <file contains silence probability>    # must be provided if lexicon is lexiconp_silprob.txt"
+  exit 1;
+fi
+
+srcdir=$1
+lexicon=$2
+dir=$3
+
+[ -f path.sh ] && . ./path.sh
+
+
+for f in $srcdir/phones.txt $lexicon; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if ! awk '{if(NF < 2) exit(1)} END{if(NR==0) exit(1)}' <$lexicon; then
+  echo "$0: it looks like there words without pronunciations or.."
+  echo "  ...blank lines in $lexicon, or it is empty."
+  exit 1
+fi
+
+mkdir -p $dir
+
+if [ -d $dir/phones ]; then rm -r $dir/phones; fi
+
+cp -r $srcdir/phones $dir/
+
+for f in oov.int oov.txt phones.txt topo words.txt; do
+  cp $srcdir/$f $dir/
+done
+
+tmpdir=$dir/temp
+rm -r $tmpdir 2>/dev/null
+mkdir -p $tmpdir
+
+silprob=false
+
+if [ $(basename $lexicon) == "lexiconp_silprob.txt" ]; then
+  silprob=true
+  if [ -z $silprob_file ] ; then
+    echo "silprob_file not provided, checking $srcdir"
+    if [ -f $srcdir/silprob.txt ]; then
+        silprob_file=$srcdir/silprob.txt
+        echo "silprob_file found in $srcdir"
+    else
+        echo "silprob_file not found in $srcdir" && exit 1;
+    fi
+  else
+    if [ ! -f $silprob_file ]; then
+      echo "$silprob_file does not exist" && exit 1;
+    fi
+  fi
+elif [ $(basename $lexicon) != lexiconp.txt ]; then
+  echo "$0: currently this script only supports the lexiconp.txt or lexiconp_silprob.txt format;"
+  echo " ... your lexicon has to have that filename."
+fi
+
+# Get the list of extra words.
+awk -v w=$srcdir/words.txt 'BEGIN{while(getline <w) seen[$1] = $1} { if (!($1 in seen)) oov[$1] = 1}
+                     END{ for(k in oov) print k;}' <$lexicon >$tmpdir/extra_words.txt
+
+# Add entries to words.txt for all the words that were not previously in the
+# lexicon.
+highest_number=$(tail -n 1 $srcdir/words.txt | awk '{print $2}')
+awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_words.txt >>$dir/words.txt
+echo "$0: added $(wc -l <$tmpdir/extra_words.txt) extra words to words.txt"
+
+if [ -f $dir/phones/nonterminals.txt ]; then
+  # extra grammar-decoding-related options for getting the lexicon.
+  grammar_opts="--left-context-phones=$dir/phones/left_context_phones.txt --nonterminals=$srcdir/phones/nonterminals.txt"
+else
+  grammar_opts=""
+fi
+
+if [ -f $dir/phones/word_boundary.txt ]; then
+  # was `if $position_dependent_phones; then..` in prepare_lang.sh
+  if "$silprob"; then
+    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
+              $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
+         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
+         else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
+         for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
+         < $lexicon > $tmpdir/lexiconp_silprob.txt
+  else
+    perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
+           if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
+           for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
+         < $lexicon > $tmpdir/lexiconp.txt || exit 1;
+  fi
+else
+  if "$silprob"; then
+    cp $lexicon $tempdir/lexiconp_silprob.txt
+  else
+    cp $lexicon $tmpdir/lexiconp.txt
+  fi
+fi
+
+# Check that there are no unseen phones in the lexicon.
+if "$silprob"; then
+  if ! utils/sym2int.pl -f 6- $srcdir/phones.txt $tmpdir/lexiconp_silprob.txt >/dev/null; then
+    echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
+    exit 1
+  fi
+else 
+  if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then
+    echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
+    exit 1
+  fi
+fi
+
+if "$silprob"; then
+  ndisambig=$(utils/add_lex_disambig.pl --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
+else
+  ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
+fi
+
+ndisambig=$[ndisambig+1]  # Add one to disambiguate silence.
+
+# we'll need to figure out whether any of these disambiguation symbols are
+# absent from our current disambiguation phones.. if they are, then we need to
+# add them as new disambiguation symbols to phones.txt.
+for n in $(seq 0 $ndisambig); do
+  sym='#'$n; if ! grep -w -q "$sym" $dir/phones/disambig.txt; then echo "$sym"; fi
+done > $tmpdir/extra_disambig.txt
+highest_number=$(tail -n 1 $srcdir/phones.txt | awk '{print $2}')
+awk -v start=$highest_number '{print $1, NR+start}' <$tmpdir/extra_disambig.txt >>$dir/phones.txt
+echo "$0: added $(wc -l <$tmpdir/extra_disambig.txt) extra disambiguation symbols to phones.txt"
+
+# add extra_disambig symbols into disambig.txt
+cat $tmpdir/extra_disambig.txt >> $dir/phones/disambig.txt
+utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt >$dir/phones/disambig.int
+utils/sym2int.pl $dir/phones.txt <$dir/phones/disambig.txt | \
+  awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/disambig.csl
+
+silphone=`cat $srcdir/phones/optional_silence.txt` || exit 1;
+[ -z "$silphone" ] && \
+  ( echo "You have no optional-silence phone; it is required in the current scripts"
+    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
+   exit 1;
+
+if "$silprob"; then
+  # remove the silprob
+  cat $tmpdir/lexiconp_silprob.txt |\
+    awk '{
+      for(i=1; i<=NF; i++) {
+        if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
+      }
+    }' > $tmpdir/lexiconp.txt
+fi
+
+# First remove pron-probs from the lexicon.
+perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt
+
+# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
+# and is not part of a word.
+[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt
+
+cat $tmpdir/align_lexicon.txt | \
+  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt
+
+if [ -f $dir/phones/nonterminals.txt ]; then
+  for w in "#nonterm_begin" "#nonterm_end" $(cat $dir/phones/nonterminals.txt); do
+    echo $w $w  # These are words without pronunciations, so leave those prons
+                # empty.
+    done >> $dir/phones/align_lexicon.txt
+fi
+
+# create phones/align_lexicon.int from phones/align_lexicon.txt
+cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
+  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
+
+# Create the basic L.fst without disambiguation symbols, for use
+# in training.
+if "$silprob"; then
+  utils/lang/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
+         $tmpdir/lexiconp_silprob.txt $silprob_file | \
+      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+        --keep_isymbols=false --keep_osymbols=false |   \
+      fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+else
+  utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
+           $tmpdir/lexiconp.txt | \
+      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+        --keep_isymbols=false --keep_osymbols=false | \
+      fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+fi
+
+
+# and create the version that has disambiguation symbols.
+if "$silprob"; then
+  utils/lang/make_lexicon_fst_silprob.py $grammar_opts \
+    --sil-phone=$silphone --sil-disambig='#'$ndisambig \
+    $tmpdir/lexiconp_silprob_disambig.txt $silprob_file | \
+    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+      --keep_isymbols=false --keep_osymbols=false |   \
+    fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
+    fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
+else
+  utils/lang/make_lexicon_fst.py $grammar_opts \
+    --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
+       $tmpdir/lexiconp_disambig.txt | \
+     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+                --keep_isymbols=false --keep_osymbols=false | \
+     fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
+     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
+fi
+
+
+echo "$(basename $0): validating output directory"
+# the --skip-generate-words-check option is needed because L.fst may not actually
+# contain all the words in words.txt.
+! utils/validate_lang.pl --skip-generate-words-check $dir && echo "$(basename $0): error validating output" &&  exit 1;
+
+exit 0;
diff --git a/egs/wsj/s5/utils/lang/grammar/augment_phones_txt.py b/egs/wsj/s5/utils/lang/grammar/augment_phones_txt.py
new file mode 100755
index 00000000000..f0087680a4b
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/grammar/augment_phones_txt.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+
+import argparse
+import re
+import os
+import sys
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script augments a phones.txt
+       file (a phone-level symbol table) by adding certain special symbols
+       relating to grammar support.  See ../add_nonterminals.sh for context.""")
+
+    parser.add_argument('input_phones_txt', type=str,
+                        help='Filename of input phones.txt file, to be augmented')
+    parser.add_argument('nonterminal_symbols_list', type=str,
+                        help='Filename of a file containing a list of nonterminal '
+                        'symbols, one per line.  E.g. #nonterm:contact_list')
+    parser.add_argument('output_phones_txt', type=str, help='Filename of output '
+                        'phones.txt file.  May be the same as input-phones-txt.')
+    args = parser.parse_args()
+    return args
+
+
+
+
+def read_phones_txt(filename):
+    """Reads the phones.txt file in 'filename', returns a 2-tuple (lines, highest_symbol)
+       where 'lines' is all the lines the phones.txt as a list of strings,
+       and 'highest_symbol' is the integer value of the highest-numbered symbol
+       in the symbol table.  It is an error if the phones.txt is empty or mis-formatted."""
+
+    # The use of latin-1 encoding does not preclude reading utf-8.  latin-1
+    # encoding means "treat words as sequences of bytes", and it is compatible
+    # with utf-8 encoding as well as other encodings such as gbk, as long as the
+    # spaces are also spaces in ascii (which we check).  It is basically how we
+    # emulate the behavior of python before python3.
+    whitespace = re.compile("[ \t]+")
+    with open(filename, 'r', encoding='latin-1') as f:
+        lines = [line.strip(" \t\r\n") for line in f]
+        highest_numbered_symbol = 0
+        for line in lines:
+            s = whitespace.split(line)
+            try:
+                i = int(s[1])
+                if i > highest_numbered_symbol:
+                    highest_numbered_symbol = i
+            except:
+                raise RuntimeError("Could not interpret line '{0}' in file '{1}'".format(
+                line, filename))
+            if s[0] == '#nonterm_bos':
+                raise RuntimeError("It looks like the symbol table {0} already has nonterminals "
+                                   "in it.".format(filename))
+        return lines, highest_numbered_symbol
+
+
+def read_nonterminals(filename):
+    """Reads the user-defined nonterminal symbols in 'filename', checks that
+       it has the expected format and has no duplicates, and returns the nonterminal
+       symbols as a list of strings, e.g.
+       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
+    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
+    if len(ans) == 0:
+        raise RuntimeError("The file {0} contains no nonterminal symbols.".format(filename))
+    for nonterm in ans:
+        if nonterm[:9] != '#nonterm:':
+            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
+                               .format(filename, nonterm))
+    if len(set(ans)) != len(ans):
+        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
+    return ans
+
+def write_phones_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
+    """Writes updated phones.txt to 'filename'.  'orig_lines' is the original lines
+       in the phones.txt file as a list of strings (without the newlines);
+       highest_numbered_symbol is the highest numbered symbol in the original
+       phones.txt; nonterminals is a list of strings like '#nonterm:foo'."""
+    with open(filename, 'w', encoding='latin-1') as f:
+        for l in orig_lines:
+            print(l, file=f)
+        cur_symbol = highest_numbered_symbol + 1
+        for n in ['#nonterm_bos', '#nonterm_begin', '#nonterm_end', '#nonterm_reenter' ] + nonterminals:
+            print("{0} {1}".format(n, cur_symbol), file=f)
+            cur_symbol = cur_symbol + 1
+
+
+
+def main():
+    args = get_args()
+    (lines, highest_symbol) = read_phones_txt(args.input_phones_txt)
+    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
+    write_phones_txt(lines, highest_symbol, nonterminals, args.output_phones_txt)
+
+
+if __name__ == '__main__':
+      main()
diff --git a/egs/wsj/s5/utils/lang/grammar/augment_words_txt.py b/egs/wsj/s5/utils/lang/grammar/augment_words_txt.py
new file mode 100755
index 00000000000..1bfe02a2c9d
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/grammar/augment_words_txt.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+
+
+import argparse
+import os
+import sys
+import re
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script augments a words.txt
+       file (a word-level symbol table) by adding certain special symbols
+       relating to grammar support.  See ../add_nonterminals.sh for context,
+       and augment_phones_txt.py.""")
+
+    parser.add_argument('input_words_txt', type=str,
+                        help='Filename of input words.txt file, to be augmented')
+    parser.add_argument('nonterminal_symbols_list', type=str,
+                        help='Filename of a file containing a list of nonterminal '
+                        'symbols, one per line.  E.g. #nonterm:contact_list')
+    parser.add_argument('output_words_txt', type=str, help='Filename of output '
+                        'words.txt file.  May be the same as input-words-txt.')
+    args = parser.parse_args()
+    return args
+
+
+
+
+def read_words_txt(filename):
+    """Reads the words.txt file in 'filename', returns a 2-tuple (lines, highest_symbol)
+       where 'lines' is all the lines the words.txt as a list of strings,
+       and 'highest_symbol' is the integer value of the highest-numbered symbol
+       in the symbol table.  It is an error if the words.txt is empty or mis-formatted."""
+
+    # The use of latin-1 encoding does not preclude reading utf-8.  latin-1
+    # encoding means "treat words as sequences of bytes", and it is compatible
+    # with utf-8 encoding as well as other encodings such as gbk, as long as the
+    # spaces are also spaces in ascii (which we check).  It is basically how we
+    # emulate the behavior of python before python3.
+    whitespace = re.compile("[ \t]+")
+    with open(filename, 'r', encoding='latin-1') as f:
+        lines = [line.strip(" \t\r\n") for line in f]
+        highest_numbered_symbol = 0
+        for line in lines:
+            s = whitespace.split(line)
+            try:
+                i = int(s[1])
+                if i > highest_numbered_symbol:
+                    highest_numbered_symbol = i
+            except:
+                raise RuntimeError("Could not interpret line '{0}' in file '{1}'".format(
+                line, filename))
+            if s[0] in [ '#nonterm_begin', '#nonterm_end' ]:
+                raise RuntimeError("It looks like the symbol table {0} already has nonterminals "
+                                   "in it.".format(filename))
+        return lines, highest_numbered_symbol
+
+
+def read_nonterminals(filename):
+    """Reads the user-defined nonterminal symbols in 'filename', checks that
+       it has the expected format and has no duplicates, and returns the nonterminal
+       symbols as a list of strings, e.g.
+       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
+    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
+    if len(ans) == 0:
+        raise RuntimeError("The file {0} contains no nonterminal symbols.".format(filename))
+    for nonterm in ans:
+        if nonterm[:9] != '#nonterm:':
+            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
+                               .format(filename, nonterm))
+    if len(set(ans)) != len(ans):
+        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
+    return ans
+
+def write_words_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
+    """Writes updated words.txt to 'filename'.  'orig_lines' is the original lines
+       in the words.txt file as a list of strings (without the newlines);
+       highest_numbered_symbol is the highest numbered symbol in the original
+       words.txt; nonterminals is a list of strings like '#nonterm:foo'."""
+    with open(filename, 'w', encoding='latin-1') as f:
+        for l in orig_lines:
+            print(l, file=f)
+        cur_symbol = highest_numbered_symbol + 1
+        for n in [ '#nonterm_begin', '#nonterm_end' ] + nonterminals:
+            print("{0} {1}".format(n, cur_symbol), file=f)
+            cur_symbol = cur_symbol + 1
+
+
+def main():
+    args = get_args()
+    (lines, highest_symbol) = read_words_txt(args.input_words_txt)
+    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
+    write_words_txt(lines, highest_symbol, nonterminals, args.output_words_txt)
+
+
+if __name__ == '__main__':
+      main()
diff --git a/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py b/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py
index 19acd311c3d..31dfd08fbd2 100755
--- a/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py
+++ b/egs/wsj/s5/utils/lang/internal/arpa2fst_constrained.py
@@ -4,6 +4,7 @@
 # Apache 2.0.
 
 from __future__ import print_function
+from __future__ import division
 import sys
 import argparse
 import math
@@ -44,7 +45,7 @@
     print(' '.join(sys.argv), file = sys.stderr)
 
 
-class HistoryState:
+class HistoryState(object):
     def __init__(self):
         # note: neither backoff_prob nor the floats
         # in word_to_prob are in log space.
@@ -56,7 +57,7 @@ def __init__(self):
         self.word_to_prob = dict()
 
 
-class ArpaModel:
+class ArpaModel(object):
     def __init__(self):
         # self.orders is indexed by history-length [i.e. 0 for unigram,
         # 1 for bigram and so on], and is then a dict indexed
diff --git a/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py
index 81c0df36d2b..68f7b4b5639 100755
--- a/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py
+++ b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py
@@ -58,6 +58,7 @@ def get_ngram_stats(old_lm_lines):
 
 def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows):
     ngram_diffs = defaultdict(int)
+    whitespace_pattern = re.compile("[ \t]+")
     unk_pattern = re.compile(
         "[0-9.-]+(?:[\s\\t]\S+){1,3}[\s\\t]" + args.oov_dict_entry +
         "[\s\\t](?!-[0-9]+\.[0-9]+).*")
@@ -70,13 +71,17 @@ def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows):
     new_lm_lines = old_lm_lines[:skip_rows]
 
     for i in range(skip_rows, len(old_lm_lines)):
-            line = old_lm_lines[i].strip()
+            line = old_lm_lines[i].strip(" \t\r\n")
 
             if "\{}-grams:".format(3) in line:
                 passed_2grams = True
             if "\{}-grams:".format(max_ngrams) in line:
                 last_ngram = True
 
+            for i in range(max_ngrams):
+                if "\{}-grams:".format(i+1) in line:
+                    ngram = i+1
+
             # remove any n-gram states of the form: foo <unk> -> X
             # that is, any n-grams of order > 2 where <unk>
             # is the second-to-last word
@@ -85,7 +90,6 @@ def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows):
             if passed_2grams:
                 g_unk = unk_pattern.search(line)
                 if g_unk:
-                    ngram = len(g_unk.group(0).split()) - 1
                     ngram_diffs[ngram] = ngram_diffs[ngram] - 1
                     unk_row_count += 1
                     continue
@@ -98,7 +102,7 @@ def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows):
             if not last_ngram:
                 g_backoff = backoff_pattern.search(line)
                 if g_backoff:
-                    updated_row = g_backoff.group(0).split()[:-1]
+                    updated_row = whitespace_pattern.split(g_backoff.group(0))[:-1]
                     updated_row = updated_row[0] + \
                         "\t" + " ".join(updated_row[1:]) + "\n"
                     new_lm_lines.append(updated_row)
diff --git a/egs/wsj/s5/utils/lang/make_kn_lm.py b/egs/wsj/s5/utils/lang/make_kn_lm.py
new file mode 100755
index 00000000000..00c4c8f2378
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_kn_lm.py
@@ -0,0 +1,379 @@
+#!/usr/bin/env python3
+
+# Copyright 2016  Johns Hopkins University (Author: Daniel Povey)
+#           2018  Ruizhe Huang
+# Apache 2.0.
+
+# This is an implementation of computing Kneser-Ney smoothed language model
+# in the same way as srilm. This is a back-off, unmodified version of
+# Kneser-Ney smoothing, which produces the same results as the following
+# command (as an example) of srilm:
+#
+# $ ngram-count -order 4 -kn-modify-counts-at-end -ukndiscount -gt1min 0 -gt2min 0 -gt3min 0 -gt4min 0 \
+# -text corpus.txt -lm lm.arpa
+#
+# The data structure is based on: kaldi/egs/wsj/s5/utils/lang/make_phone_lm.py
+# The smoothing algorithm is based on: http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html
+
+import sys
+import os
+import re
+import io
+import math
+import argparse
+from collections import Counter, defaultdict
+
+
+parser = argparse.ArgumentParser(description="""
+    Generate kneser-ney language model as arpa format. By default,
+    it will read the corpus from standard input, and output to standard output.
+    """)
+parser.add_argument("-ngram-order", type=int, default=4, choices=[2, 3, 4, 5, 6, 7], help="Order of n-gram")
+parser.add_argument("-text", type=str, default=None, help="Path to the corpus file")
+parser.add_argument("-lm", type=str, default=None, help="Path to output arpa file for language models")
+parser.add_argument("-verbose", type=int, default=0, choices=[0, 1, 2, 3, 4, 5], help="Verbose level")
+args = parser.parse_args()
+
+default_encoding = "latin-1"  # For encoding-agnostic scripts, we assume byte stream as input.
+                              # Need to be very careful about the use of strip() and split()
+                              # in this case, because there is a latin-1 whitespace character
+                              # (nbsp) which is part of the unicode encoding range.
+                              # Ref: kaldi/egs/wsj/s5/utils/lang/bpe/prepend_words.py @ 69cd717
+strip_chars = " \t\r\n"
+whitespace = re.compile("[ \t]+")
+
+
+class CountsForHistory:
+    # This class (which is more like a struct) stores the counts seen in a
+    # particular history-state.  It is used inside class NgramCounts.
+    # It really does the job of a dict from int to float, but it also
+    # keeps track of the total count.
+    def __init__(self):
+        # The 'lambda: defaultdict(float)' is an anonymous function taking no
+        # arguments that returns a new defaultdict(float).
+        self.word_to_count = defaultdict(int)
+        self.word_to_context = defaultdict(set)  # using a set to count the number of unique contexts
+        self.word_to_f = dict()  # discounted probability
+        self.word_to_bow = dict()  # back-off weight
+        self.total_count = 0
+
+    def words(self):
+        return self.word_to_count.keys()
+
+    def __str__(self):
+        # e.g. returns ' total=12: 3->4, 4->6, -1->2'
+        return ' total={0}: {1}'.format(
+            str(self.total_count),
+            ', '.join(['{0} -> {1}'.format(word, count)
+                      for word, count in self.word_to_count.items()]))
+
+    def add_count(self, predicted_word, context_word, count):
+        assert count >= 0
+
+        self.total_count += count
+        self.word_to_count[predicted_word] += count
+        if context_word is not None:
+            self.word_to_context[predicted_word].add(context_word)
+
+
+class NgramCounts:
+    # A note on data-structure.  Firstly, all words are represented as
+    # integers.  We store n-gram counts as an array, indexed by (history-length
+    # == n-gram order minus one) (note: python calls arrays "lists") of dicts
+    # from histories to counts, where histories are arrays of integers and
+    # "counts" are dicts from integer to float.  For instance, when
+    # accumulating the 4-gram count for the '8' in the sequence '5 6 7 8', we'd
+    # do as follows: self.counts[3][[5,6,7]][8] += 1.0 where the [3] indexes an
+    # array, the [[5,6,7]] indexes a dict, and the [8] indexes a dict.
+    def __init__(self, ngram_order, bos_symbol='<s>', eos_symbol='</s>'):
+        assert ngram_order >= 2
+
+        self.ngram_order = ngram_order
+        self.bos_symbol = bos_symbol
+        self.eos_symbol = eos_symbol
+
+        self.counts = []
+        for n in range(ngram_order):
+            self.counts.append(defaultdict(lambda: CountsForHistory()))
+
+        self.d = []  # list of discounting factor for each order of ngram
+
+    # adds a raw count (called while processing input data).
+    # Suppose we see the sequence '6 7 8 9' and ngram_order=4, 'history'
+    # would be (6,7,8) and 'predicted_word' would be 9; 'count' would be
+    # 1.
+    def add_count(self, history, predicted_word, context_word, count):
+        self.counts[len(history)][history].add_count(predicted_word, context_word, count)
+
+    # 'line' is a string containing a sequence of integer word-ids.
+    # This function adds the un-smoothed counts from this line of text.
+    def add_raw_counts_from_line(self, line):
+        words = [self.bos_symbol] + whitespace.split(line) + [self.eos_symbol]
+
+        for i in range(len(words)):
+            for n in range(1, self.ngram_order+1):
+                if i + n > len(words):
+                    break
+
+                ngram = words[i: i + n]
+                predicted_word = ngram[-1]
+                history = tuple(ngram[: -1])
+                if i == 0 or n == self.ngram_order:
+                    context_word = None
+                else:
+                    context_word = words[i-1]
+
+                self.add_count(history, predicted_word, context_word, 1)
+
+    def add_raw_counts_from_standard_input(self):
+        lines_processed = 0
+        infile = io.TextIOWrapper(sys.stdin.buffer, encoding=default_encoding)  # byte stream as input
+        for line in infile:
+            line = line.strip(strip_chars)
+            if line == '':
+                break
+            self.add_raw_counts_from_line(line)
+            lines_processed += 1
+        if lines_processed == 0 or args.verbose > 0:
+            print("make_phone_lm.py: processed {0} lines of input".format(lines_processed), file=sys.stderr)
+
+    def add_raw_counts_from_file(self, filename):
+        lines_processed = 0
+        with open(filename, encoding=default_encoding) as fp:
+            for line in fp:
+                line = line.strip(strip_chars)
+                if line == '':
+                    break
+                self.add_raw_counts_from_line(line)
+                lines_processed += 1
+        if lines_processed == 0 or args.verbose > 0:
+            print("make_phone_lm.py: processed {0} lines of input".format(lines_processed), file=sys.stderr)
+
+    def cal_discounting_constants(self):
+        # For each order N of N-grams, we calculate discounting constant D_N = n1_N / (n1_N + 2 * n2_N),
+        # where n1_N is the number of unique N-grams with count = 1 (counts-of-counts).
+        # This constant is used similarly to absolute discounting.
+        # Return value: d is a list of floats, where d[N+1] = D_N
+
+        self.d = [0]  # for the lowest order, i.e., 1-gram, we do not need to discount, thus the constant is 0
+                      # This is a special case: as we currently assumed having seen all vocabularies in the dictionary,
+                      # but perhaps this is not the case for some other scenarios.
+        for n in range(1, self.ngram_order):
+            this_order_counts = self.counts[n]
+            n1 = 0
+            n2 = 0
+            for hist, counts_for_hist in this_order_counts.items():
+                stat = Counter(counts_for_hist.word_to_count.values())
+                n1 += stat[1]
+                n2 += stat[2]
+            assert n1 + 2 * n2 > 0
+            self.d.append(n1 * 1.0 / (n1 + 2 * n2))
+
+    def cal_f(self):
+        # f(a_z) is a probability distribution of word sequence a_z.
+        # Typically f(a_z) is discounted to be less than the ML estimate so we have
+        # some leftover probability for the z words unseen in the context (a_).
+        #
+        # f(a_z) = (c(a_z) - D0) / c(a_)    ;; for highest order N-grams
+        # f(_z)  = (n(*_z) - D1) / n(*_*)	;; for lower order N-grams
+
+        # highest order N-grams
+        n = self.ngram_order - 1
+        this_order_counts = self.counts[n]
+        for hist, counts_for_hist in this_order_counts.items():
+            for w, c in counts_for_hist.word_to_count.items():
+                counts_for_hist.word_to_f[w] = max((c - self.d[n]), 0) * 1.0 / counts_for_hist.total_count
+
+        # lower order N-grams
+        for n in range(0, self.ngram_order - 1):
+            this_order_counts = self.counts[n]
+            for hist, counts_for_hist in this_order_counts.items():
+
+                n_star_star = 0
+                for w in counts_for_hist.word_to_count.keys():
+                    n_star_star += len(counts_for_hist.word_to_context[w])
+
+                if n_star_star != 0:
+                    for w in counts_for_hist.word_to_count.keys():
+                        n_star_z = len(counts_for_hist.word_to_context[w])
+                        counts_for_hist.word_to_f[w] = max((n_star_z - self.d[n]), 0) * 1.0 / n_star_star
+                else:  # patterns begin with <s>, they do not have "modified count", so use raw count instead
+                    for w in counts_for_hist.word_to_count.keys():
+                        n_star_z = counts_for_hist.word_to_count[w]
+                        counts_for_hist.word_to_f[w] = max((n_star_z - self.d[n]), 0) * 1.0 / counts_for_hist.total_count
+
+    def cal_bow(self):
+        # Backoff weights are only necessary for ngrams which form a prefix of a longer ngram.
+        # Thus, two sorts of ngrams do not have a bow:
+        # 1) highest order ngram
+        # 2) ngrams ending in </s>
+        #
+        # bow(a_) = (1 - Sum_Z1 f(a_z)) / (1 - Sum_Z1 f(_z))
+        # Note that Z1 is the set of all words with c(a_z) > 0
+
+        # highest order N-grams
+        n = self.ngram_order - 1
+        this_order_counts = self.counts[n]
+        for hist, counts_for_hist in this_order_counts.items():
+            for w in counts_for_hist.word_to_count.keys():
+                counts_for_hist.word_to_bow[w] = None
+
+        # lower order N-grams
+        for n in range(0, self.ngram_order - 1):
+            this_order_counts = self.counts[n]
+            for hist, counts_for_hist in this_order_counts.items():
+                for w in counts_for_hist.word_to_count.keys():
+                    if w == self.eos_symbol:
+                        counts_for_hist.word_to_bow[w] = None
+                    else:
+                        a_ = hist + (w,)
+
+                        assert len(a_) < self.ngram_order
+                        assert a_ in self.counts[len(a_)].keys()
+
+                        a_counts_for_hist = self.counts[len(a_)][a_]
+
+                        sum_z1_f_a_z = 0
+                        for u in a_counts_for_hist.word_to_count.keys():
+                            sum_z1_f_a_z += a_counts_for_hist.word_to_f[u]
+
+                        sum_z1_f_z = 0
+                        _ = a_[1:]
+                        _counts_for_hist = self.counts[len(_)][_]
+                        for u in a_counts_for_hist.word_to_count.keys():  # Should be careful here: what is Z1
+                            sum_z1_f_z += _counts_for_hist.word_to_f[u]
+
+                        counts_for_hist.word_to_bow[w] = (1.0 - sum_z1_f_a_z) / (1.0 - sum_z1_f_z)
+
+    def print_raw_counts(self, info_string):
+        # these are useful for debug.
+        print(info_string)
+        res = []
+        for this_order_counts in self.counts:
+            for hist, counts_for_hist in this_order_counts.items():
+                for w in counts_for_hist.word_to_count.keys():
+                    ngram = " ".join(hist) + " " + w
+                    ngram = ngram.strip(strip_chars)
+
+                    res.append("{0}\t{1}".format(ngram, counts_for_hist.word_to_count[w]))
+        res.sort(reverse=True)
+        for r in res:
+            print(r)
+
+    def print_modified_counts(self, info_string):
+        # these are useful for debug.
+        print(info_string)
+        res = []
+        for this_order_counts in self.counts:
+            for hist, counts_for_hist in this_order_counts.items():
+                for w in counts_for_hist.word_to_count.keys():
+                    ngram = " ".join(hist) + " " + w
+                    ngram = ngram.strip(strip_chars)
+
+                    modified_count = len(counts_for_hist.word_to_context[w])
+                    raw_count = counts_for_hist.word_to_count[w]
+
+                    if modified_count == 0:
+                        res.append("{0}\t{1}".format(ngram, raw_count))
+                    else:
+                        res.append("{0}\t{1}".format(ngram, modified_count))
+        res.sort(reverse=True)
+        for r in res:
+            print(r)
+
+    def print_f(self, info_string):
+        # these are useful for debug.
+        print(info_string)
+        res = []
+        for this_order_counts in self.counts:
+            for hist, counts_for_hist in this_order_counts.items():
+                for w in counts_for_hist.word_to_count.keys():
+                    ngram = " ".join(hist) + " " + w
+                    ngram = ngram.strip(strip_chars)
+
+                    f = counts_for_hist.word_to_f[w]
+                    if f == 0:  # f(<s>) is always 0
+                        f = 1e-99
+
+                    res.append("{0}\t{1}".format(ngram, math.log(f, 10)))
+        res.sort(reverse=True)
+        for r in res:
+            print(r)
+
+    def print_f_and_bow(self, info_string):
+        # these are useful for debug.
+        print(info_string)
+        res = []
+        for this_order_counts in self.counts:
+            for hist, counts_for_hist in this_order_counts.items():
+                for w in counts_for_hist.word_to_count.keys():
+                    ngram = " ".join(hist) + " " + w
+                    ngram = ngram.strip(strip_chars)
+
+                    f = counts_for_hist.word_to_f[w]
+                    if f == 0:  # f(<s>) is always 0
+                        f = 1e-99
+
+                    bow = counts_for_hist.word_to_bow[w]
+                    if bow is None:
+                        res.append("{1}\t{0}".format(ngram, math.log(f, 10)))
+                    else:
+                        res.append("{1}\t{0}\t{2}".format(ngram, math.log(f, 10), math.log(bow, 10)))
+        res.sort(reverse=True)
+        for r in res:
+            print(r)
+
+    def print_as_arpa(self, fout=io.TextIOWrapper(sys.stdout.buffer, encoding='latin-1')):
+        # print as ARPA format.
+
+        print('\\data\\', file=fout)
+        for hist_len in range(self.ngram_order):
+            # print the number of n-grams.
+            print('ngram {0}={1}'.format(
+                hist_len + 1,
+                sum([len(counts_for_hist.word_to_f) for counts_for_hist in self.counts[hist_len].values()])),
+                file=fout
+            )
+
+        print('', file=fout)
+
+        for hist_len in range(self.ngram_order):
+            print('\\{0}-grams:'.format(hist_len + 1), file=fout)
+
+            this_order_counts = self.counts[hist_len]
+            for hist, counts_for_hist in this_order_counts.items():
+                for word in counts_for_hist.word_to_count.keys():
+                    ngram = hist + (word,)
+                    prob = counts_for_hist.word_to_f[word]
+                    bow = counts_for_hist.word_to_bow[word]
+
+                    if prob == 0:  # f(<s>) is always 0
+                        prob = 1e-99
+
+                    line = '{0}\t{1}'.format('%.7f' % math.log10(prob), ' '.join(ngram))
+                    if bow is not None:
+                        line += '\t{0}'.format('%.7f' % math.log10(bow))
+                    print(line, file=fout)
+            print('', file=fout)
+        print('\\end\\', file=fout)
+
+
+if __name__ == "__main__":
+
+    ngram_counts = NgramCounts(args.ngram_order)
+
+    if args.text is None:
+        ngram_counts.add_raw_counts_from_standard_input()
+    else:
+        assert os.path.isfile(args.text)
+        ngram_counts.add_raw_counts_from_file(args.text)
+
+    ngram_counts.cal_discounting_constants()
+    ngram_counts.cal_f()
+    ngram_counts.cal_bow()
+
+    if args.lm is None:
+        ngram_counts.print_as_arpa()
+    else:
+        with open(args.lm, 'w', encoding=default_encoding) as f:
+            ngram_counts.print_as_arpa(fout=f)
diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_lexicon_fst.py
new file mode 100755
index 00000000000..e22222db340
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_lexicon_fst.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+
+# Copyright   2018  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0.
+
+# see get_args() below for usage message.
+import argparse
+import os
+import sys
+import math
+import re
+
+# The use of latin-1 encoding does not preclude reading utf-8.  latin-1
+# encoding means "treat words as sequences of bytes", and it is compatible
+# with utf-8 encoding as well as other encodings such as gbk, as long as the
+# spaces are also spaces in ascii (which we check).  It is basically how we
+# emulate the behavior of python before python3.
+sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
+sys.stderr = open(2, 'w', encoding='latin-1', closefd=False)
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates the
+       text form of a lexicon FST, to be compiled by fstcompile using the
+       appropriate symbol tables (phones.txt and words.txt) .  It will mostly
+       be invoked indirectly via utils/prepare_lang.sh.  The output goes to
+       the stdout.""")
+
+    parser.add_argument('--sil-phone', dest='sil_phone', type=str,
+                        help="""Text form of optional-silence phone, e.g. 'SIL'.  See also
+                        the --silprob option.""")
+    parser.add_argument('--sil-prob', dest='sil_prob', type=float, default=0.0,
+                        help="""Probability of silence between words (including at the
+                        beginning and end of word sequences).  Must be in the range [0.0, 1.0].
+                        This refers to the optional silence inserted by the lexicon; see
+                        the --silphone option.""")
+    parser.add_argument('--sil-disambig', dest='sil_disambig', type=str,
+                        help="""Disambiguation symbol to disambiguate silence, e.g. #5.
+                        Will only be supplied if you are creating the version of L.fst
+                        with disambiguation symbols, intended for use with cyclic G.fst.
+                        This symbol was introduced to fix a rather obscure source of
+                        nondeterminism of CLG.fst, that has to do with reordering of
+                        disambiguation symbols and phone symbols.""")
+    parser.add_argument('--left-context-phones', dest='left_context_phones', type=str,
+                        help="""Only relevant if --nonterminals is also supplied; this relates
+                        to grammar decoding (see http://kaldi-asr.org/doc/grammar.html or
+                        src/doc/grammar.dox).  Format is a list of left-context phones,
+                        in text form, one per line.  E.g. data/lang/phones/left_context_phones.txt""")
+    parser.add_argument('--nonterminals', type=str,
+                        help="""If supplied, --left-context-phones must also be supplied.
+                        List of user-defined nonterminal symbols such as #nonterm:contact_list,
+                        one per line.  E.g. data/local/dict/nonterminals.txt.""")
+    parser.add_argument('lexiconp', type=str,
+                        help="""Filename of lexicon with pronunciation probabilities
+                        (normally lexiconp.txt), with lines of the form 'word prob p1 p2...',
+                        e.g. 'a   1.0    ay'""")
+    args = parser.parse_args()
+    return args
+
+
+def read_lexiconp(filename):
+    """Reads the lexiconp.txt file in 'filename', with lines like 'word pron p1 p2 ...'.
+    Returns a list of tuples (word, pron_prob, pron), where 'word' is a string,
+   'pron_prob', a float, is the pronunciation probability (which must be >0.0
+    and would normally be <=1.0),  and 'pron' is a list of strings representing phones.
+    An element in the returned list might be ('hello', 1.0, ['h', 'eh', 'l', 'ow']).
+    """
+
+    ans = []
+    found_empty_prons = False
+    found_large_pronprobs = False
+    # See the comment near the top of this file, RE why we use latin-1.
+    with open(filename, 'r', encoding='latin-1') as f:
+        whitespace = re.compile("[ \t]+")
+        for line in f:
+            a = whitespace.split(line.strip(" \t\r\n"))
+            if len(a) < 2:
+                print("{0}: error: found bad line '{1}' in lexicon file {2} ".format(
+                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
+                sys.exit(1)
+            word = a[0]
+            if word == "<eps>":
+                # This would clash with the epsilon symbol normally used in OpenFst.
+                print("{0}: error: found <eps> as a word in lexicon file "
+                      "{1}".format(line.strip(" \t\r\n"), filename), file=sys.stderr)
+                sys.exit(1)
+            try:
+                pron_prob = float(a[1])
+            except:
+                print("{0}: error: found bad line '{1}' in lexicon file {2}, 2nd field "
+                      "should be pron-prob".format(sys.argv[0], line.strip(" \t\r\n"), filename),
+                      file=sys.stderr)
+                sys.exit(1)
+            prons = a[2:]
+            if pron_prob <= 0.0:
+                print("{0}: error: invalid pron-prob in line '{1}' of lexicon file {1} ".format(
+                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
+                sys.exit(1)
+            if len(prons) == 0:
+                found_empty_prons = True
+            ans.append( (word, pron_prob, prons) )
+            if pron_prob > 1.0:
+                found_large_pronprobs = True
+    if found_empty_prons:
+        print("{0}: warning: found at least one word with an empty pronunciation "
+              "in lexicon file {1}.".format(sys.argv[0], filename),
+              file=sys.stderr)
+    if found_large_pronprobs:
+        print("{0}: warning: found at least one word with pron-prob >1.0 "
+              "in {1}".format(sys.argv[0], filename), file=sys.stderr)
+
+
+    if len(ans) == 0:
+        print("{0}: error: found no pronunciations in lexicon file {1}".format(
+            sys.argv[0], filename), file=sys.stderr)
+        sys.exit(1)
+    return ans
+
+
+def write_nonterminal_arcs(start_state, loop_state, next_state,
+                           nonterminals, left_context_phones):
+    """This function relates to the grammar-decoding setup, see
+    kaldi-asr.org/doc/grammar.html.  It is called from write_fst_no_silence
+    and write_fst_silence, and writes to the stdout some extra arcs
+    in the lexicon FST that relate to nonterminal symbols.
+    See the section "Special symbols in L.fst,
+    kaldi-asr.org/doc/grammar.html#grammar_special_l.
+       start_state: the start-state of L.fst.
+       loop_state:  the state of high out-degree in L.fst where words leave
+                  and enter.
+       next_state: the number from which this function can start allocating its
+                  own states.  the updated value of next_state will be returned.
+       nonterminals: the user-defined nonterminal symbols as a list of
+          strings, e.g. ['#nonterm:contact_list', ... ].
+       left_context_phones: a list of phones that may appear as left-context,
+          e.g. ['a', 'ah', ... '#nonterm_bos'].
+    """
+    shared_state = next_state
+    next_state += 1
+    final_state = next_state
+    next_state += 1
+
+    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+        src=start_state, dest=shared_state,
+        phone='#nonterm_begin', word='#nonterm_begin',
+        cost=0.0))
+
+    for nonterminal in nonterminals:
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=loop_state, dest=shared_state,
+            phone=nonterminal, word=nonterminal,
+            cost=0.0))
+    # this_cost equals log(len(left_context_phones)) but the expression below
+    # better captures the meaning.  Applying this cost to arcs keeps the FST
+    # stochatic (sum-to-one, like an HMM), so that if we do weight pushing
+    # things won't get weird.  In the grammar-FST code when we splice things
+    # together we will cancel out this cost, see the function CombineArcs().
+    this_cost = -math.log(1.0 / len(left_context_phones))
+
+    for left_context_phone in left_context_phones:
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=shared_state, dest=loop_state,
+            phone=left_context_phone, word='<eps>', cost=this_cost))
+    # arc from loop-state to a final-state with #nonterm_end as ilabel and olabel
+    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+        src=loop_state, dest=final_state,
+        phone='#nonterm_end', word='#nonterm_end', cost=0.0))
+    print("{state}\t{final_cost}".format(
+        state=final_state, final_cost=0.0))
+    return next_state
+
+
+
+def write_fst_no_silence(lexicon, nonterminals=None, left_context_phones=None):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+
+      'lexicon' is a list of 3-tuples (word, pron-prob, prons) as returned by
+        read_lexiconp().
+     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
+        is either None, or the user-defined nonterminal symbols as a list of
+        strings, e.g. ['#nonterm:contact_list', ... ].
+     'left_context_phones', which also relates to grammar decoding, and must be
+        supplied if 'nonterminals' is supplied is either None or a list of
+        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
+    """
+
+    loop_state = 0
+    next_state = 1  # the next un-allocated state, will be incremented as we go.
+    for (word, pronprob, pron) in lexicon:
+        cost = -math.log(pronprob)
+        cur_state = loop_state
+        for i in range(len(pron) - 1):
+            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+                src=cur_state,
+                dest=next_state,
+                phone=pron[i],
+                word=(word if i == 0 else '<eps>'),
+                cost=(cost if i == 0 else 0.0)))
+            cur_state = next_state
+            next_state += 1
+
+        i = len(pron) - 1  # note: i == -1 if pron is empty.
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=cur_state,
+            dest=loop_state,
+            phone=(pron[i] if i >= 0 else '<eps>'),
+            word=(word if i <= 0 else '<eps>'),
+            cost=(cost if i <= 0 else 0.0)))
+
+    if nonterminals is not None:
+        next_state = write_nonterminal_arcs(
+            loop_state, loop_state, next_state,
+            nonterminals, left_context_phones)
+
+    print("{state}\t{final_cost}".format(
+        state=loop_state,
+        final_cost=0.0))
+
+
+def write_fst_with_silence(lexicon, sil_prob, sil_phone, sil_disambig,
+                           nonterminals=None, left_context_phones=None):
+    """Writes the text format of L.fst to the standard output.  This version is for
+       when --sil-prob != 0.0, meaning there is optional silence
+     'lexicon' is a list of 3-tuples (word, pron-prob, prons)
+         as returned by read_lexiconp().
+     'sil_prob', which is expected to be strictly between 0.. and 1.0, is the
+         probability of silence
+     'sil_phone' is the silence phone, e.g. "SIL".
+     'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
+     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
+        is either None, or the user-defined nonterminal symbols as a list of
+        strings, e.g. ['#nonterm:contact_list', ... ].
+     'left_context_phones', which also relates to grammar decoding, and must be
+        supplied if 'nonterminals' is supplied is either None or a list of
+        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
+    """
+
+    assert sil_prob > 0.0 and sil_prob < 1.0
+    sil_cost = -math.log(sil_prob)
+    no_sil_cost = -math.log(1.0 - sil_prob);
+
+    start_state = 0
+    loop_state = 1  # words enter and leave from here
+    sil_state = 2   # words terminate here when followed by silence; this state
+                    # has a silence transition to loop_state.
+    next_state = 3  # the next un-allocated state, will be incremented as we go.
+
+
+    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
+        src=start_state, dest=loop_state,
+        phone='<eps>', word='<eps>', cost=no_sil_cost))
+    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
+        src=start_state, dest=sil_state,
+        phone='<eps>', word='<eps>', cost=sil_cost))
+    if sil_disambig is None:
+        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
+            src=sil_state, dest=loop_state,
+            phone=sil_phone, word='<eps>', cost=0.0))
+    else:
+        sil_disambig_state = next_state
+        next_state += 1
+        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
+            src=sil_state, dest=sil_disambig_state,
+            phone=sil_phone, word='<eps>', cost=0.0))
+        print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
+            src=sil_disambig_state, dest=loop_state,
+            phone=sil_disambig, word='<eps>', cost=0.0))
+
+
+    for (word, pronprob, pron) in lexicon:
+        pron_cost = -math.log(pronprob)
+        cur_state = loop_state
+        for i in range(len(pron) - 1):
+            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+                src=cur_state, dest=next_state,
+                phone=pron[i],
+                word=(word if i == 0 else '<eps>'),
+                cost=(pron_cost if i == 0 else 0.0)))
+            cur_state = next_state
+            next_state += 1
+
+        i = len(pron) - 1  # note: i == -1 if pron is empty.
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=cur_state,
+            dest=loop_state,
+            phone=(pron[i] if i >= 0 else '<eps>'),
+            word=(word if i <= 0 else '<eps>'),
+            cost=no_sil_cost + (pron_cost if i <= 0 else 0.0)))
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=cur_state,
+            dest=sil_state,
+            phone=(pron[i] if i >= 0 else '<eps>'),
+            word=(word if i <= 0 else '<eps>'),
+            cost=sil_cost + (pron_cost if i <= 0 else 0.0)))
+
+    if nonterminals is not None:
+        next_state = write_nonterminal_arcs(
+            start_state, loop_state, next_state,
+            nonterminals, left_context_phones)
+
+    print("{state}\t{final_cost}".format(
+        state=loop_state,
+        final_cost=0.0))
+
+
+
+
+def write_words_txt(orig_lines, highest_numbered_symbol, nonterminals, filename):
+    """Writes updated words.txt to 'filename'.  'orig_lines' is the original lines
+       in the words.txt file as a list of strings (without the newlines);
+       highest_numbered_symbol is the highest numbered symbol in the original
+       words.txt; nonterminals is a list of strings like '#nonterm:foo'."""
+    with open(filename, 'w', encoding='latin-1') as f:
+        for l in orig_lines:
+            print(l, file=f)
+        cur_symbol = highest_numbered_symbol + 1
+        for n in [ '#nonterm_begin', '#nonterm_end' ] + nonterminals:
+            print("{0} {1}".format(n, cur_symbol), file=f)
+            cur_symbol = cur_symbol + 1
+
+
+def read_nonterminals(filename):
+    """Reads the user-defined nonterminal symbols in 'filename', checks that
+       it has the expected format and has no duplicates, and returns the nonterminal
+       symbols as a list of strings, e.g.
+       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
+    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
+    if len(ans) == 0:
+        raise RuntimeError("The file {0} contains no nonterminals symbols.".format(filename))
+    for nonterm in ans:
+        if nonterm[:9] != '#nonterm:':
+            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
+                               .format(filename, nonterm))
+    if len(set(ans)) != len(ans):
+        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
+    return ans
+
+def read_left_context_phones(filename):
+    """Reads, checks, and returns a list of left-context phones, in text form, one
+       per line.  Returns a list of strings, e.g. ['a', 'ah', ..., '#nonterm_bos' ]"""
+    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
+    if len(ans) == 0:
+        raise RuntimeError("The file {0} contains no left-context phones.".format(filename))
+    whitespace = re.compile("[ \t]+")
+    for s in ans:
+        if len(whitespace.split(s)) != 1:
+            raise RuntimeError("The file {0} contains an invalid line '{1}'".format(filename, s)   )
+
+    if len(set(ans)) != len(ans):
+        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
+    return ans
+
+
+def is_token(s):
+    """Returns true if s is a string and is space-free."""
+    if not isinstance(s, str):
+        return False
+    whitespace = re.compile("[ \t\r\n]+")
+    split_str = whitespace.split(s);
+    return len(split_str) == 1 and s == split_str[0]
+
+
+def main():
+    args = get_args()
+
+    lexicon = read_lexiconp(args.lexiconp)
+
+    if args.nonterminals is None:
+        nonterminals, left_context_phones = None, None
+    else:
+        if args.left_context_phones is None:
+            print("{0}: if --nonterminals is specified, --left-context-phones must also "
+                  "be specified".format(sys.argv[0]))
+            sys.exit(1)
+        nonterminals = read_nonterminals(args.nonterminals)
+        left_context_phones = read_left_context_phones(args.left_context_phones)
+
+    if args.sil_prob == 0.0:
+          write_fst_no_silence(lexicon,
+                               nonterminals=nonterminals,
+                               left_context_phones=left_context_phones)
+    else:
+        # Do some checking that the options make sense.
+        if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
+            print("{0}: invalid value specified --sil-prob={1}".format(
+                sys.argv[0], args.sil_prob), file=sys.stderr)
+            sys.exit(1)
+
+        if not is_token(args.sil_phone):
+            print("{0}: you specified --sil-prob={1} but --sil-phone is set "
+                  "to '{2}'".format(sys.argv[0], args.sil_prob, args.sil_phone),
+                  file=sys.stderr)
+            sys.exit(1)
+        if args.sil_disambig is not None and not is_token(args.sil_disambig):
+            print("{0}: invalid value --sil-disambig='{1}' was specified."
+                  "".format(sys.argv[0], args.sil_disambig), file=sys.stderr)
+            sys.exit(1)
+        write_fst_with_silence(lexicon, args.sil_prob, args.sil_phone,
+                               args.sil_disambig,
+                               nonterminals=nonterminals,
+                               left_context_phones=left_context_phones)
+
+
+
+#    (lines, highest_symbol) = read_words_txt(args.input_words_txt)
+#    nonterminals = read_nonterminals(args.nonterminal_symbols_list)
+#    write_words_txt(lines, highest_symbol, nonterminals, args.output_words_txt)
+
+
+if __name__ == '__main__':
+      main()
diff --git a/egs/wsj/s5/utils/lang/make_lexicon_fst_silprob.py b/egs/wsj/s5/utils/lang/make_lexicon_fst_silprob.py
new file mode 100755
index 00000000000..0633c4bec73
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_lexicon_fst_silprob.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python3
+# Copyright   2018  Johns Hopkins University (author: Daniel Povey)
+#             2018  Jiedan Zhu
+# Apache 2.0.
+# see get_args() below for usage message.
+
+import argparse
+import os
+import sys
+import math
+import re
+
+# The use of latin-1 encoding does not preclude reading utf-8.  latin-1
+# encoding means "treat words as sequences of bytes", and it is compatible
+# with utf-8 encoding as well as other encodings such as gbk, as long as the
+# spaces are also spaces in ascii (which we check).  It is basically how we
+# emulate the behavior of python before python3.
+
+sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
+sys.stderr = open(2, 'w', encoding='latin-1', closefd=False)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates the
+       text form of a lexicon FST, to be compiled by fstcompile using the
+       appropriate symbol tables (phones.txt and words.txt) .  It will mostly
+       be invoked indirectly via utils/prepare_lang.sh.  The output goes to
+       the stdout.
+
+       This version is for a lexicon with word-specific silence probabilities,
+       see http://www.danielpovey.com/files/2015_interspeech_silprob.pdf
+       for an explanation""")
+
+    parser.add_argument('--sil-phone', dest='sil_phone', type=str,
+                        help="Text form of optional-silence phone, e.g. 'SIL'.")
+    parser.add_argument('--sil-disambig', dest='sil_disambig', type=str, default="<eps>",
+                        help="""Disambiguation symbol to disambiguate silence, e.g. #5.
+                        Will only be supplied if you are creating the version of L.fst
+                        with disambiguation symbols, intended for use with cyclic G.fst.
+                        This symbol was introduced to fix a rather obscure source of
+                        nondeterminism of CLG.fst, that has to do with reordering of
+                        disambiguation symbols and phone symbols.""")
+    parser.add_argument('lexiconp', type=str,
+                        help="""Filename of lexicon with pronunciation probabilities
+                        (normally lexiconp.txt), with lines of the form
+                        'word pron-prob prob-of-sil correction-term-for-sil correction-term-for-no-sil p1 p2...',
+                        e.g. 'a   1.0  0.8 1.2  0.6  ay'""")
+    parser.add_argument('silprobs', type=str,
+                        help="""Filename with silence probabilities, with lines of the form
+                        '<s> p(sil-after|<s>) //
+                        </s>_s correction-term-for-sil-for-</s> //
+                        </s>_n correction-term-for-no-sil-for-</s> //
+                        overall p(overall-sil), where // represents line break.
+                        See also utils/dict_dir_add_pronprobs.sh,
+                        which creates this file as silprob.txt.""")
+    parser.add_argument('--left-context-phones', dest='left_context_phones', type=str,
+                        help="""Only relevant if --nonterminals is also supplied; this relates
+                        to grammar decoding (see http://kaldi-asr.org/doc/grammar.html or
+                        src/doc/grammar.dox).  Format is a list of left-context phones,
+                        in text form, one per line.  E.g. data/lang/phones/left_context_phones.txt""")
+    parser.add_argument('--nonterminals', type=str,
+                        help="""If supplied, --left-context-phones must also be supplied.
+                        List of user-defined nonterminal symbols such as #nonterm:contact_list,
+                        one per line.  E.g. data/local/dict/nonterminals.txt.""")
+
+    args = parser.parse_args()
+    return args
+
+
+def read_silprobs(filename):
+    """ Reads the silprobs file (e.g. silprobs.txt) which will have a format like this:
+     <s> 0.99
+     </s>_s 2.50607106867326
+     </s>_n 0.00653829808100956
+     overall 0.20
+    and returns it as a 4-tuple, e.g. in this example (0.99, 2.50, 0.006, 0.20)
+    """
+    silbeginprob = -1
+    silendcorrection = -1
+    nonsilendcorrection = -1
+    siloverallprob = -1
+    with open(filename, 'r', encoding='latin-1') as f:
+        whitespace = re.compile("[ \t]+")
+        for line in f:
+            a = whitespace.split(line.strip(" \t\r\n"))
+            if len(a) != 2:
+                print("{0}: error: found bad line '{1}' in silprobs file {1} ".format(
+                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
+                sys.exit(1)
+            label = a[0]
+            try:
+                if label == "<s>":
+                    silbeginprob = float(a[1])
+                elif label == "</s>_s":
+                    silendcorrection = float(a[1])
+                elif label == "</s>_n":
+                    nonsilendcorrection = float(a[1])
+                elif label == "overall":
+                    siloverallprob = float(a[1]) # this is not in use, still keep it?
+                else:
+                    raise RuntimeError()
+            except:
+                print("{0}: error: found bad line '{1}' in silprobs file {1}"
+                      .format(sys.argv[0], line.strip(" \t\r\n"), filename),
+                      file=sys.stderr)
+                sys.exit(1)
+    if (silbeginprob <= 0.0 or silbeginprob > 1.0 or
+        silendcorrection <= 0.0 or nonsilendcorrection <= 0.0 or
+        siloverallprob <= 0.0 or siloverallprob > 1.0):
+        print("{0}: error: prob is not correct in silprobs file {1}."
+            .format(sys.argv[0], filename), file=sys.stderr)
+        sys.exit(1)
+    return (silbeginprob, silendcorrection, nonsilendcorrection, siloverallprob)
+
+
+def read_lexiconp(filename):
+    """Reads the lexiconp.txt file in 'filename', with lines like
+    'word p(pronunciation|word) p(sil-after|word) correction-term-for-sil
+    correction-term-for-no-sil p1 p2 ...'.
+    Returns a list of tuples (word, pron_prob, word_sil_prob,
+    sil_word_correction, non_sil_word_correction, prons), where 'word' is a string,
+   'pron_prob', a float, is the pronunciation probability (which must be >0.0
+    and would normally be <=1.0), 'word_sil_prob' is a float,
+    'sil_word_correction' is a float, 'non_sil_word_correction' is a float,
+    and 'pron' is a list of strings representing phones.
+    An element in the returned list might be
+    ('hello', 1.0, 0.5, 0.3, 0.6, ['h', 'eh', 'l', 'ow']).
+    """
+    ans = []
+    found_empty_prons = False
+    found_large_pronprobs = False
+    # See the comment near the top of this file, RE why we use latin-1.
+    whitespace = re.compile("[ \t]+")
+    with open(filename, 'r', encoding='latin-1') as f:
+        for line in f:
+            a = whitespace.split(line.strip(" \t\r\n"))
+            if len(a) < 2:
+                print("{0}: error: found bad line '{1}' in lexicon file {1} ".format(
+                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
+                sys.exit(1)
+            word = a[0]
+            if word == "<eps>":
+                # This would clash with the epsilon symbol normally used in OpenFst.
+                print("{0}: error: found <eps> as a word in lexicon file "
+                      "{1}".format(line.strip(" \t\r\n"), filename), file=sys.stderr)
+                sys.exit(1)
+            try:
+                pron_prob = float(a[1])
+                word_sil_prob = float(a[2])
+                sil_word_correction = float(a[3])
+                non_sil_word_correction = float(a[4])
+            except:
+                print("{0}: error: found bad line '{1}' in lexicon file {2}, 2nd field "
+                      "through 5th field should be numbers".format(sys.argv[0],
+                                                                   line.strip(" \t\r\n"), filename),
+                      file=sys.stderr)
+                sys.exit(1)
+            prons = a[5:]
+            if pron_prob <= 0.0:
+                print("{0}: error: invalid pron-prob in line '{1}' of lexicon file {2} ".format(
+                    sys.argv[0], line.strip(" \t\r\n"), filename), file=sys.stderr)
+                sys.exit(1)
+            if len(prons) == 0:
+                found_empty_prons = True
+            ans.append((
+                word, pron_prob, word_sil_prob,
+                sil_word_correction, non_sil_word_correction, prons))
+            if pron_prob > 1.0:
+                found_large_pronprobs = True
+    if found_empty_prons:
+        print("{0}: warning: found at least one word with an empty pronunciation "
+              "in lexicon file {1}.".format(sys.argv[0], filename),
+              file=sys.stderr)
+    if found_large_pronprobs:
+        print("{0}: warning: found at least one word with pron-prob >1.0 "
+              "in {1}".format(sys.argv[0], filename), file=sys.stderr)
+    if len(ans) == 0:
+        print("{0}: error: found no pronunciations in lexicon file {1}".format(
+            sys.argv[0], filename), file=sys.stderr)
+        sys.exit(1)
+    return ans
+
+
+def write_nonterminal_arcs(start_state, sil_state, non_sil_state,
+                           next_state, sil_phone,
+                           nonterminals, left_context_phones):
+    """This function relates to the grammar-decoding setup, see
+    kaldi-asr.org/doc/grammar.html.  It is called from write_fst, and writes to
+    the stdout some extra arcs in the lexicon FST that relate to nonterminal
+    symbols.
+
+    See the section "Special symbols in L.fst,
+    kaldi-asr.org/doc/grammar.html#grammar_special_l.
+       start_state: the start-state of L.fst.
+       sil_state:  the state of high out-degree in L.fst where words leave
+                   when preceded by optional silence
+       non_sil_state:   the state of high out-degree in L.fst where words leave
+                   when not preceded by optional silence
+       next_state: the number from which this function can start allocating its
+                  own states.  the updated value of next_state will be returned.
+       sil_phone:  the optional-silence phone (a string, e.g 'sil')
+       nonterminals: the user-defined nonterminal symbols as a list of
+          strings, e.g. ['#nonterm:contact_list', ... ].
+       left_context_phones: a list of phones that may appear as left-context,
+          e.g. ['a', 'ah', ... '#nonterm_bos'].
+    """
+    shared_state = next_state
+    next_state += 1
+    final_state = next_state
+    next_state += 1
+
+    print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+        src=start_state, dest=shared_state,
+        phone='#nonterm_begin', word='#nonterm_begin',
+        cost=0.0))
+
+    for nonterminal in nonterminals:
+        # What we are doing here could be viewed as a little lazy, by going to
+        # 'shared_state' instead of a state specific to nonsilence vs. silence
+        # left-context vs. unknown (for #nonterm_begin).  If we made them
+        # separate we could improve (by half) the correctness of how it
+        # interacts with sil-probs in the hard-to-handle case where
+        # word-position-dependent phones are not used and some words end
+        # in the optional-silence phone.
+        for src in [sil_state, non_sil_state]:
+            print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+                src=src, dest=shared_state,
+                phone=nonterminal, word=nonterminal,
+                cost=0.0))
+
+    # this_cost equals log(len(left_context_phones)) but the expression below
+    # better captures the meaning.  Applying this cost to arcs keeps the FST
+    # stochatic (sum-to-one, like an HMM), so that if we do weight pushing
+    # things won't get weird.  In the grammar-FST code when we splice things
+    # together we will cancel out this cost, see the function CombineArcs().
+    this_cost = -math.log(1.0 / len(left_context_phones))
+
+    for left_context_phone in left_context_phones:
+        # The following line is part of how we get this to interact correctly with
+        # the silence probabilities: if the left-context phone was the silence
+        # phone, it goes to sil_state, else nonsil_state.  This won't always
+        # do the right thing if you have a system without word-position-dependent
+        # phones (--position-dependent-phones false to prepare_lang.sh) and
+        # you have words that end in the optional-silence phone.
+        dest = (sil_state if left_context_phone == sil_phone else non_sil_state)
+
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=shared_state, dest=dest,
+            phone=left_context_phone, word='<eps>', cost=this_cost))
+
+    # arc from sil_state and non_sil_state to a final-state with #nonterm_end as
+    # ilabel and olabel.  The costs on these arcs are zero because if you take
+    # that arc, you are not really terminating the sequence, you are just
+    # skipping to sil_state or non_sil_state in the FST one level up.  It
+    # takes the correct path because of the code around 'dest = ...' a few
+    # lines above this, after reaching 'shared_state' because it saw the
+    # user-defined nonterminal.
+    for src in [sil_state, non_sil_state]:
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=src, dest=final_state,
+            phone='#nonterm_end', word='#nonterm_end', cost=0.0))
+    print("{state}\t{final_cost}".format(
+        state=final_state, final_cost=0.0))
+    return next_state
+
+def write_fst(lexicon, silprobs, sil_phone, sil_disambig,
+              nonterminals = None, left_context_phones = None):
+    """Writes the text format of L.fst (or L_disambig.fst)  to the standard output.
+     'lexicon' is a list of 5-tuples
+     (word, pronprob, wordsilprob, silwordcorrection, nonsilwordcorrection, pron)
+         as returned by read_lexiconp().
+     'silprobs' is a 4-tuple of probabilities as returned by read_silprobs().
+     'sil_phone' is the silence phone, e.g. "SIL".
+     'sil_disambig' is either '<eps>', or the silence disambiguation symbol, e.g. "#5".
+     'nonterminals', which relates to grammar decoding (see kaldi-asr.org/doc/grammar.html),
+        is either None, or the user-defined nonterminal symbols as a list of
+        strings, e.g. ['#nonterm:contact_list', ... ].
+     'left_context_phones', which also relates to grammar decoding, and must be
+        supplied if 'nonterminals' is supplied is either None or a list of
+        phones that may appear as left-context, e.g. ['a', 'ah', ... '#nonterm_bos'].
+    """
+    silbeginprob, silendcorrection, nonsilendcorrection, siloverallprob = silprobs
+    initial_sil_cost = -math.log(silbeginprob)
+    initial_non_sil_cost = -math.log(1.0 - silbeginprob);
+    sil_end_correction_cost = -math.log(silendcorrection)
+    non_sil_end_correction_cost = -math.log(nonsilendcorrection);
+    start_state = 0
+    non_sil_state = 1  # words enter and leave from here
+    sil_state = 2   # words terminate here when followed by silence; this state
+                    # has a silence transition to loop_state.
+    next_state = 3  # the next un-allocated state, will be incremented as we go.
+
+    # Arcs from the start state to the silence and nonsilence loop states
+    # The one to the nonsilence state has the silence disambiguation symbol
+    # (We always use that symbol on the *non*-silence-containing arcs, which
+    # avoids having to introduce extra arcs).
+    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
+        src=start_state, dest=non_sil_state,
+        phone=sil_disambig, word='<eps>', cost=initial_non_sil_cost))
+    print('{src}\t{dest}\t{phone}\t{word}\t{cost}'.format(
+        src=start_state, dest=sil_state,
+        phone=sil_phone, word='<eps>', cost=initial_sil_cost))
+
+    for (word, pronprob, wordsilprob, silwordcorrection, nonsilwordcorrection, pron) in lexicon:
+        pron_cost = -math.log(pronprob)
+        word_to_sil_cost = -math.log(wordsilprob)
+        word_to_non_sil_cost = -math.log(1.0 - wordsilprob)
+        sil_to_word_cost = -math.log(silwordcorrection)
+        non_sil_to_word_cost = -math.log(nonsilwordcorrection)
+
+        if len(pron) == 0:
+            # this is not really expected but we try to handle it gracefully.
+            pron = ['<eps>']
+
+        new_state = next_state  # allocate a new state
+        next_state += 1
+        # Create transitions from both non_sil_state and sil_state to 'new_state',
+        # with the word label and the word's first phone on them
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=non_sil_state, dest=new_state,
+            phone=pron[0], word=word, cost=(pron_cost + non_sil_to_word_cost)))
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=sil_state, dest=new_state,
+            phone=pron[0], word=word, cost=(pron_cost + sil_to_word_cost)))
+        cur_state = new_state
+
+        # add states and arcs for all but the first phone.
+        for i in range(1, len(pron)):
+            new_state = next_state
+            next_state += 1
+            print("{src}\t{dest}\t{phone}\t<eps>".format(
+                src=cur_state, dest=new_state, phone=pron[i]))
+            cur_state = new_state
+
+        # ... and from there we return via two arcs to the silence and
+        # nonsilence state.  the silence-disambig symbol, if used,q
+        # goes on the nonsilence arc; this saves us having to insert an epsilon.
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=cur_state,  dest=non_sil_state,
+            phone=sil_disambig, word='<eps>',
+            cost=word_to_non_sil_cost))
+        print("{src}\t{dest}\t{phone}\t{word}\t{cost}".format(
+            src=cur_state, dest=sil_state,
+            phone=sil_phone, word='<eps>',
+            cost=word_to_sil_cost))
+
+    if nonterminals is not None:
+        next_state = write_nonterminal_arcs(
+            start_state, sil_state, non_sil_state,
+            next_state, sil_phone,
+            nonterminals, left_context_phones)
+
+    print('{src}\t{cost}'.format(src=sil_state, cost=sil_end_correction_cost))
+    print('{src}\t{cost}'.format(src=non_sil_state, cost=non_sil_end_correction_cost))
+
+def read_nonterminals(filename):
+    """Reads the user-defined nonterminal symbols in 'filename', checks that
+       it has the expected format and has no duplicates, and returns the nonterminal
+       symbols as a list of strings, e.g.
+       ['#nonterm:contact_list', '#nonterm:phone_number', ... ]. """
+    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
+    if len(ans) == 0:
+        raise RuntimeError("The file {0} contains no nonterminals symbols.".format(filename))
+    for nonterm in ans:
+        if nonterm[:9] != '#nonterm:':
+            raise RuntimeError("In file '{0}', expected nonterminal symbols to start with '#nonterm:', found '{1}'"
+                               .format(filename, nonterm))
+    if len(set(ans)) != len(ans):
+        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
+    return ans
+
+def read_left_context_phones(filename):
+    """Reads, checks, and returns a list of left-context phones, in text form, one
+       per line.  Returns a list of strings, e.g. ['a', 'ah', ..., '#nonterm_bos' ]"""
+    ans = [line.strip(" \t\r\n") for line in open(filename, 'r', encoding='latin-1')]
+    if len(ans) == 0:
+        raise RuntimeError("The file {0} contains no left-context phones.".format(filename))
+    for s in ans:
+        if len(s.split()) != 1:
+            raise RuntimeError("The file {0} contains an invalid line '{1}'".format(filename, s)   )
+
+    if len(set(ans)) != len(ans):
+        raise RuntimeError("Duplicate nonterminal symbols are present in file {0}".format(filename))
+    return ans
+
+
+def main():
+    args = get_args()
+    silprobs = read_silprobs(args.silprobs)
+    lexicon = read_lexiconp(args.lexiconp)
+
+
+    if args.nonterminals is None:
+        nonterminals, left_context_phones = None, None
+    else:
+        if args.left_context_phones is None:
+            print("{0}: if --nonterminals is specified, --left-context-phones must also "
+                  "be specified".format(sys.argv[0]))
+            sys.exit(1)
+        nonterminals = read_nonterminals(args.nonterminals)
+        left_context_phones = read_left_context_phones(args.left_context_phones)
+
+    write_fst(lexicon, silprobs, args.sil_phone, args.sil_disambig,
+              nonterminals, left_context_phones)
+
+
+if __name__ == '__main__':
+      main()
diff --git a/egs/wsj/s5/utils/lang/make_phone_lm.py b/egs/wsj/s5/utils/lang/make_phone_lm.py
index 47d2a45d229..5cc9a8de832 100755
--- a/egs/wsj/s5/utils/lang/make_phone_lm.py
+++ b/egs/wsj/s5/utils/lang/make_phone_lm.py
@@ -4,6 +4,7 @@
 # Apache 2.0.
 
 from __future__ import print_function
+from __future__ import division
 import sys
 import argparse
 import math
@@ -65,7 +66,7 @@
 
 
 
-class CountsForHistory:
+class CountsForHistory(object):
     ## This class (which is more like a struct) stores the counts seen in a
     ## particular history-state.  It is used inside class NgramCounts.
     ## It really does the job of a dict from int to float, but it also
@@ -77,7 +78,7 @@ def __init__(self):
         self.total_count = 0
 
     def Words(self):
-        return self.word_to_count.keys()
+        return list(self.word_to_count.keys())
 
     def __str__(self):
         # e.g. returns ' total=12 3->4 4->6 -1->2'
@@ -109,7 +110,7 @@ def AddCount(self, predicted_word, count):
         else:
             self.word_to_count[predicted_word] = new_count
 
-class NgramCounts:
+class NgramCounts(object):
     ## A note on data-structure.  Firstly, all words are represented as
     ## integers.  We store n-gram counts as an array, indexed by (history-length
     ## == n-gram order minus one) (note: python calls arrays "lists") of dicts
@@ -187,7 +188,7 @@ def ApplyBackoff(self):
         # there will be no unigram.
         if args.verbose >= 1:
             initial_num_ngrams = self.GetNumNgrams()
-        for n in reversed(range(args.no_backoff_ngram_order, args.ngram_order)):
+        for n in reversed(list(range(args.no_backoff_ngram_order, args.ngram_order))):
             this_order_counts = self.counts[n]
             for hist, counts_for_hist in this_order_counts.items():
                 backoff_hist = hist[1:]
@@ -276,8 +277,8 @@ def PruneEmptyStates(self):
 
         states_removed_per_hist_len = [ 0 ] * args.ngram_order
 
-        for n in reversed(range(args.no_backoff_ngram_order,
-                                args.ngram_order)):
+        for n in reversed(list(range(args.no_backoff_ngram_order,
+                                args.ngram_order))):
             num_states_removed = 0
             for hist, counts_for_hist in self.counts[n].items():
                 l = len(counts_for_hist.word_to_count)
@@ -304,14 +305,14 @@ def EnsureStructurallyNeededNgramsExist(self):
         # we have a unigram state].
         if args.verbose >= 1:
             num_ngrams_initial = self.GetNumNgrams()
-        for n in reversed(range(args.no_backoff_ngram_order,
-                                args.ngram_order)):
+        for n in reversed(list(range(args.no_backoff_ngram_order,
+                                args.ngram_order))):
 
             for hist, counts_for_hist in self.counts[n].items():
                 # This loop ensures that if we have an n-gram like (6, 7, 8) -> 9,
                 # then, say, (7, 8) -> 9 and (8) -> 9 exist.
                 reduced_hist = hist
-                for m in reversed(range(args.no_backoff_ngram_order, n)):
+                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                     reduced_hist = reduced_hist[1:]  # shift an element off
                                                      # the history.
                     counts_for_backoff_hist = self.counts[m][reduced_hist]
@@ -321,7 +322,7 @@ def EnsureStructurallyNeededNgramsExist(self):
                 # then, say, (6, 7) -> 8 and (6) -> 7 exist.  This will be needed
                 # for FST representations of the ARPA LM.
                 reduced_hist = hist
-                for m in reversed(range(args.no_backoff_ngram_order, n)):
+                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                     this_word = reduced_hist[-1]
                     reduced_hist = reduced_hist[:-1]  # pop an element off the
                                                       # history
@@ -346,7 +347,7 @@ def PrintAsFst(self, word_disambig_symbol):
         # History will map from history (as a tuple) to integer FST-state.
         hist_to_state = self.GetHistToStateMap()
 
-        for n in [ 1, 0 ] + range(2, args.ngram_order):
+        for n in [ 1, 0 ] + list(range(2, args.ngram_order)):
             this_order_counts = self.counts[n]
             # For order 1, make sure the keys are sorted.
             keys = this_order_counts.keys() if n != 1 else sorted(this_order_counts.keys())
@@ -388,7 +389,7 @@ def GetProtectedNgrams(self):
                 # add the backed-off n-grams (7, 8) -> 9 and (8) -> 9 to
                 # 'protected-ngrams'.
                 reduced_hist = hist
-                for m in reversed(range(args.no_backoff_ngram_order, n)):
+                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                     reduced_hist = reduced_hist[1:]  # shift an element off
                                                      # the history.
 
@@ -399,7 +400,7 @@ def GetProtectedNgrams(self):
                 # history-state (6, 7, 8), then n-grams (6, 7, 8) and (6, 7) are
                 # protected.  This assures that the FST states are accessible.
                 reduced_hist = hist
-                for m in reversed(range(args.no_backoff_ngram_order, n)):
+                for m in reversed(list(range(args.no_backoff_ngram_order, n))):
                     ans.add(reduced_hist)
                     reduced_hist = reduced_hist[:-1]  # pop an element off the
                                                       # history
@@ -499,7 +500,7 @@ def PruningLogprobChange(self, count, discount, backoff_count, backoff_total):
         # and the 'count' term is zero in the numerator part of the log expression,
         # because symbol 'a' is completely backed off in 'this' state.
         this_a_change = augmented_count * \
-            math.log((new_discount * new_backoff_count / new_backoff_total) / \
+            math.log((new_discount * new_backoff_count / new_backoff_total)/ \
                          augmented_count)
 
         # other_a_change is the log-like change of symbol 'a' coming from all
@@ -511,7 +512,7 @@ def PruningLogprobChange(self, count, discount, backoff_count, backoff_total):
         # doing so gives us an upper bound on the divergence.
         other_a_change = \
             a_other_count * math.log((new_backoff_count / new_backoff_total) / \
-                                         (backoff_count / backoff_total))
+                                         (backoff_count / backoff_total)) 
 
         # b_change is the log-like change of phantom symbol 'b' coming from
         # 'this' state (and note: it only comes from this state, that's how we
diff --git a/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py b/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
new file mode 100755
index 00000000000..83aa145c946
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_position_dependent_subword_lexicon.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates a
+        position-dependent subword lexicon from a position-independent subword lexicon
+        by adding suffixes ("_B", "_I", "_E", "_S") to the related phones.
+        It assumes that the input lexicon does not contain disambiguation symbols.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word. 
+        Subword ends with separator can only appear at the beginning or middle of a word. 
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+        The separator should match the separator used in the input lexicon.""")
+    parser.add_argument("lexiconp", type=str, help="""Filename of subword position-independent 
+        lexicon with pronunciation probabilities, with lines of the form 'subword prob p1 p2 ...'""")
+    args = parser.parse_args()
+    return args
+
+def is_end(subword, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword 
+    does not end with separator). Return false otherwise."""
+    return not subword.endswith(separator)
+
+def write_position_dependent_lexicon(lexiconp, separator):
+    """Print a position-dependent lexicon for each subword from the input lexiconp by adding
+    appropriate suffixes ("_B", "_I", "_E", "_S") to the phone sequence related to the subword.
+    There are 4 types of position-dependent subword:
+    1) Beginning subword. It can only appear at the beginning of a word.
+       The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. It can only appear at the middle of a word.
+       All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. It can only appear at the end of a word.
+       The last phone suffix should be "_E" and other suffixes should be "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self). 
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffixes should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+    In most cases (i.e., subwords have more than 1 phones), the suffixes of phones in the middle are "_I"s.
+    So the suffix_list is initialized with all _I and we only replace the first and last phone suffix when
+    dealing with different cases when necessary.
+    """
+    for (word, prob, phones) in lexiconp:
+        phones_length = len(phones)
+
+        # suffix_list is initialized by all "_I"s.
+        suffix_list = ["_I" for i in range(phones_length)]
+
+        if is_end(word, separator):
+            # print end subword lexicon by replacing the last phone suffix by "_E"
+            suffix_list[-1] = "_E"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print singleton subword lexicon
+            # the phone suffix is "_S" if the there is only 1 phone.
+            if phones_length == 1:
+                suffix_list[0] = "_S"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+            # the first phone suffix is "_B" is there is more than 1 phones.
+            else:
+                suffix_list[0] = "_B"
+                phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+                print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+        else:
+            # print middle subword lexicon
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+            # print beginning subword lexicon by replacing the first phone suffix by "_B"
+            suffix_list[0] = "_B"
+            phones_list = [phone + suffix for (phone, suffix) in zip(phones, suffix_list)]
+            print("{} {} {}".format(word, prob, ' '.join(phones_list)))
+
+def main():
+    args = get_args()
+    lexiconp = read_lexiconp(args.lexiconp)
+    write_position_dependent_lexicon(lexiconp, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py b/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
new file mode 100755
index 00000000000..1beec500c13
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/make_subword_lexicon_fst.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+
+# 2019 Dongji Gao
+# Apache 2.0.
+
+from make_lexicon_fst import read_lexiconp
+import argparse
+import math
+import sys
+
+# see get_args() below for usage mesage
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script creates the
+        text form of a subword lexicon FST to be compiled by fstcompile using
+        the appropriate symbol tables (phones.txt and words.txt). It will mostly
+        be invoked indirectly via utils/prepare_lang_subword.sh. The output
+        goes to the stdout. This script is the subword version of make_lexicon_fst.py.
+        It only allows optional silence to appear after end-subword or singleton-subword,
+        (i.e., subwords without separator). In this version we do not support
+        pronunciation probability. (i.e., pron-prob = 1.0)""")
+
+    parser.add_argument('--sil-phone', type=str, help="""Text form of
+        optional-silence phone, e.g. 'SIL'. See also the --sil-prob option.""")
+    parser.add_argument('--sil-prob', type=float, default=0.0, help="""Probability
+        of silence between words (including the beginning and end of word sequence).
+        Must be in range [0.0, 1.0). This refer to the optional silence inserted by
+        the lexicon; see the --sil-phone option.""")
+    parser.add_argument('--sil-disambig', type=str, help="""Disambiguation symbol
+        to disambiguate silence, e.g. #5. Will only be supplied if you are creating 
+        the version of L.fst with disambiguation symbols, intended for use with cyclic 
+        G.fst. This symbol was introduced to fix a rather obscure source of nondeterminism 
+        of CLG.fst, that has to do with reordering of disambiguation symbols and phone symbols.""")
+    parser.add_argument('--position-dependent', action="store_true", help="""Whether 
+        the input lexicon is position-dependent.""")
+    parser.add_argument("--separator", type=str, default="@@", help="""Separator
+        indicates the position of a subword in a word.
+        Subword followed by separator can only appear at the beginning or middle of a word.
+        Subword without separator can only appear at the end of a word or is a word itself.
+        E.g. "international -> inter@@ nation@@ al";
+             "nation        -> nation"
+    The separator should match the separator used in the input lexicon.""")
+    parser.add_argument('lexiconp', type=str, help="""Filename of lexicon with
+        pronunciation probabilities (normally lexiconp.txt), with lines of the
+        form 'subword prob p1 p2...', e.g. 'a, 1.0 ay'""")
+    args = parser.parse_args()
+    return args
+
+def contain_disambig_symbol(phones):
+    """Return true if the phone sequence contains disambiguation symbol.
+    Return false otherwise. Disambiguation symbol is at the end of phones 
+    in the form of #1, #2... There is at most one disambiguation 
+    symbol for each phone sequence"""
+    return True if phones[-1].startswith("#") else False
+
+def print_arc(src, dest, phone, word, cost):
+    print('{}\t{}\t{}\t{}\t{}'.format(src, dest, phone, word, cost))
+
+def is_end(word, separator):
+    """Return true if the subword can appear at the end of a word (i.e., the subword
+    does not end with separator). Return false otherwise."""
+    return not word.endswith(separator)
+
+def get_suffix(phone):
+    """Return the suffix of a phone. The suffix is in the form of '_B', '_I'..."""
+    if len(phone) < 3:
+        print("{}: invalid phone {} (please check if the phone is position-dependent)".format(
+              sys.argv[0], phone), file=sys.stderr)
+        sys.exit(1)
+    return phone[-2:]
+
+def write_fst_no_silence(lexicon, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter loop_state.
+    This guarantees that optional silence can only follow a word-end subword.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons) as returned by
+      'position_dependent', which is true is the lexicon is position-dependent.
+      'separator' is a symbol which indicates the position of a subword in word.
+    """
+    # regular setting
+    loop_state = 0
+    word_start_state = 1
+    next_state = 2
+
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0                # do not support pron_prob
+        phones_len = len(phones)
+
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton word
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state = loop_state
+            # set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state = word_internal_state
+            elif last_phone_suffix == "_E":
+                end_state = loop_state
+        else:
+            current_state = word_start_state
+            end_state = loop_state if is_end(word, separator) else word_start_state
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >=0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        print_arc(current_state, end_state, phone, word, cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def write_fst_with_silence(lexicon, sil_phone, sil_prob, sil_disambig, position_dependent, separator):
+    """Writes the text format of L.fst to the standard output.  This version is for
+    when --sil-prob=0.0, meaning there is no optional silence allowed.
+    loop_state here is the start and final state of the fst. It goes to word_start_state
+    via epsilon transition.
+
+    In position-independent case, there is no difference between beginning word and 
+    middle word. So all subwords with separator would leave from and enter word_start_state.
+    All subword without separator would leave from word_start_state and enter sil_state.
+    This guarantees that optional silence can only follow a word-end subword and such subwords
+    must appear at the end of the whole subword sequence.
+
+    In position-dependent case, there are 4 types of position-dependent subword:
+    1) Beginning subword. The first phone suffix should be "_B" and other suffixes should be "_I"s:
+        nation@@ 1.0 n_B ey_I sh_I ih_I n_I
+        n@@      1.0 n_B
+    2) Middle subword. All phone suffixes should be "_I"s:
+        nation@@ 1.0 n_I ey_I sh_I ih_I n_I
+    3) End subword. The last phone suffix should be "_E" and other suffixes be should "_I"s:
+        nation   1.0 n_I ey_I sh_I ih_I n_E
+        n        1.0 n_E
+    4) Singleton subword (i.e., the subword is word it self).
+       The first phone suffix should be "_B" and the last suffix should be "_E".
+       All other suffix should be "_I"s. If there is only one phone, its suffix should be "_S":
+        nation   1.0 n_B ey_I sh_I ih_I n_E
+        n        1.0 n_S
+
+    So we need an extra word_internal_state. The beginning word 
+    would leave from word_start_state and enter word_internal_state and middle word
+    would leave from and enter word_internal_state. The rest part is same.
+
+      'lexicon' is a list of 3-tuples (subword, pron-prob, prons)
+         as returned by read_lexiconp().
+      'sil_prob', which is expected to be strictly between 0.0 and 1.0, is the
+         probability of silence
+      'sil_phone' is the silence phone, e.g. "SIL".
+      'sil_disambig' is either None, or the silence disambiguation symbol, e.g. "#5".
+      'position_dependent', which is True is the lexicion is position-dependent.
+      'separator' is the symbol we use to indicate the position of a subword in word.
+    """
+
+    sil_cost = -math.log(sil_prob)
+    no_sil_cost = -math.log(1 - sil_prob)
+
+    # regular setting
+    start_state = 0
+    loop_state = 1         # also the final state
+    sil_state = 2          # words terminate here when followed by silence; this state
+                           # has a licence transition to loop_state
+    word_start_state = 3   # subword leave from here
+    next_state = 4         # the next un-allocated state, will be incremented as we go
+
+    print_arc(start_state, loop_state, "<eps>", "<eps>", no_sil_cost)
+    print_arc(start_state, sil_state, "<eps>", "<eps>", sil_cost)
+    print_arc(loop_state, word_start_state, "<eps>", "<eps>", 0.0)
+
+    # optional setting for disambig_state
+    if sil_disambig is None:
+        print_arc(sil_state, loop_state, sil_phone, "<eps>", 0.0)
+    else:
+        disambig_state = next_state
+        next_state += 1
+        print_arc(sil_state, disambig_state, sil_phone, "<eps>", 0.0)
+        print_arc(disambig_state, loop_state, sil_disambig, "<eps>", 0.0)
+
+    # optional setting for word_internal_state
+    if position_dependent:
+        word_internal_state = next_state
+        next_state += 1
+
+    for (word, pron_prob, phones) in lexicon:
+        pron_cost = 0.0           # do not support pron_prob
+        phones_len = len(phones)
+        
+        # set start and end state for different cases
+        if position_dependent:
+            first_phone_suffix = get_suffix(phones[0])
+            last_phone = phones[-2] if contain_disambig_symbol(phones) else phones[-1]
+            last_phone_suffix = get_suffix(last_phone)
+
+            # singleton subword
+            if first_phone_suffix == "_S":
+                current_state = word_start_state
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            # first set the current_state
+            elif first_phone_suffix == "_B":
+                current_state = word_start_state
+            elif first_phone_suffix == "_I" or first_phone_suffix == "_E":
+                current_state = word_internal_state
+            # then set the end_state (end_state_list)
+            if last_phone_suffix == "_B" or last_phone_suffix == "_I":
+                end_state_list = [word_internal_state]
+                end_cost_list = [0.0]
+            elif last_phone_suffix == "_E":
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+        else:
+            current_state = word_start_state
+            if is_end(word, separator):
+                end_state_list = [loop_state, sil_state]
+                end_cost_list = [no_sil_cost, sil_cost]
+            else:
+                end_state_list = [word_start_state]
+                end_cost_list = [0.0]
+
+        # print arcs (except the last one) for the subword
+        for i in range(phones_len - 1):
+            word = word if i == 0 else "<eps>"
+            cost = pron_cost if i == 0 else 0.0
+            print_arc(current_state, next_state, phones[i], word, cost)
+            current_state = next_state
+            next_state += 1
+
+        # print the last arc
+        i = phones_len - 1
+        phone = phones[i] if i >= 0 else "<eps>"
+        word = word if i <= 0 else "<eps>"
+        cost = pron_cost if i <= 0 else 0.0
+        for (end_state, end_cost) in zip(end_state_list, end_cost_list):
+            print_arc(current_state, end_state, phone, word, cost + end_cost)
+
+    # set the final state
+    print("{state}\t{final_cost}".format(state=loop_state, final_cost=0.0))
+
+def main():
+    args = get_args()
+    if args.sil_prob < 0.0 or args.sil_prob >= 1.0:
+        print("{}: invalid value specified --sil-prob={}".format(
+              sys.argv[0], args.sil_prob), file=sys.stderr)
+        sys.exit(1)
+    lexicon = read_lexiconp(args.lexiconp)
+    if args.sil_prob == 0.0:
+        write_fst_no_silence(lexicon, args.position_dependent, args.separator)
+    else:
+        write_fst_with_silence(lexicon, args.sil_phone, args.sil_prob, 
+            args.sil_disambig, args.position_dependent, args.separator)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/utils/make_lexicon_fst.pl b/egs/wsj/s5/utils/make_lexicon_fst.pl
index f97129c05cb..cd39ef98b4c 100755
--- a/egs/wsj/s5/utils/make_lexicon_fst.pl
+++ b/egs/wsj/s5/utils/make_lexicon_fst.pl
@@ -1,4 +1,9 @@
 #!/usr/bin/env perl
+
+# THIS SCRIPT IS DEPRECATED AND WILL BE REMOVED.  See
+# utils/lang/make_lexicon_fst.py which is the python-based replacement.
+
+
 use warnings; #sed replacement for -w perl parameter
 # Copyright 2010-2011  Microsoft Corporation
 #                2013  Johns Hopkins University (author: Daniel Povey)
diff --git a/egs/wsj/s5/utils/make_lexicon_fst_silprob.pl b/egs/wsj/s5/utils/make_lexicon_fst_silprob.pl
index 557af4fe65e..cef26caf2f5 100755
--- a/egs/wsj/s5/utils/make_lexicon_fst_silprob.pl
+++ b/egs/wsj/s5/utils/make_lexicon_fst_silprob.pl
@@ -1,4 +1,8 @@
 #!/usr/bin/env perl
+
+# THIS SCRIPT IS DEPRECATED AND WILL BE REMOVED.  See
+# utils/lang/make_lexicon_fst_silprob.py which is the python-based replacement.
+
 use warnings; #sed replacement for -w perl parameter
 # Copyright 2010-2011  Microsoft Corporation
 #                2013  Johns Hopkins University (author: Daniel Povey)
@@ -19,8 +23,8 @@
 # limitations under the License.
 
 
-# makes lexicon FST, in text form, from lexicon which contains (optional) 
-# probabilities of pronuniations, and (mandatory) probabilities of silence 
+# makes lexicon FST, in text form, from lexicon which contains (optional)
+# probabilities of pronuniations, and (mandatory) probabilities of silence
 # before and after the pronunciation. This script is almost the same with
 # the make_lexicon_fst.pl script except for the word-dependent silprobs part
 
@@ -68,7 +72,7 @@
   $w = shift @A;
   if ($w eq "<s>") {
     $silbeginprob = shift @A;
-  } 
+  }
   if ($w eq "</s>_s") {
     $silendcorrection = shift @A;
   }
@@ -142,6 +146,6 @@
   }
 }
 $cost = -log($silendcorrection);
-print "$silstart\t$cost\n";   
+print "$silstart\t$cost\n";
 $cost = -log($nonsilendcorrection);
 print "$nonsilstart\t$cost\n";
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index 1becfc45be3..31e86cd38f6 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -78,6 +78,19 @@ P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error
 [[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
   echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
 
+if [ -f $lang/phones/nonterm_phones_offset.int ]; then
+  if [[ $N != 2  || $P != 1 ]]; then
+    echo "$0: when doing grammar decoding, you can only build graphs for left-biphone trees."
+    exit 1
+  fi
+  nonterm_phones_offset=$(cat $lang/phones/nonterm_phones_offset.int)
+  nonterm_opt="--nonterm-phones-offset=$nonterm_phones_offset"
+  prepare_grammar_command="make-grammar-fst --nonterm-phones-offset=$nonterm_phones_offset - -"
+else
+  prepare_grammar_command="cat"
+  nonterm_opt=
+fi
+
 mkdir -p $lang/tmp
 trap "rm -f $lang/tmp/LG.fst.$$" EXIT HUP INT PIPE TERM
 # Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in
@@ -85,8 +98,7 @@ trap "rm -f $lang/tmp/LG.fst.$$" EXIT HUP INT PIPE TERM
 if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \
       $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then
   fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \
-    fstminimizeencoded | fstpushspecial | \
-    fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst.$$ || exit 1;
+    fstminimizeencoded | fstpushspecial > $lang/tmp/LG.fst.$$ || exit 1;
   mv $lang/tmp/LG.fst.$$ $lang/tmp/LG.fst
   fstisstochastic $lang/tmp/LG.fst || echo "[info]: LG not stochastic."
 fi
@@ -98,10 +110,10 @@ ilabels_tmp=$ilabels.$$
 trap "rm -f $clg_tmp $ilabels_tmp" EXIT HUP INT PIPE TERM
 if [[ ! -s $clg || $clg -ot $lang/tmp/LG.fst \
     || ! -s $ilabels || $ilabels -ot $lang/tmp/LG.fst ]]; then
-  fstcomposecontext --context-size=$N --central-position=$P \
+  fstcomposecontext $nonterm_opt --context-size=$N --central-position=$P \
    --read-disambig-syms=$lang/phones/disambig.int \
    --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.int \
-    $ilabels_tmp < $lang/tmp/LG.fst |\
+    $ilabels_tmp $lang/tmp/LG.fst |\
     fstarcsort --sort_type=ilabel > $clg_tmp
   mv $clg_tmp $clg
   mv $ilabels_tmp $ilabels
@@ -111,7 +123,7 @@ fi
 trap "rm -f $dir/Ha.fst.$$" EXIT HUP INT PIPE TERM
 if [[ ! -s $dir/Ha.fst || $dir/Ha.fst -ot $model  \
     || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then
-  make-h-transducer --disambig-syms-out=$dir/disambig_tid.int \
+  make-h-transducer $nonterm_opt --disambig-syms-out=$dir/disambig_tid.int \
     --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \
      > $dir/Ha.fst.$$  || exit 1;
   mv $dir/Ha.fst.$$ $dir/Ha.fst
@@ -134,8 +146,9 @@ fi
 
 trap "rm -f $dir/HCLG.fst.$$" EXIT HUP INT PIPE TERM
 if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then
-  add-self-loops --self-loop-scale=$loopscale --reorder=true \
-    $model < $dir/HCLGa.fst | fstconvert --fst_type=const > $dir/HCLG.fst.$$ || exit 1;
+  add-self-loops --self-loop-scale=$loopscale --reorder=true $model $dir/HCLGa.fst | \
+    $prepare_grammar_command | \
+    fstconvert --fst_type=const > $dir/HCLG.fst.$$ || exit 1;
   mv $dir/HCLG.fst.$$ $dir/HCLG.fst
   if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then
     # No point doing this test if transition-scale not 1, as it is bound to fail.
@@ -162,7 +175,8 @@ mkdir -p $dir/phones
 cp $lang/phones/word_boundary.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring,
 cp $lang/phones/align_lexicon.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring,
 cp $lang/phones/optional_silence.* $dir/phones/ 2>/dev/null # might be needed for analyzing alignments.
-  # but ignore the error if it's not there.
+    # but ignore the error if it's not there.
+
 
 cp $lang/phones/disambig.{txt,int} $dir/phones/ 2> /dev/null
 cp $lang/phones/silence.csl $dir/phones/ || exit 1;
diff --git a/egs/wsj/s5/utils/nnet/gen_dct_mat.py b/egs/wsj/s5/utils/nnet/gen_dct_mat.py
index d0f043ad7a4..77461112d0b 100755
--- a/egs/wsj/s5/utils/nnet/gen_dct_mat.py
+++ b/egs/wsj/s5/utils/nnet/gen_dct_mat.py
@@ -16,16 +16,21 @@
 # limitations under the License.
 
 # ./gen_dct_mat.py
-# script generates matrix with DCT transform, which is sparse 
-# and takes into account that data-layout is along frequency axis, 
+# script generates matrix with DCT transform, which is sparse
+# and takes into account that data-layout is along frequency axis,
 # while DCT is done along temporal axis.
 
+from __future__ import division
+from __future__ import print_function
 from math import *
 import sys
 
 
 from optparse import OptionParser
 
+def print_on_same_line(text):
+    print(text, end=' ')
+
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim', help='feature dimension')
 parser.add_option('--splice', dest='splice', help='applied splice value')
@@ -49,19 +54,19 @@
 
 
 #generate sparse DCT matrix
-print '['
+print('[')
 for k in range(dct_basis):
     for m in range(dim):
         for n in range(timeContext):
-          if(n==0): 
-              print m*'0 ',
-          else: 
-              print (dim-1)*'0 ',
-          print str(sqrt(2.0/timeContext)*cos(M_PI/timeContext*k*(n+0.5))),
+          if(n==0):
+              print_on_same_line(m*'0 ')
+          else:
+              print_on_same_line((dim-1)*'0 ')
+          print_on_same_line(str(sqrt(2.0/timeContext)*cos(M_PI/timeContext*k*(n+0.5))))
           if(n==timeContext-1):
-              print (dim-m-1)*'0 ',
-        print
-    print 
+              print_on_same_line((dim-m-1)*'0 ')
+        print()
+    print()
 
-print ']'
+print(']')
 
diff --git a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
index a4262a8cffd..110178c6702 100755
--- a/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
+++ b/egs/wsj/s5/utils/nnet/gen_hamm_mat.py
@@ -18,12 +18,17 @@
 # ./gen_hamm_mat.py
 # script generates diagonal matrix with hamming window values
 
+from __future__ import division
+from __future__ import print_function
 from math import *
 import sys
 
 
 from optparse import OptionParser
 
+def print_on_same_line(text):
+    print(text, end=' ')
+
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim', help='feature dimension')
 parser.add_option('--splice', dest='splice', help='applied splice value')
@@ -42,16 +47,16 @@
 
 dim_mat=(2*splice+1)*dim
 timeContext=2*splice+1
-print '['
+print('[')
 for row in range(dim_mat):
     for col in range(dim_mat):
         if col!=row:
-            print '0',
+            print_on_same_line('0')
         else:
             i=int(row/dim)
-            print str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1))),
-    print
+            print_on_same_line(str(0.54 - 0.46*cos((M_2PI * i) / (timeContext-1))))
+    print()
 
-print ']'
+print(']')
 
 
diff --git a/egs/wsj/s5/utils/nnet/gen_splice.py b/egs/wsj/s5/utils/nnet/gen_splice.py
index 0241aeed6ba..f3a2c8b39ac 100755
--- a/egs/wsj/s5/utils/nnet/gen_splice.py
+++ b/egs/wsj/s5/utils/nnet/gen_splice.py
@@ -18,12 +18,16 @@
 # ./gen_splice.py
 # generates <splice> Component
 
+from __future__ import print_function
 from math import *
 import sys
 
 
 from optparse import OptionParser
 
+def print_on_same_line(text):
+    print(text, end=' ')
+
 parser = OptionParser()
 parser.add_option('--fea-dim', dest='dim_in', help='feature dimension')
 parser.add_option('--splice', dest='splice', help='number of frames to concatenate with the central frame')
@@ -40,12 +44,12 @@
 
 dim_out=(2*splice+1)*dim_in
 
-print '<splice>', dim_out, dim_in
-print '[',
+print('<splice> {0} {1}'.format(dim_out, dim_in))
+print_on_same_line('[')
 
-splice_vec = range(-splice*splice_step, splice*splice_step+1, splice_step)
+splice_vec = list(range(-splice*splice_step, splice*splice_step+1, splice_step))
 for idx in range(len(splice_vec)):
-    print splice_vec[idx],
+    print_on_same_line(splice_vec[idx])
 
-print ']'
+print(']')
 
diff --git a/egs/wsj/s5/utils/nnet/make_blstm_proto.py b/egs/wsj/s5/utils/nnet/make_blstm_proto.py
index 6e540ec791a..4d269cfdef0 100755
--- a/egs/wsj/s5/utils/nnet/make_blstm_proto.py
+++ b/egs/wsj/s5/utils/nnet/make_blstm_proto.py
@@ -17,6 +17,7 @@
 
 # Generated Nnet prototype, to be initialized by 'nnet-initialize'.
 
+from __future__ import print_function
 import sys
 
 ###
@@ -54,7 +55,7 @@
   parser.print_help()
   sys.exit(1)
 
-(feat_dim, num_leaves) = map(int,args);
+(feat_dim, num_leaves) = [int(i) for i in args];
 
 # Original prototype from Jiayu,
 #<NnetProto>
@@ -77,18 +78,18 @@
 # The BLSTM layers,
 if o.num_layers == 1:
   # Single BLSTM,
-  print "<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts
+  print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts)
 else:
   # >1 BLSTM,
-  print "<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts
+  print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts)
   for l in range(o.num_layers - 2):
-    print "<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (2*o.proj_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts
-  print "<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (2*o.proj_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts
+    print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (2*o.proj_dim, 2*o.proj_dim, o.cell_dim) + lstm_extra_opts)
+  print("<BlstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (2*o.proj_dim, 2*o.proj_dim_last, o.cell_dim) + lstm_extra_opts)
 
 # Adding <Tanh> for more stability,
-print "<Tanh> <InputDim> %d <OutputDim> %d" % (2*o.proj_dim_last, 2*o.proj_dim_last)
+print("<Tanh> <InputDim> %d <OutputDim> %d" % (2*o.proj_dim_last, 2*o.proj_dim_last))
 
 # Softmax layer,
-print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0" % (2*o.proj_dim_last, num_leaves) + softmax_affine_opts
-print "<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves)
+print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0" % (2*o.proj_dim_last, num_leaves) + softmax_affine_opts)
+print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
 
diff --git a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py b/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py
deleted file mode 100755
index 73455563b51..00000000000
--- a/egs/wsj/s5/utils/nnet/make_cnn2d_proto.py
+++ /dev/null
@@ -1,257 +0,0 @@
-#!/usr/bin/python
-
-# Copyright 2014  Brno University of Technology (author: Karel Vesely)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# Generated Nnet prototype, to be initialized by 'nnet-initialize'.
-
-import math, random, sys, warnings
-from optparse import OptionParser
-
-###
-### Parse options
-###
-usage="%prog [options] <feat-dim> <num-leaves> <num-hidden-layers> <num-hidden-neurons>  >nnet-proto-file"
-parser = OptionParser(usage)
-
-parser.add_option('--activation-type', dest='activation_type', 
-                   help='Select type of activation function : (<Sigmoid>|<Tanh>) [default: %default]', 
-                   default='<Sigmoid>', type='string');
-
-parser.add_option('--cnn1-num-filters', dest='cnn1_num_filters',
-		   help='Number of filters in first convolutional layer [default: %default]',
-		   default=128, type='int')
-# this is given by splice
-# parser.add_option('--cnn1-fmap-x-len', dest='cnn1_fmap_x_len',
-# 	  	   help='Size of cnn1-fmap-x-len [default: %default]',
-# 		   default=11, type='int')
-
-# this should be equal to feat_raw_dim
-# parser.add_option('--cnn1-fmap-y-len', dest='cnn1_fmap_y_len',
-# 	  	   help='Size of cnn1-fmap-y-len [default: %default]',
-# 		   default=32, type='int')
-
-parser.add_option('--cnn1-filt-x-len', dest='cnn1_filt_x_len',
-	  	   help='Size of cnn1-filt-x-len [default: %default]',
-		   default=9, type='int')
-parser.add_option('--cnn1-filt-y-len', dest='cnn1_filt_y_len',
-	  	   help='Size of cnn1-filt-y-len [default: %default]',
-		   default=9, type='int')
-
-parser.add_option('--cnn1-filt-x-step', dest='cnn1_filt_x_step',
-	  	   help='Size of cnn1-filt-x-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn1-filt-y-step', dest='cnn1_filt_y_step',
-	  	   help='Size of cnn1-filt-y-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn1-connect-fmap', dest='cnn1_connect_fmap',
-	  	   help='Size of cnn1-connect-fmap [default: %default]',
-		   default=0, type='int')
-
-parser.add_option('--pool1-x-len', dest='pool1_x_len',
-	  	   help='Size of pool1-filt-x-len [default: %default]',
-		   default=1, type='int')
-parser.add_option('--pool1-x-step', dest='pool1_x_step',
-	  	   help='Size of pool1-x-step [default: %default]',
-		   default=1, type='int')
-
-
-# 
-parser.add_option('--pool1-y-len', dest='pool1_y_len',
-	  	   help='Size of pool1-y-len [default: %default]',
-		   default=3, type='int')
-parser.add_option('--pool1-y-step', dest='pool1_y_step',
-	  	   help='Size of pool1-y-step [default: %default]',
-		   default=3, type='int')
-
-parser.add_option('--pool1-type', dest='pool1_type',
-		  help='Type of pooling (Max || Average) [default: %default]',
-		  default='Max', type='string')
-
-parser.add_option('--cnn2-num-filters', dest='cnn2_num_filters',
-		   help='Number of filters in first convolutional layer [default: %default]',
-		   default=256, type='int')
-parser.add_option('--cnn2-filt-x-len', dest='cnn2_filt_x_len',
-	  	   help='Size of cnn2-filt-x-len [default: %default]',
-		   default=3, type='int')
-parser.add_option('--cnn2-filt-y-len', dest='cnn2_filt_y_len',
-	  	   help='Size of cnn2-filt-y-len [default: %default]',
-		   default=4, type='int')
-parser.add_option('--cnn2-filt-x-step', dest='cnn2_filt_x_step',
-	  	   help='Size of cnn2-filt-x-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn2-filt-y-step', dest='cnn2_filt_y_step',
-	  	   help='Size of cnn2-filt-y-step [default: %default]',
-		   default=1, type='int')
-parser.add_option('--cnn2-connect-fmap', dest='cnn2_connect_fmap',
-	  	   help='Size of cnn2-connect-fmap [default: %default]',
-		   default=1, type='int')
-
-parser.add_option('--pitch-dim', dest='pitch_dim',
-		  help='Number of features representing pitch [default: %default]',
-		  default=0, type='int')
-parser.add_option('--delta-order', dest='delta_order',
-		  help='Order of delta features [default: %default]',
-		  default=2, type='int')
-parser.add_option('--splice', dest='splice',
-		  help='Length of splice [default: %default]',
-		  default=5,type='int')
-parser.add_option('--dir', dest='dirct',
-		  help='Directory, where network prototypes will be saved [default: %default]',
-		  default='.', type='string')
-parser.add_option('--num-pitch-neurons', dest='num_pitch_neurons',
-		  help='Number of neurons in layers processing pitch features [default: %default]',
-		  default='200', type='int')
-
-
-(o,args) = parser.parse_args()
-if len(args) != 1 : 
-  parser.print_help()
-  sys.exit(1)
-  
-feat_dim=int(args[0])
-### End parse options 
-
-feat_raw_dim = feat_dim / (o.delta_order+1) / (o.splice*2+1) - o.pitch_dim # we need number of feats without deltas and splice and pitch
-o.cnn1_fmap_y_len = feat_raw_dim
-o.cnn1_fmap_x_len = o.splice*2+1
-
-# Check
-assert(feat_dim > 0)
-assert(o.pool1_type == 'Max' or o.pool1_type == 'Average')
-
-## Extra checks if dimensions are matching, if not match them by 
-## producing a warning
-# cnn1
-assert( (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) % o.cnn1_filt_y_step == 0 )
-assert( (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) % o.cnn1_filt_x_step == 0 )
-
-# subsample1
-cnn1_out_fmap_y_len=((1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step))
-cnn1_out_fmap_x_len=((1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step))
-
-# fix filt_len and filt_step
-def fix_filt_step(inp_len, filt_len, filt_step):
-  
-  if ((inp_len - filt_len) % filt_step == 0):
-    return filt_step
-  else:
-    # filt_step <= filt_len
-    for filt_step in xrange(filt_len, 0, -1):
-      if ((inp_len - filt_len) % filt_step == 0):
-        return filt_step
-    
-o.pool1_y_step = fix_filt_step(cnn1_out_fmap_y_len, o.pool1_y_len, o.pool1_y_step)
-if o.pool1_y_step == 1 and o.pool1_y_len != 1:
-  warnings.warn('WARNING: Choose different pool1_y_len as subsampling is not happening');
-  
-o.pool1_x_step = fix_filt_step(cnn1_out_fmap_x_len, o.pool1_x_len, o.pool1_x_step)
-if o.pool1_x_step == 1 and o.pool1_x_len != 1:
-  warnings.warn('WARNING: Choose different pool1_x_len as subsampling is not happening');
-
-
-###
-### Print prototype of the network
-###
-
-# Begin the prototype
-print "<NnetProto>"
-
-# Convolutional part of network
-'''1st CNN layer'''
-cnn1_input_dim=feat_raw_dim * (o.delta_order+1) * (o.splice*2+1)
-cnn1_out_fmap_x_len=((1 + (o.cnn1_fmap_x_len - o.cnn1_filt_x_len) / o.cnn1_filt_x_step))
-cnn1_out_fmap_y_len=((1 + (o.cnn1_fmap_y_len - o.cnn1_filt_y_len) / o.cnn1_filt_y_step))
-cnn1_output_dim=o.cnn1_num_filters * cnn1_out_fmap_x_len * cnn1_out_fmap_y_len
-
-'''1st Pooling layer'''
-pool1_input_dim=cnn1_output_dim
-pool1_fmap_x_len=cnn1_out_fmap_x_len
-pool1_out_fmap_x_len=((1 + (pool1_fmap_x_len - o.pool1_x_len) / o.pool1_x_step))
-pool1_fmap_y_len=cnn1_out_fmap_y_len
-pool1_out_fmap_y_len=((1 + (pool1_fmap_y_len - o.pool1_y_len) / o.pool1_y_step))
-pool1_output_dim=o.cnn1_num_filters*pool1_out_fmap_x_len*pool1_out_fmap_y_len
-
-'''2nd CNN layer'''
-cnn2_input_dim=pool1_output_dim
-cnn2_fmap_x_len=pool1_out_fmap_x_len
-cnn2_out_fmap_x_len=((1 + (cnn2_fmap_x_len - o.cnn2_filt_x_len) / o.cnn2_filt_x_step))
-cnn2_fmap_y_len=pool1_out_fmap_y_len
-cnn2_out_fmap_y_len=((1 + (cnn2_fmap_y_len - o.cnn2_filt_y_len) / o.cnn2_filt_y_step))
-cnn2_output_dim=o.cnn2_num_filters * cnn2_out_fmap_x_len * cnn2_out_fmap_y_len
-
-
-convolution_proto = ''
-
-convolution_proto += "<Convolutional2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <FiltXLen> %d <FiltYLen> %d <FiltXStep> %d <FiltYStep> %d <ConnectFmap> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n" % \
-    ( cnn1_input_dim, cnn1_output_dim, o.cnn1_fmap_x_len, o.cnn1_fmap_y_len, o.cnn1_filt_x_len, o.cnn1_filt_y_len, o.cnn1_filt_x_step, o.cnn1_filt_y_step, o.cnn1_connect_fmap, 0.0, 0.0, 0.01 )
-convolution_proto += "<%sPooling2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <PoolXLen> %d <PoolYLen> %d <PoolXStep> %d <PoolYStep> %d\n" % \
-    ( o.pool1_type, pool1_input_dim, pool1_output_dim, pool1_fmap_x_len, pool1_fmap_y_len, o.pool1_x_len, o.pool1_y_len, o.pool1_x_step, o.pool1_y_step )
-convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( pool1_output_dim, pool1_output_dim, 1.0 )
-convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( pool1_output_dim, pool1_output_dim, 0.0 )
-convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
-    ( o.activation_type, pool1_output_dim, pool1_output_dim )
-convolution_proto += "<Convolutional2DComponent> <InputDim> %d <OutputDim> %d <FmapXLen> %d <FmapYLen> %d <FiltXLen> %d <FiltYLen> %d <FiltXStep> %d <FiltYStep> %d <ConnectFmap> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n" % \
-    ( cnn2_input_dim, cnn2_output_dim, cnn2_fmap_x_len, cnn2_fmap_y_len, o.cnn2_filt_x_len, o.cnn2_filt_y_len, o.cnn2_filt_x_step, o.cnn2_filt_y_step, o.cnn2_connect_fmap, -2.0, 4.0, 0.1 )
-convolution_proto += "<Rescale> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( cnn2_output_dim, cnn2_output_dim, 1.0)
-convolution_proto += "<AddShift> <InputDim> %d <OutputDim> %d <InitParam> %f\n" % \
-    ( cnn2_output_dim, cnn2_output_dim, 0.0)
-convolution_proto += "%s <InputDim> %d <OutputDim> %d\n" % \
-    ( o.activation_type, cnn2_output_dim, cnn2_output_dim)
-
-if (o.pitch_dim > 0):
-  # convolutional part
-  f_conv = open('%s/nnet.proto.convolution' % o.dirct, 'w')
-  f_conv.write('<NnetProto>\n')
-  f_conv.write(convolution_proto)
-  f_conv.write('</NnetProto>\n')
-  f_conv.close()
-  
-  # pitch part
-  f_pitch = open('%s/nnet.proto.pitch' % o.dirct, 'w')
-  f_pitch.write('<NnetProto>\n')
-  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
-		((o.pitch_dim * (o.delta_order+1) * (o.splice*2+1)), o.num_pitch_neurons, -2.0, 4.0, 0.109375))
-  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
-		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
-  f_pitch.write('<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f\n' % \
-		(o.num_pitch_neurons, o.num_pitch_neurons, -2.0, 4.0, 0.109375))
-  f_pitch.write('%s <InputDim> %d <OutputDim> %d\n' % \
-		(o.activation_type, o.num_pitch_neurons, o.num_pitch_neurons))
-  f_pitch.write('</NnetProto>\n')
-  f_pitch.close()
-
-  # paralell part
-  vector = ''
-  for i in range(1, (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), feat_raw_dim + o.pitch_dim):
-    vector += '%d:1:%d ' % (i, i + feat_raw_dim - 1)
-  for i in range(feat_raw_dim+1, (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), feat_raw_dim + o.pitch_dim):
-    vector += '%d:1:%d ' % (i, i + o.pitch_dim - 1)
-  print '<Copy> <InputDim> %d <OutputDim> %d <BuildVector>  %s </BuildVector> ' % \
-	((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), (feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), vector)
-  print '<ParallelComponent> <InputDim> %d <OutputDim> %d <NestedNnetProto> %s %s </NestedNnetProto>' % \
-	((feat_raw_dim + o.pitch_dim) * (o.delta_order+1) * (o.splice*2+1), o.num_pitch_neurons + cnn2_output_dim, '%s/nnet.proto.convolution' % o.dirct, '%s/nnet.proto.pitch' % o.dirct)
-
-  num_convolution_output = o.num_pitch_neurons + cnn2_output_dim
-else: # no pitch
-  print convolution_proto
-
-# We are done!
-sys.exit(0)
-
-
diff --git a/egs/wsj/s5/utils/nnet/make_cnn_proto.py b/egs/wsj/s5/utils/nnet/make_cnn_proto.py
index c6aa519ea96..4d8b9ca2946 100755
--- a/egs/wsj/s5/utils/nnet/make_cnn_proto.py
+++ b/egs/wsj/s5/utils/nnet/make_cnn_proto.py
@@ -17,6 +17,8 @@
 
 # Generated Nnet prototype, to be initialized by 'nnet-initialize'.
 
+from __future__ import division
+from __future__ import print_function
 import math, random, sys
 from optparse import OptionParser
 
@@ -88,7 +90,7 @@
 ###
 
 # Begin the prototype
-print "<NnetProto>"
+print("<NnetProto>")
 
 # Convolutional part of network
 num_patch1 = 1 + (feat_raw_dim - o.patch_dim1) / o.patch_step1
@@ -150,13 +152,13 @@
     vector += '%d:1:%d ' % (i, i + feat_raw_dim - 1)
   for i in range(feat_raw_dim+1, inputdim_of_cnn + 1, feat_raw_dim + o.pitch_dim):
     vector += '%d:1:%d ' % (i, i + o.pitch_dim - 1)
-  print '<Copy> <InputDim> %d <OutputDim> %d <BuildVector> %s </BuildVector>' % \
-	(inputdim_of_cnn, inputdim_of_cnn, vector)
-  print '<ParallelComponent> <InputDim> %d <OutputDim> %d <NestedNnetProto> %s %s </NestedNnetProto>' % \
-	(inputdim_of_cnn, o.num_pitch_neurons + outputdim_of_cnn, '%s/nnet.proto.convolution' % o.protodir, '%s/nnet.proto.pitch' % o.protodir)
+  print('<Copy> <InputDim> %d <OutputDim> %d <BuildVector> %s </BuildVector>' % \
+	(inputdim_of_cnn, inputdim_of_cnn, vector))
+  print('<ParallelComponent> <InputDim> %d <OutputDim> %d <NestedNnetProto> %s %s </NestedNnetProto>' % \
+	(inputdim_of_cnn, o.num_pitch_neurons + outputdim_of_cnn, '%s/nnet.proto.convolution' % o.protodir, '%s/nnet.proto.pitch' % o.protodir))
 
 else: # no pitch
-  print convolution_proto
+  print(convolution_proto)
 
 # We are done!
 sys.exit(0)
diff --git a/egs/wsj/s5/utils/nnet/make_lstm_proto.py b/egs/wsj/s5/utils/nnet/make_lstm_proto.py
index a2da0a194fc..6818c860ed0 100755
--- a/egs/wsj/s5/utils/nnet/make_lstm_proto.py
+++ b/egs/wsj/s5/utils/nnet/make_lstm_proto.py
@@ -17,6 +17,7 @@
 
 # Generated Nnet prototype, to be initialized by 'nnet-initialize'.
 
+from __future__ import print_function
 import sys
 
 ###
@@ -52,7 +53,7 @@
   parser.print_help()
   sys.exit(1)
 
-(feat_dim, num_leaves) = map(int,args);
+(feat_dim, num_leaves) = [int(i) for i in args];
 
 # Original prototype from Jiayu,
 #<NnetProto>
@@ -73,14 +74,14 @@
 if None != o.param_stddev:     softmax_affine_opts += "<ParamStddev> %f " % o.param_stddev
 
 # The LSTM layers,
-print "<LstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts
+print("<LstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (feat_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts)
 for l in range(o.num_layers - 1):
-  print "<LstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (o.proj_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts
+  print("<LstmProjected> <InputDim> %d <OutputDim> %d <CellDim> %s" % (o.proj_dim, o.proj_dim, o.cell_dim) + lstm_extra_opts)
 
 # Adding <Tanh> for more stability,
-print "<Tanh> <InputDim> %d <OutputDim> %d" % (o.proj_dim, o.proj_dim)
+print("<Tanh> <InputDim> %d <OutputDim> %d" % (o.proj_dim, o.proj_dim))
 
 # Softmax layer,
-print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0" % (o.proj_dim, num_leaves) + softmax_affine_opts
-print "<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves)
+print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0" % (o.proj_dim, num_leaves) + softmax_affine_opts)
+print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
 
diff --git a/egs/wsj/s5/utils/nnet/make_nnet_proto.py b/egs/wsj/s5/utils/nnet/make_nnet_proto.py
index 7b5c50beeb8..4f60be6c9d0 100755
--- a/egs/wsj/s5/utils/nnet/make_nnet_proto.py
+++ b/egs/wsj/s5/utils/nnet/make_nnet_proto.py
@@ -17,14 +17,13 @@
 
 # Generated Nnet prototype, to be initialized by 'nnet-initialize'.
 
+from __future__ import division
+from __future__ import print_function
 import math, random, sys, re
 
 ###
 ### Parse options
 ###
-
-print >> sys.stderr, sys.argv
-
 from optparse import OptionParser
 usage="%prog [options] <feat-dim> <num-leaves> <num-hid-layers> <num-hid-neurons> >nnet-proto-file"
 parser = OptionParser(usage)
@@ -90,7 +89,7 @@
 o.affine_opts = o.affine_opts.replace("_"," ")
 o.dropout_opts = o.dropout_opts.replace("_"," ")
 
-(feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = map(int,args);
+(feat_dim, num_leaves, num_hid_layers, num_hid_neurons) = [int(i) for i in args];
 ### End parse options
 
 
@@ -123,46 +122,46 @@ def Glorot(dim1, dim2):
   assert(num_hid_layers == 0)
   if o.bottleneck_trick:
     # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate
-    print "<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
+    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
      (feat_dim, o.bottleneck_dim, \
-      (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim) * 0.75 ), 0.1)
+      (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim) * 0.75 ), 0.1))
     # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
-    print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f" % \
+    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f" % \
      (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
-      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm)
+      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm))
   else:
-    print "<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
+    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
      (feat_dim, o.bottleneck_dim, \
-      (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim)))
-    print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f" % \
+      (o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim))))
+    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f" % \
      (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
-      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm)
-  print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts) # Non-linearity
+      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm))
+  print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)) # Non-linearity
   # Last AffineTransform (10x smaller learning rate on bias)
-  print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
+  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
    (num_hid_neurons, num_leaves, 0.0, 0.0, \
-    (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1)
+    (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1))
   # Optionaly append softmax
   if o.with_softmax:
     if o.block_softmax_dims == "":
-      print "<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves)
+      print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
     else:
-      print "<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims)
-  print "</NnetProto>"
+      print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
+  print("</NnetProto>")
   # We are done!
   sys.exit(0)
 
 # NO HIDDEN LAYERS!
 # Add only last layer (logistic regression)
 if num_hid_layers == 0:
-  print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
-        (feat_dim, num_leaves, 0.0, 0.0, (o.param_stddev_factor * Glorot(feat_dim, num_leaves)))
+  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
+        (feat_dim, num_leaves, 0.0, 0.0, (o.param_stddev_factor * Glorot(feat_dim, num_leaves))))
   if o.with_softmax:
     if o.block_softmax_dims == "":
-      print "<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves)
+      print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
     else:
-      print "<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims)
-  print "</NnetProto>"
+      print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
+  print("</NnetProto>")
   # We are done!
   sys.exit(0)
 
@@ -173,63 +172,63 @@ def Glorot(dim1, dim2):
 
 # Begin the prototype,
 # First AffineTranform,
-print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
+print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
       (feat_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
        (o.param_stddev_factor * Glorot(feat_dim, num_hid_neurons) * \
-        (math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)), o.max_norm, o.affine_opts)
+        (math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)), o.max_norm, o.affine_opts))
       # Note.: compensating dynamic range mismatch between input features and Sigmoid-hidden layers,
       # i.e. mapping the std-dev of N(0,1) (input features) to std-dev of U[0,1] (sigmoid-outputs).
       # This is done by multiplying with stddev(U[0,1]) = sqrt(1/12).
       # The stddev of weights is consequently reduced with scale 0.29,
-print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)
+print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
 if o.with_dropout:
-  print "<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)
+  print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))
 
 
 # Internal AffineTransforms,
 for i in range(num_hid_layers-1):
-  print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
+  print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
         (num_hid_neurons, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
-         (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts)
-  print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)
+         (o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm, o.affine_opts))
+  print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
   if o.with_dropout:
-    print "<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)
+    print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))
 
 # Optionaly add bottleneck,
 if o.bottleneck_dim != 0:
   assert(o.bottleneck_dim > 0)
   if o.bottleneck_trick:
     # 25% smaller stddev -> small bottleneck range, 10x smaller learning rate
-    print "<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
+    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f <LearnRateCoef> %f" % \
      (num_hid_neurons, o.bottleneck_dim, \
-      (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1)
+      (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1))
     # 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
-    print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f %s" % \
+    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f %s" % \
      (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
-      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm, o.affine_opts)
+      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm, o.affine_opts))
   else:
     # Same learninig-rate and stddev-formula everywhere,
-    print "<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
+    print("<LinearTransform> <InputDim> %d <OutputDim> %d <ParamStddev> %f" % \
      (num_hid_neurons, o.bottleneck_dim, \
-      (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim)))
-    print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
+      (o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim))))
+    print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f %s" % \
      (o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
-      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts)
-  print "%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts)
+      (o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons)), o.max_norm, o.affine_opts))
+  print("%s <InputDim> %d <OutputDim> %d %s" % (o.activation_type, num_hid_neurons, num_hid_neurons, o.activation_opts))
   if o.with_dropout:
-    print "<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts)
+    print("<Dropout> <InputDim> %d <OutputDim> %d %s" % (num_hid_neurons, num_hid_neurons, o.dropout_opts))
 
 # Last AffineTransform (10x smaller learning rate on bias)
-print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
+print("<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
       (num_hid_neurons, num_leaves, 0.0, 0.0, \
-       (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1)
+       (o.param_stddev_factor * Glorot(num_hid_neurons, num_leaves)), 1.0, 0.1))
 
 # Optionaly append softmax
 if o.with_softmax:
   if o.block_softmax_dims == "":
-    print "<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves)
+    print("<Softmax> <InputDim> %d <OutputDim> %d" % (num_leaves, num_leaves))
   else:
-    print "<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims)
+    print("<BlockSoftmax> <InputDim> %d <OutputDim> %d <BlockDims> %s" % (num_leaves, num_leaves, o.block_softmax_dims))
 
 # We are done!
 sys.exit(0)
diff --git a/egs/wsj/s5/utils/parallel/limit_num_gpus.sh b/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
index d9707a816c4..9d7caddd1f6 100755
--- a/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
+++ b/egs/wsj/s5/utils/parallel/limit_num_gpus.sh
@@ -18,8 +18,8 @@ if [ "$1" == "--num-gpus" ]; then
   shift
 fi
 
-if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le 0 ]; then
-  echo $0: Must pass a positive interger after --num-gpus
+if ! printf "%d" "$num_gpus" >/dev/null || [ $num_gpus -le -1 ]; then
+  echo $0: Must pass a positive interger or 0 after --num-gpus
   echo e.g. $0 --num-gpus 2 local/tfrnnlm/run_lstm.sh
   exit 1
 fi
@@ -35,18 +35,24 @@ CUDA_VISIBLE_DEVICES=
 num_total_gpus=`nvidia-smi -L | wc -l`
 num_gpus_assigned=0
 
-for i in `seq 0 $[$num_total_gpus-1]`; do
-# going over all GPUs and check if it is idle, and add to the list if yes
-  if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
-    CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
-  fi
-# once we have enough GPUs, break out of the loop
-  [ $num_gpus_assigned -eq $num_gpus ] && break
-done
+if [ $num_gpus -eq 0 ] ; then
+    echo "$0: Running the job on CPU. Disabling submitting to gpu"
+    export CUDA_VISIBLE_DEVICES=""
+else
+    for i in `seq 0 $[$num_total_gpus-1]`; do
+    # going over all GPUs and check if it is idle, and add to the list if yes
+      if nvidia-smi -i $i | grep "No running processes found" >/dev/null; then
+        CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}$i, && num_gpus_assigned=$[$num_gpus_assigned+1]
+      fi
+    # once we have enough GPUs, break out of the loop
+      [ $num_gpus_assigned -eq $num_gpus ] && break
+    done
 
-[ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
+    [ $num_gpus_assigned -ne $num_gpus ] && echo Could not find enough idle GPUs && exit 1
 
-export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
+    export CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | sed "s=,$==g")
+
+    echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
+fi
 
-echo "$0: Running the job on GPU(s) $CUDA_VISIBLE_DEVICES"
 "$@"
diff --git a/egs/wsj/s5/utils/parallel/pbs.pl b/egs/wsj/s5/utils/parallel/pbs.pl
index 6c8d4488882..d61bb1d4566 100755
--- a/egs/wsj/s5/utils/parallel/pbs.pl
+++ b/egs/wsj/s5/utils/parallel/pbs.pl
@@ -11,19 +11,17 @@
 use Cwd;
 use Getopt::Long;
 
-# This is a version of the queue.pl modified so that it works under PBS 
+# This is a version of the queue.pl modified so that it works under PBS
 # The PBS is one of the several "almost compatible" queueing systems. The
 # command switches and environment variables are different, so we are adding
 # a this script. An optimal solution might probably be to make the variable
 # names and the commands configurable, as similar problems can be expected
 # with Torque, Univa... and who knows what else
 #
-# queue.pl has the same functionality as run.pl, except that
-# it runs the job in question on the queue (Sun GridEngine).
-# This version of queue.pl uses the task array functionality
-# of the grid engine.  Note: it's different from the queue.pl
-# in the s4 and earlier scripts.
-
+# pbs.pl has the same functionality as run.pl, except that
+# it runs the job in question on the queue (PBS).
+# This version of pbs.pl uses the task array functionality
+# of PBS.  
 # The script now supports configuring the queue system using a config file
 # (default in conf/pbs.conf; but can be passed specified with --config option)
 # and a set of command line options.
@@ -78,12 +76,12 @@
 
 sub print_usage() {
   print STDERR
-   "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" .
-   "e.g.: queue.pl foo.log echo baz\n" .
+   "Usage: pbs.pl [options] [JOB=1:n] log-file command-line arguments...\n" .
+   "e.g.: pbs.pl foo.log echo baz\n" .
    " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" .
-   "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
+   "or: pbs.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" .
    " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" .
-   "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
+   "or: pbs.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" .
    " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" .
    "  another string other than JOB)\n" .
    "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" .
@@ -113,7 +111,7 @@ ()
     } else {
       my $argument = shift @ARGV;
       if ($argument =~ m/^--/) {
-        print STDERR "queue.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n";
+        print STDERR "pbs.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n";
       }
       if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
         $sync = 1;
@@ -141,7 +139,7 @@ ()
     $jobend = $3;
     shift;
     if ($jobstart > $jobend) {
-      die "queue.pl: invalid job range $ARGV[0]";
+      die "pbs.pl: invalid job range $ARGV[0]";
     }
     if ($jobstart <= 0) {
       die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation).";
@@ -153,7 +151,7 @@ ()
     $jobend = $2;
     shift;
   } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
-    print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
+    print STDERR "pbs.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n";
   }
 }
 
@@ -248,7 +246,7 @@ ()
       $cli_options{$option} = $value;
     }
   } else {
-    print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n";
+    print STDERR "pbs.pl: unable to parse line '$line' in config file ($config)\n";
     exit(1);
   }
 }
@@ -256,7 +254,7 @@ ()
 close(CONFIG);
 
 if ($read_command != 1) {
-  print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n";
+  print STDERR "pbs.pl: config file ($config) does not contain the line \"command .*\"\n";
   exit(1);
 }
 
@@ -271,7 +269,7 @@ ()
     $qsub_opts .= "$cli_config_options{$option} ";
   } else {
     if ($opened_config_file == 0) { $config = "default config file"; }
-    die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n";
+    die "pbs.pl: Command line option $option not described in $config (or value '$value' not allowed)\n";
   }
 }
 
@@ -280,7 +278,7 @@ ()
 
 if ($array_job == 1 && $logfile !~ m/$jobname/
     && $jobend > $jobstart) {
-  print STDERR "queue.pl: you are trying to run a parallel job but "
+  print STDERR "pbs.pl: you are trying to run a parallel job but "
     . "you are putting the output into just one log file ($logfile)\n";
   exit(1);
 }
@@ -289,7 +287,7 @@ ()
 # Work out the command; quote escaping is done here.
 # Note: the rules for escaping stuff are worked out pretty
 # arbitrarily, based on what we want it to do.  Some things that
-# we pass as arguments to queue.pl, such as "|", we want to be
+# we pass as arguments to pbs.pl, such as "|", we want to be
 # interpreted by bash, so we don't escape them.  Other things,
 # such as archive specifiers like 'ark:gunzip -c foo.gz|', we want
 # to be passed, in quotes, to the Kaldi program.  Our heuristic
@@ -394,16 +392,16 @@ ()
 if ($ret != 0) {
   if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status)
     if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; }
-    print STDERR "queue.pl: job writing to $logfile failed\n";
+    print STDERR "pbs.pl: job writing to $logfile failed\n";
   } else {
-    print STDERR "queue.pl: error submitting jobs to queue (return status was $ret)\n";
+    print STDERR "pbs.pl: error submitting jobs to queue (return status was $ret)\n";
     print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n";
     print STDERR `tail $queue_logfile`;
   }
   exit(1);
 }
 
-my $sge_job_id;
+my $pbs_job_id;
 if (! $sync) { # We're not submitting with -sync y, so we
   # need to wait for the jobs to finish.  We wait for the
   # sync-files we "touched" in the script to exist.
@@ -415,25 +413,25 @@ ()
       push @syncfiles, "$syncfile.$jobid";
     }
   }
-  # We will need the sge_job_id, to check that job still exists
-  { # Get the SGE job-id from the log file in q/
-    open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile";
-    undef $sge_job_id;
-    while (<L>) {
-      if (m/Your job\S* (\d+)[. ].+ has been submitted/) {
-        if (defined $sge_job_id) {
+  # We will need the pbs_job_id, to check that job still exists
+  { # Get the PBS job-id from the log file in q/
+    open my $L, '<', $queue_logfile || die "Error opening log file $queue_logfile";
+    undef $pbs_job_id;
+    while (<$L>) {
+      if (/(\d+.+\.pbsserver)/) {
+        if (defined $pbs_job_id) {
           die "Error: your job was submitted more than once (see $queue_logfile)";
         } else {
-          $sge_job_id = $1;
+          $pbs_job_id = $1;
         }
       }
     }
-    close(L);
-    if (!defined $sge_job_id) {
-      die "Error: log file $queue_logfile does not specify the SGE job-id.";
+    close $L;
+    if (!defined $pbs_job_id) {
+      die "Error: log file $queue_logfile does not specify the PBS job-id.";
     }
   }
-  my $check_sge_job_ctr=1;
+  my $check_pbs_job_ctr=1;
   #
   my $wait = 0.1;
   my $counter = 0;
@@ -462,11 +460,11 @@ ()
         }
       }
 
-      # Check that the job exists in SGE. Job can be killed if duration
+      # Check that the job exists in PBS. Job can be killed if duration
       # exceeds some hard limit, or in case of a machine shutdown.
-      if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
+      if (($check_pbs_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on PBS.
         if ( -f $f ) { next; }; #syncfile appeared: OK.
-        $ret = system("qstat -t $sge_job_id >/dev/null 2>/dev/null");
+        $ret = system("qstat -t $pbs_job_id >/dev/null 2>/dev/null");
         # system(...) : To get the actual exit value, shift $ret right by eight bits.
         if ($ret>>8 == 1) {     # Job does not seem to exist
           # Don't consider immediately missing job as error, first wait some
@@ -501,13 +499,13 @@ ()
             # time elapsed between file modification and the start of this
             # program], then we assume the program really finished OK,
             # and maybe something is up with the file system.
-            print STDERR "**queue.pl: syncfile $f was not created but job seems\n" .
+            print STDERR "**pbs.pl: syncfile $f was not created but job seems\n" .
               "**to have finished OK.  Probably your file-system has problems.\n" .
               "**This is just a warning.\n";
             last;
           } else {
             chop $last_line;
-            print STDERR "queue.pl: Error, unfinished job no " .
+            print STDERR "pbs.pl: Error, unfinished job no " .
               "longer exists, log is in $logfile, last line is '$last_line', " .
               "syncfile is $f, return status of qstat was $ret\n" .
               "Possible reasons: a) Exceeded time limit? -> Use more jobs!" .
@@ -515,7 +513,7 @@ ()
             exit(1);
           }
         } elsif ($ret != 0) {
-          print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -t $sge_job_id,$!)\n";
+          print STDERR "pbs.pl: Warning: qstat command returned status $ret (qstat -t $pbs_job_id,$!)\n";
         }
       }
     }
@@ -574,14 +572,14 @@ ()
 else { # we failed.
   if (@logfiles == 1) {
     if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/$jobstart/g; }
-    print STDERR "queue.pl: job failed with status $status, log is in $logfile\n";
+    print STDERR "pbs.pl: job failed with status $status, log is in $logfile\n";
     if ($logfile =~ m/JOB/) {
-      print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n";
+      print STDERR "pbs.pl: probably you forgot to put JOB=1:\$nj in your script.\n";
     }
   } else {
     if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; }
     my $numjobs = 1 + $jobend - $jobstart;
-    print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n";
+    print STDERR "pbs.pl: $num_failed / $numjobs failed, log is in $logfile\n";
   }
   exit(1);
 }
diff --git a/egs/wsj/s5/utils/parallel/queue.pl b/egs/wsj/s5/utils/parallel/queue.pl
index e14af5ef6e3..bddcb4fec23 100755
--- a/egs/wsj/s5/utils/parallel/queue.pl
+++ b/egs/wsj/s5/utils/parallel/queue.pl
@@ -176,7 +176,7 @@ sub caught_signal {
 option max_jobs_run=* -tc $0
 default gpu=0
 option gpu=0
-option gpu=* -l gpu=$0 -q g.q
+option gpu=* -l gpu=$0 -q '*.q'
 EOF
 
 # Here the configuration options specified by the user on the command line
diff --git a/egs/wsj/s5/utils/parallel/retry.pl b/egs/wsj/s5/utils/parallel/retry.pl
index a039d6f5a74..618e9fb01bc 100755
--- a/egs/wsj/s5/utils/parallel/retry.pl
+++ b/egs/wsj/s5/utils/parallel/retry.pl
@@ -94,7 +94,6 @@ sub get_log_file {
       # Later on we might want to figure out which array jobs failed
       # and have to be rerun, but for now we just die.
       print STDERR "$0: job failed and log file $log_file does not exist (array job?).\n";
-      exit($return_status)
     } else {
       rename($log_file, $log_file . ".bak");
       print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning\n";
diff --git a/egs/wsj/s5/utils/parallel/run.pl b/egs/wsj/s5/utils/parallel/run.pl
index f23bb8dc0b0..d648abd2382 100755
--- a/egs/wsj/s5/utils/parallel/run.pl
+++ b/egs/wsj/s5/utils/parallel/run.pl
@@ -72,13 +72,13 @@
     $jobname = $1;
     $jobstart = $2;
     $jobend = $3;
-    shift;
     if ($jobstart > $jobend) {
       die "run.pl: invalid job range $ARGV[0]";
     }
     if ($jobstart <= 0) {
       die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
     }
+    shift;
   } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
     $jobname = $1;
     $jobstart = $2;
@@ -181,7 +181,7 @@
         delete $active_pids{$r};
         # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
     } else {
-        die "run.pl: Cannot find the PID of the chold process that just finished.";
+        die "run.pl: Cannot find the PID of the child process that just finished.";
     }
 
     # In theory we could do a non-blocking waitpid over all jobs running just
@@ -243,7 +243,7 @@
 # Some sanity checks:
 # The $fail array should not contain undefined codes
 # The number of non-zeros in that array  should be equal to $numfail
-# We cannot do foreach() here, as the JOB ids do not necessarily start by zero
+# We cannot do foreach() here, as the JOB ids do not start at zero
 $failed_jids=0;
 for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
   $job_return = $fail[$jobid];
diff --git a/egs/wsj/s5/utils/parallel/slurm.pl b/egs/wsj/s5/utils/parallel/slurm.pl
index cfa634aebc4..4a2a3b7c41d 100755
--- a/egs/wsj/s5/utils/parallel/slurm.pl
+++ b/egs/wsj/s5/utils/parallel/slurm.pl
@@ -180,9 +180,10 @@ sub exec_command {
 default gpu=0
 option gpu=0 -p shared
 option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0  # this has to be figured out
+EOF
+
 # note: the --max-jobs-run option is supported as a special case
 # by slurm.pl and you don't have to handle it in the config file.
-EOF
 
 # Here the configuration options specified by the user on the command line
 # (e.g. --mem 2G) are converted to options to the qsub system as defined in
diff --git a/egs/wsj/s5/utils/parse_options.sh b/egs/wsj/s5/utils/parse_options.sh
index 34476fdb37a..335e69e9ac7 100755
--- a/egs/wsj/s5/utils/parse_options.sh
+++ b/egs/wsj/s5/utils/parse_options.sh
@@ -42,7 +42,7 @@ done
 
 
 ###
-### No we process the command line options
+### Now we process the command line options
 ###
 while true; do
   [ -z "${1:-}" ] && break;  # break if there are no arguments
diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
index a50cdb04be4..924ebdc3473 100755
--- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh
+++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh
@@ -73,7 +73,7 @@ if [ -f $srcdir/segments ]; then
   utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | \
     utils/apply_map.pl -f 2 $destdir/reco_map | \
       awk -v factor=$factor \
-        '{printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);}' >$destdir/segments
+        '{s=$3/factor; e=$4/factor; if (e > s + 0.01) { printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);} }' >$destdir/segments
 
   utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \
     # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" 
@@ -102,6 +102,9 @@ fi
 if [ -f $srcdir/spk2gender ]; then
   utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
 fi
+if [ -f $srcdir/utt2lang ]; then
+  utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2lang >$destdir/utt2lang
+fi
 
 #prepare speed-perturbed utt2dur
 if [ ! -f $srcdir/utt2dur ]; then
diff --git a/egs/wsj/s5/utils/prepare_extended_lang.sh b/egs/wsj/s5/utils/prepare_extended_lang.sh
index 824654cabf1..57cfcaabe34 100755
--- a/egs/wsj/s5/utils/prepare_extended_lang.sh
+++ b/egs/wsj/s5/utils/prepare_extended_lang.sh
@@ -24,6 +24,7 @@ word_list= # if a word list (mapping words from the srcdict to IDs) is provided,
 # we'll make sure the IDs of these words are kept as before.
 # end configuration sections
 
+echo "$0: warning: This sript is is now deprecated. You may want to use utils/lang/extend_lang.sh"
 echo "$0 $@"  # Print the command line for logging
 
 . utils/parse_options.sh
diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index fa5ff7856b0..fa9b9122786 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -67,6 +67,8 @@ extra_word_disambig_syms=        # if set, add disambiguation symbols from this
 num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
                                 # Increasing this number does not harm, but is only useful if you later
                                 # want to introduce this labels to L_disambig.fst
+
+
 # end configuration sections
 
 echo "$0 $@"  # Print the command line for logging
@@ -74,12 +76,14 @@ echo "$0 $@"  # Print the command line for logging
 . utils/parse_options.sh
 
 if [ $# -ne 4 ]; then
-  echo "usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
+  echo "Usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
   echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
   echo "<dict-src-dir> should contain the following files:"
   echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
   echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
   echo "options: "
+  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
+  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
   echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
   echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
   echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
@@ -104,6 +108,11 @@ srcdir=$1
 oov_word=$2
 tmpdir=$3
 dir=$4
+
+
+if [ -d $dir/phones ]; then
+  rm -r $dir/phones
+fi
 mkdir -p $dir $tmpdir $dir/phones
 
 silprob=false
@@ -209,7 +218,6 @@ else
   paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
 fi
 
-mkdir -p $dir/phones  # various sets of phones...
 
 # Sets of phones for use in clustering, and making monophone systems.
 
@@ -261,7 +269,7 @@ fi
 
 # add_lex_disambig.pl is responsible for adding disambiguation symbols to
 # the lexicon, for telling us how many disambiguation symbols it used,
-# and and also for modifying the unknown-word's pronunciation (if the
+# and also for modifying the unknown-word's pronunciation (if the
 # --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
 # disambig symbols for that purpose.
 # The #2 will later be replaced with the actual unk model.  The reason
@@ -380,9 +388,9 @@ fi
 
 # format of $dir/words.txt:
 #<eps> 0
-#!EXCLAMATION-POINT 1
-#!SIL 2
-#"CLOSE-QUOTE 3
+#a 1
+#aa 2
+#aarvark 3
 #...
 
 silphone=`cat $srcdir/optional_silence.txt` || exit 1;
@@ -403,9 +411,40 @@ perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lex
 [ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt
 
 cat $tmpdir/align_lexicon.txt | \
- perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt
+  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt
+
+if [ -f $srcdir/nonterminals.txt ]; then
+  utils/lang/grammar/augment_phones_txt.py $dir/phones.txt $srcdir/nonterminals.txt $dir/phones.txt
+  utils/lang/grammar/augment_words_txt.py $dir/words.txt $srcdir/nonterminals.txt $dir/words.txt
+  cp $srcdir/nonterminals.txt $dir/phones/nonterminals.txt
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/nonterminals.txt >$dir/phones/nonterminals.int
+
+  for w in "#nonterm_begin" "#nonterm_end" $(cat $srcdir/nonterminals.txt); do
+    echo $w $w  # These are words without pronunciations, so leave those prons
+                # empty.
+  done >> $dir/phones/align_lexicon.txt
+  nonterm_phones_offset=$(grep '#nonterm_bos' <$dir/phones.txt | awk '{print $2}')
+  echo $nonterm_phones_offset > $dir/phones/nonterm_phones_offset.int
+  echo '#nonterm_bos' > $dir/phones/nonterm_phones_offset.txt  # temporary.
+
+  if [ -f $dir/phones/word_boundary.txt ]; then
+    # word-position-dependent system.  Only include the optional-silence phone,
+    # and phones that can end a word, plus the special symbol #nonterm_bos, in the
+    # left-context phones.
+    awk '{if ($2 == "end" || $2 == "singleton") print $1; }' <$dir/phones/word_boundary.txt | \
+        cat - $dir/phones/optional_silence.txt $dir/phones/nonterm_phones_offset.txt > $dir/phones/left_context_phones.txt
+  else
+    cat $dir/phones/{silence,nonsilence}.txt $dir/phones/nonterm_phones_offset.txt > $dir/phones/left_context_phones.txt
+  fi
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/left_context_phones.txt >$dir/phones/left_context_phones.int
 
-# create phones/align_lexicon.int
+  # we need to write utils/lang/make_lexicon_fst_silprob.py before this can work.
+  grammar_opts="--left-context-phones=$dir/phones/left_context_phones.txt --nonterminals=$srcdir/nonterminals.txt"
+else
+  grammar_opts=
+fi
+
+# create phones/align_lexicon.int from phones/align_lexicon.txt
 cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
   utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
 
@@ -413,18 +452,20 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
 # in training.
 
 if $silprob; then
-  # Add silence probabilities (modlels the prob. of silence before and after each
+  # Add silence probabilities (models the prob. of silence before and after each
   # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
   # and where it's called in the example scripts (run.sh).
-  utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt $silphone "<eps>" | \
+  utils/lang/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
+         $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
-     --keep_isymbols=false --keep_osymbols=false |   \
+       --keep_isymbols=false --keep_osymbols=false |   \
      fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
 else
-  utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \
+  utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
+            $tmpdir/lexiconp.txt | \
     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
-    --keep_isymbols=false --keep_osymbols=false | \
-     fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+      --keep_isymbols=false --keep_osymbols=false | \
+    fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
 fi
 
 # The file oov.txt contains a word that we will map any OOVs to during
@@ -490,15 +531,19 @@ utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonel
 # disambiguation symbols from G.fst.
 
 if $silprob; then
-  utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \
+  utils/lang/make_lexicon_fst_silprob.py $grammar_opts \
+     --sil-phone=$silphone --sil-disambig='#'$ndisambig \
+     $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
-     --keep_isymbols=false --keep_osymbols=false |   \
+       --keep_isymbols=false --keep_osymbols=false |   \
      fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 else
-  utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \
+  utils/lang/make_lexicon_fst.py $grammar_opts \
+       --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
+         $tmpdir/lexiconp_disambig.txt | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
-     --keep_isymbols=false --keep_osymbols=false |   \
+       --keep_isymbols=false --keep_osymbols=false |   \
      fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 fi
diff --git a/egs/wsj/s5/utils/reverse_arpa.py b/egs/wsj/s5/utils/reverse_arpa.py
index 5437aec4341..e154a6e0813 100755
--- a/egs/wsj/s5/utils/reverse_arpa.py
+++ b/egs/wsj/s5/utils/reverse_arpa.py
@@ -2,11 +2,12 @@
 # -*- coding: utf-8 -*-
 # Copyright 2012 Mirko Hannemann BUT, mirko.hannemann@gmail.com
 
+from __future__ import print_function
 import sys
 import codecs # for UTF-8/unicode
 
 if len(sys.argv) != 2:
-    print 'usage: reverse_arpa arpa.in'
+    print('usage: reverse_arpa arpa.in')
     sys.exit()
 arpaname = sys.argv[1]
 
@@ -34,13 +35,13 @@
 try:
   file = codecs.open(arpaname, "r", "utf-8")
 except IOError:
-  print 'file not found: ' + arpaname
+  print('file not found: ' + arpaname)
   sys.exit()
 
 text=file.readline()
 while (text and text[:6] != "\\data\\"): text=file.readline()
 if not text:
-  print "invalid ARPA file"
+  print("invalid ARPA file")
   sys.exit()
 #print text,
 while (text and text[:5] != "ngram"): text=file.readline()
@@ -54,7 +55,7 @@
   r = ind[0].split()
   read_n = int(r[1].strip())
   if read_n != n+1:
-    print "invalid ARPA file:", text
+    print("invalid ARPA file: {}".format(text))
     sys.exit()
   n = read_n
   cngrams.append(counts)
@@ -68,7 +69,7 @@
 for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams
   while (text and "-grams:" not in text): text=file.readline()
   if n != int(text[1]):
-    print "invalid ARPA file:", text
+    print("invalid ARPA file:{}".format(text))
     sys.exit()
   #print text,cngrams[n-1]
   this_ngrams={} # stores all read ngrams
@@ -115,7 +116,7 @@
 
 while (text and text[:5] != "\\end\\"): text=file.readline()
 if not text:
-  print "invalid ARPA file"
+  print("invalid ARPA file")
   sys.exit()
 file.close()
 #print text,
@@ -133,14 +134,13 @@
 #p(ABCD)+b(ABCD)-p(BCD)+p(ABC)-p(BC)+p(AB)-p(B)+p(A) DCBA 0
 
 # compute new reversed ARPA model
-print "\\data\\"
+print("\\data\\")
 for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams
-  print "ngram "+str(n)+"="+str(len(ngrams[n-1].keys()))
+  print("ngram {0} = {1}".format(n, len(ngrams[n-1].keys())))
 offset = 0.0
 for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams
-  print "\\"+str(n)+"-grams:"
-  keys = ngrams[n-1].keys()
-  keys.sort()
+  print("\\{}-grams:".format(n))
+  keys = sorted(ngrams[n-1].keys())
   for ngram in keys:
     prob = ngrams[n-1][ngram]
     # reverse word order
@@ -179,10 +179,10 @@
         elif n == 2:
           revprob = revprob + offset # add <s> weight to bigrams starting with <s>
       if (prob[1] != inf): # only backoff weights from not newly created ngrams
-        print revprob,rev_ngram.encode("utf-8"),back
+        print(revprob,rev_ngram.encode("utf-8"),back)
       else:
-        print revprob,rev_ngram.encode("utf-8"),"-100000.0"
+        print(revprob,rev_ngram.encode("utf-8"),"-100000.0")
     else: # highest order - no backoff weights
       if (n==2) and (rev_ngram[:3] == "<s>"): revprob = revprob + offset
-      print revprob,rev_ngram.encode("utf-8")
-print "\\end\\"
+      print(revprob,rev_ngram.encode("utf-8"))
+print("\\end\\")
diff --git a/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl b/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl
index 217448e9fb0..f44c0d9cfb3 100755
--- a/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl
+++ b/egs/wsj/s5/utils/scoring/wer_per_spk_details.pl
@@ -130,10 +130,15 @@ sub format_sys {
 }
 
 open(UTT2SPK,$ARGV[0]) or die "Could not open the utt2spk file $ARGV[0]";
-while(<UTT2SPK>) {
-  chomp;
-  my @F=split;
-  die "Incompatible format of the utt2spk file: $_" if @F != 2; 
+
+(my $utt_is_utf8, my @utt_lines) = get_utf8_or_bytestream(\*UTT2SPK);
+die "Cannot read file" unless @utt_lines;
+
+while (@utt_lines) {
+  my $line = shift @utt_lines;
+  chomp $line;
+  my @F=split(" ", $line);
+  die "Incompatible format of the utt2spk file: $_" if @F != 2;
   $UTTMAP{$F[0]} = $F[1];
   # Set width of speaker column by its longest label,
   if($SPK_WIDTH < length($F[1])) { $SPK_WIDTH = length($F[1]) }
diff --git a/egs/wsj/s5/utils/segmentation.pl b/egs/wsj/s5/utils/segmentation.pl
index 41d90f4bd9d..fa7c4429927 100755
--- a/egs/wsj/s5/utils/segmentation.pl
+++ b/egs/wsj/s5/utils/segmentation.pl
@@ -221,7 +221,8 @@ ()
         if ($A[$p] == 0) { $num_sil++; }
         else { last; }
       }
-      $num_silence_phones[$n] = $p;
+      
+      $num_silence_phones[$n] = $num_sil; # should be the num of silence
     }
   }
 
diff --git a/egs/wsj/s5/utils/split_scp.pl b/egs/wsj/s5/utils/split_scp.pl
index 994c62e7a2d..dc798282f79 100755
--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -1,7 +1,9 @@
 #!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
+
 # Copyright 2010-2011 Microsoft Corporation
 
+# See ../../COPYING for clarification regarding multiple authors
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +18,6 @@
 # limitations under the License.
 
 
-
 # This program splits up any kind of .scp or archive-type file.
 # If there is no utt2spk option it will work on any text  file and
 # will split it up with an approximately equal number of lines in
@@ -41,29 +42,43 @@
 # [note: with this option, it assumes zero-based indexing of the split parts,
 # i.e. the second number must be 0 <= n < num-jobs.]
 
+use warnings;
+
 $num_jobs = 0;
 $job_id = 0;
 $utt2spk_file = "";
+$one_based = 0;
 
-for ($x = 1; $x <= 2 && @ARGV > 0; $x++) {
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
     if ($ARGV[0] eq "-j") {
         shift @ARGV;
         $num_jobs = shift @ARGV;
         $job_id = shift @ARGV;
-        if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
-            die "Invalid num-jobs and job-id: $num_jobs and $job_id";
-        }
     }
-    if ($ARGV[0] =~ "--utt2spk=(.+)") {
+    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
         $utt2spk_file=$1;
         shift;
     }
+    if ($ARGV[0] eq '--one-based') {
+        $one_based = 1;
+        shift @ARGV;
+    }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+                       $job_id - $one_based >= $num_jobs)) {
+  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+      ($one_based ? " --one-based" : "") . "'\n"
 }
 
+$one_based
+    and $job_id--;
+
 if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
-    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
-        " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
-        " ... where 0 <= job-id < num-jobs.";
+    die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
 }
 
 $error = 0;
@@ -82,21 +97,22 @@
 }
 
 if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
-    while(<U>) {
+    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+    while(<$u_fh>) {
         @A = split;
-        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
+        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
         ($u,$s) = @A;
         $utt2spk{$u} = $s;
     }
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
     @spkrs = ();
-    while(<I>) {
+    while(<$i_fh>) {
         @A = split;
-        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
+        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
         $u = $A[0];
         $s = $utt2spk{$u};
-        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
+        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
         if(!defined $spk_count{$s}) {
             push @spkrs, $s;
             $spk_count{$s} = 0;
@@ -111,8 +127,8 @@
     $numspks = @spkrs;  # number of speakers.
     $numscps = @OUTPUTS; # number of output files.
     if ($numspks < $numscps) {
-      die "Refusing to split data because number of speakers $numspks is less " .
-          "than the number of output .scp files $numscps";
+      die "$0: Refusing to split data because number of speakers $numspks " .
+          "is less than the number of output .scp files $numscps\n";
     }
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
         $scparray[$scpidx] = []; # [] is array reference.
@@ -174,52 +190,57 @@
     }
     # Now print out the files...
     for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfn = $OUTPUTS[$scpidx];
-        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+                         : open($f_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
         $count = 0;
         if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
+            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+                         "$scpfile (too many splits and too few speakers?)\n";
             $error = 1;
         } else {
             foreach $spk ( @{$scparray[$scpidx]} ) {
-                print F @{$spk_data{$spk}};
+                print $f_fh @{$spk_data{$spk}};
                 $count += $spk_count{$spk};
             }
-            if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
+            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
         }
-        close(F);
+        close($f_fh);
     }
 } else {
    # This block is the "normal" case where there is no --utt2spk
    # option and we just break into equal size chunks.
 
-    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
 
     $numscps = @OUTPUTS;  # size of array.
     @F = ();
-    while(<I>) {
+    while(<$i_fh>) {
         push @F, $_;
     }
     $numlines = @F;
     if($numlines == 0) {
-        print STDERR "split_scp.pl: error: empty input scp file $inscp , ";
+        print STDERR "$0: error: empty input scp file $inscp\n";
         $error = 1;
     }
     $linesperscp = int( $numlines / $numscps); # the "whole part"..
-    $linesperscp >= 1 || die "You are splitting into too many pieces! [reduce \$nj]";
+    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj]\n";
     $remainder = $numlines - ($linesperscp * $numscps);
     ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
     # [just doing int() rounds down].
     $n = 0;
     for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
         $scpfile = $OUTPUTS[$scpidx];
-        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
+        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+                         : open($o_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
         for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
-            print O $F[$n++];
+            print $o_fh $F[$n++];
         }
-        close(O) || die "Closing scp file $scpfile";
+        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
     }
-    $n == $numlines || die "split_scp.pl: code error., $n != $numlines";
+    $n == $numlines || die "$n != $numlines [code error]";
 }
 
-exit ($error ? 1 : 0);
+exit ($error);
diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh
index ba52d140ccc..f202a998b5d 100755
--- a/egs/wsj/s5/utils/subset_data_dir.sh
+++ b/egs/wsj/s5/utils/subset_data_dir.sh
@@ -34,42 +34,27 @@
 
 shortest=false
 perspk=false
-first_opt=""
 speakers=false
-spk_list_specified=false
-utt_list_specified=false
-
-if [ "$1" == "--per-spk" ]; then
-  perspk=true;
-  shift;
-elif [ "$1" == "--shortest" ]; then
-  shortest=true;
-  shift;
-elif [ "$1" == "--first" ]; then
-  first_opt="--first";
-  shift;
-elif [ "$1" == "--speakers" ]; then
-  speakers=true
-  shift;
-elif [ "$1" == "--last" ]; then
-  first_opt="--last";
-  shift;
-elif [ "$1" == "--spk-list" ]; then
-  spk_list_specified=true
-  shift;
-elif [ "$1" == "--utt-list" ]; then
-  utt_list_specified=true
-  shift;
-fi
-
-
-
-
-if [ $# != 3 ]; then
-  echo "Usage: "
+first_opt=
+spk_list=
+utt_list=
+
+expect_args=3
+case $1 in
+  --first|--last) first_opt=$1; shift ;;
+  --per-spk)  perspk=true; shift ;;
+  --shortest) shortest=true; shift ;;
+  --speakers) speakers=true; shift ;;
+  --spk-list) shift; spk_list=$1; shift; expect_args=2 ;;
+  --utt-list) shift; utt_list=$1; shift; expect_args=2 ;;
+  --*) echo "$0: invalid option '$1'"; exit 1
+esac
+
+if [ $# != $expect_args ]; then
+  echo "Usage:"
   echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
   echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
-  echo "  subset_data_dir.sh [--utt-list <utterance-list-file>] <srcdir> <destdir>"
+  echo "  subset_data_dir.sh [--utt-list <utt-list-file>] <srcdir> <destdir>"
   echo "By default, randomly selects <num-utt> utterances from the data directory."
   echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
   echo "With --per-spk, selects <num-utt> utterances per speaker, if available."
@@ -77,118 +62,131 @@ if [ $# != 3 ]; then
   echo "With --last, selects the last <num-utt> utterances"
   echo "With --shortest, selects the shortest <num-utt> utterances."
   echo "With --spk-list, reads the speakers to keep from <speaker-list-file>"
+  echo "With --utt-list, reads the utterances to keep from <utt-list-file>"
   exit 1;
 fi
 
-if $spk_list_specified; then
-  spk_list=$1
-  srcdir=$2
-  destdir=$3
-elif $utt_list_specified; then
-  utt_list=$1
-  srcdir=$2
-  destdir=$3
+srcdir=$1
+if [[ $spk_list || $utt_list ]]; then
+  numutt=
+  destdir=$2
 else
-  srcdir=$1
   numutt=$2
   destdir=$3
 fi
 
-
 export LC_ALL=C
 
 if [ ! -f $srcdir/utt2spk ]; then
-  echo "subset_data_dir.sh: no such file $srcdir/utt2spk"
-  exit 1;
+  echo "$0: no such file $srcdir/utt2spk"
+  exit 1
 fi
 
-function do_filtering {
-  # assumes the utt2spk and spk2utt files already exist.
-  [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
-  [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
-  [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
-  [ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
-  [ -f $srcdir/utt2num_frames ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
-  [ -f $srcdir/utt2uniq ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
-  [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
-  [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
-  [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
-  [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
-  [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
-  [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
-  if [ -f $srcdir/segments ]; then
-     utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
-     awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
-     # The next line would override the command above for wav.scp, which would be incorrect.
-     [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
-     [ -f $srcdir/reco2file_and_channel ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
-
-     # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
-     [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm
-
-     rm $destdir/reco
-  else
-     awk '{print $1;}' $destdir/wav.scp | sort | uniq > $destdir/reco
-     [ -f $srcdir/reco2file_and_channel ] && \
-       utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
-     
-     rm $destdir/reco
-  fi
-  srcutts=`cat $srcdir/utt2spk | wc -l`
-  destutts=`cat $destdir/utt2spk | wc -l`
-  echo "$0: reducing #utt from $srcutts to $destutts"
-}
+if [[ $numutt && $numutt -gt $(wc -l <$srcdir/utt2spk) ]]; then
+  echo "$0: cannot subset to more utterances than you originally had."
+  exit 1
+fi
 
+if $shortest && [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: you selected --shortest but no feats.scp exist."
+  exit 1
+fi
+
+mkdir -p $destdir || exit 1
 
-if $spk_list_specified; then
-  mkdir -p $destdir
+if [[ $spk_list ]]; then
   utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
-  do_filtering; # bash function.
-  exit 0;
-elif $utt_list_specified; then
-  mkdir -p $destdir
+elif [[ $utt_list ]]; then
   utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1;
   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1;
-  do_filtering; # bash function.
-  exit 0;
 elif $speakers; then
-  mkdir -p $destdir
-  utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \
+  utils/shuffle_list.pl < $srcdir/spk2utt |
+    awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' |
     sort > $destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
-  do_filtering; # bash function.
-  exit 0;
 elif $perspk; then
-  mkdir -p $destdir
-  awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
-         for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); }
+  awk '{ n='$numutt'; printf("%s ",$1);
+         skip=1; while(n*(skip+1) <= NF-1) { skip++; }
+         for(x=2; x<=NF && x <= (n*skip+1); x += skip) { printf("%s ", $x); }
          printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
   utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
-  do_filtering; # bash function.
-  exit 0;
 else
-  if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then
-    echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
-    exit 1;
-  fi
-  mkdir -p $destdir || exit 1;
-
-  ## scripting note: $shortest evaluates to true or false
-  ## so this becomes the command true or false.
   if $shortest; then
-    # select the n shortest utterances.
+    # Select $numutt shortest utterances.
     . ./path.sh
-    [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1;
     feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
-    sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist
+    sort -n -k2 $destdir/tmp.len |
+      awk '{print $1}' |
+      head -$numutt >$destdir/tmp.uttlist
     utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
     rm $destdir/tmp.uttlist $destdir/tmp.len
   else
+    # Select $numutt random utterances.
     utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
   fi
   utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
-  do_filtering;
-  exit 0;
 fi
+
+# Perform filtering. utt2spk and spk2utt files already exist by this point.
+# Filter by utterance.
+[ -f $srcdir/feats.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
+[ -f $srcdir/vad.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp
+[ -f $srcdir/utt2lang ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang
+[ -f $srcdir/utt2dur ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur
+[ -f $srcdir/utt2num_frames ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames
+[ -f $srcdir/utt2uniq ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq
+[ -f $srcdir/wav.scp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
+[ -f $srcdir/utt2warp ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp
+[ -f $srcdir/text ] &&
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
+
+# Filter by speaker.
+[ -f $srcdir/spk2warp ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp
+[ -f $srcdir/spk2gender ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
+[ -f $srcdir/cmvn.scp ] &&
+  utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
+
+# Filter by recording-id.
+if [ -f $srcdir/segments ]; then
+  utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
+  # Recording-ids are in segments.
+  awk '{print $2}' $destdir/segments | sort | uniq >$destdir/reco
+  # The next line overrides the command above for wav.scp, which would be incorrect.
+  [ -f $srcdir/wav.scp ] &&
+    utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
+else
+  # No segments; recording-ids are in wav.scp.
+  awk '{print $1}' $destdir/wav.scp | sort | uniq >$destdir/reco
+fi
+
+[ -f $srcdir/reco2file_and_channel ] &&
+  utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
+[ -f $srcdir/reco2dur ] &&
+  utils/filter_scp.pl $destdir/reco <$srcdir/reco2dur >$destdir/reco2dur
+
+# Filter the STM file for proper sclite scoring.
+# Copy over the comments from STM file.
+[ -f $srcdir/stm ] &&
+  (grep "^;;" $srcdir/stm
+   utils/filter_scp.pl $destdir/reco $srcdir/stm) >$destdir/stm
+
+rm $destdir/reco
+
+# Copy frame_shift if present.
+[ -f $srcdir/frame_shift ] && cp $srcdir/frame_shift $destdir
+
+srcutts=$(wc -l <$srcdir/utt2spk)
+destutts=$(wc -l <$destdir/utt2spk)
+echo "$0: reducing #utt from $srcutts to $destutts"
+exit 0
diff --git a/egs/wsj/s5/utils/subword/prepare_lang_subword.sh b/egs/wsj/s5/utils/subword/prepare_lang_subword.sh
new file mode 100755
index 00000000000..8f5e4ecce6b
--- /dev/null
+++ b/egs/wsj/s5/utils/subword/prepare_lang_subword.sh
@@ -0,0 +1,448 @@
+#!/bin/bash
+# Copyright 2012-2013  Johns Hopkins University (Author: Daniel Povey);
+#                      Arnab Ghoshal
+#                2014  Guoguo Chen
+#                2015  Hainan Xu
+#                2016  FAU Erlangen (Author: Axel Horndasch)
+#                2019  Dongji Gao
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script prepares a directory (for subword) such as data/lang_subword/, in the standard format,
+# given a source directory containing a subword dictionary lexicon.txt in a form like:
+# subword phone1 phone2 ... phoneN
+# per line (alternate prons would be separate lines), or a dictionary with probabilities
+# called lexiconp.txt in a form:
+# subword pron-prob phone1 phone2 ... phoneN
+# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if
+# lexicon.txt exists.
+# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
+# and extra_questions.txt
+# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
+# non-silence phones respectively (where silence includes various kinds of
+# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
+# "real" phones.)
+# In each line of those files is a list of phones, and the phones on each line
+# are assumed to correspond to the same "base phone", i.e. they will be
+# different stress or tone variations of the same basic phone.
+# The file "optional_silence.txt" contains just a single phone (typically SIL)
+# which is used for optional silence in the lexicon.
+# extra_questions.txt might be empty; typically will consist of lists of phones,
+# all members of each list with the same stress or tone; and also possibly a
+# list for the silence phones.  This will augment the automatically generated
+# questions (note: the automatically generated ones will treat all the
+# stress/tone versions of a phone the same, so will not "get to ask" about
+# stress or tone).
+#
+
+# This script adds word-position-dependent phones and constructs a host of other
+# derived files, that go in data/lang_subword/.
+
+# Currently it only support the most basic functions.
+# Begin configuration section.
+num_sil_states=5
+num_nonsil_states=3
+position_dependent_phones=true
+# position_dependent_phones is false also when position dependent phones and word_boundary.txt
+# have been generated by another source
+share_silence_phones=false  # if true, then share pdfs of different silence
+                            # phones together.
+sil_prob=0.5
+phone_symbol_table=             # if set, use a specified phones.txt file
+num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence.
+                                # Increasing this number does not harm, but is only useful if you later
+                                # want to introduce this labels to L_disambig.fst
+separator="@@"   # Separator is a suffix or prefix of subword indicating the position of this subword in word.
+                 # By default, subword which is not at the end of word would have separator as suffix.
+                 # For example: international -> inter@@ nation@@ al
+
+# end configuration sections
+
+echo "$0 $@"  # Print the command line for logging
+
+. utils/parse_options.sh
+
+if [ $# -ne 4 ]; then
+  echo "Usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
+  echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
+  echo "<dict-src-dir> should contain the following files:"
+  echo " extra_questions.txt  lexicon.txt nonsilence_phones.txt  optional_silence.txt  silence_phones.txt"
+  echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info."
+  echo "options: "
+  echo "<dict-src-dir> may also, for the grammar-decoding case (see http://kaldi-asr.org/doc/grammar.html)"
+  echo "contain a file nonterminals.txt containing symbols like #nonterm:contact_list, one per line."
+  echo "     --num-sil-states <number of states>             # default: 5, #states in silence models."
+  echo "     --num-nonsil-states <number of states>          # default: 3, #states in non-silence models."
+  echo "     --position-dependent-phones (true|false)        # default: true; if true, use _B, _E, _S & _I"
+  echo "                                                     # markers on phones to indicate word-internal positions. "
+  echo "     --share-silence-phones (true|false)             # default: false; if true, share pdfs of "
+  echo "                                                     # all silence phones. "
+  echo "     --sil-prob <probability of silence>             # default: 0.5 [must have 0 <= silprob < 1]"
+  echo "     --separator <separator>                         # default: @@"
+  exit 1;
+fi
+
+srcdir=$1
+oov_word=$2
+tmpdir=$3
+dir=$4
+mkdir -p $dir $tmpdir $dir/phones
+
+silprob=false
+[ -f $srcdir/lexiconp_silprob.txt ] && echo "$0: Currently we do not support word-dependent silence probability." && exit 1;
+
+if [ -f $srcdir/nonterminals.txt ]; then
+  echo "$0: Currently we do not support nonterminals" && exit 1;
+else
+  grammar_opts=
+fi
+
+[ -f path.sh ] && . ./path.sh
+
+# Validate dict directory
+! utils/validate_dict_dir.pl $srcdir && \
+  echo "*Error validating directory $srcdir*" && exit 1;
+
+if [[ ! -f $srcdir/lexicon.txt ]]; then
+  echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt"
+  perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1;
+fi
+if [[ ! -f $srcdir/lexiconp.txt ]]; then
+  echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt"
+  perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1;
+fi
+
+# Currently The lexicon in dict directory have to be a subword lexicon.
+# If the lexicon is for word and is not phonemic, we can not get a subword lexicon without knowing the alignment.
+! grep -q $separator $srcdir/lexiconp.txt && \
+echo "$0: Warning, this lexicon contains no separator \"$separator\" and may not be a subword lexicon." && exit 1;
+
+# Write the separator into file for future use.
+echo $separator > $dir/subword_separator.txt
+
+if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then
+  utils/validate_dict_dir.pl $srcdir  # show the output.
+  echo "Validation failed (second time)"
+  exit 1;
+fi
+
+# phones.txt file provided, we will do some sanity check here.
+if [ ! -z $phone_symbol_table ]; then
+  # Checks if we have position dependent phones
+  n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l`
+  n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l`
+  $position_dependent_phones && [ $n1 -eq $n2 ] &&\
+    echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1;
+  ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\
+    echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1;
+
+  # Checks if the phone sets match.
+  cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table '
+  BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }}
+  { for (x = 1; x <= NF; ++x) { if (!($x in phones)) {
+      print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1;
+fi
+
+if $position_dependent_phones; then
+  # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or
+  # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by
+  # adding the markers _B, _E, _S, _I depending on word position.
+  # In this recipe, these markers apply to silence also.
+  # Do this starting from lexiconp.txt only.
+  if "$silprob"; then
+    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+  else
+    utils/lang/make_position_dependent_subword_lexicon.py $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1;
+  fi
+
+  # create $tmpdir/phone_map.txt
+  # this has the format (on each line)
+  # <original phone> <version 1 of original phone> <version 2> ...
+  # where the versions depend on the position of the phone within a word.
+  # For instance, we'd have:
+  # AA AA_B AA_E AA_I AA_S
+  # for (B)egin, (E)nd, (I)nternal and (S)ingleton
+  # and in the case of silence
+  # SIL SIL SIL_B SIL_E SIL_I SIL_S
+  # [because SIL on its own is one of the variants; this is for when it doesn't
+  #  occur inside a word but as an option in the lexicon.]
+
+  # This phone map expands the phone lists into all the word-position-dependent
+  # versions of the phone lists.
+  cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    > $tmpdir/phone_map.txt
+else
+  if "$silprob"; then
+    echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+  else
+    cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt
+  fi
+
+  cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \
+    awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones
+  paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt
+fi
+
+mkdir -p $dir/phones  # various sets of phones...
+
+# Sets of phones for use in clustering, and making monophone systems.
+
+if $share_silence_phones; then
+  # build a roots file that will force all the silence phones to share the
+  # same pdf's. [three distinct states, only the transitions will differ.]
+  # 'shared'/'not-shared' means, do we share the 3 states of the HMM
+  # in the same tree-root?
+  # Sharing across models(phones) is achieved by writing several phones
+  # into one line of roots.txt (shared/not-shared doesn't affect this).
+  # 'not-shared not-split' means we have separate tree roots for the 3 states,
+  # but we never split the tree so they remain stumps,
+  # so all phones in the line correspond to the same model.
+
+  cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \
+    utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
+  cat $dir/phones/sets.txt | \
+    awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt
+else
+  # different silence phones will have different GMMs.  [note: here, all "shared split" means
+  # is that we may have one GMM for all the states, or we can split on states.  because they're
+  # context-independent phones, they don't see the context.]
+  cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
+  cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
+fi
+
+cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
+cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+  awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
+cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt
+cp $dir/phones/silence.txt $dir/phones/context_indep.txt
+
+# if extra_questions.txt is empty, it's OK.
+cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \
+  >$dir/phones/extra_questions.txt
+
+# Want extra questions about the word-start/word-end stuff. Make it separate for
+# silence and non-silence. Probably doesn't matter, as silence will rarely
+# be inside a word.
+if $position_dependent_phones; then
+  for suffix in _B _E _I _S; do
+    (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+  done
+  for suffix in "" _B _E _I _S; do
+    (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+  done
+fi
+
+# add_lex_disambig.pl is responsible for adding disambiguation symbols to
+# the lexicon, for telling us how many disambiguation symbols it used,
+# and and also for modifying the unknown-word's pronunciation (if the
+# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those
+# disambig symbols for that purpose.
+# The #2 will later be replaced with the actual unk model.  The reason
+# for the #1 and the #3 is for disambiguation and also to keep the
+# FST compact.  If we didn't have the #1, we might have a different copy of
+# the unk-model FST, or at least some of its arcs, for each start-state from
+# which an <unk> transition comes (instead of per end-state, which is more compact);
+# and adding the #3 prevents us from potentially having 2 copies of the unk-model
+# FST due to the optional-silence [the last phone of any word gets 2 arcs].
+
+if "$silprob"; then
+  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+else
+  ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
+fi
+ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST.
+echo $ndisambig > $tmpdir/lex_ndisambig
+
+# Format of lexiconp_disambig.txt:
+# !SIL	1.0   SIL_S
+# <SPOKEN_NOISE>	1.0   SPN_S #1
+# <UNK>	1.0  SPN_S #2
+# <NOISE>	1.0  NSN_S
+# !EXCLAMATION-POINT	1.0  EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E
+
+( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt
+
+# Create phone symbol table.
+if [ ! -z $phone_symbol_table ]; then
+  start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
+  echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
+  BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
+    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt
+else
+  echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
+     awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
+fi
+
+# Create a file that describes the word-boundary information for
+# each phone.  5 categories.
+if $position_dependent_phones; then
+  cat $dir/phones/{silence,nonsilence}.txt | \
+    awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
+         /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
+         {print $1, "nonword";} ' > $dir/phones/word_boundary_moved.txt
+else
+  # word_boundary.txt might have been generated by another source
+  [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary_moved.txt
+fi
+
+# Create word symbol table.
+# <s> and </s> are only needed due to the need to rescore lattices with
+# ConstArpaLm format language model. They do not normally appear in G.fst or
+# L.fst.
+
+if "$silprob"; then
+  echo "$0: Currently we do not support word-dependent silence probability" && exit 1;
+fi
+
+cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
+  BEGIN {
+    print "<eps> 0";
+  }
+  {
+    if ($1 == "<s>") {
+      print "<s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    if ($1 == "</s>") {
+      print "</s> is in the vocabulary!" | "cat 1>&2"
+      exit 1;
+    }
+    printf("%s %d\n", $1, NR);
+  }
+  END {
+    printf("#0 %d\n", NR+1);
+    printf("<s> %d\n", NR+2);
+    printf("</s> %d\n", NR+3);
+  }' > $dir/words.txt || exit 1;
+
+# In case there are extra word-level disambiguation symbols they also
+# need to be added to words.txt
+
+# format of $dir/words.txt:
+# <eps> 0
+# a 1
+# aa 2
+# aarvark 3
+# ...
+
+silphone=`cat $srcdir/optional_silence.txt` || exit 1;
+[ -z "$silphone" ] && \
+  ( echo "You have no optional-silence phone; it is required in the current scripts"
+    echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
+   exit 1;
+
+# create $dir/phones/align_lexicon.{txt,int}.
+# This is the method we use for lattice word alignment if we are not
+# using word-position-dependent phones.
+
+# First remove pron-probs from the lexicon.
+perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt
+
+# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence,
+# and is not part of a word.
+[ ! -z "$silphone" ] && echo "<eps> $silphone" >> $tmpdir/align_lexicon.txt
+
+cat $tmpdir/align_lexicon.txt | \
+  perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt
+
+# create phones/align_lexicon.int from phones/align_lexicon.txt
+cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
+  utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
+
+# Create the basic L.fst without disambiguation symbols, for use
+# in training.
+
+if $silprob; then
+#  # Add silence probabilities (models the prob. of silence before and after each
+#  # word).  On some setups this helps a bit.  See utils/dict_dir_add_pronprobs.sh
+#  # and where it's called in the example scripts (run.sh).
+  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
+else
+  utils/lang/make_subword_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone --position-dependent\
+            --separator=$separator $tmpdir/lexiconp.txt | \
+    fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+      --keep_isymbols=false --keep_osymbols=false | \
+    fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+fi
+
+# The file oov.txt contains a word that we will map any OOVs to during
+# training.
+echo "$oov_word" > $dir/oov.txt || exit 1;
+cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
+# integer version of oov symbol, used in some scripts.
+
+# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
+# disambiguation symbols that are used in the grammar and passed through by the
+# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
+# for more generality (which probably would be added by another script).
+# wdisambig_words.int contains the corresponding list interpreted by the
+# symbol table words.txt, and wdisambig_phones.int contains the corresponding
+# list interpreted by the symbol table phones.txt.
+echo '#0' >$dir/phones/wdisambig.txt
+
+utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
+utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int
+
+# Create these lists of phones in colon-separated integer list form too,
+# for purposes of being given to programs as command-line options.
+for f in silence nonsilence optional_silence disambig context_indep; do
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
+   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
+done
+
+for x in sets extra_questions; do
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
+done
+
+utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \
+   > $dir/phones/roots.int || exit 1;
+
+if [ -f $dir/phones/word_boundary_moved.txt ]; then
+  utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary_moved.txt \
+    > $dir/phones/word_boundary_moved.int || exit 1;
+fi
+
+silphonelist=`cat $dir/phones/silence.csl`
+nonsilphonelist=`cat $dir/phones/nonsilence.csl`
+
+# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file
+# with another one of your choice if the 'topo' file you want can't be generated by
+# utils/gen_topo.pl.  We do this in the 'chain' recipes.  Of course, the 'topo' file
+# should cover all the phones.  Try running utils/validate_lang.pl to check that
+# everything is OK after modifying the topo file.
+utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo
+
+# Create the lexicon FST with disambiguation symbols, and put it in lang_test.
+# There is an extra step where we create a loop to "pass through" the
+# disambiguation symbols from G.fst.
+
+if $silprob; then
+  echo "$0: Currently we do not support word-dependnet silence probability" && exit 1;
+else
+  utils/lang/make_subword_lexicon_fst.py $grammar_opts \
+       --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig --position-dependent \
+       --separator=$separator $tmpdir/lexiconp_disambig.txt | \
+     fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
+       --keep_isymbols=false --keep_osymbols=false |   \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
+     fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
+fi
+
+echo "$(basename $0): validating output directory"
+! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" &&  exit 1;
+
+exit 0;
diff --git a/egs/wsj/s5/utils/subword/prepare_subword_text.sh b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
new file mode 100755
index 00000000000..0f0ce68c44f
--- /dev/null
+++ b/egs/wsj/s5/utils/subword/prepare_subword_text.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# 2019 Dongji Gao
+
+# This script generates subword text form word text.
+# For example, <noise> internatioal -> <noise> inter@@ nation@@ al
+# @@ here is the separator indicate the poisition of subword in word.
+# Subword directly followed by separator can only appear at he begining or middle of word.
+# "<noise>" here can be reserved if added to the option "--glossaries"
+
+# Begin configuration section
+separator="@@"
+glossaries=
+# End configuration section
+
+. utils/parse_options.sh
+
+echo "$0 $@"
+
+if [ $# -ne 3 ]; then
+  echo "Usage: utils/prepare_subword_text.sh <word-text> <pair_code> <subword-text>"
+  echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword"
+  echo "    --seperator <separator>         # default: @@"
+  echo "    --glossaries <reserved-words>   # glossaries are words reserved"
+  exit 1;
+fi
+
+word_text=$1
+pair_code=$2
+subword_text=$3
+
+[ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1;
+
+grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1;
+
+glossaries_opt=
+[ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
+cut -d ' ' -f2- $word_text | \
+  utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
+  if [ $word_text == $subword_text ]; then
+    mv $word_text ${word_text}.old
+    cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
+  else
+    cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text
+  fi
+
+rm ${word_text}.sub
+echo "Subword text created."
diff --git a/egs/wsj/s5/utils/sym2int.pl b/egs/wsj/s5/utils/sym2int.pl
index 592145c59f1..7ac0ab9a658 100755
--- a/egs/wsj/s5/utils/sym2int.pl
+++ b/egs/wsj/s5/utils/sym2int.pl
@@ -19,7 +19,7 @@
 
 for($x = 0; $x < 2; $x++) {
   if ($ARGV[0] eq "--map-oov") {
-    shift @ARGV; 
+    shift @ARGV;
     $map_oov = shift @ARGV;
     if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") {
       # disallow '-f', the empty string and anything ending in words.txt as the
@@ -29,11 +29,11 @@
   }
   if ($ARGV[0] eq "-f") {
     shift @ARGV;
-    $field_spec = shift @ARGV; 
+    $field_spec = shift @ARGV;
     if ($field_spec =~ m/^\d+$/) {
       $field_begin = $field_spec - 1; $field_end = $field_spec - 1;
     }
-    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10)
+    if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesy (properly, 1-10)
       if ($1 ne "") {
         $field_begin = $1 - 1;  # Change to zero-based indexing.
       }
@@ -42,7 +42,7 @@
       }
     }
     if (!defined $field_begin && !defined $field_end) {
-      die "Bad argument to -f option: $field_spec"; 
+      die "Bad argument to -f option: $field_spec";
     }
   }
 }
@@ -98,7 +98,7 @@
   print "\n";
 }
 if ($num_warning > 0) {
-  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; 
+  print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n";
 }
 
 exit(0);
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index a8b0542c1bb..c7e633ab57b 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -79,14 +79,13 @@ trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
 export LC_ALL=C
 
 function check_sorted_and_uniq {
+  ! perl -ne '((substr $_,-1) eq "\n") or die "file $ARGV has invalid newline";' $1 && exit 1;
   ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \
     echo "$0: file $1 is not in sorted order or has duplicates" && exit 1;
 }
 
 function partial_diff {
-  diff $1 $2 | head -n 6
-  echo "..."
-  diff $1 $2 | tail -n 6
+  diff -U1 $1 $2 | (head -n 6; echo "..."; tail -n 6)
   n1=`cat $1 | wc -l`
   n2=`cat $2 | wc -l`
   echo "[Lengths are $1=$n1 versus $2=$n2]"
@@ -340,9 +339,23 @@ if [ -f $data/utt2dur ]; then
     exit 1;
   fi
   cat $data/utt2dur | \
-    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
+    awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line utt2dur:" NR ":" $0; exit(1) }}' || exit 1
 fi
 
+if [ -f $data/utt2num_frames ]; then
+  check_sorted_and_uniq $data/utt2num_frames
+  cat $data/utt2num_frames | awk '{print $1}' > $tmpdir/utts.utt2num_frames
+  if ! cmp -s $tmpdir/utts{,.utt2num_frames}; then
+    echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2num_frames file"
+    echo "$0: differ, partial diff is:"
+    partial_diff $tmpdir/utts{,.utt2num_frames}
+    exit 1
+  fi
+  awk <$data/utt2num_frames '{
+    if (NF != 2 || !($2 > 0) || $2 != int($2)) {
+      print "Bad line utt2num_frames:" NR ":" $0
+      exit 1 } }' || exit 1
+fi
 
 if [ -f $data/reco2dur ]; then
   check_sorted_and_uniq $data/reco2dur
diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl
index 981dc005116..819fca7f03c 100755
--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@@ -5,7 +5,7 @@
 #            2015 Daniel Povey
 #            2017 Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
 #
-# Validation script for data/local/dict
+# Validation script for 'dict' directories (e.g. data/local/dict)
 
 # this function reads the opened file (supplied as a first
 # parameter) into an array of lines. For each
@@ -35,7 +35,7 @@ sub get_utf8_or_bytestream {
       $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
       push @unicode_lines, $decoded_text;
     } else {
-      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
+      #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
       ;
     }
     push @raw_lines, $raw_text;
@@ -56,14 +56,21 @@ sub validate_utf8_whitespaces {
   use feature 'unicode_strings';
   for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
     my $current_line = $unicode_lines->[$i];
+    if ((substr $current_line, -1) ne "\n"){
+      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
+      return 1;
+    }
+    my @A = split(" ", $current_line);
+    my $utt_id = $A[0];
     # we replace TAB, LF, CR, and SPACE
     # this is to simplify the test
     if ($current_line =~ /\x{000d}/) {
-      print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character\n";
+      print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n";
       return 1;
     }
     $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g;
     if ($current_line =~/\s/) {
+      print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n";
       return 1;
     }
   }
@@ -442,7 +449,7 @@ sub check_lexicon_pair {
     }
     foreach (0 .. @col-1) {
       if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) {
-        set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n";
+        set_to_fail();  print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence_phones.txt (line $idx, block ", $_+1, ")\n";
       }
       $idx ++;
     }
@@ -464,6 +471,22 @@ sub check_lexicon_pair {
   $success == 0 || print "--> $dict/extra_questions.txt is OK\n";
 } else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";}
 
+if (-f "$dict/nonterminals.txt") {
+  open(NT, "<$dict/nonterminals.txt") || die "opening $dict/nonterminals.txt";
+  my %nonterminals = ();
+  my $line_number = 1;
+  while (<NT>) {
+    chop;
+    my @line = split(" ", $_);
+    if (@line != 1 || ! m/^#nonterm:/ || defined $nonterminals{$line[0]}) {
+      print "--> ERROR: bad (or duplicate) line $line_number: '$_' in $dict/nonterminals.txt\n"; exit 1;
+    }
+    $nonterminals{$line[0]} = 1;
+    $line_number++;
+  }
+  print "--> $dict/nonterminals.txt is OK\n";
+}
+
 
 # check nonsilence_phones.txt again for phone-pairs that are never
 # distnguishable.  (note: this situation is normal and expected for silence
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index 2501d25c8f3..8dba2a0ca69 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -4,6 +4,7 @@
 # Copyright  2012   Guoguo Chen
 #            2014   Neil Nelson
 #            2017   Johns Hopkins University (Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#            2019   Dongji Gao
 #
 # Validation script for data/lang
 
@@ -56,6 +57,10 @@ sub validate_utf8_whitespaces {
   use feature 'unicode_strings';
   for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
     my $current_line = $unicode_lines->[$i];
+    if ((substr $current_line, -1) ne "\n"){
+      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
+      return 1;
+    }
     # we replace TAB, LF, CR, and SPACE
     # this is to simplify the test
     if ($current_line =~ /\x{000d}/) {
@@ -96,21 +101,29 @@ sub check_allowed_whitespace {
 
 $skip_det_check = 0;
 $skip_disambig_check = 0;
+$skip_generate_words_check = 0;
+$subword_check = 0;
 
-if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
-  $skip_det_check = 1;
-  shift @ARGV;
-}
-
-if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") {
-  $skip_disambig_check = 1;
-  shift @ARGV;
+for ($x=0; $x <= 3; $x++) {
+  if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") {
+    $skip_det_check = 1;
+    shift @ARGV;
+  }
+  if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") {
+    $skip_disambig_check = 1;
+    shift @ARGV;
+  }
+  if (@ARGV > 0 && $ARGV[0] eq "--skip-generate-words-check") {
+    $skip_generate_words_check = 1;
+    shift @ARGV;
+  }
 }
 
 if (@ARGV != 1) {
   print "Usage: $0 [options] <lang_directory>\n";
   print "e.g.:  $0 data/lang\n";
   print "Options:\n";
+  print " --skip-det-check                         (this flag causes it to skip a deterministic fst check).\n";
   print " --skip-determinization-check             (this flag causes it to skip a time consuming check).\n";
   print " --skip-disambig-check                    (this flag causes it to skip a disambig check in phone bigram models).\n";
   exit(1);
@@ -121,6 +134,40 @@ sub check_allowed_whitespace {
 $lang = shift @ARGV;
 $exit = 0;
 $warning = 0;
+
+# Checking existence of separator file ------------------
+print "Checking existence of separator file\n";
+if (!-e "$lang/subword_separator.txt") {
+  print "separator file $lang/subword_separator.txt is empty or does not exist, deal in word case.\n";
+} else {
+  if (!open(S, "<$lang/subword_separator.txt")) {
+    print "--> ERROR: fail to open $lang/subword_separator.txt\n"; exit 1;
+  } else {
+    $line_num = `wc -l <$lang/subword_separator.txt`;
+    if ($line_num != 1) {
+      print "--> ERROR, $lang/subword_separator.txt should only contain one line.\n"; exit 1;
+    } else {
+      while (<S>) {
+        chomp;
+        my @col = split(" ", $_);
+        if (@col != 1) {
+          print "--> ERROR, invalid separator.\n"; exit 1;
+        } else {
+         $separator = shift @col;
+         $separator_length = length $separator;
+         $subword_check = 1;
+        }
+      }
+    }
+  }
+}
+
+if (!$subword_check) {
+  $word_boundary = "word_boundary";
+} else {
+  $word_boundary = "word_boundary_moved";
+}
+
 # Checking phones.txt -------------------------------
 print "Checking $lang/phones.txt ...\n";
 if (-z "$lang/phones.txt") {
@@ -479,32 +526,15 @@ sub check_summation {
   %sum = (%silence, %nonsilence, %disambig);
   $sum{"<eps>"} = 1;
 
-  my @itset = intersect(\%sum, \%psymtab);
-  my @key1 = keys %sum;
-  my @key2 = keys %psymtab;
-  my %itset = (); foreach(@itset) {$itset{$_} = 1;}
-  if (@itset < @key1) {
-    $exit = 1; print "--> ERROR: phones in silence.txt, nonsilence.txt, disambig.txt but not in phones.txt -- ";
-    foreach (@key1) {
-      if (!$itset{$_}) {
-        print "$_ ";
-      }
+  my $ok = 1;
+  foreach $p (keys %psymtab) {
+    if (! defined $sum{$p} && $p !~ m/^#nonterm/) {
+      $exit = 1;  $ok = 0;  print("--> ERROR: phone $p is not in silence.txt, nonsilence.txt or disambig.txt...\n");
     }
-    print "\n";
   }
 
-  if (@itset < @key2) {
-    $exit = 1; print "--> ERROR: phones in phones.txt but not in silence.txt, nonsilence.txt, disambig.txt -- ";
-    foreach (@key2) {
-      if (!$itset{$_}) {
-        print "$_ ";
-      }
-    }
-    print "\n";
-  }
-
-  if (@itset == @key1 and @itset == @key2) {
-    print "--> summation property is OK\n";
+  if ($ok) {
+    print "--> found no unexplainable phones in phones.txt\n";
   }
   return;
 }
@@ -537,8 +567,8 @@ sub check_summation {
     $exit = 1;
   }
 }
-if (-e "$lang/phones/word_boundary.txt") {
-  check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n";
+if (-e "$lang/phones/$word_boundary.txt") {
+  check_txt_int("$lang/phones/$word_boundary", \%psymtab, 0); print "\n";
 }
 
 # Checking optional_silence.txt -------------------------------
@@ -641,10 +671,10 @@ sub check_summation {
 $end       = "";
 $internal  = "";
 $singleton = "";
-if (-s "$lang/phones/word_boundary.txt") {
-  print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
-  if (!open (W, "<$lang/phones/word_boundary.txt")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n";
+if (-s "$lang/phones/$word_boundary.txt") {
+  print "Checking $word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n";
+  if (!open (W, "<$lang/phones/$word_boundary.txt")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.txt\n";
   }
   $idx = 1;
   %wb = ();
@@ -667,7 +697,7 @@ sub check_summation {
       s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";}
     }
     if (@col != 1) {
-      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n";
+      $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/$word_boundary.txt (line $idx)\n";
     }
     $wb{shift @col} = 1;
     $idx ++;
@@ -678,13 +708,13 @@ sub check_summation {
   $success1 = 1;
   if (@itset != 0) {
     $success1 = 0;
-    $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- ";
+    $exit = 1; print "--> ERROR: $lang/phones/$word_boundary.txt has disambiguation symbols -- ";
     foreach (@itset) {
       print "$_ ";
     }
     print "\n";
   }
-  $success1 == 0 || print "--> $lang/phones/word_boundary.txt doesn't include disambiguation symbols\n";
+  $success1 == 0 || print "--> $lang/phones/$word_boundary.txt doesn't include disambiguation symbols\n";
 
   %sum = (%silence, %nonsilence);
   @itset = intersect(\%sum, \%wb);
@@ -692,7 +722,7 @@ sub check_summation {
   $success2 = 1;
   if (@itset < scalar(keys %sum)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- ";
+    $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in $word_boundary.txt -- ";
     foreach (keys %sum) {
       if (!$itset{$_}) {
         print "$_ ";
@@ -702,7 +732,7 @@ sub check_summation {
   }
   if (@itset < scalar(keys %wb)) {
     $success2 = 0;
-    $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
+    $exit = 1; print "--> ERROR: phones in $word_boundary.txt but not in nonsilence.txt or silence.txt -- ";
     foreach (keys %wb) {
       if (!$itset{$_}) {
         print "$_ ";
@@ -710,8 +740,8 @@ sub check_summation {
     }
     print "\n";
   }
-  $success2 == 0 || print "--> $lang/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
-  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/word_boundary.txt is OK\n";
+  $success2 == 0 || print "--> $lang/phones/$word_boundary.txt is the union of nonsilence.txt and silence.txt\n";
+  $success1 != 1 or $success2 != 1 || print "--> $lang/phones/$word_boundary.txt is OK\n";
   print "\n";
 }
 
@@ -757,11 +787,11 @@ sub check_summation {
     close(P);
     my $len = @wdisambig, $len2;
     if (($len2 = @wdisambig_words) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths\n";
       $exit = 1; return;
     }
     if (($len2 = @wdisambig_phones) != $len) {
-      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths";
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths\n";
       $exit = 1; return;
     }
     for (my $i = 0; $i < $len; $i++) {
@@ -784,16 +814,23 @@ sub check_summation {
   }
 }
 
-
-if (-s "$lang/phones/word_boundary.int") {
-  print "Checking word_boundary.int and disambig.int\n";
-  if (!open (W, "<$lang/phones/word_boundary.int")) {
-    $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n";
+# Check validity of L.fst, L_disambig.fst, and word_boundary.int.
+# First we generate a random word/subword sequence. We then compile it into fst and compose it with L.fst/L_disambig.fst.
+# For subword case the last subword of the sequence must be a end-subword 
+# (i.e. the subword can only be at the end of word or is a single word itself) 
+# to guarantee the composition would not fail.
+# We then get the corresponging phones sequence and apply a transition matrix on it to get the number of valid boundaries.
+# In word case, the number of valid boundaries should be equal to the number of words.
+# In subword case, the number of valid boundaries should be equal to the number of end-subwords.
+if (-s "$lang/phones/$word_boundary.int") {
+  print "Checking $word_boundary.int and disambig.int\n";
+  if (!open (W, "<$lang/phones/$word_boundary.int")) {
+    $exit = 1; print "--> ERROR: fail to open $lang/phones/$word_boundary.int\n";
   }
   while (<W>) {
     @A = split;
     if (@A != 2) {
-      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n";
+      $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/$word_boundary.int\n";
     }
     $wbtype{$A[0]} = $A[1];
   }
@@ -817,23 +854,62 @@ sub check_summation {
   }
 
   foreach $fst ("L.fst", "L_disambig.fst") {
+    if ($skip_generate_words_check) {
+      next;
+    }
     $wlen = int(rand(100)) + 1;
-    print "--> generating a $wlen word sequence\n";
+    $end_subword = 0;
+    print "--> generating a $wlen word/subword sequence\n";
     $wordseq = "";
     $sid = 0;
     $wordseq_syms = "";
-    foreach (1 .. $wlen) {
+    # exclude disambiguation symbols, BOS and EOS, epsilon, and
+    # grammar-related symbols from the word sequence.
+    while ($sid < ($wlen - 1)) {
       $id = int(rand(scalar(keys %wint2sym)));
-      # exclude disambiguation symbols, BOS and EOS and epsilon from the word
-      # sequence.
       while (defined $wdisambig_words_hash{$id} or
-             $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or $id == 0) {
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
         $id = int(rand(scalar(keys %wint2sym)));
       }
       $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
       $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
       $sid ++;
+
+      if ($subword_check) {
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+        if ($suffix ne $separator) {
+          $end_subword ++;
+        }
+      }
+    } 
+
+    # generate the last word (subword)
+    $id = int(rand(scalar(keys %wint2sym)));
+    if ($subword_check) {
+      $subword = $wint2sym{$id};
+      $suffix = substr($subword, -$separator_length, $separator_length);
+      # the last subword can not followed by separator  
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0 or $suffix eq $separator) {
+        $id = int(rand(scalar(keys %wint2sym)));
+        $subword = $wint2sym{$id};
+        $suffix = substr($subword, -$separator_length, $separator_length);
+      }
+      $end_subword ++;
+    } else {
+      while (defined $wdisambig_words_hash{$id} or
+           $wint2sym{$id} eq "<s>" or $wint2sym{$id} eq "</s>" or
+           $wint2sym{$id} =~ m/^#nonterm/ or $id == 0) {
+       $id = int(rand(scalar(keys %wint2sym)));
+      }
     }
+    $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " ";
+    $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n";
+    $sid ++;
+
     $wordseq = $wordseq . "$sid 0";
     $phoneseq = `. ./path.sh; echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`;
     $transition = { }; # empty assoc. array of allowed transitions between phone types.  1 means we count a word,
@@ -864,10 +940,10 @@ sub check_summation {
           $state = $wbtype{$phone};
         }
         if (!defined $state) {
-          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/word_boundary.int\n";
+          $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/$word_boundary.int\n";
           last;
         } elsif (!defined $transition{$cur_state, $state}) {
-          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in word_boundary.int or L.fst\n";
+          $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in $word_boundary.int or L.fst\n";
           last;
         } else {
           $num_words += $transition{$cur_state, $state};
@@ -876,10 +952,13 @@ sub check_summation {
       }
     }
     if (!$exit) {
+      if ($subword_check) { 
+        $wlen = $end_subword;
+      }
       if ($num_words != $wlen) {
         $phoneseq_syms = "";
         foreach my $id (split(" ", $phoneseq)) { $phoneseq_syms = $phoneseq_syms . " " . $pint2sym{$id}; }
-        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
+        $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or $word_boundary.int.  phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n";
       } else {
         print "--> resulting phone sequence from $fst corresponds to the word sequence\n";
         print "--> $fst is OK\n";
diff --git a/egs/wsj/s5/utils/validate_text.pl b/egs/wsj/s5/utils/validate_text.pl
index 172396c867e..7f75cf12f20 100755
--- a/egs/wsj/s5/utils/validate_text.pl
+++ b/egs/wsj/s5/utils/validate_text.pl
@@ -74,6 +74,10 @@ sub validate_utf8_whitespaces {
   use feature 'unicode_strings';
   for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) {
     my $current_line = $unicode_lines->[$i];
+    if ((substr $current_line, -1) ne "\n"){
+      print STDERR "$0: The current line (nr. $i) has invalid newline\n";
+      return 1;
+    }
     my @A = split(" ", $current_line);
     my $utt_id = $A[0];
     # we replace TAB, LF, CR, and SPACE
diff --git a/egs/yomdle_fa/README.txt b/egs/yomdle_fa/README.txt
new file mode 100644
index 00000000000..984ffdb53b5
--- /dev/null
+++ b/egs/yomdle_fa/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various newswires (e.g. Hamshahri)
diff --git a/egs/yomdle_fa/v1/cmd.sh b/egs/yomdle_fa/v1/cmd.sh
new file mode 100755
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/yomdle_fa/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/yomdle_fa/v1/image b/egs/yomdle_fa/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_fa/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_fa/v1/local/augment_data.sh b/egs/yomdle_fa/v1/local/augment_data.sh
new file mode 100755
index 00000000000..1c38bcb072d
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/augment_data.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --vertical-shift $verticle_shift \
+    --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set
+
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_fa/v1/local/bidi.py b/egs/yomdle_fa/v1/local/bidi.py
new file mode 100755
index 00000000000..447313a5d02
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/bidi.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# Copyright   2018 Chun-Chieh Chang
+
+# This script is largely written by Stephen Rawls
+# and uses the python package https://pypi.org/project/PyICU_BiDi/
+# The code leaves right to left text alone and reverses left to right text.
+
+import icu_bidi
+import io
+import sys
+import unicodedata
+# R=strong right-to-left;  AL=strong arabic right-to-left
+rtl_set =  set(chr(i) for i in range(sys.maxunicode)
+               if unicodedata.bidirectional(chr(i)) in ['R','AL'])
+def determine_text_direction(text):
+    # Easy case first
+    for char in text:
+        if char in rtl_set:
+            return icu_bidi.UBiDiLevel.UBIDI_RTL
+    # If we made it here we did not encounter any strongly rtl char
+    return icu_bidi.UBiDiLevel.UBIDI_LTR
+
+def utf8_visual_to_logical(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+    bidi.inverse = True
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+def utf8_logical_to_visual(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT  #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+
+##main##
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+for line in sys.stdin:
+    line = line.strip()
+    line = utf8_logical_to_visual(line)[::-1]
+    sys.stdout.write(line + '\n')
diff --git a/egs/yomdle_fa/v1/local/chain/compare_wer.sh b/egs/yomdle_fa/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..ab880c1adb5
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/chain/compare_wer.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..700b57d9fce
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# local/chain/compare_wer.sh scale_baseline2/exp_yomdle_farsi/chain/e2e_cnn_1a scale_baseline2/exp_yomdle_farsi/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# WER                             19.55     18.45
+# CER                              5.64      4.94
+# Final train prob              -0.0065   -0.0633
+# Final valid prob               0.0015   -0.0619
+# Final train prob (xent)                 -0.2636
+# Final valid prob (xent)                 -0.2511
+
+set -e -o pipefail
+
+data_dir=data
+exp_dir=exp
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=true
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+e2echain_model_dir=$exp_dir/chain/e2e_cnn_1a
+ali_dir=$exp_dir/chain/e2e_ali_train
+lat_dir=$exp_dir/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=$exp_dir/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=$data_dir/${train_set}
+tree_dir=$exp_dir/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=$data_dir/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt $data_dir/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r $data_dir/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=72"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=144"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=196"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=120 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=16 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=4 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
new file mode 100755
index 00000000000..bb5352943f6
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+
+# local/chain/compare_wer.sh exp_yomdle_farsi/chain/e2e_cnn_1a exp_yomdle_farsi/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# WER                             19.55     18.45
+# CER                              5.64      4.94
+# Final train prob              -0.0065   -0.0633
+# Final valid prob               0.0015   -0.0619
+# Final train prob (xent)                 -0.2636
+# Final valid prob (xent)                 -0.2511
+
+set -e
+
+data_dir=data
+exp_dir=exp
+
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=4
+num_jobs_final=8
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_test=lang_test
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=$data_dir/lang_e2e
+treedir=$exp_dir/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=$exp_dir/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r $data_dir/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       $data_dir/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat $data_dir/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \
+    utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=72"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=144"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=144"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=120 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $data_dir/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/yomdle_fa/v1/local/create_download.sh b/egs/yomdle_fa/v1/local/create_download.sh
new file mode 100755
index 00000000000..1040ecc2165
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/create_download.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2018 Chun-Chieh Chang
+
+# The original format of the dataset given is GEDI and page images.
+# This script is written to create line images from page images.
+# It also creates csv files from the GEDI files.
+
+database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi
+slam_dir=download/slam_farsi
+yomdle_dir=download/yomdle_farsi
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1; 
+
+echo "$0: Processing SLAM ${language}"
+echo "Date: $(date)."
+mkdir -p ${slam_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/GEDI2CSV_enriched.py \
+    --inputDir ${database_slam} \
+    --outputDir ${slam_dir}/truth_csv_raw \
+    --log ${slam_dir}/GEDI2CSV_enriched.log
+local/create_line_image_from_page_image.py \
+    ${database_slam} \
+    ${slam_dir}/truth_csv_raw \
+    ${slam_dir}
+
+echo "$0: Processing YOMDLE ${language}"
+echo "Date: $(date)."
+mkdir -p ${yomdle_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/YOMDLE2CSV.py \
+    --inputDir ${database_yomdle} \
+    --outputDir ${yomdle_dir}/truth_csv_raw/ \
+    --log ${yomdle_dir}/YOMDLE2CSV.log
+local/create_line_image_from_page_image.py \
+    --im-format "jpg" \
+    ${database_yomdle}/images \
+    ${yomdle_dir}/truth_csv_raw \
+    ${yomdle_dir}
diff --git a/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py
new file mode 100755
index 00000000000..7135bb1b242
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py
@@ -0,0 +1,458 @@
+#!/usr/bin/env python3
+
+# Copyright   2018 Ashish Arora
+# Apache 2.0
+# minimum bounding box part in this script is originally from
+#https://github.com/BebeSparkelSparkel/MinimumBoundingBox
+#https://startupnextdoor.com/computing-convex-hull-in-python/
+""" This module will be used for extracting line images from page image.
+ Given the word segmentation (bounding box around a word) for every word, it will
+ extract line segmentation. To extract line segmentation, it will take word bounding
+ boxes of a line as input, will create a minimum area bounding box that will contain
+ all corner points of word bounding boxes. The obtained bounding box (will not necessarily
+ be vertically or horizontally aligned). Hence to extract line image from line bounding box,
+ page image is rotated and line image is cropped and saved.
+"""
+
+import argparse
+import csv
+import itertools
+import sys
+import os
+import numpy as np
+from math import atan2, cos, sin, pi, degrees, sqrt
+from collections import namedtuple
+
+from scipy.spatial import ConvexHull
+from PIL import Image
+from scipy.misc import toimage
+
+parser = argparse.ArgumentParser(description="Creates line images from page image")
+parser.add_argument('image_dir', type=str, help='Path to full page images')
+parser.add_argument('csv_dir', type=str, help='Path to csv files')
+parser.add_argument('out_dir', type=str, help='Path to output directory')
+parser.add_argument('--im-format', type=str, default='png', help='What file format are the images')
+parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area')
+parser.add_argument('--head', type=int, default=-1, help='Number of csv files to process')
+args = parser.parse_args()
+
+"""
+bounding_box is a named tuple which contains:
+             area (float): area of the rectangle
+             length_parallel (float): length of the side that is parallel to unit_vector
+             length_orthogonal (float): length of the side that is orthogonal to unit_vector
+             rectangle_center(int, int): coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector (float, float): direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle (float): angle of the unit vector to be in radians.
+             corner_points [(float, float)]: set that contains the corners of the rectangle
+"""
+
+bounding_box_tuple = namedtuple('bounding_box_tuple', 'area '
+                                        'length_parallel '
+                                        'length_orthogonal '
+                                        'rectangle_center '
+                                        'unit_vector '
+                                        'unit_vector_angle '
+                                        'corner_points'
+                         )
+
+
+def unit_vector(pt0, pt1):
+    """ Given two points pt0 and pt1, return a unit vector that
+        points in the direction of pt0 to pt1.
+    Returns
+    -------
+    (float, float): unit vector
+    """
+    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
+    return (pt1[0] - pt0[0]) / dis_0_to_1, \
+           (pt1[1] - pt0[1]) / dis_0_to_1
+
+
+def orthogonal_vector(vector):
+    """ Given a vector, returns a orthogonal/perpendicular vector of equal length.
+    Returns
+    ------
+    (float, float): A vector that points in the direction orthogonal to vector.
+    """
+    return -1 * vector[1], vector[0]
+
+
+def bounding_area(index, hull):
+    """ Given index location in an array and convex hull, it gets two points
+        hull[index] and hull[index+1]. From these two points, it returns a named
+        tuple that mainly contains area of the box that bounds the hull. This
+        bounding box orintation is same as the orientation of the lines formed
+        by the point hull[index] and hull[index+1].
+    Returns
+    -------
+    a named tuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side.
+    (it's orthogonal vector can be found with the orthogonal_vector function)
+    """
+    unit_vector_p = unit_vector(hull[index], hull[index+1])
+    unit_vector_o = orthogonal_vector(unit_vector_p)
+
+    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
+    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)
+
+    min_p = min(dis_p)
+    min_o = min(dis_o)
+    len_p = max(dis_p) - min_p
+    len_o = max(dis_o) - min_o
+
+    return {'area': len_p * len_o,
+            'length_parallel': len_p,
+            'length_orthogonal': len_o,
+            'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2),
+            'unit_vector': unit_vector_p,
+            }
+
+
+def to_xy_coordinates(unit_vector_angle, point):
+    """ Given angle from horizontal axis and a point from origin,
+        returns converted unit vector coordinates in x, y coordinates.
+        angle of unit vector should be in radians.
+    Returns
+    ------
+    (float, float): converted x,y coordinate of the unit vector.
+    """
+    angle_orthogonal = unit_vector_angle + pi / 2
+    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
+           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
+
+
+def rotate_points(center_of_rotation, angle, points):
+    """ Rotates a point cloud around the center_of_rotation point by angle
+    input
+    -----
+    center_of_rotation (float, float): angle of unit vector to be in radians.
+    angle (float): angle of rotation to be in radians.
+    points [(float, float)]: Points to be a list or tuple of points. Points to be rotated.
+    Returns
+    ------
+    [(float, float)]: Rotated points around center of rotation by angle
+    """
+    rot_points = []
+    ang = []
+    for pt in points:
+        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
+        diff_angle = atan2(diff[1], diff[0]) + angle
+        ang.append(diff_angle)
+        diff_length = sqrt(sum([d**2 for d in diff]))
+        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
+                           center_of_rotation[1] + diff_length * sin(diff_angle)))
+
+    return rot_points
+
+
+def rectangle_corners(rectangle):
+    """ Given rectangle center and its inclination, returns the corner
+        locations of the rectangle.
+    Returns
+    ------
+    [(float, float)]: 4 corner points of rectangle.
+    """
+    corner_points = []
+    for i1 in (.5, -.5):
+        for i2 in (i1, -1 * i1):
+            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
+                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))
+
+    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
+
+
+def get_orientation(origin, p1, p2):
+    """
+    Given origin and two points, return the orientation of the Point p1 with
+    regards to Point p2 using origin.
+    Returns
+    -------
+    integer: Negative if p1 is clockwise of p2.
+    """
+    difference = (
+        ((p2[0] - origin[0]) * (p1[1] - origin[1]))
+        - ((p1[0] - origin[0]) * (p2[1] - origin[1]))
+    )
+    return difference
+
+
+def compute_hull(points):
+    """
+    Given input list of points, return a list of points that
+    made up the convex hull.
+    Returns
+    -------
+    [(float, float)]: convexhull points
+    """
+    hull_points = []
+    start = points[0]
+    min_x = start[0]
+    for p in points[1:]:
+        if p[0] < min_x:
+            min_x = p[0]
+            start = p
+
+    point = start
+    hull_points.append(start)
+
+    far_point = None
+    while far_point is not start:
+        p1 = None
+        for p in points:
+            if p is point:
+                continue
+            else:
+                p1 = p
+                break
+
+        far_point = p1
+
+        for p2 in points:
+            if p2 is point or p2 is p1:
+                continue
+            else:
+                direction = get_orientation(point, far_point, p2)
+                if direction > 0:
+                    far_point = p2
+
+        hull_points.append(far_point)
+        point = far_point
+    return hull_points
+
+
+def minimum_bounding_box(points):
+    """ Given a list of 2D points, it returns the minimum area rectangle bounding all
+        the points in the point cloud.
+    Returns
+    ------
+    returns a namedtuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side. RADIANS
+    unit_vector_angle: angle of the unit vector
+    corner_points: set that contains the corners of the rectangle
+    """
+
+    if len(points) <= 2: raise ValueError('More than two points required.')
+
+    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
+    hull_ordered.append(hull_ordered[0])
+    #hull_ordered = compute_hull(points)
+    hull_ordered = tuple(hull_ordered)
+
+    min_rectangle = bounding_area(0, hull_ordered)
+    for i in range(1, len(hull_ordered)-1):
+        rectangle = bounding_area(i, hull_ordered)
+        if rectangle['area'] < min_rectangle['area']:
+            min_rectangle = rectangle
+
+    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
+    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])
+
+    return bounding_box_tuple(
+        area = min_rectangle['area'],
+        length_parallel = min_rectangle['length_parallel'],
+        length_orthogonal = min_rectangle['length_orthogonal'],
+        rectangle_center = min_rectangle['rectangle_center'],
+        unit_vector = min_rectangle['unit_vector'],
+        unit_vector_angle = min_rectangle['unit_vector_angle'],
+        corner_points = set(rectangle_corners(min_rectangle))
+    )
+
+
+def get_center(im):
+    """ Given image, returns the location of center pixel
+    Returns
+    -------
+    (int, int): center of the image
+    """
+    center_x = float(im.size[0]) / 2
+    center_y = float(im.size[1]) / 2
+    return int(center_x), int(center_y)
+
+
+def get_horizontal_angle(unit_vector_angle):
+    """ Given an angle in radians, returns angle of the unit vector in
+        first or fourth quadrant.
+    Returns
+    ------
+    (float): updated angle of the unit vector to be in radians.
+             It is only in first or fourth quadrant.
+    """
+    if unit_vector_angle > pi / 2 and unit_vector_angle <= pi:
+        unit_vector_angle = unit_vector_angle - pi
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2:
+        unit_vector_angle = unit_vector_angle + pi
+
+    return unit_vector_angle
+
+
+def get_smaller_angle(bounding_box):
+    """ Given a rectangle, returns its smallest absolute angle from horizontal axis.
+    Returns
+    ------
+    (float): smallest angle of the rectangle to be in radians.
+    """
+    unit_vector = bounding_box.unit_vector
+    unit_vector_angle = bounding_box.unit_vector_angle
+    ortho_vector = orthogonal_vector(unit_vector)
+    ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0])
+
+    unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle)
+    ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle)
+
+    if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated):
+        return unit_vector_angle_updated
+    else:
+        return ortho_vector_angle_updated
+
+
+def rotated_points(bounding_box, center):
+    """ Given the rectangle, returns corner points of rotated rectangle.
+        It rotates the rectangle around the center by its smallest angle.
+    Returns
+    -------
+    [(int, int)]: 4 corner points of rectangle.
+    """
+    p1, p2, p3, p4 = bounding_box.corner_points
+    x1, y1 = p1
+    x2, y2 = p2
+    x3, y3 = p3
+    x4, y4 = p4
+    center_x, center_y = center
+    rotation_angle_in_rad = -get_smaller_angle(bounding_box)
+    x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x
+
+    y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y
+    return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4
+
+
+def pad_image(image):
+    """ Given an image, returns a padded image around the border.
+        This routine save the code from crashing if bounding boxes that are
+        slightly outside the page boundary.
+    Returns
+    -------
+    image: page image
+    """
+    offset = int(args.padding // 2)
+    padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white")
+    padded_image.paste(im = image, box = (offset, offset))
+    return padded_image
+
+def update_minimum_bounding_box_input(bounding_box_input):
+    """ Given list of 2D points, returns list of 2D points shifted by an offset.
+    Returns
+    ------
+    points [(float, float)]: points, a list or tuple of 2D coordinates
+    """
+    updated_minimum_bounding_box_input = []
+    offset = int(args.padding // 2)
+    for point in bounding_box_input:
+        x, y = point
+        new_x = x + offset
+        new_y = y + offset
+        word_coordinate = (new_x, new_y)
+        updated_minimum_bounding_box_input.append(word_coordinate)
+
+    return updated_minimum_bounding_box_input
+
+
+### main ###
+csv_count = 0
+for filename in sorted(os.listdir(args.csv_dir)):
+    if filename.endswith('.csv') and (csv_count < args.head or args.head < 0):
+        csv_count = csv_count + 1
+        with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f:
+            image_file = os.path.join(args.image_dir, os.path.splitext(filename)[0] + '.' + args.im_format)
+            if not os.path.isfile(image_file):
+                continue
+            csv_out_file = os.path.join(args.out_dir, 'truth_csv', filename)
+            csv_out_fh = open(csv_out_file, 'w', encoding='utf-8')
+            csv_out_writer = csv.writer(csv_out_fh)
+            im = Image.open(image_file)
+            im = pad_image(im)
+            count = 1
+            for row in itertools.islice(csv.reader(f), 0, None):
+                if count == 1:
+                    count = 0
+                    continue
+    
+                points = []
+                points.append((int(row[2]), int(row[3])))
+                points.append((int(row[4]), int(row[5])))
+                points.append((int(row[6]), int(row[7])))
+                points.append((int(row[8]), int(row[9])))
+    
+                x = [int(row[2]), int(row[4]), int(row[6]), int(row[8])]
+                y = [int(row[3]), int(row[5]), int(row[7]), int(row[9])]
+                min_x, min_y = min(x), min(y)
+                max_x, max_y = max(x), max(y)
+                if min_x == max_x or min_y == max_y:
+                    continue
+    
+                try:
+                    updated_mbb_input = update_minimum_bounding_box_input(points)
+                    bounding_box = minimum_bounding_box(updated_mbb_input)
+                except Exception as e:
+                    print("Error: Skipping Image " + row[1])
+                    continue
+    
+                p1, p2, p3, p4 = bounding_box.corner_points
+                x1, y1 = p1
+                x2, y2 = p2
+                x3, y3 = p3
+                x4, y4 = p4
+                min_x = int(min(x1, x2, x3, x4))
+                min_y = int(min(y1, y2, y3, y4))
+                max_x = int(max(x1, x2, x3, x4))
+                max_y = int(max(y1, y2, y3, y4))
+                box = (min_x, min_y, max_x, max_y)
+                region_initial = im.crop(box)
+                rot_points = []
+                p1_new = (x1 - min_x, y1 - min_y)
+                p2_new = (x2 - min_x, y2 - min_y)
+                p3_new = (x3 - min_x, y3 - min_y)
+                p4_new = (x4 - min_x, y4 - min_y)
+                rot_points.append(p1_new)
+                rot_points.append(p2_new)
+                rot_points.append(p3_new)
+                rot_points.append(p4_new)
+    
+                cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                        bounding_box.length_parallel,
+                        bounding_box.length_orthogonal,
+                        bounding_box.length_orthogonal,
+                        bounding_box.unit_vector,
+                        bounding_box.unit_vector_angle,
+                        set(rot_points))
+    
+                rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+                img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
+                x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                    cropped_bounding_box, get_center(region_initial))
+    
+                min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                box = (min_x, min_y, max_x, max_y)
+                region_final = img2.crop(box)
+                csv_out_writer.writerow(row)
+                image_out_file = os.path.join(args.out_dir, 'truth_line_image', row[1])
+                region_final.save(image_out_file)
diff --git a/egs/yomdle_fa/v1/local/extract_features.sh b/egs/yomdle_fa/v1/local/extract_features.sh
new file mode 100755
index 00000000000..f75837ae5b3
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/extract_features.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+augment='no_aug'
+num_channels=3
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_fa/v1/local/gedi2csv.py b/egs/yomdle_fa/v1/local/gedi2csv.py
new file mode 100755
index 00000000000..0b80c2e80bb
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/gedi2csv.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id= col= row= width= height= Language= Quality= Overlay= Script= Type= Text_Content=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+    
+class GEDI2CSV(object):
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        writePath = os.path.join(writePath,'')
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','text_type']
+        conf=100
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+            
+        strPos = writePath + baseName
+
+        """ for each group of coordinates """
+        for i in coords:
+
+            [id,x,y,w,h,degrees,text,qual,script,text_type] = i
+                    
+            contour = np.array([(x,y),(x+w,y),(x+w,y+h),(x,y+h)])
+
+            """
+            First rotate around upper left corner based on orientationD keyword
+            """
+            M, rot = Rotate2D(contour, np.array([x,y]), degrees*pi/180)
+            rot = np.int0(rot)
+
+            # rot is the 8 points rotated by degrees
+            # pgrot is the rotation after extraction, so save
+
+            # save rotated points to list or array
+            rot = np.reshape(rot,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(rot)
+            
+            text = text.replace(u'\ufeff','')
+
+            bbrot = degrees
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+
+        # if there are polygons, first save the text
+        for j in polys:
+            arr = []
+            [id,poly_val,text,qual,script,text_type] = j
+            for i in poly_val:
+                arr.append(eval(i))
+
+            contour = np.asarray(arr)
+            convex = cv2.convexHull(contour)
+            rect = cv2.minAreaRect(convex)
+            box = cv2.boxPoints(rect)
+            box = np.int0(box)
+            box = np.reshape(box,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+            
+            bbrot = 0.0
+            
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+            
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+                
+        return write_ctr
+    
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+        
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    
+    """
+    Get all XML files in the directory and sub folders
+    """
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                """ read the XML file """
+                tree = ET.parse(fullName)
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+                if args.ftype == 'boxed':
+                    fileTypeStr = 'col'
+                elif args.ftype == 'transcribed':
+                    fileTypeStr = 'Text_Content'
+                else:
+                    print('Filetype must be either boxed or transcribed!')
+                    logger.info('Filetype must be either boxed or transcribed!')
+                    sys.exit(-1)
+                
+                if args.quality == 'both':
+                    qualset = {'Regular','Low-Quality'}
+                elif args.quality == 'low':
+                    qualset = {'Low-Quality'}
+                elif args.quality == 'regular':
+                    qualset = {'Regular'}
+                else:
+                    print('Quality must be both, low or regular!')
+                    logger.info('Quality must be both, low or regular!')
+                    sys.exit(-1)
+                    
+                    
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+                        
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' and zone.attrib['Type'] in \
+                            ('Machine_Print','Confusable_Allograph','Handwriting') and zone.attrib['Quality'] in qualset:
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')])
+                            elif zone.get(fileTypeStr) != None:
+                                keyCnt+=1
+                                coord = [zone.attrib['id'],int(zone.attrib['col']),int(zone.attrib['row']),
+                                                    int(zone.attrib['width']), int(zone.attrib['height']),
+                                                    float(zone.get('orientationD',0.0)),
+                                                    zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')]
+                                coordinates.append(coord)
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no applicable content' % (baseName[0]))
+
+    print('complete...total files %d, lines written %d' % (fileCnt, line_write_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', required=True)
+    parser.add_argument('--outputDir', type=str, help='Output directory', required=True)
+    parser.add_argument('--ftype', type=str, help='GEDI file type (either "boxed" or "transcribed")', default='transcribed')
+    parser.add_argument('--quality', type=str, help='GEDI file quality (either "both" or "low" or "regular")', default='regular')
+    parser.add_argument('--log', type=str, help='Log directory', default='./GEDI2CSV_enriched.log')
+
+    return parser.parse_args(argv)
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
+
+
+
+    
+
+
diff --git a/egs/yomdle_fa/v1/local/prepare_dict.sh b/egs/yomdle_fa/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..8d14130d8c0
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/prepare_dict.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+data_dir=data
+
+. ./utils/parse_options.sh || exit 1;
+
+base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2)
+
+mkdir -p $dir
+
+local/prepare_lexicon.py --data-dir $data_dir $dir
+
+perl -i -ne 'print if /\S/' $dir/lexicon.txt
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_fa/v1/local/prepare_lexicon.py b/egs/yomdle_fa/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..46be4f37970
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/prepare_lexicon.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Ashish Arora
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+parser.add_argument('--data-dir', type=str, default='data', help='Path to text file')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join(args.data_dir, 'train', 'text')
+text_fh = open(text_path, 'r', encoding='utf-8')
+
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+	    # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = characters.replace('#','<HASH>')
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_fa/v1/local/process_data.py b/egs/yomdle_fa/v1/local/process_data.py
new file mode 100755
index 00000000000..3423cc5380e
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/process_data.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Farsi OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+parser.add_argument('--head', type=int, default=-1, help='limit on number of synth data')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+count = 0
+for filename in sorted(os.listdir(os.path.join(args.database_path, 'truth_csv'))):
+    if filename.endswith('.csv') and (count < args.head or args.head < 0):
+        count = count + 1
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        row_count = 0
+        for row in csv.reader(csv_file):
+            if row_count == 0:
+                row_count = 1
+                continue
+            image_id = os.path.splitext(row[1])[0]
+            image_filepath = os.path.join(args.database_path, 'truth_line_image', row[1])
+            text = unicodedata.normalize('NFC', row[11])
+            file_info = os.stat(image_filepath)
+            if file_info.st_size != 0:
+                if text:
+                    text_fh.write(image_id + ' ' + text + '\n')
+                    utt2spk_fh.write(image_id + ' ' + '_'.join(image_id.split('_')[:-1]) + '\n')
+                    image_fh.write(image_id + ' ' + image_filepath + ' ' + row[13] +  '\n')
diff --git a/egs/yomdle_fa/v1/local/score.sh b/egs/yomdle_fa/v1/local/score.sh
new file mode 100755
index 00000000000..f2405205f02
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@"
+steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@"
diff --git a/egs/yomdle_fa/v1/local/train_lm.sh b/egs/yomdle_fa/v1/local/train_lm.sh
new file mode 100755
index 00000000000..bc738f217da
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/train_lm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_fa/v1/local/train_lm_lr.sh b/egs/yomdle_fa/v1/local/train_lm_lr.sh
new file mode 100755
index 00000000000..5bfc20acdeb
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/train_lm_lr.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE+Extra training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+extra_lm=download/extra_lm.txt
+order=3
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  cat ${extra_lm} | local/bidi.py | utils/lang/bpe/prepend_words.py --encoding 'utf-8' | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > ${dir}/data/text/extra_lm.txt
+  
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='extra_lm=10 train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_fa/v1/local/wer_output_filter b/egs/yomdle_fa/v1/local/wer_output_filter
new file mode 100755
index 00000000000..08d5563bca4
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/wer_output_filter
@@ -0,0 +1,151 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+# Arabic-specific normalization
+while (<>) {
+  @F = split " ";
+  print "$F[0] ";
+  foreach $s (@F[1..$#F]) {
+    # Normalize tabs, spaces, and no-break spaces
+    $s =~ s/[\x{0009}\x{0020}\x{00A0}]+/ /g;
+    # Normalize "dots"/"filled-circles" to periods
+    $s =~ s/[\x{25CF}\x{u2022}\x{2219}]+/\x{002E}/g;
+    # Normalize dashes to regular hyphen
+    $s =~ s/[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}]+/\x{002D}/g;
+    # Normalize various parenthesis to regular parenthesis
+    $s =~ s/\x{UFF09}/\x{0029}/g;
+    $s =~ s/\x{UFF08}/\x{0028}/g;
+    
+    # Convert various presentation forms to base form
+    $s =~ s/[\x{FED1}\x{FED3}\x{FED4}\x{FED2}]+/\x{0641}/g;
+    $s =~ s/[\x{FBB0}\x{FBB1}]+/\x{06D3}/g;
+    $s =~ s/[\x{FECD}\x{FECF}\x{FED0}\x{FECE}]+/\x{063A}/g;
+    $s =~ s/[\x{FBDD}]+/\x{0677}/g;
+    $s =~ s/[\x{FBA6}\x{FBA8}\x{FBA9}\x{FBA7}]+/\x{06C1}/g;
+    $s =~ s/[\x{FEC1}\x{FEC3}\x{FEC4}\x{FEC2}]+/\x{0637}/g;
+    $s =~ s/[\x{FE85}\x{FE86}]+/\x{0624}/g;
+    $s =~ s/[\x{FEA5}\x{FEA7}\x{FEA8}\x{FEA6}]+/\x{062E}/g;
+    $s =~ s/[\x{FBD9}\x{FBDA}]+/\x{06C6}/g;
+    $s =~ s/[\x{FE8F}\x{FE91}\x{FE92}\x{FE90}]+/\x{0628}/g;
+    $s =~ s/[\x{FEED}\x{FEEE}]+/\x{0648}/g;
+    $s =~ s/[\x{FE99}\x{FE9B}\x{FE9C}\x{FE9A}]+/\x{062B}/g;
+    $s =~ s/[\x{FEBD}\x{FEBF}\x{FEC0}\x{FEBE}]+/\x{0636}/g;
+    $s =~ s/[\x{FEE5}\x{FEE7}\x{FEE8}\x{FEE6}]+/\x{0646}/g;
+    $s =~ s/[\x{FBFC}\x{FBFE}\x{FBFF}\x{FBFD}]+/\x{06CC}/g;
+    $s =~ s/[\x{FBA4}\x{FBA5}]+/\x{06C0}/g;
+    $s =~ s/[\x{FB72}\x{FB74}\x{FB75}\x{FB73}]+/\x{0684}/g;
+    $s =~ s/[\x{FBD3}\x{FBD5}\x{FBD6}\x{FBD4}]+/\x{06AD}/g;
+    $s =~ s/[\x{FB6A}\x{FB6C}\x{FB6D}\x{FB6B}]+/\x{06A4}/g;
+    $s =~ s/[\x{FB66}\x{FB68}\x{FB69}\x{FB67}]+/\x{0679}/g;
+    $s =~ s/[\x{FB5E}\x{FB60}\x{FB61}\x{FB5F}]+/\x{067A}/g;
+    $s =~ s/[\x{FB88}\x{FB89}]+/\x{0688}/g;
+    $s =~ s/[\x{FB7E}\x{FB80}\x{FB81}\x{FB7F}]+/\x{0687}/g;
+    $s =~ s/[\x{FB8E}\x{FB90}\x{FB91}\x{FB8F}]+/\x{06A9}/g;
+    $s =~ s/[\x{FB86}\x{FB87}]+/\x{068E}/g;
+    $s =~ s/[\x{FE83}\x{FE84}]+/\x{0623}/g;
+    $s =~ s/[\x{FB8A}\x{FB8B}]+/\x{0698}/g;
+    $s =~ s/[\x{FED5}\x{FED7}\x{FED8}\x{FED6}]+/\x{0642}/g;
+    $s =~ s/[\x{FED9}\x{FEDB}\x{FEDC}\x{FEDA}]+/\x{0643}/g;
+    $s =~ s/[\x{FBE0}\x{FBE1}]+/\x{06C5}/g;
+    $s =~ s/[\x{FEB9}\x{FEBB}\x{FEBC}\x{FEBA}]+/\x{0635}/g;
+    $s =~ s/[\x{FEC5}\x{FEC7}\x{FEC8}\x{FEC6}]+/\x{0638}/g;
+    $s =~ s/[\x{FE8D}\x{FE8E}]+/\x{0627}/g;
+    $s =~ s/[\x{FB9A}\x{FB9C}\x{FB9D}\x{FB9B}]+/\x{06B1}/g;
+    $s =~ s/[\x{FEAD}\x{FEAE}]+/\x{0631}/g;
+    $s =~ s/[\x{FEF1}\x{FEF3}\x{FEF4}\x{FEF2}]+/\x{064A}/g;
+    $s =~ s/[\x{FE93}\x{FE94}]+/\x{0629}/g;
+    $s =~ s/[\x{FBE4}\x{FBE6}\x{FBE7}\x{FBE5}]+/\x{06D0}/g;
+    $s =~ s/[\x{FE89}\x{FE8B}\x{FE8C}\x{FE8A}]+/\x{0626}/g;
+    $s =~ s/[\x{FB84}\x{FB85}]+/\x{068C}/g;
+    $s =~ s/[\x{FE9D}\x{FE9F}\x{FEA0}\x{FE9E}]+/\x{062C}/g;
+    $s =~ s/[\x{FB82}\x{FB83}]+/\x{068D}/g;
+    $s =~ s/[\x{FEA1}\x{FEA3}\x{FEA4}\x{FEA2}]+/\x{062D}/g;
+    $s =~ s/[\x{FB52}\x{FB54}\x{FB55}\x{FB53}]+/\x{067B}/g;
+    $s =~ s/[\x{FB92}\x{FB94}\x{FB95}\x{FB93}]+/\x{06AF}/g;
+    $s =~ s/[\x{FB7A}\x{FB7C}\x{FB7D}\x{FB7B}]+/\x{0686}/g;
+    $s =~ s/[\x{FBDB}\x{FBDC}]+/\x{06C8}/g;
+    $s =~ s/[\x{FB56}\x{FB58}\x{FB59}\x{FB57}]+/\x{067E}/g;
+    $s =~ s/[\x{FEB5}\x{FEB7}\x{FEB8}\x{FEB6}]+/\x{0634}/g;
+    $s =~ s/[\x{FBE2}\x{FBE3}]+/\x{06C9}/g;
+    $s =~ s/[\x{FB96}\x{FB98}\x{FB99}\x{FB97}]+/\x{06B3}/g;
+    $s =~ s/[\x{FE80}]+/\x{0621}/g;
+    $s =~ s/[\x{FBAE}\x{FBAF}]+/\x{06D2}/g;
+    $s =~ s/[\x{FB62}\x{FB64}\x{FB65}\x{FB63}]+/\x{067F}/g;
+    $s =~ s/[\x{FEE9}\x{FEEB}\x{FEEC}\x{FEEA}]+/\x{0647}/g;
+    $s =~ s/[\x{FE81}\x{FE82}]+/\x{0622}/g;
+    $s =~ s/[\x{FBDE}\x{FBDF}]+/\x{06CB}/g;
+    $s =~ s/[\x{FE87}\x{FE88}]+/\x{0625}/g;
+    $s =~ s/[\x{FB6E}\x{FB70}\x{FB71}\x{FB6F}]+/\x{06A6}/g;
+    $s =~ s/[\x{FBA0}\x{FBA2}\x{FBA3}\x{FBA1}]+/\x{06BB}/g;
+    $s =~ s/[\x{FBAA}\x{FBAC}\x{FBAD}\x{FBAB}]+/\x{06BE}/g;
+    $s =~ s/[\x{FEA9}\x{FEAA}]+/\x{062F}/g;
+    $s =~ s/[\x{FEE1}\x{FEE3}\x{FEE4}\x{FEE2}]+/\x{0645}/g;
+    $s =~ s/[\x{FEEF}\x{FBE8}\x{FBE9}\x{FEF0}]+/\x{0649}/g;
+    $s =~ s/[\x{FB8C}\x{FB8D}]+/\x{0691}/g;
+    $s =~ s/[\x{FB76}\x{FB78}\x{FB79}\x{FB77}]+/\x{0683}/g;
+    $s =~ s/[\x{FB5A}\x{FB5C}\x{FB5D}\x{FB5B}]+/\x{0680}/g;
+    $s =~ s/[\x{FB9E}\x{FB9F}]+/\x{06BA}/g;
+    $s =~ s/[\x{FEC9}\x{FECB}\x{FECC}\x{FECA}]+/\x{0639}/g;
+    $s =~ s/[\x{FEDD}\x{FEDF}\x{FEE0}\x{FEDE}]+/\x{0644}/g;
+    $s =~ s/[\x{FB50}\x{FB51}]+/\x{0671}/g;
+    $s =~ s/[\x{FEB1}\x{FEB3}\x{FEB4}\x{FEB2}]+/\x{0633}/g;
+    $s =~ s/[\x{FE95}\x{FE97}\x{FE98}\x{FE96}]+/\x{062A}/g;
+    $s =~ s/[\x{FBD7}\x{FBD8}]+/\x{06C7}/g;
+    $s =~ s/[\x{FEAF}\x{FEB0}]+/\x{0632}/g;
+    $s =~ s/[\x{FEAB}\x{FEAC}]+/\x{0630}/g;
+
+    # Remove tatweel
+    $s =~ s/\x{0640}//g;
+    # Remove vowels and hamza
+    $s =~ s/[\x{064B}-\x{0655}]+//g;
+    # Remove right-to-left and left-to-right
+    $s =~ s/[\x{200F}\x{200E}]+//g;
+    # Arabic Keheh to Arabic Kaf
+    $s =~ s/\x{06A9}/\x{0643}/g;
+    # Arabic Yeh to Farsi Yeh
+    $s =~ s/\x{064A}/\x{06CC}/g;
+    # Decompose RIAL
+    $s =~ s/\x{FDFC}/\x{0631}\x{06CC}\x{0627}\x{0644}/g;
+    # Farsi arabic-indic digits to arabic-indic digits
+    $s =~ s/\x{06F0}/\x{0660}/g;
+    $s =~ s/\x{06F1}/\x{0661}/g;
+    $s =~ s/\x{06F2}/\x{0662}/g;
+    $s =~ s/\x{06F3}/\x{0663}/g;
+    $s =~ s/\x{06F4}/\x{0664}/g;
+    $s =~ s/\x{06F5}/\x{0665}/g;
+    $s =~ s/\x{06F6}/\x{0666}/g;
+    $s =~ s/\x{06F7}/\x{0667}/g;
+    $s =~ s/\x{06F8}/\x{0668}/g;
+    $s =~ s/\x{06F9}/\x{0669}/g;
+    # Arabic-indic digits to digits
+    $s =~ s/\x{0660}/0/g;
+    $s =~ s/\x{0661}/1/g;
+    $s =~ s/\x{0662}/2/g;
+    $s =~ s/\x{0663}/3/g;
+    $s =~ s/\x{0664}/4/g;
+    $s =~ s/\x{0665}/5/g;
+    $s =~ s/\x{0666}/6/g;
+    $s =~ s/\x{0667}/7/g;
+    $s =~ s/\x{0668}/8/g;
+    $s =~ s/\x{0669}/9/g;
+    # Arabic comma to comma
+    $s =~ s/\x{060C}/\x{002C}/g;
+
+    $s =~ s/\|/ /g;
+    if ($s ne "") {
+      print "$s";
+    } else {
+      print "";
+    }
+  }
+  print "\n";
+}
+
diff --git a/egs/yomdle_fa/v1/local/yomdle2csv.py b/egs/yomdle_fa/v1/local/yomdle2csv.py
new file mode 100755
index 00000000000..8f208e2d968
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/yomdle2csv.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id=  Illegible= polygon=  Language= Text_Content= text_raw=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+
+class GEDI2CSV(object):
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','lang']
+        conf=100
+        pgrot = 0
+        bbrot = 0
+        qual = 0
+        script = ''
+
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+
+        strPos = writePath + baseName
+
+        for j in polys:
+            try:
+                arr = []
+                [id,poly_val,text,qual,lang] = j
+                script=None
+                #print(j)
+                for i in poly_val:
+                    if len(i.strip()) > 0:
+                        #print(i)
+                        arr.append(eval(i))
+
+                contour = np.asarray(arr)
+                #print(contour)
+                convex = cv2.convexHull(contour)
+                rect = cv2.minAreaRect(convex)
+                box = cv2.boxPoints(rect)
+                box = np.int0(box)
+                box = np.reshape(box,(-1,1)).T
+                c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+
+                bbrot = 0.0
+
+                rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,lang])
+
+            except:
+                print('...polygon error %s, %s' % (j, baseName))
+                continue
+
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+
+        return write_ctr
+
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    print('write to %s' % (writePath))
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    file_error_ctr = 0
+    """
+    Get all XML files in the directory and sub folders
+    """
+    print('reading %s' % (args.inputDir))
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                try:
+                    """ read the XML file """
+                    tree = ET.parse(fullName)
+                except:
+                    print('...ERROR parsing %s' % (fullName))
+                    file_error_ctr += 1
+                    continue
+
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' :
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Illegible'),zone.get('Language')])
+                            else:
+                                print('...Not polygon')
+
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no text content' % (baseName[0]))
+
+
+    print('complete...total files %d, lines written %d, img errors %d, line error %d' % (fileCnt, line_write_ctr, file_error_ctr, line_error_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', default='/data/YOMDLE/final_arabic/xml')
+    parser.add_argument('--outputDir', type=str, help='Output directory', default='/exp/YOMDLE/final_arabic/csv_truth/')
+    parser.add_argument('--log', type=str, help='Log directory', default='/exp/logs.txt')
+
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
diff --git a/egs/yomdle_fa/v1/path.sh b/egs/yomdle_fa/v1/path.sh
new file mode 100644
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_fa/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_fa/v1/run.sh b/egs/yomdle_fa/v1/run.sh
new file mode 100755
index 00000000000..a7547b1ee69
--- /dev/null
+++ b/egs/yomdle_fa/v1/run.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+set -e
+stage=0
+nj=60
+
+database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi
+download_dir=data_yomdle_farsi/download/
+extra_lm=download/extra_lm.txt
+data_dir=data_yomdle_farsi
+exp_dir=exp_yomdle_farsi
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $stage -le -1 ]; then
+    local/create_download.sh --database-slam $database_slam \
+        --database-yomdle $database_yomdle \
+        --slam-dir download/slam_farsi \
+        --yomdle-dir download/yomdle_farsi
+fi
+
+if [ $stage -le 0 ]; then
+    mkdir -p data_slam_farsi/slam
+    mkdir -p data_yomdle_farsi/yomdle
+    local/process_data.py download/slam_farsi data_slam_farsi/slam
+    local/process_data.py download/yomdle_farsi data_yomdle_farsi/yomdle
+    ln -s ../data_slam_farsi/slam ${data_dir}/test
+    ln -s ../data_yomdle_farsi/yomdle ${data_dir}/train
+    image/fix_data_dir.sh ${data_dir}/test
+    image/fix_data_dir.sh ${data_dir}/train
+fi
+
+mkdir -p $data_dir/{train,test}/data
+if [ $stage -le 1 ]; then
+    echo "$0: Obtaining image groups. calling get_image2num_frames"
+    echo "Date: $(date)."
+    image/get_image2num_frames.py --feat-dim 40 $data_dir/train
+    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train
+
+    for datasplit in train test; do
+        echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. "
+        echo "Date: $(date)."
+        local/extract_features.sh --nj $nj --cmd "$cmd" \
+            --feat-dim 40 --num-channels 3 --fliplr true \
+            $data_dir/${datasplit}
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1;
+    done
+
+    echo "$0: Fixing data directory for train dataset"
+    echo "Date: $(date)."
+    utils/fix_data_dir.sh $data_dir/train
+fi
+
+if [ $stage -le 2 ]; then
+    for datasplit in train; do
+        echo "$(date) stage 2: Performing augmentation, it will double training data"
+        local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 --fliplr false $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1;
+    done
+fi
+
+if [ $stage -le 3 ]; then
+    echo "$0: Preparing dictionary and lang..."
+    if [ ! -f $data_dir/train/bpe.out ]; then
+        cut -d' ' -f2- $data_dir/train/text | local/bidi.py | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out
+        for datasplit in test train train_aug; do
+            cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids
+            cut -d' ' -f2- $data_dir/$datasplit/text | local/bidi.py | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text
+            mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old
+            paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text
+        done
+    fi
+
+    local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict
+    # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+    # So we set --sil-prob to 0.0
+    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+        $data_dir/local/dict "<sil>" $data_dir/lang/temp $data_dir/lang
+    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang
+fi
+
+if [ $stage -le 4 ]; then
+    echo "$0: Estimating a language model for decoding..."
+    local/train_lm.sh --data-dir $data_dir  --dir $data_dir/local/local_lm
+    utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+        $data_dir/local/dict/lexicon.txt $data_dir/lang_test
+fi
+
+if [ $stage -le 5 ]; then
+    echo "$0: Calling the flat-start chain recipe..."
+    echo "Date: $(date)." 
+    local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 6 ]; then
+    echo "$0: Aligning the training data using the e2e chain model..."
+    echo "Date: $(date)."
+    steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+        $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+    echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+    echo "Date: $(date)."
+    local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 8 ]; then
+    echo "$0: Estimating a language model for lattice rescoring...$(date)"
+    local/train_lm_lr.sh --data-dir $data_dir  --dir $data_dir/local/local_lm_lr --extra-lm $extra_lm --order 6
+
+    utils/build_const_arpa_lm.sh $data_dir/local/local_lm_lr/data/arpa/6gram_unpruned.arpa.gz \
+        $data_dir/lang_test $data_dir/lang_test_lr
+    steps/lmrescore_const_arpa.sh $data_dir/lang_test $data_dir/lang_test_lr \
+        $data_dir/test $exp_dir/chain/cnn_e2eali_1b/decode_test $exp_dir/chain/cnn_e2eali_1b/decode_test_lr
+fi
diff --git a/egs/yomdle_fa/v1/steps b/egs/yomdle_fa/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_fa/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_fa/v1/utils b/egs/yomdle_fa/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_fa/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_korean/README.txt b/egs/yomdle_korean/README.txt
new file mode 100644
index 00000000000..3bf4cc8cd2d
--- /dev/null
+++ b/egs/yomdle_korean/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources
diff --git a/egs/yomdle_korean/v1/cmd.sh b/egs/yomdle_korean/v1/cmd.sh
new file mode 100755
index 00000000000..3d69546dfe8
--- /dev/null
+++ b/egs/yomdle_korean/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_korean/v1/image b/egs/yomdle_korean/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_korean/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/local/augment_data.sh b/egs/yomdle_korean/v1/local/augment_data.sh
new file mode 100755
index 00000000000..136bfd24eb2
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/augment_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --vertical-shift $verticle_shift \
+    --fliplr false --augment 'random_scale' $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_korean/v1/local/chain/compare_wer.sh b/egs/yomdle_korean/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..80f31e0f311
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..fcf59f917c1
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1b.sh
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh
new file mode 100755
index 00000000000..cea60a221a1
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                          e2e_cnn_1a
+#                                 score_basic  score_nomalized
+# WER                             13.64        10.6
+# WER (rescored)                  13.13        10.2
+# CER                              2.99         3.0
+# CER (rescored)                   2.88         2.9
+# Final train prob               0.0113
+# Final valid prob               0.0152
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=48 nj=5..8 num-params=3.0M dim=40->352 combine=0.047->0.047 (over 2) logprob:train/valid[31,47,final]=(0.002,0.008,0.011/0.008,0.013,0.015)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_decode=data/lang
+decode_e2e=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights true \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 3 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 5 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..03333f6d229
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# e2eali_1a is the same as 1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# local/chain/compare_wer.sh exp/old/chain/cnn_e2eali_1a/
+# System                      cnn_e2eali_1a
+# WER                             15.68
+# CER                              3.18
+# Final train prob              -0.0331
+# Final valid prob              -0.0395
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/
+# exp/old/chain/cnn_e2eali_1a/: num-iters=33 nj=3..16 num-params=5.2M dim=40->456 combine=-0.035->-0.035 (over 1) xent:train/valid[21,32,final]=(-0.226,-0.175,-0.169/-0.248,-0.202,-0.195) logprob:train/valid[21,32,final]=(-0.039,-0.034,-0.033/-0.046,-0.040,-0.039)
+
+# Normalize scoring
+# WER = 11.7
+# CER = 3.3
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+decode_chain=false
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_chain; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ] && $decode_chain; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --beam 12 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh $dir
+fi
diff --git a/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..fd9cdc8921d
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+
+# e2eali_1b is the same as e2eali_1a but has fewer CNN layers, smaller
+# l2-regularize, more epochs and uses dropout.
+
+#local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/
+# System                      cnn_e2eali_1b
+#                                 score_basic  score_nomalized
+# WER                             13.01        10.0
+# WER (rescored)                  12.69         9.6
+# CER                              2.78         3.0
+# CER (rescored)                   2.70         2.8
+# Final train prob              -0.0568
+# Final valid prob              -0.0410
+#steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b
+#exp/chain/cnn_e2eali_1b: num-iters=67 nj=3..16 num-params=5.2M dim=40->464 combine=-0.052->-0.052 (over 1) xent:train/valid[43,66,final]=(-0.379,-0.319,-0.304/-0.291,-0.234,-0.227) logprob:train/valid[43,66,final]=(-0.069,-0.058,-0.057/-0.046,-0.041,-0.041)
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=1000
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=false
+lang_decode=data/lang
+decode_chain=true
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=16 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
diff --git a/egs/yomdle_korean/v1/local/check_tools.sh b/egs/yomdle_korean/v1/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/yomdle_korean/v1/local/extract_features.sh b/egs/yomdle_korean/v1/local/extract_features.sh
new file mode 100755
index 00000000000..3880ebad3e8
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_korean/v1/local/normalize_data.py b/egs/yomdle_korean/v1/local/normalize_data.py
new file mode 100755
index 00000000000..fba3e762789
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/normalize_data.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+import unicodedata
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  text_normalized = unicodedata.normalize('NFC', transcript)
+  output.write(uttid + ' ' + text_normalized + '\n')
diff --git a/egs/yomdle_korean/v1/local/prepare_dict.sh b/egs/yomdle_korean/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..22db5ae834d
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/prepare_dict.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Babak Rekabdar
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_korean/v1/local/prepare_lexicon.py b/egs/yomdle_korean/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..ec8d43d8335
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/prepare_lexicon.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Babak Rekabdar
+#                2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text.
+# Since this lexicon is based on BPE, it replaces '|' with silence.
+
+import argparse
+import os
+import unicodedata
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            char_normalized = unicodedata.normalize('NFD', line_vect[i]).replace('\n', '')
+            characters = list(char_normalized)
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = list(characters)
+            characters = "".join([ '<HASH>' if char == '#' else char for char in characters])
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_korean/v1/local/process_corpus.py b/egs/yomdle_korean/v1/local/process_corpus.py
new file mode 100755
index 00000000000..b39030270b7
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/process_corpus.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+# This script reads valid phones and removes the lines in the corpus
+# which have any other phone.
+
+import os
+import sys, io
+
+phone_file = os.path.join('data/local/text/cleaned/phones.txt')
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+phone_dict = dict()
+with open(phone_file, 'r', encoding='utf-8') as phone_fh:
+    for line in phone_fh:
+        line = line.strip().split()[0]
+        phone_dict[line] = line
+
+phone_dict[' '] = ' '
+corpus_text = list()
+for line in infile:
+    text = line.strip()
+    skip_text = False
+    for phone in text:
+        if phone not in phone_dict.keys():
+            skip_text = True
+            break
+    if not skip_text:
+        output.write(text+ '\n')
+
diff --git a/egs/yomdle_korean/v1/local/process_data.py b/egs/yomdle_korean/v1/local/process_data.py
new file mode 100755
index 00000000000..d7546b0a803
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/process_data.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Tamil OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+import unicodedata
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = row[11]
+                text_vect = text.split() # this is to avoid non-utf-8 spaces
+                text = " ".join(text_vect)
+                #text_normalized = unicodedata.normalize('NFD', text).replace('\n', '')
+                if not text:
+                    continue
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_korean/v1/local/score.sh b/egs/yomdle_korean/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
new file mode 100755
index 00000000000..f6b2c1bac42
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+# This script is semi-supervised recipe with 25k line images of supervised data
+# and 22k line images of unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_semisup.sh shows how to call this.
+
+# We use 3-gram LM trained on 5M lines of auxilary data.
+# This script uses the same tree as that for the seed model.
+# Unsupervised set: train_unsup (25k tamil line images)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/ exp/semisup_100k/chain/tdnn_semisup_1a/
+# System                      cnn_e2eali_1b tdnn_semisup_1a
+# WER                             15.06     13.83
+# CER                              3.15      2.83
+# Final train prob              -0.0343    0.6103-0.0360
+# Final valid prob              -0.0403    0.6054-0.0418
+
+# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1a/
+# exp/semisup_100k/chain/tdnn_semisup_1a/: num-iters=58 nj=6..16 num-params=3.7M dim=40->456 combine=0.240->0.240 (over 1)
+
+# Normalize scoring
+#WER = 10.4
+#CER = 2.9
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=30
+test_nj=30
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+exp_root=exp/semisup_100k
+chain_affix=    # affix for chain dir
+tdnn_affix=_semisup_1a  # affix for semi-supervised chain system
+
+# Datasets-Expects supervised_set and unsupervised_set
+supervised_set=train
+unsupervised_set=train_unsup
+
+# Input seed system
+sup_chain_dir=exp/chain/cnn_e2eali_1b  # supervised chain system
+sup_lat_dir=exp/chain/e2e_train_lats  # Seed model options
+sup_tree_dir=exp/chain/tree_e2e  # tree directory for supervised chain system
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+# Neural network opts
+xent_regularize=0.1
+tdnn_dim=450
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+graphdir=$sup_chain_dir/graph_unsup
+for f in data/$supervised_set/feats.scp \
+  data/$unsupervised_set/feats.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $lang_decode/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+if [ $stage -le 5 ]; then
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$cmd" --beam 12 \
+            --frames-per-chunk 340 \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --scoring-opts "--min-lmwt 6 --max-lmwt 6" --word-determinize false \
+            $graphdir data/$unsupervised_set $sup_chain_dir/decode_$unsupervised_set
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${cmd}" --acwt 0.1 \
+    data/$unsupervised_set \
+    $sup_chain_dir/decode_${unsupervised_set} \
+    $sup_chain_dir/best_path_$unsupervised_set
+fi
+
+frame_subsampling_factor=4
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$cmd" \
+    --lm_opts '--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000' \
+    $sup_tree_dir $sup_chain_dir/best_path_$unsupervised_set \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_$supervised_set
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$cmd" \
+               --left-tolerance 3 --right-tolerance 3 \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor 1 \
+               --frames-overlap-per-eg 0 --constrained false \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 2000000 \
+               --cmvn-opts "$cmvn_opts" \
+               --generate-egs-scp true \
+               data/${supervised_set} $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=340,300,200,100  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=6.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=3   # frame-tolerance for chain training
+
+unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_$unsupervised_set
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/$unsupervised_set $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \
+    --block-size 64 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+chunk_width=340,300,200,100
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --egs.chunk-width=$chunk_width \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.apply-deriv-weights=true \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=0 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 5 \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs false \
+    --feat-dir data/$supervised_set \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+fi
+
+if [ $stage -le 18 ]; then
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \
+      $dir/graph data/test $dir/decode_test
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
+exit 0;
+
diff --git a/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
new file mode 100755
index 00000000000..8185fa2645d
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -0,0 +1,325 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+# This script is semi-supervised recipe with 25k line images of supervised data
+# and 22k line images of unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_semisup.sh shows how to call this.
+
+# We use 3-gram LM trained on 5M lines of auxilary data.
+# This script uses the same tree as that for the seed model.
+# Unsupervised set: train_unsup (25k tamil line images)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# local/chain/compare_wer.sh exp/semisup_100k/chain/tdnn_semisup_1b/
+# System                      tdnn_semisup_1b
+#                                 score_basic    score_normalized
+# WER                             13.73          10.2
+# WER (rescored)                  12.80           9.4
+# CER                              2.78           2.8
+# CER (rescored)                   2.57           2.7
+# Final train prob           0.6138-0.0337
+# Final valid prob           0.6115-0.0399
+
+# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1b/
+# exp/semisup_100k/chain/tdnn_semisup_1b/: num-iters=46 nj=6..16 num-params=5.7M dim=40->456 combine=0.239->0.239 (over 1)
+
+set -u -e -o pipefail
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=30
+test_nj=30
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+exp_root=exp/semisup_100k
+chain_affix=    # affix for chain dir
+tdnn_affix=_semisup_1b  # affix for semi-supervised chain system
+
+# Datasets-Expects supervised_set and unsupervised_set
+supervised_set=train
+unsupervised_set=train_unsup
+
+# Input seed system
+sup_chain_dir=exp/chain/cnn_e2eali_1b  # supervised chain system
+sup_lat_dir=exp/chain/e2e_train_lats  # Seed model options
+sup_tree_dir=exp/chain/tree_e2e  # tree directory for supervised chain system
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+# Neural network opts
+xent_regularize=0.1
+tdnn_dim=550
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+graphdir=$sup_chain_dir/graph_unsup
+for f in data/$supervised_set/feats.scp \
+  data/$unsupervised_set/feats.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $lang_decode/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+if [ $stage -le 5 ]; then
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$cmd" --beam 12 \
+            --frames-per-chunk 340 \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --scoring-opts "--min-lmwt 6 --max-lmwt 6" --word-determinize false \
+            $graphdir data/$unsupervised_set $sup_chain_dir/decode_$unsupervised_set
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${cmd}" --acwt 0.1 \
+    data/$unsupervised_set \
+    $sup_chain_dir/decode_${unsupervised_set} \
+    $sup_chain_dir/best_path_$unsupervised_set
+fi
+
+frame_subsampling_factor=5
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$cmd" \
+    --lm_opts '--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000' \
+    $sup_tree_dir $sup_chain_dir/best_path_$unsupervised_set \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_$supervised_set
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$cmd" \
+               --left-tolerance 3 --right-tolerance 3 \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor 1 \
+               --frames-overlap-per-eg 0 --constrained false \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 2000000 \
+               --cmvn-opts "$cmvn_opts" \
+               --generate-egs-scp true \
+               data/${supervised_set} $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=340,300,200,100  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=6.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=3   # frame-tolerance for chain training
+
+unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_$unsupervised_set
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/$unsupervised_set $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \
+    --block-size 64 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+chunk_width=340,300,200,100
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --egs.chunk-width=$chunk_width \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.apply-deriv-weights=true \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=0 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs false \
+    --feat-dir data/$supervised_set \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+fi
+
+if [ $stage -le 18 ]; then
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \
+      $dir/graph data/test $dir/decode_test
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
+exit 0;
+
diff --git a/egs/yomdle_korean/v1/local/semisup/process_data.py b/egs/yomdle_korean/v1/local/semisup/process_data.py
new file mode 100755
index 00000000000..94ad770ec2d
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/process_data.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the slam boxed Tamil OCR dataset and creates the following
+    files utt2spk, images.scp. Since boxed data do not have transcripts, it do not
+    creates text file. It is created as a separate script, because the data that
+    local/process_data.py is processing contains some empty transcripts which 
+    should be removed or it will create bug while applying BPE.
+
+  Eg. local/semisup/process_data.py data/download/ data/local/splits/train_unsup.txt
+        data/train_unsup
+
+  Eg. utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = 'semisup'
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_korean/v1/local/semisup/run_semisup.sh b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh
new file mode 100755
index 00000000000..5e20f50c99e
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/semisup/run_semisup.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using 25k line images of 
+# supervised data and 22k line images of unsupervised data.
+# We assume the supervised data is in data/train and unsupervised data
+# is in data/train_unsup. 
+# For LM training, we use 5 million lines of tamil text.
+
+set -e
+set -o pipefail
+stage=0
+nj=30
+exp_root=exp/semisup_56k
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p data/train_unsup/data
+if [ $stage -le 0 ]; then
+  echo "stage 0: Processing train unsupervised data...$(date)"
+  local/semisup/process_data.py data/download/ \
+    data/local/splits/train_unsup.txt \
+    data/train_unsup
+  image/fix_data_dir.sh data/train_unsup
+fi
+
+if [ $stage -le 1 ]; then
+  echo "stage 1: Obtaining image groups. calling get_image2num_frames..."
+  image/get_image2num_frames.py --feat-dim 40 data/train_unsup
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train_unsup
+  echo "Extracting features and calling compute_cmvn_stats: $(date) "
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train_unsup
+  steps/compute_cmvn_stats.sh data/train_unsup || exit 1;
+  image/fix_data_dir.sh data/train_unsup
+fi
+
+for f in data/train/utt2spk data/train_unsup/utt2spk \
+  data/train/text; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find $f"
+    exit 1;
+  fi
+done
+
+# Prepare semi-supervised train set 
+if [ $stage -le 1 ]; then
+  utils/combine_data.sh data/semisup100k_250k \
+    data/train data/train_unsup || exit 1
+fi
+
+###############################################################################
+# Semi-supervised training using 25k line images supervised data and 
+# 22k hours unsupervised data. We use tree, lattices 
+# and seed chain system from the previous stage.
+###############################################################################
+if [ $stage -le 2 ]; then
+  local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh \
+    --supervised-set train \
+    --unsupervised-set train_unsup \
+    --sup-chain-dir exp/chain/cnn_e2eali_1b_ep16_7cnn \
+    --sup-lat-dir exp/chain/e2e_train_lats \
+    --sup-tree-dir exp/chain/tree_e2e \
+    --chain-affix "" \
+    --tdnn-affix _semisup_ep16_7cnn \
+    --stage 15 --train_stage 9 \
+    --exp-root $exp_root || exit 1
+fi
diff --git a/egs/yomdle_korean/v1/local/train_lm.sh b/egs/yomdle_korean/v1/local/train_lm.sh
new file mode 100755
index 00000000000..c73c42fb7dc
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/train_lm.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions and corpus text.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=6
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/local/text/cleaned/bpe_val.txt  > ${dir}/data/text/dev.txt
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from train and corpus text
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  #[perplexity = 22.0613098868] over 151116.0 words
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  #[perplexity = 23.4801171202] over 151116.0 words
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/yomdle_korean/v1/local/wer_output_filter b/egs/yomdle_korean/v1/local/wer_output_filter
new file mode 100755
index 00000000000..59e364e0231
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/wer_output_filter
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/yomdle_korean/v1/local/yomdle b/egs/yomdle_korean/v1/local/yomdle
new file mode 120000
index 00000000000..2c4544c1399
--- /dev/null
+++ b/egs/yomdle_korean/v1/local/yomdle
@@ -0,0 +1 @@
+../../../yomdle_tamil/v1/local/yomdle/
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/path.sh b/egs/yomdle_korean/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_korean/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_korean/v1/run_end2end.sh b/egs/yomdle_korean/v1/run_end2end.sh
new file mode 100755
index 00000000000..65f5beb4b08
--- /dev/null
+++ b/egs/yomdle_korean/v1/run_end2end.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright 2018    Hossein Hadian
+#                   Ashish Arora
+#                   Jonathan Chang
+# Apache 2.0
+
+set -e
+stage=0
+nj=30
+
+language_main=Korean
+slam_dir=/export/corpora5/slam/SLAM/
+yomdle_dir=/export/corpora5/slam/YOMDLE/
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ko/
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+./local/check_tools.sh
+# Start from stage=-2 for data preparation. This stage stores line images,
+# csv files and splits{train,test,train_unsup} data/download/truth_line_image,
+# data/download/truth_csv and data/local/splits respectively.
+if [ $stage -le -2 ]; then
+  echo "$(date): preparing data, obtaining line images and csv files..."
+  local/yomdle/create_download_dir.sh --language_main $language_main \
+    --slam_dir $slam_dir --yomdle_dir $yomdle_dir
+fi
+
+if [ $stage -le -1 ]; then
+  echo "$(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/ko.txt
+  head -20000 data/local/text/ko.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/ko.txt > data/local/text/cleaned/corpus.txt
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 0 ]; then
+  echo "$0 stage 0: Processing train and test data.$(date)"
+  echo " creating text, images.scp, utt2spk and spk2utt"
+  #local/prepare_data.sh data/download/
+  for set in train test; do
+    local/process_data.py data/download/ \
+      data/local/splits/${set}.txt data/${set}
+    image/fix_data_dir.sh data/${set}
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  for set in train test; do
+    echo "$(date) Extracting features, creating feats.scp file"
+    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  image/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$(date) stage 3: BPE preparation"
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/text/cleaned/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
+
+  echo "learning BPE..."
+  # it is currently learned with only training text but we can also use all corpus text
+  # to learn BPE. phones are added so that one isolated occurance of every phone exists.
+  cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$(date) stage 4: applying BPE..."
+  echo "applying BPE on train, test text..."
+  for set in test train; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+      sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "applying BPE to corpus text..."
+  cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
+  cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$(date) stage 5: Preparing dictionary and lang..."
+  local/prepare_dict.sh --dir data/local/dict
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \
+    data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$(date) stage 6: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$(date) stage 7: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+    data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+chunk_width='340,300,200,100'
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if [ $stage -le 8 ]; then
+  echo "$(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$(date) stage 9: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 10 ] && $decode_e2e; then
+  echo "$(date) stage 10: decoding end2end setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+fi
+
+if [ $stage -le 11 ] && $decode_chain; then
+  echo "$(date) stage 11: decoding chain alignment setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+fi
diff --git a/egs/yomdle_korean/v1/steps b/egs/yomdle_korean/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_korean/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_korean/v1/utils b/egs/yomdle_korean/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_korean/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_russian/README.txt b/egs/yomdle_russian/README.txt
new file mode 100644
index 00000000000..3bf4cc8cd2d
--- /dev/null
+++ b/egs/yomdle_russian/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources
diff --git a/egs/yomdle_russian/v1/cmd.sh b/egs/yomdle_russian/v1/cmd.sh
new file mode 100755
index 00000000000..3d69546dfe8
--- /dev/null
+++ b/egs/yomdle_russian/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_russian/v1/image b/egs/yomdle_russian/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_russian/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/local/chain/compare_wer.sh b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..80f31e0f311
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..e2545b0186e
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
new file mode 100755
index 00000000000..6f5742cd34b
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright    2017  Hossein Hadian
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                      e2e_cnn_1a
+#                             score_basic      rescoring + nomalized
+# WER                             16.24        11.0
+# WER (rescored)                  15.63        10.5
+# CER                              5.98         5.6
+# CER (rescored)                   5.66         5.3
+# Final train prob               0.1376
+# Final valid prob               0.1913
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=27 nj=5..8 num-params=3.0M dim=40->470 combine=0.091->0.091 (over 1) logprob:train/valid[17,26,final]=(0.135,0.137,0.138/0.191,0.191,0.191)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights true \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 3 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 5 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
diff --git a/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..cd582472993
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+# System                      cnn_e2eali_1a      rescoring + nomalized
+# WER                             12.08          7.7
+# WER (rescored)                  11.90          7.5
+# CER                              3.60          3.4
+# CER (rescored)                   3.42          3.2
+# Final train prob              -0.0373
+# Final valid prob              -0.0362
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a
+# exp/chain/cnn_e2eali_1a: num-iters=74 nj=3..16 num-params=6.3M dim=40->848 combine=-0.039->-0.039 (over 1) xent:train/valid[48,73,final]=(-0.206,-0.153,-0.146/-0.191,-0.156,-0.151) logprob:train/valid[48,73,final]=(-0.044,-0.038,-0.037/-0.040,-0.037,-0.036)
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=1000
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=false
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-dropout-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=16 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
diff --git a/egs/yomdle_russian/v1/local/check_tools.sh b/egs/yomdle_russian/v1/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/yomdle_russian/v1/local/extract_features.sh b/egs/yomdle_russian/v1/local/extract_features.sh
new file mode 100755
index 00000000000..3880ebad3e8
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_russian/v1/local/prepare_dict.sh b/egs/yomdle_russian/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..22db5ae834d
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/prepare_dict.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Babak Rekabdar
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_russian/v1/local/prepare_lexicon.py b/egs/yomdle_russian/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..a68b1cb49dd
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/prepare_lexicon.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Babak Rekabdar
+#                2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text.
+# Since this lexicon is based on BPE, it replaces '|' with silence.
+
+import argparse
+import os
+import unicodedata
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = list(characters)
+            characters = "".join([ '<HASH>' if char == '#' else char for char in characters])
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_russian/v1/local/process_corpus.py b/egs/yomdle_russian/v1/local/process_corpus.py
new file mode 100755
index 00000000000..b39030270b7
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/process_corpus.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+# This script reads valid phones and removes the lines in the corpus
+# which have any other phone.
+
+import os
+import sys, io
+
+phone_file = os.path.join('data/local/text/cleaned/phones.txt')
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+phone_dict = dict()
+with open(phone_file, 'r', encoding='utf-8') as phone_fh:
+    for line in phone_fh:
+        line = line.strip().split()[0]
+        phone_dict[line] = line
+
+phone_dict[' '] = ' '
+corpus_text = list()
+for line in infile:
+    text = line.strip()
+    skip_text = False
+    for phone in text:
+        if phone not in phone_dict.keys():
+            skip_text = True
+            break
+    if not skip_text:
+        output.write(text+ '\n')
+
diff --git a/egs/yomdle_russian/v1/local/process_data.py b/egs/yomdle_russian/v1/local/process_data.py
new file mode 100755
index 00000000000..d7546b0a803
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/process_data.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Tamil OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+import unicodedata
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = row[11]
+                text_vect = text.split() # this is to avoid non-utf-8 spaces
+                text = " ".join(text_vect)
+                #text_normalized = unicodedata.normalize('NFD', text).replace('\n', '')
+                if not text:
+                    continue
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_russian/v1/local/score.sh b/egs/yomdle_russian/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/yomdle_russian/v1/local/train_lm.sh b/egs/yomdle_russian/v1/local/train_lm.sh
new file mode 100755
index 00000000000..c73c42fb7dc
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/train_lm.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions and corpus text.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=6
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/local/text/cleaned/bpe_val.txt  > ${dir}/data/text/dev.txt
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from train and corpus text
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  #[perplexity = 22.0613098868] over 151116.0 words
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  #[perplexity = 23.4801171202] over 151116.0 words
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/yomdle_russian/v1/local/wer_output_filter b/egs/yomdle_russian/v1/local/wer_output_filter
new file mode 100755
index 00000000000..59e364e0231
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/wer_output_filter
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/yomdle_russian/v1/local/yomdle b/egs/yomdle_russian/v1/local/yomdle
new file mode 120000
index 00000000000..2c4544c1399
--- /dev/null
+++ b/egs/yomdle_russian/v1/local/yomdle
@@ -0,0 +1 @@
+../../../yomdle_tamil/v1/local/yomdle/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/path.sh b/egs/yomdle_russian/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_russian/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_russian/v1/run_end2end.sh b/egs/yomdle_russian/v1/run_end2end.sh
new file mode 100755
index 00000000000..12beebeaa05
--- /dev/null
+++ b/egs/yomdle_russian/v1/run_end2end.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright 2018    Hossein Hadian
+#                   Ashish Arora
+#                   Jonathan Chang
+# Apache 2.0
+
+set -e
+stage=0
+nj=30
+
+language_main=Russian
+slam_dir=/export/corpora5/slam/SLAM/
+yomdle_dir=/export/corpora5/slam/YOMDLE/
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ru/
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+./local/check_tools.sh
+# Start from stage=-2 for data preparation. This stage stores line images,
+# csv files and splits{train,test,train_unsup} data/download/truth_line_image,
+# data/download/truth_csv and data/local/splits respectively.
+if [ $stage -le -2 ]; then
+  echo "$0: $(date): preparing data, obtaining line images and csv files..."
+  local/yomdle/create_download_dir.sh --language_main $language_main \
+    --slam_dir $slam_dir --yomdle_dir $yomdle_dir
+fi
+
+if [ $stage -le -1 ]; then
+  echo "$0: $(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/ru.txt
+  head -20000 data/local/text/ru.txt > data/local/text/cleaned/val.txt
+  tail -n +20000 data/local/text/ru.txt > data/local/text/cleaned/corpus.txt
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 0 ]; then
+  echo "$0: stage 0: Processing train and test data.$(date)"
+  echo "$0: creating text, images.scp, utt2spk and spk2utt"
+  #local/prepare_data.sh data/download/
+  for set in train test; do
+    local/process_data.py data/download/ \
+      data/local/splits/${set}.txt data/${set}
+    image/fix_data_dir.sh data/${set}
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: $(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  for set in train test; do
+    echo "$0: $(date) Extracting features, creating feats.scp file"
+    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  image/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: $(date) stage 3: BPE preparation"
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/text/cleaned/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
+
+  echo "$0: learning BPE..."
+  # it is currently learned with only training text but we can also use all corpus text
+  # to learn BPE. phones are added so that one isolated occurance of every phone exists.
+  cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: $(date) stage 4: applying BPE..."
+  echo "$0: applying BPE on train, test text..."
+  for set in test train; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+      sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "$0: applying BPE to corpus text..."
+  cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
+  cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: $(date) stage 5: Preparing dictionary and lang..."
+  local/prepare_dict.sh --dir data/local/dict
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 4 --sil-prob 0.0 --position-dependent-phones false \
+    data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: $(date) stage 6: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: $(date) stage 7: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+    data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+chunk_width='340,300,200,100'
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+if [ $stage -le 8 ]; then
+  echo "$0: $(date) stage 8: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --chunk_width $chunk_width
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: $(date) stage 9: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 10 ] && $decode_e2e; then
+  echo "$0: $(date) stage 10: decoding end2end setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/e2e_cnn_1a/ exp/chain/e2e_cnn_1a/graph || exit 1;
+
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/e2e_cnn_1a/graph data/test exp/chain/e2e_cnn_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/e2e_cnn_1a/decode_test{,_rescored} || exit 1
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+fi
+
+if [ $stage -le 11 ] && $decode_chain; then
+  echo "$0: $(date) stage 11: decoding chain alignment setup..."
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    exp/chain/cnn_e2eali_1a/ exp/chain/cnn_e2eali_1a/graph || exit 1;
+
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    exp/chain/cnn_e2eali_1a/graph data/test exp/chain/cnn_e2eali_1a/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test exp/chain/cnn_e2eali_1a/decode_test{,_rescored} || exit 1
+
+  echo "$0: Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a
+fi
diff --git a/egs/yomdle_russian/v1/steps b/egs/yomdle_russian/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_russian/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_russian/v1/utils b/egs/yomdle_russian/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_russian/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_tamil/README.txt b/egs/yomdle_tamil/README.txt
new file mode 100644
index 00000000000..0f295e5ae5f
--- /dev/null
+++ b/egs/yomdle_tamil/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources.
diff --git a/egs/yomdle_tamil/v1/cmd.sh b/egs/yomdle_tamil/v1/cmd.sh
new file mode 100755
index 00000000000..3d69546dfe8
--- /dev/null
+++ b/egs/yomdle_tamil/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_tamil/v1/image b/egs/yomdle_tamil/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_tamil/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_tamil/v1/local/augment_data.sh b/egs/yomdle_tamil/v1/local/augment_data.sh
new file mode 100755
index 00000000000..136bfd24eb2
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/augment_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --vertical-shift $verticle_shift \
+    --fliplr false --augment 'random_scale' $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_tamil/v1/local/chain/compare_wer.sh b/egs/yomdle_tamil/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..80f31e0f311
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_tamil/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_tamil/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..fcf59f917c1
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1b.sh
\ No newline at end of file
diff --git a/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
new file mode 100755
index 00000000000..f553467d4a6
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,159 @@
+#!/bin/bash
+
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System                          e2e_cnn_1a
+#                                 score_basic  score_nomalized
+# WER                             13.64        10.6
+# WER (rescored)                  13.13        10.2
+# CER                              2.99         3.0
+# CER (rescored)                   2.88         2.9
+# Final train prob               0.0113
+# Final valid prob               0.0152
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=48 nj=5..8 num-params=3.0M dim=40->352 combine=0.047->0.047 (over 2) logprob:train/valid[31,47,final]=(0.002,0.008,0.011/0.008,0.013,0.015)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+decode_e2e=true
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       data/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat data/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.apply-deriv-weights true \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 3 \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial 5 \
+    --trainer.optimization.num-jobs-final 8 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ] && $decode_e2e; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ] && $decode_e2e; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj 30 --cmd "$cmd" --beam 12 \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh $dir
+fi
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
new file mode 100755
index 00000000000..03333f6d229
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# e2eali_1a is the same as 1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# local/chain/compare_wer.sh exp/old/chain/cnn_e2eali_1a/
+# System                      cnn_e2eali_1a
+# WER                             15.68
+# CER                              3.18
+# Final train prob              -0.0331
+# Final valid prob              -0.0395
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/
+# exp/old/chain/cnn_e2eali_1a/: num-iters=33 nj=3..16 num-params=5.2M dim=40->456 combine=-0.035->-0.035 (over 1) xent:train/valid[21,32,final]=(-0.226,-0.175,-0.169/-0.248,-0.202,-0.195) logprob:train/valid[21,32,final]=(-0.039,-0.034,-0.033/-0.046,-0.040,-0.039)
+
+# Normalize scoring
+# WER = 11.7
+# CER = 3.3
+
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+decode_chain=false
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_chain; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ] && $decode_chain; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --beam 12 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh $dir
+fi
diff --git a/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..fb15ce10dde
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# e2eali_1b is the same as e2eali_1a but has fewer CNN layers, smaller
+# l2-regularize, more epochs and uses dropout.
+
+#local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/
+# System                      cnn_e2eali_1b
+#                                 score_basic  score_nomalized
+# WER                             13.01        10.0
+# WER (rescored)                  12.69         9.6
+# CER                              2.78         3.0
+# CER (rescored)                   2.70         2.8
+# Final train prob              -0.0568
+# Final valid prob              -0.0410
+#steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b
+#exp/chain/cnn_e2eali_1b: num-iters=67 nj=3..16 num-params=5.2M dim=40->464 combine=-0.052->-0.052 (over 1) xent:train/valid[43,66,final]=(-0.379,-0.319,-0.304/-0.291,-0.234,-0.227) logprob:train/valid[43,66,final]=(-0.069,-0.058,-0.057/-0.046,-0.041,-0.041)
+set -e -o pipefail
+stage=0
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+tdnn_dim=550
+# training options
+srand=0
+remove_egs=false
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+decode_chain=true
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+ali_dir=exp/chain/e2e_ali_train
+lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_e2e
+e2echain_model_dir=exp/chain/e2e_cnn_1a
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=8 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=16 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ] && $decode_chain; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_decode \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ] && $decode_chain; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --beam 12 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+
+  steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+
+  echo "Done. Date: $(date). Results:"
+  local/chain/compare_wer.sh $dir
+fi
diff --git a/egs/yomdle_tamil/v1/local/check_tools.sh b/egs/yomdle_tamil/v1/local/check_tools.sh
new file mode 100755
index 00000000000..5b4d3107d3b
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/check_tools.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 >&/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
diff --git a/egs/yomdle_tamil/v1/local/extract_features.sh b/egs/yomdle_tamil/v1/local/extract_features.sh
new file mode 100755
index 00000000000..3880ebad3e8
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/extract_features.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
+nj=4
+cmd=run.pl
+feat_dim=40
+augment='no_aug'
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_tamil/v1/local/prepare_dict.sh b/egs/yomdle_tamil/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..22db5ae834d
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/prepare_dict.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Babak Rekabdar
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+. ./utils/parse_options.sh || exit 1;
+
+mkdir -p $dir
+
+local/prepare_lexicon.py $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_tamil/v1/local/prepare_lexicon.py b/egs/yomdle_tamil/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..3de96056c2a
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/prepare_lexicon.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Babak Rekabdar
+#                2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+# Apache 2.0
+
+# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text.
+# Since this lexicon is based on BPE, it replaces '|' with silence.
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join('data', 'train', 'text')
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = list(characters)
+            characters = "".join([ '<HASH>' if char == '#' else char for char in characters])
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_tamil/v1/local/process_corpus.py b/egs/yomdle_tamil/v1/local/process_corpus.py
new file mode 100755
index 00000000000..b39030270b7
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/process_corpus.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright      2018  Ashish Arora
+# Apache 2.0
+# This script reads valid phones and removes the lines in the corpus
+# which have any other phone.
+
+import os
+import sys, io
+
+phone_file = os.path.join('data/local/text/cleaned/phones.txt')
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+phone_dict = dict()
+with open(phone_file, 'r', encoding='utf-8') as phone_fh:
+    for line in phone_fh:
+        line = line.strip().split()[0]
+        phone_dict[line] = line
+
+phone_dict[' '] = ' '
+corpus_text = list()
+for line in infile:
+    text = line.strip()
+    skip_text = False
+    for phone in text:
+        if phone not in phone_dict.keys():
+            skip_text = True
+            break
+    if not skip_text:
+        output.write(text+ '\n')
+
diff --git a/egs/yomdle_tamil/v1/local/process_data.py b/egs/yomdle_tamil/v1/local/process_data.py
new file mode 100755
index 00000000000..7c116165ddd
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/process_data.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Tamil OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = row[11]
+                if not text:
+                    continue
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_tamil/v1/local/score.sh b/egs/yomdle_tamil/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
new file mode 100755
index 00000000000..f6b2c1bac42
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1a.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+# This script is semi-supervised recipe with 25k line images of supervised data
+# and 22k line images of unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_semisup.sh shows how to call this.
+
+# We use 3-gram LM trained on 5M lines of auxilary data.
+# This script uses the same tree as that for the seed model.
+# Unsupervised set: train_unsup (25k tamil line images)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b/ exp/semisup_100k/chain/tdnn_semisup_1a/
+# System                      cnn_e2eali_1b tdnn_semisup_1a
+# WER                             15.06     13.83
+# CER                              3.15      2.83
+# Final train prob              -0.0343    0.6103-0.0360
+# Final valid prob              -0.0403    0.6054-0.0418
+
+# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1a/
+# exp/semisup_100k/chain/tdnn_semisup_1a/: num-iters=58 nj=6..16 num-params=3.7M dim=40->456 combine=0.240->0.240 (over 1)
+
+# Normalize scoring
+#WER = 10.4
+#CER = 2.9
+
+set -u -e -o pipefail
+
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=30
+test_nj=30
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+exp_root=exp/semisup_100k
+chain_affix=    # affix for chain dir
+tdnn_affix=_semisup_1a  # affix for semi-supervised chain system
+
+# Datasets-Expects supervised_set and unsupervised_set
+supervised_set=train
+unsupervised_set=train_unsup
+
+# Input seed system
+sup_chain_dir=exp/chain/cnn_e2eali_1b  # supervised chain system
+sup_lat_dir=exp/chain/e2e_train_lats  # Seed model options
+sup_tree_dir=exp/chain/tree_e2e  # tree directory for supervised chain system
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+# Neural network opts
+xent_regularize=0.1
+tdnn_dim=450
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+graphdir=$sup_chain_dir/graph_unsup
+for f in data/$supervised_set/feats.scp \
+  data/$unsupervised_set/feats.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $lang_decode/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+if [ $stage -le 5 ]; then
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$cmd" --beam 12 \
+            --frames-per-chunk 340 \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --scoring-opts "--min-lmwt 6 --max-lmwt 6" --word-determinize false \
+            $graphdir data/$unsupervised_set $sup_chain_dir/decode_$unsupervised_set
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${cmd}" --acwt 0.1 \
+    data/$unsupervised_set \
+    $sup_chain_dir/decode_${unsupervised_set} \
+    $sup_chain_dir/best_path_$unsupervised_set
+fi
+
+frame_subsampling_factor=4
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$cmd" \
+    --lm_opts '--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000' \
+    $sup_tree_dir $sup_chain_dir/best_path_$unsupervised_set \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_$supervised_set
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$cmd" \
+               --left-tolerance 3 --right-tolerance 3 \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor 1 \
+               --frames-overlap-per-eg 0 --constrained false \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 2000000 \
+               --cmvn-opts "$cmvn_opts" \
+               --generate-egs-scp true \
+               data/${supervised_set} $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=340,300,200,100  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=6.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=3   # frame-tolerance for chain training
+
+unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_$unsupervised_set
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/$unsupervised_set $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \
+    --block-size 64 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+chunk_width=340,300,200,100
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --egs.chunk-width=$chunk_width \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.apply-deriv-weights=true \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=0 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 5 \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs false \
+    --feat-dir data/$supervised_set \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+fi
+
+if [ $stage -le 18 ]; then
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \
+      $dir/graph data/test $dir/decode_test
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
+exit 0;
+
diff --git a/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
new file mode 100755
index 00000000000..17d59642b05
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh
@@ -0,0 +1,323 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+# This script is semi-supervised recipe with 25k line images of supervised data
+# and 22k line images of unsupervised data with naive splitting.
+# Based on "Semi-Supervised Training of Acoustic Models using Lattice-Free MMI",
+# Vimal Manohar, Hossein Hadian, Daniel Povey, Sanjeev Khudanpur, ICASSP 2018
+# http://www.danielpovey.com/files/2018_icassp_semisupervised_mmi.pdf
+# local/semisup/run_semisup.sh shows how to call this.
+
+# We use 3-gram LM trained on 5M lines of auxilary data.
+# This script uses the same tree as that for the seed model.
+# Unsupervised set: train_unsup (25k tamil line images)
+# unsup_frames_per_eg=150
+# Deriv weights: Lattice posterior of best path pdf
+# Unsupervised weight: 1.0
+# Weights for phone LM (supervised, unsupervised): 3,2
+# LM for decoding unsupervised data: 4gram
+# Supervision: Naive split lattices
+# output-0 and output-1 are for superivsed and unsupervised data respectively.
+
+# local/chain/compare_wer.sh exp/semisup_100k/chain/tdnn_semisup_1b/
+# System                      tdnn_semisup_1b
+#                                 score_basic    score_normalized
+# WER                             13.73          10.2
+# WER (rescored)                  12.80           9.4
+# CER                              2.78           2.8
+# CER (rescored)                   2.57           2.7
+# Final train prob           0.6138-0.0337
+# Final valid prob           0.6115-0.0399
+
+# steps/info/chain_dir_info.pl exp/semisup_100k/chain/tdnn_semisup_1b/
+# exp/semisup_100k/chain/tdnn_semisup_1b/: num-iters=46 nj=6..16 num-params=5.7M dim=40->456 combine=0.239->0.239 (over 1)
+
+set -u -e -o pipefail
+stage=0   # Start from -1 for supervised seed system training
+train_stage=-100
+nj=30
+test_nj=30
+
+# The following 3 options decide the output directory for semi-supervised 
+# chain system
+# dir=${exp_root}/chain${chain_affix}/tdnn${tdnn_affix}
+exp_root=exp/semisup_100k
+chain_affix=    # affix for chain dir
+tdnn_affix=_semisup_1b  # affix for semi-supervised chain system
+
+# Datasets-Expects supervised_set and unsupervised_set
+supervised_set=train
+unsupervised_set=train_unsup
+
+# Input seed system
+sup_chain_dir=exp/chain/cnn_e2eali_1b  # supervised chain system
+sup_lat_dir=exp/chain/e2e_train_lats  # Seed model options
+sup_tree_dir=exp/chain/tree_e2e  # tree directory for supervised chain system
+
+# Semi-supervised options
+supervision_weights=1.0,1.0   # Weights for supervised, unsupervised data egs.
+                              # Can be used to scale down the effect of unsupervised data
+                              # by using a smaller scale for it e.g. 1.0,0.3
+lm_weights=3,2  # Weights on phone counts from supervised, unsupervised data for denominator FST creation
+
+sup_egs_dir=   # Supply this to skip supervised egs creation
+unsup_egs_dir=  # Supply this to skip unsupervised egs creation
+unsup_egs_opts=  # Extra options to pass to unsupervised egs creation
+# Neural network opts
+xent_regularize=0.1
+tdnn_dim=550
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+dir=$exp_root/chain$chain_affix/tdnn$tdnn_affix
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+graphdir=$sup_chain_dir/graph_unsup
+for f in data/$supervised_set/feats.scp \
+  data/$unsupervised_set/feats.scp \
+  $sup_lat_dir/lat.1.gz $sup_tree_dir/ali.1.gz \
+  $lang_decode/G.fst; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find file $f"
+    exit 1
+  fi
+done
+
+if [ ! -f $graphdir/HCLG.fst ]; then
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $sup_chain_dir $graphdir
+fi
+
+# Decode unsupervised data and write lattices in non-compact
+# undeterminized format
+if [ $stage -le 5 ]; then
+  steps/nnet3/decode_semisup.sh --num-threads 4 --nj $nj --cmd "$cmd" --beam 12 \
+            --frames-per-chunk 340 \
+            --acwt 1.0 --post-decode-acwt 10.0 --write-compact false --skip-scoring true \
+            --scoring-opts "--min-lmwt 6 --max-lmwt 6" --word-determinize false \
+            $graphdir data/$unsupervised_set $sup_chain_dir/decode_$unsupervised_set
+fi
+
+# Get best path alignment and lattice posterior of best path alignment to be
+# used as frame-weights in lattice-based training
+if [ $stage -le 8 ]; then
+  steps/best_path_weights.sh --cmd "${cmd}" --acwt 0.1 \
+    data/$unsupervised_set \
+    $sup_chain_dir/decode_${unsupervised_set} \
+    $sup_chain_dir/best_path_$unsupervised_set
+fi
+
+frame_subsampling_factor=4
+if [ -f $sup_chain_dir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $sup_chain_dir/frame_subsampling_factor)
+fi
+cmvn_opts=$(cat $sup_chain_dir/cmvn_opts) || exit 1
+
+diff $sup_tree_dir/tree $sup_chain_dir/tree || { echo "$0: $sup_tree_dir/tree and $sup_chain_dir/tree differ"; exit 1; }
+
+# Train denominator FST using phone alignments from
+# supervised and unsupervised data
+if [ $stage -le 10 ]; then
+  steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$cmd" \
+    --lm_opts '--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000' \
+    $sup_tree_dir $sup_chain_dir/best_path_$unsupervised_set \
+    $dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.03 dropout-proportion=0.0"
+  tdnn_opts="l2-regularize=0.03"
+  output_opts="l2-regularize=0.04"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+  conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2
+  relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+
+  # We use separate outputs for supervised and unsupervised data
+  # so we can properly track the train and valid objectives.
+  output name=output-0 input=output.affine
+  output name=output-1 input=output.affine
+  output name=output-0-xent input=output-xent.log-softmax
+  output name=output-1-xent input=output-xent.log-softmax
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+# Get values for $model_left_context, $model_right_context
+. $dir/configs/vars
+
+left_context=$model_left_context
+right_context=$model_right_context
+
+egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)")
+egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)")
+
+if [ -z "$sup_egs_dir" ]; then
+  sup_egs_dir=$dir/egs_$supervised_set
+  frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg)
+
+  if [ $stage -le 12 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$sup_egs_dir/storage $sup_egs_dir/storage
+    fi
+    mkdir -p $sup_egs_dir/
+    touch $sup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the supervised data"
+    steps/nnet3/chain/get_egs.sh --cmd "$cmd" \
+               --left-tolerance 3 --right-tolerance 3 \
+               --left-context $egs_left_context --right-context $egs_right_context \
+               --frame-subsampling-factor $frame_subsampling_factor \
+               --alignment-subsampling-factor 1 \
+               --frames-overlap-per-eg 0 --constrained false \
+               --frames-per-eg $frames_per_eg \
+               --frames-per-iter 2000000 \
+               --cmvn-opts "$cmvn_opts" \
+               --generate-egs-scp true \
+               data/${supervised_set} $dir \
+               $sup_lat_dir $sup_egs_dir
+  fi
+else
+  frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg)
+fi
+
+unsup_frames_per_eg=340,300,200,100  # Using a frames-per-eg of 150 for unsupervised data
+                         # was found to be better than allowing smaller chunks
+                         # (160,140,110,80) like for supervised system
+lattice_lm_scale=0.5  # lm-scale for using the weights from unsupervised lattices when
+                      # creating numerator supervision
+lattice_prune_beam=6.0  # beam for pruning the lattices prior to getting egs
+                        # for unsupervised data
+tolerance=3   # frame-tolerance for chain training
+
+unsup_lat_dir=$sup_chain_dir/decode_$unsupervised_set
+if [ -z "$unsup_egs_dir" ]; then
+  unsup_egs_dir=$dir/egs_$unsupervised_set
+
+  if [ $stage -le 13 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then
+      utils/create_split_dir.pl \
+       /export/b0{5,6,7,8}/$USER/kaldi-data/egs/fisher_english-$(date +'%m_%d_%H_%M')/s5c/$unsup_egs_dir/storage $unsup_egs_dir/storage
+    fi
+    mkdir -p $unsup_egs_dir
+    touch $unsup_egs_dir/.nodelete # keep egs around when that run dies.
+
+    echo "$0: generating egs from the unsupervised data"
+    steps/nnet3/chain/get_egs.sh \
+      --cmd "$cmd" --alignment-subsampling-factor 1 \
+      --left-tolerance $tolerance --right-tolerance $tolerance \
+      --left-context $egs_left_context --right-context $egs_right_context \
+      --frames-per-eg $unsup_frames_per_eg --frames-per-iter 2000000 \
+      --frame-subsampling-factor $frame_subsampling_factor \
+      --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \
+      --lattice-prune-beam "$lattice_prune_beam" \
+      --deriv-weights-scp $sup_chain_dir/best_path_$unsupervised_set/weights.scp \
+      --generate-egs-scp true $unsup_egs_opts \
+      data/$unsupervised_set $dir \
+      $unsup_lat_dir $unsup_egs_dir
+  fi
+fi
+
+comb_egs_dir=$dir/comb_egs
+if [ $stage -le 14 ]; then
+  steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$cmd" \
+    --block-size 64 \
+    --lang2weight $supervision_weights 2 \
+    $sup_egs_dir $unsup_egs_dir $comb_egs_dir
+  touch $comb_egs_dir/.nodelete # keep egs around when that run dies.
+fi
+
+if [ $train_stage -le -4 ]; then
+  # This is to skip stages of den-fst creation, which was already done.
+  train_stage=-4
+fi
+
+chunk_width=340,300,200,100
+if [ $stage -le 15 ]; then
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --egs.dir "$comb_egs_dir" \
+    --egs.chunk-width=$chunk_width \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00001 \
+    --chain.apply-deriv-weights=true \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=900" \
+    --trainer.srand=0 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs 8 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.num-jobs-initial 6 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs false \
+    --feat-dir data/$supervised_set \
+    --tree-dir $sup_tree_dir \
+    --lat-dir $sup_lat_dir \
+    --dir $dir || exit 1;
+
+fi
+
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 $lang_decode $dir $dir/graph
+fi
+
+if [ $stage -le 18 ]; then
+    steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+      --beam 12 --frames-per-chunk 340 --nj $nj --cmd "$cmd" \
+      $dir/graph data/test $dir/decode_test
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+                                data/test $dir/decode_test{,_rescored} || exit 1
+fi
+exit 0;
+
diff --git a/egs/yomdle_tamil/v1/local/semisup/process_data.py b/egs/yomdle_tamil/v1/local/semisup/process_data.py
new file mode 100755
index 00000000000..94ad770ec2d
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/semisup/process_data.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the slam boxed Tamil OCR dataset and creates the following
+    files utt2spk, images.scp. Since boxed data do not have transcripts, it do not
+    creates text file. It is created as a separate script, because the data that
+    local/process_data.py is processing contains some empty transcripts which 
+    should be removed or it will create bug while applying BPE.
+
+  Eg. local/semisup/process_data.py data/download/ data/local/splits/train_unsup.txt
+        data/train_unsup
+
+  Eg. utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+import re
+import string
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('data_split', type=str, help='Path to file that contain datasplits')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+
+with open(args.data_split) as f:
+    for line in f:
+        line = line.strip()
+        image_id = line
+        image_filename = image_id + '.png'
+        image_filepath = os.path.join(args.database_path, 'truth_line_image', image_filename)
+        if not os.path.isfile (image_filepath):
+            print("File does not exist {}".format(image_filepath))
+            continue
+        line_id = int(line.split('_')[-1])
+        csv_filename = '_'.join(line.split('_')[:-1]) + '.csv'
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', csv_filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        for row in csv.reader(csv_file):
+            if row[1] == image_filename:
+                text = 'semisup'
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(line.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath +  '\n')
diff --git a/egs/yomdle_tamil/v1/local/semisup/run_semisup.sh b/egs/yomdle_tamil/v1/local/semisup/run_semisup.sh
new file mode 100755
index 00000000000..0b82def2ead
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/semisup/run_semisup.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# Copyright 2017  Vimal Manohar
+#           2018  Ashish Arora
+# Apache 2.0
+
+# This script demonstrates semi-supervised training using 25k line images of 
+# supervised data and 22k line images of unsupervised data.
+# We assume the supervised data is in data/train and unsupervised data
+# is in data/train_unsup. 
+# For LM training, we use 5 million lines of tamil text.
+
+set -e
+set -o pipefail
+stage=0
+nj=30
+exp_root=exp/semisup_100k
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p data/train_unsup/data
+if [ $stage -le 0 ]; then
+  echo "stage 0: Processing train unsupervised data...$(date)"
+  local/semisup/process_data.py data/download/ \
+    data/local/splits/train_unsup.txt \
+    data/train_unsup
+  image/fix_data_dir.sh data/train_unsup
+fi
+
+if [ $stage -le 1 ]; then
+  echo "stage 1: Obtaining image groups. calling get_image2num_frames..."
+  image/get_image2num_frames.py --feat-dim 40 data/train_unsup
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train_unsup
+  echo "Extracting features and calling compute_cmvn_stats: $(date) "
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train_unsup
+  steps/compute_cmvn_stats.sh data/train_unsup || exit 1;
+  image/fix_data_dir.sh data/train_unsup
+fi
+
+for f in data/train/utt2spk data/train_unsup/utt2spk \
+  data/train/text; do
+  if [ ! -f $f ]; then
+    echo "$0: Could not find $f"
+    exit 1;
+  fi
+done
+
+# Prepare semi-supervised train set 
+if [ $stage -le 1 ]; then
+  utils/combine_data.sh data/semisup100k_250k \
+    data/train_aug data/train_unsup || exit 1
+fi
+
+###############################################################################
+# Semi-supervised training using 25k line images supervised data and 
+# 22k hours unsupervised data. We use tree, lattices 
+# and seed chain system from the previous stage.
+###############################################################################
+if [ $stage -le 2 ]; then
+  local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh \
+    --supervised-set train_aug \
+    --unsupervised-set train_unsup \
+    --sup-chain-dir exp/chain/cnn_e2eali_1b \
+    --sup-lat-dir exp/chain/e2e_train_lats \
+    --sup-tree-dir exp/chain/tree_e2e \
+    --chain-affix "" \
+    --tdnn-affix _semisup_1a \
+    --exp-root $exp_root || exit 1
+fi
diff --git a/egs/yomdle_tamil/v1/local/train_lm.sh b/egs/yomdle_tamil/v1/local/train_lm.sh
new file mode 100755
index 00000000000..bb21c67b63f
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/train_lm.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the training transcriptions and corpus text.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+order=6
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.031,0.860,0.678,0.194,0.037,0.006,0.928,0.712,0.454,0.220,0.926,0.844,0.749,0.358,0.966,0.879,0.783,0.544,0.966,0.826,0.674,0.450"
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/local/text/cleaned/bpe_val.txt  > ${dir}/data/text/dev.txt
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  cat data/local/text/cleaned/bpe_corpus.txt > ${dir}/data/text/corpus_text.txt
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from train and corpus text
+  cat ${dir}/data/text/{train,corpus_text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  #3grm: [perplexity = 27.7734168008] over 151116.0 words
+  #6grm: [perplexity = 18.6681627154] over 151116.0 words
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  #[perplexity = 22.0613098868] over 151116.0 words
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  #[perplexity = 23.4801171202] over 151116.0 words
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/yomdle_tamil/v1/local/wer_output_filter b/egs/yomdle_tamil/v1/local/wer_output_filter
new file mode 100755
index 00000000000..59e364e0231
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/wer_output_filter
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Hossein Hadian
+
+# Apache 2.0
+# This script converts a BPE-encoded text to normal text. It is used in scoring
+
+import sys, io
+import string
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ''.join(words[1:])
+  transcript = transcript.replace('|', ' ')
+  output.write(uttid + ' ' + transcript + '\n')
diff --git a/egs/yomdle_tamil/v1/local/yomdle/create_download_dir.sh b/egs/yomdle_tamil/v1/local/yomdle/create_download_dir.sh
new file mode 100755
index 00000000000..de932e01021
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/create_download_dir.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+
+# Copyright      2018  Chun Chieh Chang
+#                2018  Ashish Arora
+#                2018  Hossein Hadian
+# Apache 2.0
+
+# This script assumes that the SLAM and Yomdle OCR database is stored in slam_dir and
+# yomdle_dir. It reads the xml files and converts them to csv files. It then with the
+# help of csv files, extracts lines images from page images. It can create dataset for
+# any yomdle and slam language. Assuming it is creating dataset for Tamil OCR. It
+# creates csv files for yomdle English, yomdle Tamil, slam Tamil transcribed and slam
+# Tamil boxed. It also creates train, test and train_unsup sets for training and testing.
+# Yomdle (English and Tamil) is training set, slam Tamil transcribed is test set, and
+# slam Tamil boxed is semi-supervised set.
+
+set -e
+stage=0
+language_main=Tamil
+slam_dir=/export/corpora5/slam/SLAM/
+yomdle_dir=/export/corpora5/slam/YOMDLE/
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+mkdir -p data/local/splits
+language_lower=$(echo "$language_main" | tr '[:upper:]' '[:lower:]')
+
+echo "$0: extracting line images for english and ${language} for shared model training"
+if [ $stage -le 0 ]; then
+  for language in  english $language_lower; do
+    echo "$0: Processing YOMDLE ${language}"
+    mkdir -p data/download/${language}/{truth_csv,truth_line_image}
+    local/yomdle/yomdle2csv.py \
+      --inputDir $yomdle_dir/final_$language/ \
+      --outputDir data/download/${language}/truth_csv/ \
+      --log data/download/yomdle2csv.${language}.log
+    local/yomdle/create_line_image_from_page_image.py \
+      $yomdle_dir/final_$language/images/ \
+      data/download/${language}/truth_csv/ \
+      data/download/${language}/truth_line_image/ \
+      data/local/yomdle-${language}-train.list \
+      --filter
+  done
+fi
+
+echo "$0: extracting line images for slam ${language} for testing"
+if [ $stage -le 1 ]; then
+  echo "$0: Processing slam ${language_main}"
+  mkdir -p data/download/${language_main}/{truth_csv,truth_line_image}
+  local/yomdle/gedi2csv_enriched.py \
+    --inputDir $slam_dir/${language_main}/transcribed/ \
+    --outputDir data/download/${language_main}/truth_csv/ \
+    --log data/download/gedi2csv.${language_main}.log
+  local/yomdle/create_line_image_from_page_image.py \
+    $slam_dir/${language_main}/transcribed/ \
+    data/download/${language_main}/truth_csv/ \
+    data/download/${language_main}/truth_line_image/ \
+    data/local/yomdle-${language_main}-test.list \
+    --ext '.png'
+fi
+
+echo "$0: extracting line images for semi supervised training for slam ${language}"
+if [ $stage -le 2 ]; then
+  echo "$0: Processing slam ${language_main}"
+  mkdir -p data/download/${language_main}_boxed/{truth_csv,truth_line_image}
+  local/yomdle/gedi2csv_enriched.py \
+    --inputDir $slam_dir/${language_main}/boxed \
+    --ftype boxed \
+    --outputDir data/download/${language_main}_boxed/truth_csv/ \
+    --log data/download/gedi2csv.${language_main}_boxed.log
+  local/yomdle/create_line_image_from_page_image.py \
+    $slam_dir/${language_main}/boxed \
+    data/download/${language_main}_boxed/truth_csv/ \
+    data/download/${language_main}_boxed/truth_line_image/ \
+    data/local/yomdle-${language_main}-train_unsup.list \
+    --ext '.png' \
+    --filter
+fi
+
+echo "$0: storing english, given language(transcribed and untranscribed) line images together"
+if [ $stage -le 3 ]; then
+  cp -r data/download/${language_main}_boxed/truth_line_image/* data/download/$language_lower/truth_line_image/
+  cp -r data/download/$language_main/truth_line_image/* data/download/$language_lower/truth_line_image/
+  cp -r data/download/english/truth_line_image/* data/download/$language_lower/truth_line_image/
+  cp -r data/download/${language_main}_boxed/truth_csv/* data/download/$language_lower/truth_csv/
+  cp -r data/download/$language_main/truth_csv/* data/download/$language_lower/truth_csv/
+  cp -r data/download/english/truth_csv/* data/download/$language_lower/truth_csv/
+fi
+
+
+if [ $stage -le 4 ]; then
+  mv data/download/$language_lower/truth_line_image/ data/download/
+  mv data/download/$language_lower/truth_csv/ data/download/
+fi
+
+echo "$0: storing train, test and train unsupervised splits"
+if [ $stage -le 5 ]; then
+  cat data/local/yomdle-${language_lower}-train.list data/local/yomdle-english-train.list > data/local/splits/train.txt
+  cp data/local/yomdle-${language_main}-test.list data/local/splits/test.txt
+  cp data/local/yomdle-${language_main}-train_unsup.list data/local/splits/train_unsup.txt
+fi
diff --git a/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py
new file mode 100755
index 00000000000..885f18c7deb
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/create_line_image_from_page_image.py
@@ -0,0 +1,406 @@
+#!/usr/bin/env python3
+
+# Copyright   2018 Ashish Arora
+#             2018  Chun Chieh Chang
+
+# Apache 2.0
+
+# minimum bounding box part in this script is originally from
+#https://github.com/BebeSparkelSparkel/MinimumBoundingBox
+#https://startupnextdoor.com/computing-convex-hull-in-python/
+""" This module will be used for extracting line images from page image.
+ Given the word segmentation (bounding box around a word) for every word, it will
+ extract line segmentation. To extract line segmentation, it will take word bounding
+ boxes of a line as input, will create a minimum area bounding box that will contain
+ all corner points of word bounding boxes. The obtained bounding box (will not necessarily
+ be vertically or horizontally aligned). Hence to extract line image from line bounding box,
+ page image is rotated and line image is cropped and saved.
+"""
+
+import argparse
+import csv
+import itertools
+import sys
+import os
+import numpy as np
+from math import atan2, cos, sin, pi, degrees, sqrt
+from collections import namedtuple
+
+from scipy.spatial import ConvexHull
+from PIL import Image
+from scipy.misc import toimage
+from pathlib import Path
+from glob import glob
+parser = argparse.ArgumentParser(description="Creates line images from page image")
+parser.add_argument('image_dir', type=str, help='Path to full page images')
+parser.add_argument('csv_dir', type=str, help='Path to csv files')
+parser.add_argument('out_dir', type=str, help='Path to output directory')
+parser.add_argument('output_file', type=str, help='file containing all line images id')
+parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area')
+parser.add_argument('--ext', type=str, default='.jpg', help='Extention of the line images')
+parser.add_argument("--filter", action="store_true",
+                   help="If true, filter height/width<10 pixels minimum area rectangles")
+args = parser.parse_args()
+
+"""
+bounding_box is a named tuple which contains:
+             area (float): area of the rectangle
+             length_parallel (float): length of the side that is parallel to unit_vector
+             length_orthogonal (float): length of the side that is orthogonal to unit_vector
+             rectangle_center(int, int): coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector (float, float): direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle (float): angle of the unit vector to be in radians.
+             corner_points [(float, float)]: set that contains the corners of the rectangle
+"""
+
+bounding_box_tuple = namedtuple('bounding_box_tuple', 'area '
+                                        'length_parallel '
+                                        'length_orthogonal '
+                                        'rectangle_center '
+                                        'unit_vector '
+                                        'unit_vector_angle '
+                                        'corner_points'
+                         )
+
+
+def unit_vector(pt0, pt1):
+    """ Given two points pt0 and pt1, return a unit vector that
+        points in the direction of pt0 to pt1.
+    Returns
+    -------
+    (float, float): unit vector
+    """
+    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
+    return (pt1[0] - pt0[0]) / dis_0_to_1, \
+           (pt1[1] - pt0[1]) / dis_0_to_1
+
+
+def orthogonal_vector(vector):
+    """ Given a vector, returns a orthogonal/perpendicular vector of equal length.
+    Returns
+    ------
+    (float, float): A vector that points in the direction orthogonal to vector.
+    """
+    return -1 * vector[1], vector[0]
+
+
+def bounding_area(index, hull):
+    """ Given index location in an array and convex hull, it gets two points
+        hull[index] and hull[index+1]. From these two points, it returns a named
+        tuple that mainly contains area of the box that bounds the hull. This
+        bounding box orintation is same as the orientation of the lines formed
+        by the point hull[index] and hull[index+1].
+    Returns
+    -------
+    a named tuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side.
+    (it's orthogonal vector can be found with the orthogonal_vector function)
+    """
+    unit_vector_p = unit_vector(hull[index], hull[index+1])
+    unit_vector_o = orthogonal_vector(unit_vector_p)
+
+    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
+    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)
+
+    min_p = min(dis_p)
+    min_o = min(dis_o)
+    len_p = max(dis_p) - min_p
+    len_o = max(dis_o) - min_o
+
+    return {'area': len_p * len_o,
+            'length_parallel': len_p,
+            'length_orthogonal': len_o,
+            'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2),
+            'unit_vector': unit_vector_p,
+            }
+
+
+def to_xy_coordinates(unit_vector_angle, point):
+    """ Given angle from horizontal axis and a point from origin,
+        returns converted unit vector coordinates in x, y coordinates.
+        angle of unit vector should be in radians.
+    Returns
+    ------
+    (float, float): converted x,y coordinate of the unit vector.
+    """
+    angle_orthogonal = unit_vector_angle + pi / 2
+    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
+           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
+
+
+def rotate_points(center_of_rotation, angle, points):
+    """ Rotates a point cloud around the center_of_rotation point by angle
+    input
+    -----
+    center_of_rotation (float, float): angle of unit vector to be in radians.
+    angle (float): angle of rotation to be in radians.
+    points [(float, float)]: Points to be a list or tuple of points. Points to be rotated.
+    Returns
+    ------
+    [(float, float)]: Rotated points around center of rotation by angle
+    """
+    rot_points = []
+    ang = []
+    for pt in points:
+        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
+        diff_angle = atan2(diff[1], diff[0]) + angle
+        ang.append(diff_angle)
+        diff_length = sqrt(sum([d**2 for d in diff]))
+        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
+                           center_of_rotation[1] + diff_length * sin(diff_angle)))
+
+    return rot_points
+
+
+def rectangle_corners(rectangle):
+    """ Given rectangle center and its inclination, returns the corner
+        locations of the rectangle.
+    Returns
+    ------
+    [(float, float)]: 4 corner points of rectangle.
+    """
+    corner_points = []
+    for i1 in (.5, -.5):
+        for i2 in (i1, -1 * i1):
+            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
+                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))
+
+    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
+
+
+def minimum_bounding_box(points):
+    """ Given a list of 2D points, it returns the minimum area rectangle bounding all
+        the points in the point cloud.
+    Returns
+    ------
+    returns a namedtuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side. RADIANS
+    unit_vector_angle: angle of the unit vector
+    corner_points: set that contains the corners of the rectangle
+    """
+
+    if len(points) <= 2: raise ValueError('More than two points required.')
+
+    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
+    hull_ordered.append(hull_ordered[0])
+    hull_ordered = tuple(hull_ordered)
+
+    min_rectangle = bounding_area(0, hull_ordered)
+    for i in range(1, len(hull_ordered)-1):
+        rectangle = bounding_area(i, hull_ordered)
+        if rectangle['area'] < min_rectangle['area']:
+            min_rectangle = rectangle
+
+    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
+    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])
+
+    return bounding_box_tuple(
+        area = min_rectangle['area'],
+        length_parallel = min_rectangle['length_parallel'],
+        length_orthogonal = min_rectangle['length_orthogonal'],
+        rectangle_center = min_rectangle['rectangle_center'],
+        unit_vector = min_rectangle['unit_vector'],
+        unit_vector_angle = min_rectangle['unit_vector_angle'],
+        corner_points = set(rectangle_corners(min_rectangle))
+    )
+
+
+def get_center(im):
+    """ Given image, returns the location of center pixel
+    Returns
+    -------
+    (int, int): center of the image
+    """
+    center_x = float(im.size[0]) / 2
+    center_y = float(im.size[1]) / 2
+    return int(center_x), int(center_y)
+
+
+def get_horizontal_angle(unit_vector_angle):
+    """ Given an angle in radians, returns angle of the unit vector in
+        first or fourth quadrant.
+    Returns
+    ------
+    (float): updated angle of the unit vector to be in radians.
+             It is only in first or fourth quadrant.
+    """
+    if unit_vector_angle > pi / 2 and unit_vector_angle <= pi:
+        unit_vector_angle = unit_vector_angle - pi
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2:
+        unit_vector_angle = unit_vector_angle + pi
+
+    return unit_vector_angle
+
+
+def get_smaller_angle(bounding_box):
+    """ Given a rectangle, returns its smallest absolute angle from horizontal axis.
+    Returns
+    ------
+    (float): smallest angle of the rectangle to be in radians.
+    """
+    unit_vector = bounding_box.unit_vector
+    unit_vector_angle = bounding_box.unit_vector_angle
+    ortho_vector = orthogonal_vector(unit_vector)
+    ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0])
+
+    unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle)
+    ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle)
+
+    if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated):
+        return unit_vector_angle_updated
+    else:
+        return ortho_vector_angle_updated
+
+
+def rotated_points(bounding_box, center):
+    """ Given the rectangle, returns corner points of rotated rectangle.
+        It rotates the rectangle around the center by its smallest angle.
+    Returns
+    -------
+    [(int, int)]: 4 corner points of rectangle.
+    """
+    p1, p2, p3, p4 = bounding_box.corner_points
+    x1, y1 = p1
+    x2, y2 = p2
+    x3, y3 = p3
+    x4, y4 = p4
+    center_x, center_y = center
+    rotation_angle_in_rad = -get_smaller_angle(bounding_box)
+    x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x
+
+    y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y
+    return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4
+
+
+def pad_image(image):
+    """ Given an image, returns a padded image around the border.
+        This routine save the code from crashing if bounding boxes that are
+        slightly outside the page boundary.
+    Returns
+    -------
+    image: page image
+    """
+    offset = int(args.padding // 2)
+    padded_image = Image.new('L', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white")
+    padded_image.paste(im = image, box = (offset, offset))
+    return padded_image
+
+def update_minimum_bounding_box_input(bounding_box_input):
+    """ Given list of 2D points, returns list of 2D points shifted by an offset.
+    Returns
+    ------
+    points [(float, float)]: points, a list or tuple of 2D coordinates
+    """
+    updated_minimum_bounding_box_input = []
+    offset = int(args.padding // 2)
+    for point in bounding_box_input:
+        x, y = point
+        new_x = x + offset
+        new_y = y + offset
+        word_coordinate = (new_x, new_y)
+        updated_minimum_bounding_box_input.append(word_coordinate)
+
+    return updated_minimum_bounding_box_input
+
+
+### main ###
+globvar = 0
+text_fh = open(args.output_file, 'w', encoding='utf-8')
+file_list = list(Path(args.csv_dir).rglob("*.[cC][sS][vV]"))
+for filename in sorted(file_list):
+    filename = str(filename)
+    with open(str(filename), 'r', encoding='utf-8') as f:
+        base_name = os.path.basename(filename)
+        image_file = os.path.join(args.image_dir, base_name.split('.')[0] + args.ext)
+        try:
+            im = Image.open(image_file).convert('L')
+        except Exception as e:
+            print("Error: No such Image " + row[1])
+            globvar += 1
+            continue
+        im = pad_image(im)
+        for row in itertools.islice(csv.reader(f), 1, None):
+            points = []
+            points.append((int(row[2]), int(row[3])))
+            points.append((int(row[4]), int(row[5])))
+            points.append((int(row[6]), int(row[7])))
+            points.append((int(row[8]), int(row[9])))
+            x = [int(row[2]), int(row[4]), int(row[6]), int(row[8])]
+            y = [int(row[3]), int(row[5]), int(row[7]), int(row[9])]
+            min_x, min_y = min(x), min(y)
+            max_x, max_y = max(x), max(y)
+            try:
+                updated_mbb_input = update_minimum_bounding_box_input(points)
+                bounding_box = minimum_bounding_box(updated_mbb_input)
+            except Exception as e:
+                globvar += 1
+                continue
+            p1, p2, p3, p4 = bounding_box.corner_points
+            x1, y1 = p1
+            x2, y2 = p2
+            x3, y3 = p3
+            x4, y4 = p4
+            min_x = int(min(x1, x2, x3, x4))
+            min_y = int(min(y1, y2, y3, y4))
+            max_x = int(max(x1, x2, x3, x4))
+            max_y = int(max(y1, y2, y3, y4))
+            box = (min_x, min_y, max_x, max_y)
+            region_initial = im.crop(box)
+            rot_points = []
+            p1_new = (x1 - min_x, y1 - min_y)
+            p2_new = (x2 - min_x, y2 - min_y)
+            p3_new = (x3 - min_x, y3 - min_y)
+            p4_new = (x4 - min_x, y4 - min_y)
+            rot_points.append(p1_new)
+            rot_points.append(p2_new)
+            rot_points.append(p3_new)
+            rot_points.append(p4_new)
+            cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                    bounding_box.length_parallel,
+                    bounding_box.length_orthogonal,
+                    bounding_box.length_orthogonal,
+                    bounding_box.unit_vector,
+                    bounding_box.unit_vector_angle,
+                    set(rot_points))
+            rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+            img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
+            x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                cropped_bounding_box, get_center(region_initial))
+            min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+            min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+            max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+            max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+            box = (min_x, min_y, max_x, max_y)
+            region_final = img2.crop(box)
+            width, height = region_final.size
+            if args.filter:
+              if height > (width * 2):
+                  globvar += 1
+                  continue
+              if height < 10:
+                  globvar += 1
+                  continue
+              if width < 10:
+                  globvar += 1
+                  continue
+            fname = row[1].split('.')[0]
+            text_fh.write(fname + '\n')
+            image_out_file = os.path.join(args.out_dir, row[1])
+            region_final.save(image_out_file)
+print(globvar)
diff --git a/egs/yomdle_tamil/v1/local/yomdle/gedi2csv_enriched.py b/egs/yomdle_tamil/v1/local/yomdle/gedi2csv_enriched.py
new file mode 100755
index 00000000000..51d7a34e7e8
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/gedi2csv_enriched.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+
+'''
+Convert GEDI-type bounding boxes to CSV format
+'''
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+class GEDI2CSV(object):
+    ''' Initialize the extractor'''
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    '''
+    Segment image with GEDI bounding box information
+    '''
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        ''' for writing the files '''
+        writePath = self._args.outputDir
+        writePath = os.path.join(writePath,'')
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','text_type']
+        conf=100
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+            
+        strPos = writePath + baseName
+
+        ''' for each group of coordinates '''
+        for i in coords:
+
+            [id,x,y,w,h,degrees,text,qual,script,text_type] = i
+            contour = np.array([(x,y),(x+w,y),(x+w,y+h),(x,y+h)])
+            """First rotate around upper left corner based on orientationD keyword"""
+            M, rot = Rotate2D(contour, np.array([x,y]), degrees*pi/180)
+            rot = np.int0(rot)
+
+            # rot is the 8 points rotated by degrees
+            # pgrot is the rotation after extraction, so save
+            # save rotated points to list or array
+            rot = np.reshape(rot,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(rot)
+            
+            bbrot = degrees
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+
+        # if there are polygons, first save the text
+        for j in polys:
+            arr = []
+            [id,poly_val,text,qual,script,text_type] = j
+            for i in poly_val:
+                arr.append(eval(i))
+
+            contour = np.asarray(arr)
+            convex = cv2.convexHull(contour)
+            rect = cv2.minAreaRect(convex)
+            box = cv2.boxPoints(rect)
+            box = np.int0(box)
+            box = np.reshape(box,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+            
+            bbrot = 0.0
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+        return write_ctr
+    
+
+def main(args):
+    startTime = time.clock()
+    writePath = args.outputDir
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+    ''' Setup logging '''
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    
+    '''
+    Get all XML files in the directory and sub folders
+    '''
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+                fileCnt += 1
+                ''' read the XML file '''
+                tree = ET.parse(fullName)
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+                if args.ftype == 'boxed':
+                    fileTypeStr = 'col'
+                elif args.ftype == 'transcribed':
+                    fileTypeStr = 'Text_Content'
+                else:
+                    print('Filetype must be either boxed or transcribed!')
+                    logger.info('Filetype must be either boxed or transcribed!')
+                    sys.exit(-1)
+                
+                if args.quality == 'both':
+                    qualset = {'Regular','Low-Quality'}
+                elif args.quality == 'low':
+                    qualset = {'Low-Quality'}
+                elif args.quality == 'regular':
+                    qualset = {'Regular'}
+                else:
+                    print('Quality must be both, low or regular!')
+                    logger.info('Quality must be both, low or regular!')
+                    sys.exit(-1)
+                    
+                    
+
+                ''' and for each page '''
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+                        
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    ''' find children for each page '''
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' and zone.attrib['Type'] in \
+                            ('Machine_Print','Confusable_Allograph','Handwriting') and zone.attrib['Quality'] in qualset:
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')])
+                            elif zone.get(fileTypeStr) != None:
+                                keyCnt+=1
+                                coord = [zone.attrib['id'],int(zone.attrib['col']),int(zone.attrib['row']),
+                                                    int(zone.attrib['width']), int(zone.attrib['height']),
+                                                    float(zone.get('orientationD',0.0)),
+                                                    zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')]
+                                coordinates.append(coord)
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no applicable content' % (baseName[0]))
+
+    print('complete...total files %d, lines written %d' % (fileCnt, line_write_ctr))
+
+
+''' Args and defaults '''
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', required=True)
+    parser.add_argument('--outputDir', type=str, help='Output directory', required=True)
+    parser.add_argument('--ftype', type=str, help='GEDI file type (either "boxed" or "transcribed")', default='transcribed')
+    parser.add_argument('--quality', type=str, help='GEDI file quality (either "both" or "low" or "regular")', default='regular')
+    parser.add_argument('--log', type=str, help='Log directory', default='./GEDI2CSV_enriched.log')
+
+    return parser.parse_args(argv)
+
+''' Run '''
+if __name__ == '__main__':
+    main(parse_arguments(sys.argv[1:]))
+
+
+
+    
+
+
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/baseline_text_detect.sh b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/baseline_text_detect.sh
new file mode 100755
index 00000000000..057d22ab492
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/baseline_text_detect.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+#  INPUT:
+#		LANGUAGE - SLAM language to evaluate
+#		TRUTH_CSV - Transcription annotation csv file 
+#			formt: ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,rotation,quality,script
+#
+#		PREDICT_CSV - The predicted transcription csv file
+#			formt: ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth
+#
+#  OUTPUT:
+#		OUTPUT_DIR - 
+#
+#
+
+source activate py35
+export LD_LIBRARY_PATH=/exp/scale18/ocr/tools/leptonica-1.74.4/lib:/exp/scale18/ocr/tools/tesseract/install/lib:$LD_LIBRARY_PATH
+
+echo "LD_LIBRARY_PATH = ${LD_LIBRARY_PATH}"
+echo "PATH = ${PATH}"
+
+MODELS_DIR=/exp/scale18/ocr/tools/tessdata
+MODELS_LANG=far+eng
+LANGUAGE=Farsi
+INPUT=/exp/scale18/ocr/data/derived/SLAM_2.0/${LANGUAGE}/transcribed_list.txt
+OUTPUT=/exp/detter/scale18/slam2/results/${LANGUAGE}/transcribed/bbox_csv
+OVERLAY=/exp/detter/scale18/slam2/results/${LANGUAGE}/transcribed/bbox_overlay
+LOG=/exp/detter/scale18/slam2/results/${LANGUAGE}/transcribed/bbox_log.txt
+
+echo "Models ${MODELS_DIR}"
+echo "Model lang ${MODELS_LANG}"
+
+echo "Language ${LANGUAGE}"
+echo "Input ${INPUT}"
+echo "Output ${OUTPUT}"
+echo "Overlay ${OVERLAY}"
+
+echo "...evalulate text detection"
+python /exp/detter/scale18/ocr/cv_scale/detect_lines/get_bbox_tesserocr.py \
+--tess_data=${MODELS_DIR} \
+--lang=${MODELS_LANG} \
+--oem=1 \
+--blur=0 \
+--line=1 \
+--input=${INPUT} \
+--output=${OUTPUT} \
+--overlay=${OVERLAY} \
+--log=${LOG}
+
+echo "...COMPLETE..."
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/convert2snor.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/convert2snor.py
new file mode 100755
index 00000000000..c8f1d2efa48
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/convert2snor.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+
+""" This script converts kaldi format into snor format..
+"""
+import sys
+import io
+
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+
+for line in sys.stdin:
+    line = line.strip()
+    line_vect = line.split()
+    utt_id = line_vect[0]
+    utt = ' '.join(line_vect[1:])
+    sys.stdout.write(utt + " (" + utt_id + ")\n")
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/eval_text_detect.sh b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/eval_text_detect.sh
new file mode 100755
index 00000000000..2243d46e10a
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/eval_text_detect.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# DESC: Evaluate text detection bounding boxes
+#
+# INPUT:
+#    truth - Directory of input truth csv files
+#        formt: ID,name,col1,row1,col2,row2,col3,row3,col4,row4,box_conf
+#    predict - Directory of input predict csv files
+#        formt: ID,name,col1,row1,col2,row2,col3,row3,col4,row4,box_conf
+#    iou - intersection over union
+#    
+# OUTPUT:
+#    output - output directory of results (plot)
+#    log - log output
+
+source activate py35
+echo "LD_LIBRARY_PATH = ${LD_LIBRARY_PATH}"
+echo "PATH = ${PATH}"
+
+LANGUAGE=Farsi
+IOU=0.50
+TRUTH_CSV=/exp/scale18/ocr/data/derived/SLAM_2.0/${LANGUAGE}/transcribed/truth_csv
+PREDICT_CSV=/exp/detter/scale18/slam2/results/${LANGUAGE}/transcribed/bbox_csv
+RESULTS=/exp/detter/scale18/slam2/results/${LANGUAGE}/transcribed/bbox_results
+LOG=/exp/detter/scale18/slam2/results/${LANGUAGE}/transcribed/bbox_results_log.txt
+
+echo "Language ${LANGUAGE}"
+echo "Truth ${TRUTH_CSV}"
+echo "Predict ${PREDICT_CSV}"
+echo "Results ${RESULTS}"
+
+echo "\n...evalulate text detection"
+python /exp/detter/scale18/ocr/cv_scale/eval/eval_detect_lines.py \
+--truth=${TRUTH_CSV} \
+--predict=${PREDICT_CSV} \
+--iou=${IOU} \
+--output=${RESULTS} \
+--log=${LOG}
+
+echo "...COMPLETE..."
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/normalized_score.sh b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/normalized_score.sh
new file mode 100755
index 00000000000..f55600939ae
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/normalized_score.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+#    This script normalizes hypothesis and reference file and performs scoring.
+#    Eg. ./local/yomdle/normalized_scoring/normalized_score.sh <output-dir> <input-hyp-file> <slam-language>
+
+if [ $# -ne 3 ]; then
+    echo "USAGE:  ./local/yomdle/normalized_scoring/normalized_score.sh <output-dir> <input-hyp-file> <slam-language>"
+    exit 1
+fi
+
+OUTDIR=$1
+HYP_FILE=$2
+LANG=$3
+
+# ocr_score.pl is slow, especially for CER computation
+# Therefore default option is to convert files to uxxxx format and use sclite for scoring
+# Turn following switch to false to use ocr_score.pl instead
+USE_SCLITE=true
+script_dir=local/yomdle/normalized_scoring
+OCR_SCORE=${script_dir}/ocr_score.pl
+SCLITE=../../../tools/sctk/bin/sclite
+
+LANG=$(echo $LANG | tr '[:upper:]' '[:lower:]')
+echo "performing some normalizations..."
+
+mkdir -p $OUTDIR
+cat $HYP_FILE | python3 $script_dir/convert2snor.py > data/local/text/hyp_file.txt
+cat data/test/text.old | python3 $script_dir/convert2snor.py > data/local/text/ref_file.txt
+# Step 1. Run some normalizations that are common to all languages
+python3 ${script_dir}/utils/normalize_spaces.py data/local/text/hyp_file.txt $OUTDIR/hyp.norm-sp.txt
+python3 ${script_dir}/utils/normalize_spaces.py data/local/text/ref_file.txt $OUTDIR/ref.norm-sp.txt
+
+python3 ${script_dir}/utils/normalize_common.py $OUTDIR/hyp.norm-sp.txt $OUTDIR/hyp.norm-sp-common.txt
+python3 ${script_dir}/utils/normalize_common.py $OUTDIR/ref.norm-sp.txt $OUTDIR/ref.norm-sp-common.txt
+
+# Step 1. Run language specific normalization
+if [ "$LANG" == "farsi" ]; then
+    # Farsi Normalization
+    python3 ${script_dir}/utils/normalize_farsi.py $OUTDIR/hyp.norm-sp-common.txt $OUTDIR/hyp.norm-final.txt
+    python3 ${script_dir}/utils/normalize_farsi.py $OUTDIR/ref.norm-sp-common.txt $OUTDIR/ref.norm-final.txt
+else
+    # For now no normalization for other langs
+    cp $OUTDIR/hyp.norm-sp-common.txt $OUTDIR/hyp.norm-final.txt
+    cp $OUTDIR/ref.norm-sp-common.txt $OUTDIR/ref.norm-final.txt
+fi
+
+# Step 2. Run tokenization to get word-based output
+python3 ${script_dir}/utils/trans_to_tokenized_words.py $OUTDIR/hyp.norm-final.txt $OUTDIR/hyp.norm-final.words.txt
+python3 ${script_dir}/utils/trans_to_tokenized_words.py $OUTDIR/ref.norm-final.txt $OUTDIR/ref.norm-final.words.txt
+
+# Step 3. Also need to turn into space-seperated character stream to get char-based output
+python3 ${script_dir}/utils/trans_to_chars.py $OUTDIR/hyp.norm-final.txt $OUTDIR/hyp.norm-final.chars.txt
+python3 ${script_dir}/utils/trans_to_chars.py $OUTDIR/ref.norm-final.txt $OUTDIR/ref.norm-final.chars.txt
+
+# Step 5. Look for reference uttids that aren't in hypothesis and add them in as blank hypotheses. This is needed because
+#         otherwise sclite will not penalize systems for missing hypotheses
+#python3 ${script_dir}/utils/find_missing_hyp_ids.py $OUTDIR/ref.norm-final.words.txt $OUTDIR/hyp.norm-final.words.txt > $OUTDIR/missing-hyp-ids.list
+#python3 ${script_dir}/utils/insert_empty_hyp.py $OUTDIR/missing-hyp-ids.list $OUTDIR/hyp.norm-final.words.txt $OUTDIR/hyp.norm-final.words.withmissing.txt
+#python3 ${script_dir}/utils/insert_empty_hyp.py $OUTDIR/missing-hyp-ids.list $OUTDIR/hyp.norm-final.chars.txt $OUTDIR/hyp.norm-final.chars.withmissing.txt
+
+# Step 5. Look for reference uttids that aren't in hypothesis and add them in as blank hypotheses. This is needed because
+#         otherwise sclite will not penalize systems for missing hypotheses
+python3 ${script_dir}/utils/find_missing_hyp_ids.py $OUTDIR/ref.norm-final.words.txt $OUTDIR/hyp.norm-final.words.txt > $OUTDIR/missing-hyp-ids.list
+#python3 ${script_dir}/utils/insert_empty_hyp.py $OUTDIR/missing-hyp-ids.list $OUTDIR/hyp.norm-final.words.txt $OUTDIR/hyp.norm-final.words.withmissing.txt
+#python3 ${script_dir}/utils/insert_empty_hyp.py $OUTDIR/missing-hyp-ids.list $OUTDIR/hyp.norm-final.chars.txt $OUTDIR/hyp.norm-final.chars.withmissing.txt
+cp $OUTDIR/hyp.norm-final.words.txt $OUTDIR/hyp.norm-final.words.withmissing.txt
+cp $OUTDIR/hyp.norm-final.chars.txt $OUTDIR/hyp.norm-final.chars.withmissing.txt
+
+# Step 6. Possible filtering
+# TODO
+# Currently just cp non-filtered transcripts to filtered transcripts
+# This will eventually filter out "bad" uttids that should be removed prior to scoring
+cp $OUTDIR/ref.norm-final.words.txt $OUTDIR/ref.norm-final.words.filtered.txt
+cp $OUTDIR/ref.norm-final.chars.txt $OUTDIR/ref.norm-final.chars.filtered.txt
+cp $OUTDIR/hyp.norm-final.words.withmissing.txt $OUTDIR/hyp.norm-final.words.filtered.txt
+cp $OUTDIR/hyp.norm-final.chars.withmissing.txt $OUTDIR/hyp.norm-final.chars.filtered.txt
+
+
+# Step 7. Now we can run scoring
+
+if [ "$USE_SCLITE" == true ]; then
+    # First convert files to uxxxx format
+    python3 ${script_dir}/utils/word_trans_utf8_to_uxxxx.py $OUTDIR/ref.norm-final.words.filtered.txt $OUTDIR/ref.norm-final.words.filtered.uxxxx
+    python3 ${script_dir}/utils/word_trans_utf8_to_uxxxx.py $OUTDIR/hyp.norm-final.words.filtered.txt $OUTDIR/hyp.norm-final.words.filtered.uxxxx
+    python3 ${script_dir}/utils/char_trans_utf8_to_uxxxx.py $OUTDIR/ref.norm-final.chars.filtered.txt $OUTDIR/ref.norm-final.chars.filtered.uxxxx
+    python3 ${script_dir}/utils/char_trans_utf8_to_uxxxx.py $OUTDIR/hyp.norm-final.chars.filtered.txt $OUTDIR/hyp.norm-final.chars.filtered.uxxxx
+
+    echo "Computing WER"
+    $SCLITE -r $OUTDIR/ref.norm-final.words.filtered.uxxxx -h $OUTDIR/hyp.norm-final.words.filtered.uxxxx -i swb -o all >/dev/null
+    wer_sys_file=$OUTDIR/hyp.norm-final.words.filtered.uxxxx.sys
+
+    WER=$(grep 'Sum/Avg' ${wer_sys_file}  | awk '{print $(NF-2)}')
+    echo "WER = $WER"
+
+    echo "Computing CER"
+    $SCLITE -r $OUTDIR/ref.norm-final.chars.filtered.uxxxx -h $OUTDIR/hyp.norm-final.chars.filtered.uxxxx -i swb -o all >/dev/null
+    cer_sys_file=$OUTDIR/hyp.norm-final.chars.filtered.uxxxx.sys
+
+    CER=$(grep 'Sum/Avg' ${cer_sys_file}  | awk '{print $(NF-2)}')
+    echo "CER = $CER"
+
+else
+    echo "Computing WER"
+    LANG=C perl -CSAD $OCR_SCORE --ref_format trn --hyp_format trn $OUTDIR/ref.norm-final.words.filtered.txt $OUTDIR/hyp.norm-final.words.filtered.txt >/dev/null
+    wer_sys_file=$OUTDIR/hyp.norm-final.words.filtered.txt.sys
+
+    WER=$(awk '{print $4}' ${wer_sys_file} | head -n 4 | tail -n 1)
+    echo "WER = $WER"
+
+    echo "Computing CER"
+    LANG=C perl -CSAD $OCR_SCORE --ref_format trn --hyp_format trn $OUTDIR/ref.norm-final.chars.filtered.txt $OUTDIR/hyp.norm-final.chars.filtered.txt >/dev/null
+    cer_sys_file=$OUTDIR/hyp.norm-final.chars.filtered.txt.sys
+
+    CER=$(awk '{print $4}' ${cer_sys_file} | head -n 4 | tail -n 1)
+    echo "CER = $CER"
+fi
+
+num_missing_hyp=$(wc -l $OUTDIR/missing-hyp-ids.list | awk '{print $1}')
+
+echo "Done."
+echo ""
+echo "For detailed system scores see:"
+echo -e "\t${wer_sys_file}"
+echo -e "\t${cer_sys_file}"
+
+if [ "$num_missing_hyp" -gt 0 ]; then
+echo ""
+echo "Warning, you are missing ${num_missing_hyp} hypothesis lines. Your score is penalized due to missing lines."
+echo -e "\tFind missing hypothesis ids here: $OUTDIR/missing-hyp-ids.list"
+fi
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/ocr_score.pl b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/ocr_score.pl
new file mode 100755
index 00000000000..14b7e50a66c
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/ocr_score.pl
@@ -0,0 +1,1431 @@
+eval '(exit $?0)' && eval 'exec perl -w -S $0 ${1+"$@"}' && eval 'exec perl -w -S $0 $argv:q'
+  if 0;
+# The above two evil-looking lines enable the perl interpreter to be found on
+# the PATH when this script is executed as a Unix command.
+
+###################################################################
+# Copyright 2008 by BBN Technologies Corp.     All Rights Reserved
+###################################################################
+
+use strict;
+
+use File::Basename;
+
+my $usage =
+    "Usage: $0 [options] reference hypothesis [output-prefix]\n".
+    "Options:\n".
+    "   --ref_format <trn|stm|kaldi> reference file format (default is kaldi)\n".
+    "   --hyp_format <trn|ctm|kaldi> hypothesis file format (default is kaldi)\n".    "                             trn denotes SNOR format transcript:\n".
+    "                             line ends with unique ID in parentheses,\n".
+    "                             no other parentheses allowed.\n";
+
+# Set default formats
+
+my %ref_format_values = map { $_ => 1 } ("trn", "stm", "kaldi" );
+my %hyp_format_values = map { $_ => 1 } ("trn", "ctm", "kaldi" );
+
+my $ref_format = "kaldi";
+my $hyp_format = "kaldi";
+my $SPEAKER_SIDE_DELIM = "-";
+my $require_all_in_ref = 0;
+
+use Getopt::Long;
+
+if ( ! GetOptions( "ref_format=s" => \$ref_format,
+                   "hyp_format=s" => \$hyp_format,
+                   "use_all_ref=i" => \$require_all_in_ref) ) {
+    die $usage;
+}
+
+if ( ! $ref_format_values{$ref_format} ) {
+    die "Invalid format for reference file: $ref_format\n$usage";
+}
+
+if ( ! $hyp_format_values{$hyp_format} ) {
+    die "Invalid format for hypothesis file: $hyp_format\n$usage";
+}
+
+die $usage
+    if ( ( @ARGV > 3 ) || ( @ARGV < 2 ) );
+
+my $ref = $ARGV[0];
+my $hyp = $ARGV[1];
+my $out_pre = (defined $ARGV[2]) ? $ARGV[2] : $ARGV[1];
+
+# Todo:
+#       1) Support nested <ALT>'s in hypothesis ctm files
+#       2) Support multiple reference word paths
+#	5) Output IGNORE alignment for non-scoring regions
+#	7) Match hyp words to reference by audiofile channels, not speakers
+#	6) Det plot output of confidence scores?
+#	12) Output #Snt, #S.Err ?  Std. Dev. / Median of speaker statistics?
+#	13) Worry about GB, EUC and non-UTF8 unicode encoded stm/ctm files
+#		(Use perl's Encode support w/ command line option?)
+#	16) Join together close reference utterances for alignment step
+#       4) intersegment gaps? => fine as is.
+# 	17) Check that start_times are increasing across the hyp lattice
+# 	    from left to right.
+#	18) Check that <ALT> tags have * as start time and duration --
+#	    otherwise they might be words :).
+
+# Some constants.  Many of these should eventually become command line options.
+
+my $fake_region_id = "-9999";
+my $insert_cost = 3;
+my $delete_cost = 3;
+my $sub_cost = 4;
+my $opt_del_cost = 2;	# It's better to substitute versus (%HEST)
+			# than to insert and then get it correct.
+
+my $log_zero = -1000;		# a fair approximation to -infinity,
+				# given that confidence values are typically
+				# only precise to at most 3 digits.
+my $max_cost = 99999999;
+
+my $allow_multiple_reference_paths = 0;
+my $allow_word_fragment_matching = 1;
+
+my $verbosity = 1;
+my $debug = 0;
+
+# Globals
+my @cost;
+my @traceback;
+
+# Load reference
+print "Loading $ref_format reference file $ref ...\n" if ( $verbosity > 0 );
+my @load_ref_data;
+if ($ref_format eq "stm") {
+    @load_ref_data = load_stm( $ref );
+}
+elsif (($ref_format eq "trn") or ($ref_format eq "kaldi")) {
+    @load_ref_data = load_snor( $ref, $ref_format );
+}
+else {
+    die "Internal error: invalid format $ref_format";
+}
+my ( $refreg, $label_names, $category_names, $reforder, $get_side_from_speaker ) = @load_ref_data;
+
+# Merge reference regions into scoring regions
+# my $score_regions = merge_ref_regions( $ref_regions );
+
+# Load hypothesis
+print "Loading $hyp_format hypothesis file $hyp ...\n" if ( $verbosity > 0 );
+my @load_hyp_data;
+if ($hyp_format eq "ctm") {
+    @load_hyp_data = load_ctm( $hyp, $get_side_from_speaker );
+}
+elsif (($hyp_format eq "trn") or ($hyp_format eq "kaldi")) {
+    my ( $hypreg ) = load_snor( $hyp, $hyp_format );
+    @load_hyp_data = ( $hypreg, 5 );   # indicates no confidence field
+    # Make word networks for all utterances
+    map { MakeNetworkFromText( $_ ) } map { values %$_ } values %$hypreg;
+}
+else {
+    die "Internal error: invalid format $hyp_format";
+}
+my ( $hypreg, $num_ctm_fields ) = @load_hyp_data;
+
+# Assign hypothesis words to scoring regions (currently done in load_ctm)
+# my $assigned_hyps = assign_hyp_words_to_regions( $hyp_words, $score_regions );
+
+# Do the alignment
+# my $stats = align( $score_regions, $assigned_hyps, $out_pre );
+my $stats = align();
+
+# Print stats
+print_stats( $stats, $out_pre );
+
+print STDOUT "Output files written to ${out_pre}.pra (alignments), ${out_pre}.sys (statistics), ${out_pre}.dtl.*\n";
+
+exit(0);
+
+#################################################################################
+#				Subroutines
+#################################################################################
+
+sub align {
+
+  my $stats = {};
+ 
+  my %sub_count = ();
+  my %ins_count = ();
+  my %del_count = ();
+  my %ref_correct_count = ();
+  my %hyp_correct_count = ();
+  my %ref_count = ();
+  my %hyp_count = ();
+  my %ref_sub_count = ();
+  my %hyp_sub_count = ();
+ 
+  open ( F, ">" . $out_pre . ".pra" )
+  	or die "Couldn't open ${out_pre}.pra for writing alignments\n";
+  open ( SF, ">" . $out_pre . ".sgml" )
+        or die "Couldn't open ${out_pre}.sgml for writing sgml alignments\n";
+
+  my $date = `date`;
+  chomp $date; 
+  print SF '<SYSTEM title="' . $out_pre . '" ref_fname="' . $ref . '" hyp_fname="' . $hyp . '" creation_date="' . $date . '" format="2.4" frag_corr="TRUE" opt_del="TRUE" weight_ali="FALSE" weight_filename="">' . "\n";
+
+  foreach my $label ( sort keys %{ $label_names } ) {
+    print SF '<LABEL id="' . $label . '" title="' . $label_names->{$label}->{short} . '" desc="' . $label_names->{$label}->{long} . '">' . "\n";
+    print SF "</LABEL>\n";
+  }
+
+  foreach my $category ( sort { $a <=> $b } keys %{ $category_names } ) {
+    print SF '<CATEGORY id="' . $category . '" title="' . $category_names->{$category}->{short} . '" desc="' . $category_names->{$category}->{long} . '">' . "\n";
+    print SF "</CATEGORY>\n";
+  }
+
+  foreach my $spkr ( @$reforder ) {
+      if (not $require_all_in_ref) {
+          next unless exists($hypreg->{$spkr});
+      }
+
+    print "Aligning $spkr ...\n" if ( $verbosity > 1 );
+    print SF '<SPEAKER id="' . $spkr . '">' . "\n";
+
+    my $cnt = 1;
+    ALIGN_UTT: foreach my $st ( sort { $a <=> $b } keys %{ $refreg->{$spkr} } ) {
+  
+      # Align
+      my $correct = 0;
+      my $insertions = 0;
+      my $deletions = 0;
+      my $substitutions = 0;
+      my $log_prob = 0;
+ 
+      next ALIGN_UTT if ( $refreg->{$spkr}{$st}->{words} =~ /^IGNORE_TIME_SEGMENT_IN_SCORING$/i );
+ 
+      next ALIGN_UTT if ( ( $st eq $fake_region_id ) && !defined($hypreg->{$spkr}{$st}) );
+
+      print "Aligning ${spkr}-${st}\n" if ( $debug );
+
+      # Make the reference lattice from word string
+      my $reflat = { arcs => [], nodes => [] };
+      push @{ $reflat->{nodes} }, { in_arcs => [], out_arcs => [ 0 ] };
+      foreach my $refword ( split( ' ', $refreg->{$spkr}{$st}->{words} ) ) {
+        my $last_node_id = $#{ $reflat->{nodes} };
+        push @{ $reflat->{arcs} }, { src => $last_node_id,
+				     dst => $last_node_id + 1,
+				     word => $refword };
+        $reflat->{nodes}->[$last_node_id]->{out_arcs} = [ $#{ $reflat->{arcs} } ];
+        push @{ $reflat->{nodes} }, { in_arcs => [ $#{$reflat->{arcs} } ], 
+				      out_arcs => [] };
+      }
+
+      my $hyplat = defined( $hypreg->{$spkr}{$st} ) ?
+			$hypreg->{$spkr}{$st} : 
+			{ nodes => [ { in_arcs => [], out_arcs => [] } ], 
+			  arcs => [] };
+
+      if ( $debug ) {
+        print "Reference lattice =\n";
+        print_lattice( $reflat );
+        print "Hypothesis lattice =\n";
+        print_lattice( $hyplat );
+      }
+
+      @cost = ();
+      @traceback = ();
+      $cost[0][0] = 0;
+      $traceback[0][0] = {};
+
+      # Assign lowest costs to every ( ref_lat_node, hyp_lat_node ) pair
+
+      for ( my $i = 0; $i <= $#{ $reflat->{nodes} }; $i++ ) {
+        HYP_NODES: for ( my $j = 0; $j <= $#{ $hyplat->{nodes} }; $j++ ) {
+
+          next HYP_NODES if ( ( $i == 0 ) && ( $j == 0 ) );
+
+          $cost[$i][$j] = $max_cost;
+	  print "Aligning $i,$j\n" if ( $debug );
+
+          foreach my $ref_arc ( @{ $reflat->{nodes}->[$i]->{in_arcs} } ) {
+            my $ref_arc_hash = $reflat->{arcs}->[$ref_arc];
+            my $ref_word = $ref_arc_hash->{word};
+
+            foreach my $hyp_arc ( @{ $hyplat->{nodes}->[$j]->{in_arcs} } ) {
+              my $hyp_arc_hash = $hyplat->{arcs}->[$hyp_arc];
+
+              my $base_cost = $cost[ $ref_arc_hash->{src} ][ $hyp_arc_hash->{src} ];
+              my $hyp_word = $hyp_arc_hash->{word};
+              my $move_cost;
+              my $tb_str;
+
+              print "Comparing ref $ref_word vs. hyp $hyp_word\n" if ( $debug );
+
+              if ( $ref_word eq $hyp_word ) {
+                 $move_cost = $base_cost;
+                 $tb_str = "CORRECT: $ref_word";
+              } elsif ( ( $ref_word eq "(" . $hyp_word . ")" ) ||
+                   ( $allow_word_fragment_matching &&
+                     ( ( ( $ref_word =~ /^\((.*)\-\)$/ ) &&     # (X-) can match XY
+                         ( $hyp_word =~ /^$1/ ) ) ||
+                       ( ( $ref_word =~ /^\(\-(.*)\)$/ ) &&     # (-X) can match YX
+                         ( $hyp_word =~ /$1$/ ) ) ) ) ){
+                 $move_cost = $base_cost;
+                 $tb_str = "CORRECT: hyp $hyp_word for ref $ref_word";
+              } else {
+                 $move_cost = $base_cost + $sub_cost;
+                 $tb_str = "SUBSTITUTION: hyp $hyp_word for ref $ref_word";
+              }
+
+              update_cost( $i, $j, $ref_arc, $hyp_arc, $move_cost, $tb_str );
+            }
+
+            # Deletions
+            my $base_cost = $cost[ $ref_arc_hash->{src} ][ $j ];
+	    my $move_cost;
+            my $tb_str;
+            if ( $ref_word =~ /^\(.*\)$/ ) {
+              $move_cost = $base_cost + $opt_del_cost;
+              $tb_str = "CORRECT (Opt. Del.): $ref_word";
+            } else {
+              $move_cost = $base_cost + $delete_cost;
+              $tb_str = "DELETION: $ref_word";
+            }
+
+            update_cost( $i, $j, $ref_arc, undef, $move_cost, $tb_str );
+
+          }
+
+          # Insertions
+          foreach my $hyp_arc ( @{ $hyplat->{nodes}->[$j]->{in_arcs} } ) {
+            my $hyp_arc_hash = $hyplat->{arcs}->[$hyp_arc];
+            my $base_cost = $cost[$i][ $hyp_arc_hash->{src} ];
+            my $hyp_word = $hyp_arc_hash->{word};
+
+            my $move_cost = $base_cost + $insert_cost;
+            my $tb_str = "INSERTION: $hyp_word";
+            update_cost( $i, $j, undef, $hyp_arc, $move_cost, $tb_str );
+          }
+
+        } # for $j
+      } # for $i
+  
+      # Traceback
+      my $i = $#{ $reflat->{nodes} };
+      my $j = $#{ $hyplat->{nodes} };
+      my $aligned_ref = "";
+      my $aligned_hyp = "";
+      my $align_str = "";
+      my $sgml_str = "";
+      
+      while ( ( $i > 0 ) || ( $j > 0 ) ) {
+        my $tb = $traceback[$i][$j];
+
+#        print "Traceback for $i,$j is $tb" if ( $debug );
+
+        my $tb_str = $tb->{str};
+        die "Undefined traceback string for speaker $spkr start time $st (i=$i,j=$j)" unless defined( $tb_str );
+
+        $align_str = $tb_str . "\n" . $align_str;
+
+        my $ref_arc_hash = defined($tb->{ref_arc}) ? $reflat->{arcs}->[$tb->{ref_arc}] : {};
+        my $hyp_arc_hash = defined($tb->{hyp_arc}) ? $hyplat->{arcs}->[$tb->{hyp_arc}] : {};
+	my $ref_word = defined( $ref_arc_hash->{word} ) ? $ref_arc_hash->{word} : "";
+        my $hyp_word = defined( $hyp_arc_hash->{word} ) ? $hyp_arc_hash->{word} : "";
+	my $hyp_word_conf = defined( $hyp_arc_hash->{conf} ) ? $hyp_arc_hash->{conf} : "";
+        my $hyp_start_time = defined( $hyp_arc_hash->{start_time} ) ? $hyp_arc_hash->{start_time} : "";
+        my $hyp_end_time = defined( $hyp_arc_hash->{end_time} ) ? $hyp_arc_hash->{end_time} : "";
+
+        if ( $ref_word ) {
+          $aligned_ref = $ref_word . " " . $aligned_ref;
+          $ref_count{$ref_word} = 0 unless defined( $ref_count{$ref_word} );
+          $ref_count{$ref_word} += 1;
+        }
+
+        if ( $hyp_word ) {
+          $aligned_hyp = $hyp_word . " " . $aligned_hyp;
+          $hyp_count{$hyp_word} = 0 unless defined( $ref_count{$hyp_word} );
+          $hyp_count{$hyp_word} += 1;
+        }
+
+	my $next_i = defined($ref_arc_hash->{src}) ? $ref_arc_hash->{src} : $i;
+        my $next_j = defined($hyp_arc_hash->{src}) ? $hyp_arc_hash->{src} : $j;
+
+        if ( $tb_str =~ /^C/ ) {
+          $correct += 1;
+          $ref_correct_count{$ref_word} = 0 
+		unless defined( $ref_correct_count{$ref_word} );
+          $ref_correct_count{$ref_word} += 1;
+          if ( $tb_str !~ /^CORRECT \(Opt/ ) {
+            $log_prob += mylog( $hyp_word_conf );
+            $sgml_str = 'C,"' . $ref_word . '","' . $hyp_word . '",' . $hyp_start_time . '+' . $hyp_end_time . ',' . $hyp_word_conf . ':' . $sgml_str;
+            $hyp_correct_count{$hyp_word} = 0
+                unless defined( $hyp_correct_count{$hyp_word} );
+	    $hyp_correct_count{$hyp_word} += 1;
+          } else {
+            $sgml_str = 'C,"' . $ref_word . '","",0.000+0.000,0.000000:' . $sgml_str;
+          }
+        } elsif ( $tb_str =~ /^S/ ) {
+          $substitutions +=1;
+          $log_prob += mylog( 1.0 - $hyp_word_conf );
+          $sgml_str = 'S,"' . $ref_word . '","' . $hyp_word . '",' 
+			. $hyp_start_time . '+' . $hyp_end_time . ',' 
+			. $hyp_word_conf . ':' . $sgml_str;
+          $sub_count{"$ref_word $hyp_word"} = 0
+		unless defined( $sub_count{"$ref_word $hyp_word"} );
+          $sub_count{"$ref_word $hyp_word"} += 1;
+          $ref_sub_count{$ref_word} = 0
+                unless defined( $ref_sub_count{$ref_word} );
+          $ref_sub_count{$ref_word} += 1;
+          $hyp_sub_count{$hyp_word} = 0
+                unless defined( $hyp_sub_count{$hyp_word} );
+          $hyp_sub_count{$hyp_word} += 1;
+        } elsif ( $tb_str =~ /^I/ ) {
+          $insertions += 1;
+          $log_prob += mylog( 1.0 - $hyp_word_conf );
+          $sgml_str = 'I,,"' . $hyp_word . '",' . $hyp_start_time . '+' . 
+			$hyp_end_time . ',' . $hyp_word_conf 
+			. ':' . $sgml_str;
+          $ins_count{$hyp_word} = 0 unless defined( $ins_count{$hyp_word} );
+          $ins_count{$hyp_word} += 1;
+        } elsif ( $tb_str =~ /^D/ ) {
+          $deletions += 1;
+          $sgml_str = 'D,"' . $ref_word . '",,,:' . $sgml_str;
+          $del_count{$ref_word} = 0 unless defined( $del_count{$ref_word} );
+          $del_count{$ref_word} += 1;
+        } else {
+          die "INTERNAL ERROR:  Unknown traceback string $tb_str while aligning speaker $spkr reference starting at $st\n";
+        }
+
+      $i = $next_i;
+      $j = $next_j;
+      } # end while
+  
+      my $et = $refreg->{$spkr}{$st}->{end_time};
+      if ( $st eq $fake_region_id ) {
+        print F "Speaker $spkr  Hypothesis words outside of reference regions\n";
+        }
+      else {
+          if (($ref_format eq 'trn') or ($ref_format eq 'kaldi')) {
+              print F "id: ${spkr}${SPEAKER_SIDE_DELIM}$st\n";
+          }
+          else {
+              print F "Speaker $spkr Start time $st  End time $et\n";
+          }
+        }
+      print F "Ref: $aligned_ref\n";
+      print F "Hyp: $aligned_hyp\n";
+
+      print F "Scores: ( #C #S #D #I ) = ( $correct $substitutions $deletions $insertions )\n";
+      print F $align_str;
+      print F "\n";
+
+      my $nreference = $correct + $substitutions + $deletions;
+      my $nhypothesis = $correct + $substitutions + $insertions;
+
+      print SF '<PATH id="(' . $spkr . "-" . $st . "-" . $et . 
+	       ')" word_cnt="' . $nhypothesis . 
+	 	'" labels="<' .  $refreg->{$spkr}{$st}->{tags} . 
+		'>" file="' .  $refreg->{$spkr}{$st}->{wavefile} .
+	       '" channel="' . $refreg->{$spkr}{$st}->{channel} .  
+		'" sequence="' . $cnt++ . 
+               '" R_T1="' . $st . '" R_T2="' . $et . 
+               '" word_aux="h_t1+t2,h_conf">' . "\n";
+      chop $sgml_str;
+      print SF $sgml_str . "\n";
+      print SF "</PATH>\n";
+
+      # Accumulate statistics
+      foreach my $t ( split( ',', $refreg->{$spkr}{$st}->{tags} ), 
+  		    $refreg->{$spkr}{$st}->{speaker}, 
+                      "ALL" ) {
+        my $s = { nref => $nreference,
+  		  nhyp => $nhypothesis,
+                  cor => $correct,
+                  sub => $substitutions,
+                  ins => $insertions,
+                  del => $deletions,
+                  logprob => $log_prob };
+        foreach my $k ( keys %{ $s } ) {
+          $stats->{$t}->{$k} += $s->{$k};
+        }
+      }
+  
+    } # $foreach $st
+
+    print SF "</SPEAKER>\n";
+  } # foreach $spkr
+
+  close( F );
+
+  print SF "</SYSTEM>\n";
+  close( SF );
+
+  # dtl files
+
+  open ( DTL, ">" . $out_pre . ".dtl.sub" )
+        or die "Couldn't open ${out_pre}.dtl.sub for writing substitution counts\n";
+  print DTL "Substitutions\n\n";
+  print DTL "Count  Ref_word  Hyp_word\n";
+  print DTL "---------------------------------------\n";
+  foreach my $k ( sort { $sub_count{$b} <=> $sub_count{$a} } keys %sub_count ) {
+     printf DTL "%5d  %-70s\n", $sub_count{$k}, $k;
+  } 
+  close( DTL );
+
+  open ( DTL, ">" . $out_pre . ".dtl.ins" )
+        or die "Couldn't open ${out_pre}.dtl.ins for writing insertion counts\n";
+  print DTL "Insertions\n\n";
+  print DTL "Count  Hyp_word\n";
+  print DTL "---------------------------------------\n";
+  foreach my $k ( sort { $ins_count{$b} <=> $ins_count{$a} } keys %ins_count ) {
+     printf DTL "%5d  %-70s\n", $ins_count{$k}, $k;
+  }
+  close( DTL );
+
+  open ( DTL, ">" . $out_pre . ".dtl.del" )
+        or die "Couldn't open ${out_pre}.dtl.del for writing deletion counts\n";
+  print DTL "Deletions\n\n";
+  print DTL "Count  Ref_word\n";
+  print DTL "---------------------------------------\n";
+  foreach my $k ( sort { $del_count{$b} <=> $del_count{$a} } keys %del_count ) {
+     printf DTL "%5d  %-70s\n", $del_count{$k}, $k;
+  }
+  close( DTL );
+
+  open ( DTL, ">" . $out_pre . ".dtl.ref_words" )
+        or die "Couldn't open ${out_pre}.dtl.ref_words for writing reference word statistics\n";
+  print DTL "Statistics by reference word\n\n";
+  printf DTL "%-25s  %6s  %4s  %4s  %4s  %4s\n",
+	"Word", "Count", "%Cor", "%Err", "%Sub", "%Del";
+  print DTL "---------------------------------------------------------\n";
+  foreach my $k ( sort { $ref_count{$b} <=> $ref_count{$a} } keys %ref_count ) {
+     $ref_correct_count{$k} = 0 unless defined( $ref_correct_count{$k} );
+     $ref_sub_count{$k} = 0 unless defined( $ref_sub_count{$k} );
+     $del_count{$k} = 0 unless defined( $del_count{$k} );
+     printf DTL "%-25s  %6s  %4d  %4d  %4d  %4d\n",
+		$k, $ref_count{$k}, 
+	        100 * ( $ref_correct_count{$k} / $ref_count{$k} ),
+		100 * ( $ref_sub_count{$k} + $del_count{$k} ) / $ref_count{$k},
+		100 * ( $ref_sub_count{$k} / $ref_count{$k} ), 
+	        100 * ( $del_count{$k} / $ref_count{$k} );
+  }
+  close( DTL );
+
+  open ( DTL, ">" . $out_pre . ".dtl.hyp_words" )
+        or die "Couldn't open ${out_pre}.dtl.hyp_words for writing reference word statistics\n";
+  print DTL "Statistics by hypothesis word\n\n";
+  printf DTL "%-25s  %6s  %4s  %4s  %4s  %4s\n",
+        "Word", "Count", "%Cor", "%Err", "%Sub", "%Ins";
+  print DTL "---------------------------------------------------------\n";
+  foreach my $k ( sort { $hyp_count{$b} <=> $hyp_count{$a} } keys %hyp_count ) {
+     $hyp_correct_count{$k} = 0 unless defined( $hyp_correct_count{$k} );
+     $hyp_sub_count{$k} = 0 unless defined( $hyp_sub_count{$k} );
+     $ins_count{$k} = 0 unless defined( $ins_count{$k} );
+     printf DTL "%-25s  %6s  %4d  %4d  %4d  %4d\n",
+                $k, $hyp_count{$k},
+                100 * ( $hyp_correct_count{$k} / $hyp_count{$k} ),
+                100 * ( $hyp_sub_count{$k} + $ins_count{$k} ) / $hyp_count{$k},
+                100 * ( $hyp_sub_count{$k} / $hyp_count{$k} ),
+                100 * ( $ins_count{$k} / $hyp_count{$k} );
+  }
+  close( DTL );
+
+
+  return $stats;
+
+} # end of align
+
+
+sub print_stats {
+
+  my ( $stats, $out_pre ) = @_;
+
+  my $sysf = $out_pre . ".sys";
+  my $rawf = $out_pre . ".raw";
+  
+  if ( $verbosity > 1 ) {
+    open( F, "| tee $sysf" ) or
+  	die "Couldn't open | tee $sysf for writing\n";
+    }
+  else {
+    open( F, ">" . $sysf ) or
+  	die "Couldn't open $sysf for writing\n";
+    }
+  
+  open( RAW, ">" . $rawf ) or
+  	die "Couldn't open $rawf for writing\n";
+  
+  my $format = "%15s  %6s  %6s  %6s  %5s  %5s  %5s  %5s  %7s\n";
+  my $dash_line = ("-" x 79) . "\n";
+  
+  printf F $format, "Label", "#Ref", "#Hyp", "WER", "%Cor", "%Sub", "%Del", "%Ins", "NCE";
+  print F $dash_line;
+ 
+  printf RAW $format, "Label", "#Ref", "#Hyp", "#Err", "#Cor", "#Sub", "#Del", "#Ins", "NCE";
+  print RAW $dash_line;
+ 
+  foreach my $t ( sort keys %{ $stats } ) {
+    my $label = $t;
+    $label = $label_names->{$t}->{short} if defined( $label_names->{$t}->{short} );
+  
+    my $st = $stats->{$t};
+  
+    # Prevent divide by zero
+    $st->{nref} = 1 if ($st->{nref} == 0);
+
+    my $wer = ( $st->{sub} + $st->{del} + $st->{ins} ) / $st->{nref} * 100.0;
+    my $p_c = ( $st->{nhyp} == 0 ) ? 0.5 : $st->{cor} / $st->{nhyp};
+    my $h_max = -$st->{cor} * mylog($p_c) - ($st->{nhyp} - $st->{cor}) * mylog( 1 - $p_c );
+    my $NCE = ( $h_max == 0.00 ) ? "XXX" : sprintf( "%7.3f", 1.0 + $st->{logprob} / $h_max ); 
+    $NCE = "n/a" if ( $num_ctm_fields == 5 );
+  
+    print F $dash_line if ( $t eq "ALL" );
+    print RAW $dash_line if ( $t eq "ALL" );
+  
+    printf F $format, $label, $st->{nref}, $st->{nhyp}, sprintf( "%6.2f", $wer ),
+  	sprintf( "%5.1f", $st->{cor} / $st->{nref} * 100.0 ), 
+          sprintf( "%5.1f", $st->{sub} / $st->{nref} * 100.0 ),
+          sprintf( "%5.1f", $st->{del} / $st->{nref} * 100.0 ),
+          sprintf( "%5.1f", $st->{ins} / $st->{nref} * 100.0 ), 
+  	$NCE;
+  
+    printf RAW $format, $label, $st->{nref}, $st->{nhyp}, 
+  	( $st->{sub} + $st->{del} + $st->{ins} ),
+          $st->{cor}, $st->{sub}, $st->{del}, $st->{ins}, $NCE;
+  
+    print F $dash_line if ( $t eq "ALL" );
+    print RAW $dash_line if ( $t eq "ALL" );
+  
+    }
+  print F $dash_line;
+  printf F $format, "Label", "#Ref", "#Hyp", "WER", "%Cor", "%Sub", "%Del", "%Ins", "NCE";
+ 
+  print RAW $dash_line; 
+  printf RAW $format, "TAG", "#Ref", "#Hyp", "#Err", "#Cor", "#Sub", "#Del", "#Ins", "NCE";
+  
+  close( F );
+  close( RAW );
+
+  return;
+}  
+  
+sub mylog
+{
+  my $x = shift;
+
+  return ( $x > 0 ) ? log( $x) : $log_zero;
+}
+  
+sub load_stm
+{
+  my $ref = shift;
+
+  my $refreg = {};
+  my $label_names = {};
+  my $category_names = {};
+  my $reforder = [];
+  my $side_from_speaker = 0;
+
+  open( R, $ref ) or die "Can't open stm file $ref for reading\n";
+  REF: while( <R> ) {
+
+    if ( /^;;\s*LABEL\s*\"([^\"]*)\"\s*\"([^\"]*)\"\s*\"([^\"]*)\"/ ) {
+      my $label = $1;
+      warn "Previously defined label $label is redefined on STM file $ref line $_\n"
+  	if ( defined( $label_names->{$label} ) 
+	     && ( $label_names->{$label}->{short} ne $2 ) );
+      die "Label '$label' may not contain spaces on STM file $ref line $_\n"
+  	if ( $label =~ / / );
+      $label_names->{$label} = { short => $2, long => $3 };
+      }
+
+    if ( /^;;\s*CATEGORY\s*\"([^\"]*)\"\s*\"([^\"]*)\"\s*\"([^\"]*)\"/ ) {
+      my $category = $1;
+      warn "Previously defined category $category is redefined on STM file $ref line $_\n"
+        if ( defined( $category_names->{$category} )
+             && ( $category_names->{$category}->{short} ne $2 ) );
+      die "Category '$category' may not contain spaces on STM file $ref line $_\n" if ( $category =~ / / );
+      $category_names->{$category} = { short => $2, long => $3 };
+      }
+
+    next REF if (/^;/ or /^\s*$/);
+  
+    my @f = split;
+    (@f >= 6) or die "Stm file $ref line\n$_ doesn't have enough fields. (It must have wavefile channel speaker start_time end_time tag)\n";
+    my ($wavefile) = fileparse($f[0], qr/\.[^.]*$/);
+    my $channel = $f[1];
+    my $speaker = $f[2];
+    my $start_time = $f[3];
+    my $end_time = $f[4];
+    my $tag = $f[5];
+    $tag =~ /^\<(.*)\>$/ or die "Couldn't parse tag field $tag of stm file $ref line $_ ; tag field must start with < and end with >\n";
+    $tag = $1;
+    my $words = join( ' ', @f[6 .. $#f] );
+  
+    if ( $end_time < $start_time + 0.0001 ) {
+      print "WARNING:  For stm file $ref line $_ the end time $end_time isn't after the start time $start_time plus 0.0001 .\n";
+      $end_time = $start_time + 0.0001;
+      }
+   
+    # For first cut, stm file utterances = scorable regions
+    my $side;
+    if ($channel =~ /^[A-Z]$/) {  # usually will be just 'A' or 'B', but some intermediate scripts can have a channel of 'X'
+        $side = $wavefile . "_" . $channel;
+    }
+    else {
+        $side = $channel;
+        $side_from_speaker = 1;
+    }
+
+    push(@$reforder, $side) unless $refreg->{$side};
+
+    # Check for overlapping reference regions
+    foreach my $st ( keys %{ $refreg->{$side} } ) {
+       my $et = $refreg->{$side}{$st}->{end_time};
+       if ( ( ( $start_time > $st ) && ( $start_time < $et ) )
+            || ( $end_time > $st ) && ( $end_time < $et ) ) {
+         warn "STM line $_ overlaps with STM utterance starting at $st and ending at $et\n\n";
+       }
+    }
+
+    $refreg->{$side}{$start_time} = { end_time => $end_time,
+				      tags => $tag,
+				      words => $words,
+				      speaker => $speaker,
+				      wavefile => $wavefile,
+				      channel => $channel };
+    $refreg->{$side}{$fake_region_id} = { end_time => $fake_region_id,
+					  tags => $tag,
+					  words => "",
+					  speaker => $speaker,
+                                          wavefile => $wavefile,
+                                          channel => $channel }
+  	unless defined( $refreg->{$side}{$fake_region_id} );
+  
+    }
+  close( R );
+
+  return ( $refreg, $label_names, $category_names, $reforder, $side_from_speaker );
+}
+
+sub load_snor
+{
+    my $filename = shift;
+    my $fmt      = shift;
+
+    my $reg = {};
+    my $label_names = {};
+    my $category_names = {};
+    my $order = [];
+
+    open( R, $filename ) or die "Can't open trn file $filename for reading\n";
+    RECORD: while( <R> ) {
+        chomp;
+        next RECORD if (/^\s*$/);
+        
+        my @f = split;
+        next RECORD unless @f;
+
+        my $snorIdField;
+	if ($fmt eq "trn") {
+	    $snorIdField = pop(@f);
+	    $snorIdField = StripParens($snorIdField);
+	} elsif ($fmt eq "kaldi") {
+	    $snorIdField = shift(@f);
+	} else {
+	    die "load_snor(): unknown format \"$fmt\"! ";
+	}
+        my $side;
+        my $uttIndex;
+        
+        if ($snorIdField =~ m/^(\S+)([_-])(\d+)$/) {
+            $side = $1;
+	    $SPEAKER_SIDE_DELIM = $2;
+            $uttIndex = $3;
+        } else {
+            $side = $snorIdField;
+            $uttIndex = "1";
+        }
+
+        unless (defined($side) and defined($uttIndex)) {
+            die "Transcript (SNOR) file $filename bad SNOR id $snorIdField on line $_\n";
+        }
+
+        my $words = join( ' ', @f);
+  
+        push(@$order, $side) unless $reg->{$side};
+
+        # Check for repeated index within speaker
+        if ( $reg->{$side}{$uttIndex} ) {
+            warn "Transcript (SNOR) line side $side index $uttIndex repeated\n\n";
+       }
+# XXXX later need to decide exactly what field values will be
+        $reg->{$side}{$uttIndex} = {   end_time => "XXX",
+                                       tags => "",
+                                       words => $words,
+                                       speaker => $side,
+                                       wavefile => "XXX",
+                                       channel => "XXX" };
+# XXXX do we need fake_region_id?
+# XXXX MD disabling
+        # $reg->{$side}{$fake_region_id} = { end_time => $fake_region_id,
+        #                                    tags => "",
+        #                                    words => "",
+        #                                    speaker => $side,
+        #                                    wavefile => "XXX",
+        #                                    channel => "XXX", }
+        # unless defined( $reg->{$side}{$fake_region_id} );
+        
+    }
+    close( R );
+
+# Note: label_names, category_names always empty for SNOR file
+  return ( $reg, $label_names, $category_names, $order, 0 );
+}
+
+sub StripParens {
+    my ($str) = @_;
+    # Return contents of one-level of matched, enclosing parentheses
+    # Strips the parens and adjoining space
+    # (More general than needs to be: allows internal blanks
+    # in the contained string, e.g.,
+    #    " (   Hi there )" -->  "Hi there"
+    # if ( $str =~ /^\s*\(\s*(\S+(\s+\S+)*)\s*\)\s*$/ ) {
+    # Changed my mind, just keep it simple, allow no spaces:
+    if ( $str =~ /^\((\S+)\)$/ ) {
+        return $1;
+    }
+    return undef;
+}
+
+sub ParseSNORID {
+    my ($snorId) = @_;
+    # This must stupidly assume that ids are in form side-uttindex, eg, sw2001-A-0001.
+    if ( $snorId && $snorId =~ /^(\S+)-(\d+)$/ ) {
+        return ($1, $2);
+    }
+    return (undef, undef);
+}
+
+sub MakeNetworkFromText {
+    my ($lat) = @_;
+
+    @$lat{"arcs", "nodes"} = ( [], [] );
+    push @{ $lat->{nodes} }, { in_arcs => [], out_arcs => [ 0 ] };
+    my $words = $lat->{words};
+    return unless $words;
+
+    foreach my $word ( split( ' ', $words) ) {
+        my $last_node_id = $#{ $lat->{nodes} };
+        push @{ $lat->{arcs} }, { src => $last_node_id,
+                                  dst => $last_node_id + 1,
+                                  word => $word,
+                                  conf => 0.5 };
+        $lat->{nodes}->[$last_node_id]->{out_arcs} = [ $#{ $lat->{arcs} } ];
+        push @{ $lat->{nodes} }, { in_arcs => [ $#{$lat->{arcs} } ], 
+				      out_arcs => [] };
+    }
+}
+
+sub check_conf
+{
+  my ( $hyp, $line, $conf, $num_ctm_fields ) = @_;
+
+  if ( defined( $conf ) ) {
+    # check that conf value is valid, if it isn't verify that the decoding type is being called correctly, especially DecodeFastFWBW
+    die "On ctm file $hyp line $line confidence value $conf isn't valid numeric value." unless ( $conf =~ /^\s*[-+]?[0-9]*(?:[0-9]|\.[0-9]*)?(?:[eE][-+]?[0-9]+)?\s*$/);
+
+    if ( $num_ctm_fields == 5 ) {
+        warn "CTM file $hyp started out having five fields, but line $line has six!\n";
+      } else {
+        $num_ctm_fields = 6;
+      }
+    } else {
+      $conf = 0.5;
+      if ( $num_ctm_fields == 6 ) {
+        warn "CTM file $hyp started out having six fields, but line $line has only five!\n";
+      } else {
+        $num_ctm_fields = 5;
+      }
+    }
+
+    die "On ctm file $hyp line $line confidence value $conf isn't between 0 and 1\n"
+        if ( ( $conf > 1.0 ) || ( $conf < 0.0 ) );
+
+  return ( $conf, $num_ctm_fields );
+}
+
+sub load_ctm 
+{
+  my $hyp = shift;
+  my $side_from_speaker = shift;
+
+  my $hypreg = {}; 
+  my $num_ctm_fields = 0;
+  my $curr_spkr = undef;
+
+  open( H, $hyp ) or die "Can't open ctm file $hyp for reading\n";
+  HYP: while( <H> ) {
+    #next HYP if ( /^[;#]/ or /^\s*$/ );
+    if ( /^[;#]/ ) {
+        # Save the speaker ID from the comment that starts a new utterance.  If the STM was
+        # indexed by speaker ID, we will use this to look up the matching reference transcription.
+        if ( /spkr (\S+)/ ) {
+            $curr_spkr = $1;
+        }
+        next HYP;
+    }
+    next HYP if ( /^\s*$/ );
+
+    my ($wavefile, $channel, $start_time, $duration, $word, $conf, @foo) = split;
+    (defined($word) && !(@foo)) or die "Ctm file $hyp line $_ doesn't have five or six fields\n(It must have wavefile channel start_time end_time word [confidence].)\n";
+
+    # Extract id from file
+    $wavefile = fileparse($wavefile, qr/\.[^.]*$/);
+
+    # Assign it a scorable region
+    my $side;
+    if ($side_from_speaker && defined($curr_spkr)) {
+        $side = $curr_spkr;
+    }
+    else {
+        $side = $wavefile . "_" . $channel;
+    }
+ 
+    if ( $word eq "<ALT_BEGIN>" ) {
+      my $orig_wavefile = $wavefile;
+      my $orig_channel = $channel;
+      my @alt_hyps = ( [] );
+      my $i = 0;
+
+      my $region_start = 99999999; 
+      my $region_end = -99999;
+
+      ALT_LINE: while ( <H> ) {
+        my ($wavefile2, $channel2, $start_time2, $duration2, $word2, $conf2, @foo2) = split;
+        $wavefile2 = fileparse($wavefile2, qr/\.[^.]*$/);
+        die "Wavefile switched from $orig_wavefile to $wavefile2 inside <ALT> block at ctm file $hyp line $_" unless ( $orig_wavefile eq $wavefile2);
+        die "Channel switched from $orig_channel to $channel2 inside <ALT> block at ctm file $hyp line $_" unless ( $orig_channel eq $channel2 );
+
+        (defined($word2) && !(@foo2)) or die "Ctm file $hyp line $_ doesn't have five or six fields\n(It must have wavefile channel start_time end_time word [confidence].)\n";
+
+        if ( $word2 eq "<ALT>" ) {
+          $i++; 
+          $alt_hyps[$i] = [];
+          next ALT_LINE;
+        }
+        if ( $word2 eq "<ALT_END>" ) {
+          last ALT_LINE;
+        }
+
+        ($conf2,$num_ctm_fields) = check_conf( $hyp, $_, $conf2, $num_ctm_fields);
+
+        push @{ $alt_hyps[$i] }, [$start_time2, $start_time2 + $duration2, $word2, $conf2];
+
+        $region_start = min( $region_start, $start_time2 );
+        $region_end = max( $start_time2 + $duration2, $region_end );
+      }
+
+      # Put the <ALT> block into a hypreg
+      my $best_region = find_best_region( $side, $region_start, $region_end );
+
+      $hypreg->{$side}{$best_region} = { arcs => [],
+                                       nodes => [ { in_arcs => [],
+                                                  out_arcs => [] } ] }
+        unless defined( $hypreg->{$side}{$best_region} );
+
+      my $alt_start_node_id = $#{ $hypreg->{$side}{$best_region}->{nodes} };
+      my @arc_ids_to_fix = ();
+
+      foreach my $i ( 0 .. $#alt_hyps ) {
+         my $start_node_id = $alt_start_node_id;
+         foreach my $j ( 0 .. $#{ $alt_hyps[$i] } ) {
+
+            push @{ $hypreg->{$side}{$best_region}->{arcs} },
+        	{ word => $alt_hyps[$i]->[$j]->[2], 
+		  conf => $alt_hyps[$i]->[$j]->[3], 
+		  start_time => $alt_hyps[$i]->[$j]->[0],
+		  end_time => $alt_hyps[$i]->[$j]->[1],
+		  src => $start_node_id,
+		  dst => $#{ $hypreg->{$side}{$best_region}->{nodes} } + 1,
+		};
+           
+            push @{ $hypreg->{$side}{$best_region}->{nodes}->[$start_node_id]->{out_arcs} }, $#{ $hypreg->{$side}{$best_region}->{arcs} };
+
+            if ( $j != $#{ $alt_hyps[$i] } ) {
+               push @{ $hypreg->{$side}{$best_region}->{nodes} },
+		{ in_arcs => [ $#{ $hypreg->{$side}{$best_region}->{arcs} } ], 
+	          out_arcs => [] };
+               $start_node_id = $#{ $hypreg->{$side}{$best_region}->{nodes} };
+            } else {
+               push @arc_ids_to_fix, 
+		    $#{ $hypreg->{$side}{$best_region}->{arcs} };
+            }
+
+         } # foreach $j
+      } # foreach $i
+
+    push  @{ $hypreg->{$side}{$best_region}->{nodes} },
+        { in_arcs => [ @arc_ids_to_fix ], out_arcs => [] };
+    foreach my $arc_id ( @arc_ids_to_fix ) {
+       $hypreg->{$side}{$best_region}->{arcs}->[$arc_id]->{dst} =
+	$#{ $hypreg->{$side}{$best_region}->{nodes} };
+    }
+
+    next HYP;
+    } # if $word eq "<ALT_BEGIN>"
+ 
+    my $end_time = $start_time + $duration;
+
+    ($conf,$num_ctm_fields) = check_conf( $hyp, $_, $conf, $num_ctm_fields);
+
+    my $best_region = find_best_region( $side, $start_time, $end_time );
+ 
+    $hypreg->{$side}{$best_region} = { arcs => [], 
+				       nodes => [ { in_arcs => [], 
+						  out_arcs => [] } ] }
+	unless defined( $hypreg->{$side}{$best_region} );
+
+    my $last_node_id = $#{ $hypreg->{$side}{$best_region}->{nodes} };
+    push @{ $hypreg->{$side}{$best_region}->{arcs} },
+	{ src => $last_node_id, dst => $last_node_id + 1, word => $word, 
+	  conf => $conf, start_time => $start_time, end_time => $end_time };
+    $hypreg->{$side}{$best_region}->{nodes}->[$last_node_id]->{out_arcs}
+	= [ $#{ $hypreg->{$side}{$best_region}->{arcs} } ];
+    push @{ $hypreg->{$side}{$best_region}->{nodes} },
+ 	{ in_arcs => [ $#{ $hypreg->{$side}{$best_region}->{arcs} } ], 
+	  out_arcs => [] };
+
+    } # while <H>
+  close( H );
+ 
+  return( $hypreg, $num_ctm_fields ); 
+}
+  
+
+sub min {
+  my ( $a, $b ) = @_;
+  return ( $a < $b ) ? $a : $b;
+}
+
+sub max {
+  my ( $a, $b ) = @_;
+  return ( $a > $b ) ? $a : $b;
+}
+
+sub find_best_region {
+  my ( $side, $start_time, $end_time ) = @_;
+
+  if ( !defined( $refreg->{$side} ) ) {
+    die "Wavefile+channel $side from ctm file $hyp line $_ wasn't seen in the stm file reference.\n";
+  }
+
+  my $best_region = $fake_region_id;
+  my $dist_to_best_region = 9999999999;
+
+  ST: foreach my $st ( keys %{ $refreg->{$side} } ) {
+      next ST if ( $st eq $fake_region_id );
+      my $et = $refreg->{$side}{$st}->{end_time};
+      my $dist = 0;
+      if ( $start_time < $st ) {
+        $dist = $st - $start_time;
+      }
+      if ( $end_time > $et ) {
+        $dist += $end_time - $et;
+      }
+      if ( $dist < $dist_to_best_region ) {
+        $best_region = $st;
+        $dist_to_best_region = $dist;
+        }
+      }
+
+  return $best_region;
+}
+
+sub update_cost {
+  my ( $i, $j, $ref_arc, $hyp_arc, $move_cost, $str ) = @_;
+
+  die "Disconnected lattice at $i, $j $str" if ( $move_cost > $max_cost );
+
+  if ( $move_cost < $cost[$i][$j] ) {
+     $cost[$i][$j] = $move_cost;
+     $traceback[$i][$j] = { ref_arc => $ref_arc, 
+			    hyp_arc => $hyp_arc, 
+                            str => $str };
+     print "New lowest cost $move_cost for $i,$j with $str\n" if ( $debug );
+  }
+
+}
+
+sub print_lattice {
+  my ( $lat ) = @_;
+
+  return unless defined( $lat->{nodes} );
+
+  print "Nodes:\n";
+  for ( my $n = 0 ; $n <= $#{ $lat->{nodes} }; $n++ ) {
+     print "  $n in_arcs = ", join( ' ', @{ $lat->{nodes}->[$n]->{in_arcs} } ),
+	   " out_arcs = ", join( ' ', @{ $lat->{nodes}->[$n]->{out_arcs} } ),
+	   "\n";
+  }
+  print "Arcs:\n";
+  for ( my $a = 0 ; $a <= $#{ $lat->{arcs} }; $a++ ) {
+     print " $a word = ", $lat->{arcs}->[$a]->{word}, " src = ",
+	$lat->{arcs}->[$a]->{src}, " dst = ", $lat->{arcs}->[$a]->{dst}, "\n";
+  }
+
+}
+__END__
+
+=head1 NAME
+
+scorer.pl - Score speech recognition system output
+
+=head1 SYNOPSIS
+
+scorer.pl STM-reference-file CTM-hypothesis-file [output-filename-prefix]
+
+=head1 DESCRIPTION
+
+scorer.pl aligns the words in the CTM-hypothesis-file against the STM-reference-file
+and then prints out various statistics of the alignment, including the
+word error rate (WER), to standard out and to files beginning with
+output-filename-prefix (or CTM-hypothesis-file if output-filename-prefix
+is not given).  It is intended as a replacement for sclite(1).
+
+=head1 OPTIONS
+
+Currently, scorer.pl takes no options.
+
+=head1 ALIGNMENT
+
+The alignment process consists of two steps.  In the first step, each word from
+the CTM hypothesis file is assigned to an utterance from the STM-reference-file.
+In the second step, the reference words in each utterance are aligned with
+the hypothesis words assigned to that utterance so as to minimize a
+Levenshtein edit-distance function with correct words, insertions, deletions
+and substitutions given costs of 0, 3, 3, and 4 respectively.  (Inserting an
+optionally deletable word counts as correct, but is given a cost of 2 for
+alignment purposes.)
+
+=head1 STM FILE FORMAT
+
+STM (Segment Time Mark) files are text files, any line of which can be either
+a blank line, a comment line, a label declaration line, or a regular line.  Blank
+and comment lines are ignored.
+
+Comment lines begin with a semicolon character and may then consist of any
+number of non-new line characters.  [Note:  sclite requires STM comment lines
+to begin with two semicolon characters.]
+
+Label declaration lines begin with two semicolons followed by optional whitespace 
+followed by the word "LABEL".  Next comes three strings, each of which is delimmited 
+on both ends by double quotes ("), with optional whitespace between the strings.
+The first string is the label tag used to mark utterances.  It may not contain spaces.
+The second string is the short label description, and is used when presenting
+summary statistics for the utterances belonging to the label.  The third string
+is a long label description, and is currently unused.
+
+Here are some example label declarations:
+
+   ;; LABEL "F"	   "Female"        "Female Speakers"
+   ;; LABEL "FISH" "Fisher"        "Fisher Speakers"
+   ;; LABEL "CH-M" "Callhome Male" "Male Callhome Speakers"
+
+Label declarations may be grouped in category sections, which are declared with
+lines that look like
+   
+   ;; CATEGORY "0" "" ""
+
+scorer.pl currently does not use category information in any way.
+
+Regular STM file lines give the transcription and time information for reference
+utterances, and consist of at least six whitespace separated fields.  The meaning
+of the fields is as follows:
+
+=over 4
+
+=item Field 1:  
+
+Audio file identifier.  Typically this is the basename of the audio
+file, without any path information or file type suffixes (like ".sph" or ".wav").
+
+=item Field 2:  
+
+Channel identifier.  Typically "A" for channel 1, and "B" for
+channel 2.
+
+=item Field 3:  
+
+Speaker identifier.  Typically this is the audio file identifier
+followed by an underscore (_) followed by the channel identifier.
+
+=item Field 4:  
+
+Utterance begin time in seconds, as counted from the beginning of
+the audio file.  Typically specified to 1/100ths of a second.
+
+=item Field 5:  
+
+Utterance end time in seconds.
+
+=item Field 6:  
+
+Label tags for this utterance.  The label tags should be separated by commas (,) and enclosed by < and >.  For example:  <F,FISH,FISH-F>.  If there are no label tags
+for the utterance, the string <> is expected.  [Note:  unlike in sclite, this field is 
+mandatory.]
+
+=item Fields 7+ (Optional):  
+
+The words for this utterance.  Any words enclosed with 
+parenthesis are considered to be "optionally deletable":  if no hypothesis word
+aligns to the optionally deletable word, then it is counted as correct.  For example,
+if "(%HESITATION)" is the sole reference word for an utterance, then either
+%HESITATION or no hypothesis for the utterance will assigned 1 correct word and
+0 errors for the utterance.  If a single non-%HESITATION word is hypothesized,
+it will be counted as a substitution.  If a optionally deletable word ends with 
+a dash, then it is considered to be a word fragment and any hypothesis word that 
+matches the word upto dash will be considered correct.  For example, the hypothesis
+MOLD would be correct if aligned to (MOL-), (MO-), or (M-).
+
+The reference words can be in any encoding scheme in which 
+the bytes for whitespace, new lines, parenthesis, and dash (ascii 9, 10, 13, 32, 40,
+41, and 45) always represent themselves.  This is true for UTF-8 and (I believe) 
+EUC-JP, but not (I believe) for UTF-16 or GB18030-2000.  In addition, if the
+encoding scheme contains multiple byte sequences that code for the same character,
+then the reference and hypothesis words should both be normalized into an
+encoding subset for which every character has an unique byte sequence.
+
+=back
+
+=head1 CTM FILE FORMAT
+
+CTM (Conversation Time Marked) files are text files, any line of which may a
+blank line, a comment line, or a regular line.  As with STM files, comment lines
+begin with a semicolon (;) character, and blank and comment lines are ignored.
+
+Regular CTM file lines give the information for a single hypothesis word, and
+consist of either five or six whitespace separated fields:
+
+=over 4
+
+=item Field 1:  
+
+Audio file identifier.  As in the STM file.
+
+=item Field 2:  
+
+Channel identifier.  As in the STM file.
+
+=item Field 3:  
+
+Word start time in seconds, as counted from the beginning of
+the audio file.  Typically specified to 1/100ths of a second.
+
+=item Field 4:  
+
+Word duration in seconds.
+
+=item Field 5:  
+
+The hypothesis word.
+
+=item Field 6 (Optional):  
+
+A confidence score for the hypothesis word.  The
+score must be between 0 and 1 inclusive.
+
+=back
+
+A CTM file may also contain alternate hypothesis paths.  These are typically
+the result of filtering an initial CTM file with a GLM mapping file, and
+are intended to deal with hypothesis words that have multiple valid transcriptions.
+Alternate hypothesis pathes are described by a format that looks like
+
+   fsh_109487 A 90.500 0.210 WHAT 0.645777
+   fsh_109487 A * * <ALT_BEGIN>
+   fsh_109487 A 90.710 0.230 THAT'S 0.347474
+   fsh_109487 A * * <ALT>
+   fsh_109487 A 90.710 0.115 THAT 0.347474
+   fsh_109487 A 90.825 0.115 IS 0.347474
+   fsh_109487 A * * <ALT>
+   fsh_109487 A 90.710 0.115 THAT 0.347474
+   fsh_109487 A 90.825 0.115 HAS 0.347474
+   fsh_109487 A * * <ALT_END>
+   fsh_109487 A 94.240 0.320 JUST 0.884898
+
+Specifically, the alternate paths should be surrounded by the tokens
+<ALT_BEGIN> and <ALT_END> in the CTM file word field, and the alternate
+paths should be separated by <ALT>s, also in the word field.  In all of
+these cases, fields 3 and 4 should contain only single asterisks.
+
+For a particular audio file/channel combination, the words in a CTM file must
+appear in order of increasing start time.  The UNIX command
+"sort +0 -1 +1 -2 +2nb -3" will accomplish this while also sorting the 
+conversations into an order sclite likes, but only if the CTM file does not
+contain <ALT> regions.
+
+=head1 OUTPUT
+
+scorer.pl outputs four files:  the .sys, .raw, .sgml and .pra files.  These are
+written to output-filename-prefix plus the suffix; if output-filename-prefix
+is not given, ctm-hypothesis-file is used as the output filename prefix
+instead.  The .sys file is additionally written to standard output.
+
+=head2 The .sys File
+
+The .sys file contains the following statistics for every label, every speaker,
+and for the entire test set ("ALL"):
+
+=over 4
+
+=item #Ref = number of words in the reference STM file
+
+=item #Hyp = number of words in the hypothesis CTM file
+
+=item WER = the word error rate = ( #_substitutions + #_deletions + #_insertions ) / #_reference_words
+
+=item %Cor = percentage correct = #_correct / #_reference_words
+
+=item %Sub = percentage substitutions = #_substitutions / #_reference_words
+
+=item %Del = percentage deletions = #_deletions / #_reference_words
+
+=item %Ins = percentage insertions = #_insertions / #_reference_words
+
+=item NCE = Normalized Cross Entropy, a measure of the goodness of the confidence
+values in the CTM file.  It is calculated using the following formula:
+
+  NCE = 1 - LL / ( #Cor log(p_c) + (#Hyp - #Cor) log(1-p_c) )
+  LL = Log Likelihood of Confidence Values 
+     = sum_{w correct} log( conf(w) ) + sum_{w incorrect} log( 1 - conf(w) )
+  p_c = (ML Estimate of) Probability of Correctness
+      = #Cor / #Hyp
+
+In all of the above formulas, log( 0 ) is replaced with -1000 whenever it
+occurs.
+
+=back
+
+=head2 The .raw File
+
+The .raw file contains the following statistics for every label, every speaker,
+and for the entire test set ("ALL"):
+
+=over 4
+
+=item #Ref = number of words in the reference STM file
+
+=item #Hyp = number of words in the hypothesis CTM file
+
+=item #Err = number of errors = #_substitutions + #_deletions + #_insertions
+
+=item #Cor = number of correct hypothesis words
+
+=item #Sub = number of substitutions
+
+=item #Del = number of reference words deleted
+
+=item #Ins = number of hypothesis words inserted
+
+=item NCE = Normalized Cross Entropy, see above description under L<"The .sys File">
+
+=back
+
+=head2 The .pra File
+
+The .pra file contains alignment information for each STM file reference
+utterance.  Here is an example:
+
+  Speaker fsh_110103_A Start time 123.44  End time 127.61
+  Ref: (%HESITATION) TOPIC IS NEEDED OR OR WHERE THEY HAVE
+  Hyp: OUR TOPIC IS NEEDED OR WHAT THEY HAVE
+  Scores: ( #C #S #D #I ) = ( 7 1 1 1 )
+  INSERTION: OUR
+  CORRECT (Opt. Del.): (%HESITATION)
+  CORRECT: TOPIC
+  CORRECT: IS
+  CORRECT: NEEDED
+  DELETION: OR
+  CORRECT: OR
+  SUBSTITION: hypothesis WHAT for reference WHERE
+  CORRECT: THEY
+  CORRECT: HAVE
+
+The utterance start and end times are given in seconds.  #C, #S, #D, and #I stand for number of correct, substitution, deletion and insertion words, respectively.  "Opt. Del." stands for optionally deletable (see L<"STM FILE FORMAT"> above).
+
+Note that scorer.pl's .pra file output format is rather different than sclite's.
+
+=head2 The .sgml File
+
+The .sgml file also contains alignment information, but in a slightly
+more computer parseable format.  Here is an example:
+
+   <SYSTEM title="18985m-ADEC_CONF_SCORE-cleaned.ctm.filt" ref_fname="18985m-local-copy.stm.dedash.filt" hyp_fname="18985m-ADEC_CONF_SCORE-cleaned.ctm.filt" creation_date="Mon Feb 13 12:59:19 EST 2006" format="2.4" frag_corr="TRUE" opt_del="TRUE" weight_ali="FALSE" weight_filename="">
+   <SPEAKER id="fsh_109487_A">
+   <PATH id="(fsh_109487_A-86.06-88.02)" word_cnt="1" labels="<O>" file ="fsh_109487" channel="A" sequence="5" R_T1="86.06" R_T2="88.02" word_aux="h_t1+t2,h_conf">
+   I,,"YEAH",86.18+86.62,0.911505:C,"YEAH","YEAH",87.31+87.74,0.943606
+   </PATH>
+   <PATH id="(fsh_109487_A-226.81-227.69)" word_cnt="1" labels="<O>" file ="fsh_109487" channel="A" sequence="40" R_T1="226.81" R_T2="227.69" word_aux="h_t1+t2,h_conf">
+   S,"YUP","YEAH",227.04+227.42,0.940359
+   </PATH>
+   </SPEAKER>
+   </SYSTEM>
+
+The alignment information for each reference utterance is described by a
+colon delimmited list.  Each alignment step is described by either
+
+   C,"ref_word","hyp_word",start_time+end_time,confidence     [CORRECT]
+   S,"ref_word","hyp_word",start_time+end_time,confidence     [SUBSTITUTION]
+   I,,"hyp_word",start_time+end_time,confidence               [INSERTION]
+   D,"ref_word",,,                                            [DELETION]
+
+scorer.pl's .sgml file output is intended to be 100% compatible with sclite's, 
+with the one exception of PATH id names:  scorer.pl's are
+   wavefile_channel-starttime-endtime
+while sclite's are
+   wavefile_channel-number
+
+=head1 ADVANTAGES OVER SCLITE
+
+Better error messages.
+
+Less finicky about input:  conversations do not need to appear in any particular
+order, and it's okay if there are no hypothesis words for a speaker.
+
+Totally case sensitive:  scorer.pl never upper or lower cases anything.
+
+Fewer "special" characters:  words can now contain semicolons (;) and less than
+signs (<), for example.
+
+Small, easy to maintain implementation.
+
+Summary statistics per label are put in the .sys file, rather than hidden
+in separate .lur files.
+
+=head1 CAVEATS
+
+Some of sclite's output files aren't supported:  .det and .hist plots and
+.lur files.  No sentence/utterance statistics or median statistics are output.
+
+"IGNORE_TIME_SEGMENT_IN_SCORING" segments are properly ignored for scoring, but
+they produce no alignment information.  (The effected hypothesis words should
+be given "IGNORED" alignment tags in the .pra and .sgml files, but aren't.)
+
+If there isn't at least a 2-3 second gap between two reference utterances,
+they should be joined together for the purposes of aligning the hypothesis
+words (and then separated again when outputing the alignment statistics).
+
+Arguably, %Ins would make more sense as #_insertions / #_hypothesis_words,
+but tradition and consistency define it as #_insertions / #_reference_words.
+
+Nested <ALT> regions in CTM files and multiple reference paths in STM files
+are not supported.
+
+There should be command line options to fiddle with various things
+(insertion/deletion/substitution costs, whether to match word fragments,
+which output files to produce, etc.).
+
+=head1 AUTHOR
+
+Thomas Colthurst, thomasc@bbn.com.  Z<BBN_ref_explicitly_OK>
+
+=head1 COPYRIGHT
+
+Copyright 2005 by BBN Technologies.
+
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/char_trans_utf8_to_uxxxx.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/char_trans_utf8_to_uxxxx.py
new file mode 100644
index 00000000000..81e38f1b109
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/char_trans_utf8_to_uxxxx.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+""" This script convertss the text from character format to
+    hexadecimal format (uxxxx).
+    Eg. char_trans_utf8_to_uxxxx.py <input-file> <output-file>
+"""
+
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: char_trans_utf8_to_uxxxx.py <input-file> <output-file>")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+
+def main():
+
+    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            for char in utt.split():
+                if char == "<sp>":
+                    fh_out.write("u0020 ")
+                else:
+                    fh_out.write(utf8_char_to_uxxxx(char))
+                    fh_out.write(" ")
+
+            # Finally write out uttid and newline
+            fh_out.write("(%s)\n" % uttid)
+
+
+def utf8_char_to_uxxxx(char):
+    raw_hex = hex(ord(char))[2:].zfill(4).lower()
+    uxxxx_char = "u%s" % raw_hex
+    return uxxxx_char
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/filter_ids.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/filter_ids.py
new file mode 100644
index 00000000000..ee55a900fbf
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/filter_ids.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+
+""" This script is used for partial scoring, it will remove the given
+    utterance ids and will score with the remailing utterance ids.
+    Eg. filter_ids.py <ids-to-filter> <input-trans> <output-trans>
+"""
+
+import unicodedata
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 4:
+    print("Usage: filter_ids.py <ids-to-filter> <input-trans> <output-trans>")
+    sys.exit(1)
+
+input_ids_file = sys.argv[1]
+input_trans = sys.argv[2]
+output_trans = sys.argv[3]
+
+def main():
+
+    # First load ids to filter out of transcript
+    ids_to_filter = set()
+    with open(input_ids_file, 'r') as fh:
+        for line in fh:
+            ids_to_filter.add(line.strip())
+
+    # Now load input transcript and filter out the ids
+    with open(input_trans, 'r', encoding='utf-8') as fh, open(output_trans, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            if uttid in ids_to_filter:
+                continue
+
+            fh_out.write("%s (%s)\n" % (utt, uttid))
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/find_missing_hyp_ids.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/find_missing_hyp_ids.py
new file mode 100644
index 00000000000..e578ea6cc54
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/find_missing_hyp_ids.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+""" This script finds and prints the hypothesis utterance ids which
+    are not present in the reference utterance ids.
+    Eg. find_missing_hyp_ids.py <ref-file> <hyp-file>
+"""
+
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: find_missing_hyp_ids.py <ref-file> <hyp-file>")
+    sys.exit(1)
+
+hyp_file = sys.argv[1]
+ref_file = sys.argv[2]
+
+def main():
+
+    with open(hyp_file, 'r', encoding='utf-8') as hyp_fh, open(ref_file, 'r', encoding='utf-8') as ref_fh:
+        ref_ids = set()
+        for utt, uttid in SnorIter(ref_fh):
+            ref_ids.add(uttid)
+
+        for utt, uttid in SnorIter(hyp_fh):
+            if uttid not in ref_ids:
+                print(uttid)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/insert_empty_hyp.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/insert_empty_hyp.py
new file mode 100644
index 00000000000..fa9e51e38fc
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/insert_empty_hyp.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+""" This script adds ids with empty utterance. It is used during scoring
+    in cases where some of the reference ids are missing in the hypothesis.
+    Eg. insert_empty_hyp.py <ids-to-insert> <in-hyp-file> <out-hyp-file>
+"""
+
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 4:
+    print("Usage: insert_empty_hyp.py <ids-to-insert> <in-hyp-file> <out-hyp-file>")
+    sys.exit(1)
+
+ids_file = sys.argv[1]
+hyp_in_file = sys.argv[2]
+hyp_out_file = sys.argv[3]
+
+def main():
+
+    with open(hyp_in_file, 'r', encoding='utf-8') as hyp_in_fh, open(hyp_out_file, 'w', encoding='utf-8') as hyp_out_fh, open(ids_file, 'r') as ids_fh:
+        # First just copy input hyp file over
+        for line in hyp_in_fh:
+            hyp_out_fh.write(line)
+
+        # Now add missing ids
+
+        for line in ids_fh:
+            uttid = line.strip()
+            hyp_out_fh.write("(%s)\n" % uttid)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_common.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_common.py
new file mode 100644
index 00000000000..5bab669d175
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_common.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+""" This script normalizes a text file. It performs following normalizations:
+    dots/filled-circles to periods, # variuos dashes to regular hyphen, full
+    width left/right-paren to regular left/right paren.
+    Eg. normalize_common.py <input-file> <output-file>
+"""
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: normalize_common.py <input-file> <output-file>")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+def main():
+
+    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            for char in utt:
+                if char == "\u25cf" or char == "\u2022" or char == "\u2219":
+                    # Convert "dots"/"filled-circles" to periods
+                    fh_out.write("\u002e")
+                elif char == "\u2010" or char == "\u2011" or char == "\u2012" or char == "\u2013" or char == "\u2014" or char == "\u2015":
+                    # Change variuos Unicode dashes to Reular hyphen
+                    fh_out.write("\u002d")
+                elif char == "\uff09":
+                    # Change Full width right-paren to regular paren
+                    fh_out.write("\u0029")
+                elif char == "\uff08":
+                    # Change Full width left-paren to regular paren
+                    fh_out.write("\u0028")
+                else:
+                    # Otherwise just apapend char w/o modification
+                    fh_out.write(char)
+
+            # Finally, print out uttid and newline
+            fh_out.write(" (%s)\n" % uttid)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_farsi.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_farsi.py
new file mode 100644
index 00000000000..aa6205fee51
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_farsi.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+
+""" This script normalizes a text file. It performs following normalizations:
+    remove tatweel, vowels and hamza, RTL and LTR marks,
+    convert arabic keheh to arabic kaf, Farsi Yeh to Arabic Yeh,
+    Extended (farsi) arabic-indic digit to regular arabic-indic digit,
+    arabic comma to regular comma.
+    Eg. normalize_farsi.py <input-file> <output-file>
+"""
+
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: normalize_farsi.py <input-file> <output-file>")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+def main():
+
+    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            for char in utt:
+                # First, convert from presentation form to base form
+                if char in PRESENTATION_TO_BASE:
+                    char = PRESENTATION_TO_BASE[char]
+
+                # Next, handle character-level transformations
+                if char == "\u0640":
+                    # remove tatweel
+                    continue
+                elif char == "\u064b" or char == "\u064c" or char == "\u064d" or char == "\u064e" or char == "\u064f" or char == "\u0650" or char == "\u0651" or char == "\u0652" or char == "\u0653" or char == "\u0654" or char == "\u0655":
+                    # remove vowels and hamza
+                    continue
+                elif char == "\u200f" or char == "\u200e":
+                    # remove RTL and LTR marks
+                    continue
+                elif char == "\u06a9":
+                    # u06a9 (arabic keheh) -> u0643 (arabic kaf)
+                    fh_out.write("\u0643")
+                elif char == "\u06cc":
+                    # u06cc (Farsi Yeh) -> u064a (Arabic Yeh)
+                    fh_out.write("\u064a")
+                elif char == "\ufdfc":
+                    # Transform ligature for RIAL sign -> seq of chars for rial sign
+                    fh_out.write("\u0631\u06cc\u0627\u0644")
+                elif char == "\u06f0":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0660")
+                elif char == "\u06f1":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0661")
+                elif char == "\u06f2":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0662")
+                elif char == "\u06f3":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0663")
+                elif char == "\u06f4":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0664")
+                elif char == "\u06f5":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0665")
+                elif char == "\u06f6":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0666")
+                elif char == "\u06f7":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0667")
+                elif char == "\u06f8":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0668")
+                elif char == "\u06f9":
+                    # Extended (farsi) arabic-indic digit -> regular arabic-indic digit
+                    fh_out.write("\u0669")
+                elif char == "\u060c":
+                    # Change Arabic comma to Reular Comma
+                    fh_out.write("\u002c")
+                else:
+                    # Otherwise just apapend char w/o modification
+                    fh_out.write(char)
+
+            # Finally, print out uttid and newline
+            fh_out.write(" (%s)\n" % uttid)
+
+
+BASE_TO_PRESENTATION = {
+    # ARABIC LETTER HAMZA
+    '\u0621': ('\uFE80', '', '', ''),
+    # ARABIC LETTER ALEF WITH MADDA ABOVE
+    '\u0622': ('\uFE81', '', '', '\uFE82'),
+    # ARABIC LETTER ALEF WITH HAMZA ABOVE
+    '\u0623': ('\uFE83', '', '', '\uFE84'),
+    # ARABIC LETTER WAW WITH HAMZA ABOVE
+    '\u0624': ('\uFE85', '', '', '\uFE86'),
+    # ARABIC LETTER ALEF WITH HAMZA BELOW
+    '\u0625': ('\uFE87', '', '', '\uFE88'),
+    # ARABIC LETTER YEH WITH HAMZA ABOVE
+    '\u0626': ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A'),
+    # ARABIC LETTER ALEF
+    '\u0627': ('\uFE8D', '', '', '\uFE8E'),
+    # ARABIC LETTER BEH
+    '\u0628': ('\uFE8F', '\uFE91', '\uFE92', '\uFE90'),
+    # ARABIC LETTER TEH MARBUTA
+    '\u0629': ('\uFE93', '', '', '\uFE94'),
+    # ARABIC LETTER TEH
+    '\u062A': ('\uFE95', '\uFE97', '\uFE98', '\uFE96'),
+    # ARABIC LETTER THEH
+    '\u062B': ('\uFE99', '\uFE9B', '\uFE9C', '\uFE9A'),
+    # ARABIC LETTER JEEM
+    '\u062C': ('\uFE9D', '\uFE9F', '\uFEA0', '\uFE9E'),
+    # ARABIC LETTER HAH
+    '\u062D': ('\uFEA1', '\uFEA3', '\uFEA4', '\uFEA2'),
+    # ARABIC LETTER KHAH
+    '\u062E': ('\uFEA5', '\uFEA7', '\uFEA8', '\uFEA6'),
+    # ARABIC LETTER DAL
+    '\u062F': ('\uFEA9', '', '', '\uFEAA'),
+    # ARABIC LETTER THAL
+    '\u0630': ('\uFEAB', '', '', '\uFEAC'),
+    # ARABIC LETTER REH
+    '\u0631': ('\uFEAD', '', '', '\uFEAE'),
+    # ARABIC LETTER ZAIN
+    '\u0632': ('\uFEAF', '', '', '\uFEB0'),
+    # ARABIC LETTER SEEN
+    '\u0633': ('\uFEB1', '\uFEB3', '\uFEB4', '\uFEB2'),
+    # ARABIC LETTER SHEEN
+    '\u0634': ('\uFEB5', '\uFEB7', '\uFEB8', '\uFEB6'),
+    # ARABIC LETTER SAD
+    '\u0635': ('\uFEB9', '\uFEBB', '\uFEBC', '\uFEBA'),
+    # ARABIC LETTER DAD
+    '\u0636': ('\uFEBD', '\uFEBF', '\uFEC0', '\uFEBE'),
+    # ARABIC LETTER TAH
+    '\u0637': ('\uFEC1', '\uFEC3', '\uFEC4', '\uFEC2'),
+    # ARABIC LETTER ZAH
+    '\u0638': ('\uFEC5', '\uFEC7', '\uFEC8', '\uFEC6'),
+    # ARABIC LETTER AIN
+    '\u0639': ('\uFEC9', '\uFECB', '\uFECC', '\uFECA'),
+    # ARABIC LETTER GHAIN
+    '\u063A': ('\uFECD', '\uFECF', '\uFED0', '\uFECE'),
+    # ARABIC LETTER FEH
+    '\u0641': ('\uFED1', '\uFED3', '\uFED4', '\uFED2'),
+    # ARABIC LETTER QAF
+    '\u0642': ('\uFED5', '\uFED7', '\uFED8', '\uFED6'),
+    # ARABIC LETTER KAF
+    '\u0643': ('\uFED9', '\uFEDB', '\uFEDC', '\uFEDA'),
+    # ARABIC LETTER LAM
+    '\u0644': ('\uFEDD', '\uFEDF', '\uFEE0', '\uFEDE'),
+    # ARABIC LETTER MEEM
+    '\u0645': ('\uFEE1', '\uFEE3', '\uFEE4', '\uFEE2'),
+    # ARABIC LETTER NOON
+    '\u0646': ('\uFEE5', '\uFEE7', '\uFEE8', '\uFEE6'),
+    # ARABIC LETTER HEH
+    '\u0647': ('\uFEE9', '\uFEEB', '\uFEEC', '\uFEEA'),
+    # ARABIC LETTER WAW
+    '\u0648': ('\uFEED', '', '', '\uFEEE'),
+    # ARABIC LETTER (UIGHUR KAZAKH KIRGHIZ)? ALEF MAKSURA
+    '\u0649': ('\uFEEF', '\uFBE8', '\uFBE9', '\uFEF0'),
+    # ARABIC LETTER YEH
+    '\u064A': ('\uFEF1', '\uFEF3', '\uFEF4', '\uFEF2'),
+    # ARABIC LETTER ALEF WASLA
+    '\u0671': ('\uFB50', '', '', '\uFB51'),
+    # ARABIC LETTER U WITH HAMZA ABOVE
+    '\u0677': ('\uFBDD', '', '', ''),
+    # ARABIC LETTER TTEH
+    '\u0679': ('\uFB66', '\uFB68', '\uFB69', '\uFB67'),
+    # ARABIC LETTER TTEHEH
+    '\u067A': ('\uFB5E', '\uFB60', '\uFB61', '\uFB5F'),
+    # ARABIC LETTER BEEH
+    '\u067B': ('\uFB52', '\uFB54', '\uFB55', '\uFB53'),
+    # ARABIC LETTER PEH
+    '\u067E': ('\uFB56', '\uFB58', '\uFB59', '\uFB57'),
+    # ARABIC LETTER TEHEH
+    '\u067F': ('\uFB62', '\uFB64', '\uFB65', '\uFB63'),
+    # ARABIC LETTER BEHEH
+    '\u0680': ('\uFB5A', '\uFB5C', '\uFB5D', '\uFB5B'),
+    # ARABIC LETTER NYEH
+    '\u0683': ('\uFB76', '\uFB78', '\uFB79', '\uFB77'),
+    # ARABIC LETTER DYEH
+    '\u0684': ('\uFB72', '\uFB74', '\uFB75', '\uFB73'),
+    # ARABIC LETTER TCHEH
+    '\u0686': ('\uFB7A', '\uFB7C', '\uFB7D', '\uFB7B'),
+    # ARABIC LETTER TCHEHEH
+    '\u0687': ('\uFB7E', '\uFB80', '\uFB81', '\uFB7F'),
+    # ARABIC LETTER DDAL
+    '\u0688': ('\uFB88', '', '', '\uFB89'),
+    # ARABIC LETTER DAHAL
+    '\u068C': ('\uFB84', '', '', '\uFB85'),
+    # ARABIC LETTER DDAHAL
+    '\u068D': ('\uFB82', '', '', '\uFB83'),
+    # ARABIC LETTER DUL
+    '\u068E': ('\uFB86', '', '', '\uFB87'),
+    # ARABIC LETTER RREH
+    '\u0691': ('\uFB8C', '', '', '\uFB8D'),
+    # ARABIC LETTER JEH
+    '\u0698': ('\uFB8A', '', '', '\uFB8B'),
+    # ARABIC LETTER VEH
+    '\u06A4': ('\uFB6A', '\uFB6C', '\uFB6D', '\uFB6B'),
+    # ARABIC LETTER PEHEH
+    '\u06A6': ('\uFB6E', '\uFB70', '\uFB71', '\uFB6F'),
+    # ARABIC LETTER KEHEH
+    '\u06A9': ('\uFB8E', '\uFB90', '\uFB91', '\uFB8F'),
+    # ARABIC LETTER NG
+    '\u06AD': ('\uFBD3', '\uFBD5', '\uFBD6', '\uFBD4'),
+    # ARABIC LETTER GAF
+    '\u06AF': ('\uFB92', '\uFB94', '\uFB95', '\uFB93'),
+    # ARABIC LETTER NGOEH
+    '\u06B1': ('\uFB9A', '\uFB9C', '\uFB9D', '\uFB9B'),
+    # ARABIC LETTER GUEH
+    '\u06B3': ('\uFB96', '\uFB98', '\uFB99', '\uFB97'),
+    # ARABIC LETTER NOON GHUNNA
+    '\u06BA': ('\uFB9E', '', '', '\uFB9F'),
+    # ARABIC LETTER RNOON
+    '\u06BB': ('\uFBA0', '\uFBA2', '\uFBA3', '\uFBA1'),
+    # ARABIC LETTER HEH DOACHASHMEE
+    '\u06BE': ('\uFBAA', '\uFBAC', '\uFBAD', '\uFBAB'),
+    # ARABIC LETTER HEH WITH YEH ABOVE
+    '\u06C0': ('\uFBA4', '', '', '\uFBA5'),
+    # ARABIC LETTER HEH GOAL
+    '\u06C1': ('\uFBA6', '\uFBA8', '\uFBA9', '\uFBA7'),
+    # ARABIC LETTER KIRGHIZ OE
+    '\u06C5': ('\uFBE0', '', '', '\uFBE1'),
+    # ARABIC LETTER OE
+    '\u06C6': ('\uFBD9', '', '', '\uFBDA'),
+    # ARABIC LETTER U
+    '\u06C7': ('\uFBD7', '', '', '\uFBD8'),
+    # ARABIC LETTER YU
+    '\u06C8': ('\uFBDB', '', '', '\uFBDC'),
+    # ARABIC LETTER KIRGHIZ YU
+    '\u06C9': ('\uFBE2', '', '', '\uFBE3'),
+    # ARABIC LETTER VE
+    '\u06CB': ('\uFBDE', '', '', '\uFBDF'),
+    # ARABIC LETTER FARSI YEH
+    '\u06CC': ('\uFBFC', '\uFBFE', '\uFBFF', '\uFBFD'),
+    # ARABIC LETTER E
+    '\u06D0': ('\uFBE4', '\uFBE6', '\uFBE7', '\uFBE5'),
+    # ARABIC LETTER YEH BARREE
+    '\u06D2': ('\uFBAE', '', '', '\uFBAF'),
+    # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE
+    '\u06D3': ('\uFBB0', '', '', '\uFBB1'),
+}
+
+PRESENTATION_TO_BASE = dict()
+for base in BASE_TO_PRESENTATION:
+    for presentation in BASE_TO_PRESENTATION[base]:
+        if presentation == '':
+            continue
+        PRESENTATION_TO_BASE[presentation.lower()] = base.lower()
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_spaces.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_spaces.py
new file mode 100644
index 00000000000..a64ad74e440
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/normalize_spaces.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+""" This script normalizes a text file. It performs following normalizations:
+    multiple continuous spaces to single space, removes spaces at the begining
+    and end of the word.
+    Eg. normalize_spaces.py <input-file> <output-file>
+"""
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: normalize_spaces.py <input-file> <output-file>")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+def main():
+
+    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            # Only output one space at a time
+            space_chars = set([" ", "\t", "\u00a0"])
+
+            last_char_was_space = False
+
+            # Strip spaces at beginning and end of utterance
+            utt = utt.strip(' ')
+            for char in utt:
+                if char in space_chars:
+                    if not last_char_was_space:
+                        fh_out.write(" ")
+                    last_char_was_space = True
+                else:
+                    fh_out.write(char)
+                    last_char_was_space = False
+
+            # Finally, print out uttid and newline
+            fh_out.write(" (%s)\n" % uttid)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/snor.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/snor.py
new file mode 100644
index 00000000000..29aa22e97ab
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/snor.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+""" This script defines an iterator over SNOR-formatted files.
+    The iterator iterates over lines, returning tuples of form (utt, utt-id).
+    snor-format:
+    some text goes here (id-of-utterance)
+    some other text here (id-of-next-utterance)
+"""
+
+def SnorIter(fh):
+    for line in fh:
+        lparen_location = line.rfind("(")
+        rparen_location = line.rfind(")")
+
+        if lparen_location > 0 and line[lparen_location-1] == " ":
+            lparen_location_modifier = -1
+        else:
+            lparen_location_modifier = 0
+        utt = line[ :lparen_location + lparen_location_modifier ]
+        uttid = line[ lparen_location+1 : rparen_location ]
+
+        yield utt, uttid
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/trans_to_chars.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/trans_to_chars.py
new file mode 100644
index 00000000000..1a01d8cb618
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/trans_to_chars.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+
+""" This script converts the words into sequence of space separated characters.
+    It also converts space between words into "<sp> "
+    Eg. trans_to_chars.py <input-file> <output-file>
+"""
+
+import unicodedata
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: trans_to_chars.py <input-file> <output-file>")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+def main():
+    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            for char in utt:
+                if char == " ":
+                    fh_out.write("<sp> ")
+                else:
+                    fh_out.write(char)
+                    fh_out.write(" ")
+            # Finally write out uttid and newline
+            fh_out.write("(%s)\n" % uttid)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/trans_to_tokenized_words.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/trans_to_tokenized_words.py
new file mode 100644
index 00000000000..7af13f7fd9e
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/trans_to_tokenized_words.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+""" This script splits punctuations, digits and currency symbols
+    from the word. 
+    Eg. "They have come!" he said reverently, gripping his
+        " They have come ! " he said reverently , gripping his
+    Eg. trans_to_tokenized_words.py <input-file> <output-file>
+"""
+
+import unicodedata
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: trans_to_tokenized_words.py <input-file> <output-file>")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+
+punc =  set(chr(i) for i in range(sys.maxunicode)
+            if unicodedata.category(chr(i)).startswith('P'))
+currency_symbols =  set(chr(i) for i in range(sys.maxunicode)
+                        if unicodedata.category(chr(i)) == "Sc")
+digits =  set(chr(i) for i in range(sys.maxunicode)
+                        if unicodedata.category(chr(i)) == "Nd")
+
+split_punc = True
+split_digits = True
+def main():
+
+    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            for char in utt:
+                if (split_punc and char in punc) or (split_punc and char in currency_symbols) or (split_digits and char in digits):
+                    fh_out.write(" ")
+                    fh_out.write(char)
+                    fh_out.write(" ")
+                else:
+                    fh_out.write(char)
+
+            # Finally write out uttid and newline
+            fh_out.write(" (%s)\n" % uttid)
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/word_trans_utf8_to_uxxxx.py b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/word_trans_utf8_to_uxxxx.py
new file mode 100644
index 00000000000..143667b1e8c
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/normalized_scoring/utils/word_trans_utf8_to_uxxxx.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+""" This script converts characters from utf-8 format to hexadecimal format.
+    Eg. word_trans_utf8_to_uxxxx.py <input-file> <output-file>
+"""
+
+import sys
+from snor import SnorIter
+
+if len(sys.argv) != 3:
+    print("Usage: word_trans_utf8_to_uxxxx.py <input-file> <output-file>")
+    sys.exit(1)
+
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+
+def main():
+    with open(input_file, 'r', encoding='utf-8') as fh, open(output_file, 'w', encoding='utf-8') as fh_out:
+        for utt, uttid in SnorIter(fh):
+            for word in utt.split():
+                fh_out.write(utf8_char_to_uxxxx(word[0]))
+                for char in word[1:]:
+                    fh_out.write("_")
+                    fh_out.write(utf8_char_to_uxxxx(char))
+                fh_out.write(" ")
+            # Finally write out uttid and newline
+            fh_out.write("(%s)\n" % uttid)
+
+
+def utf8_char_to_uxxxx(char):
+    raw_hex = hex(ord(char))[2:].zfill(4).lower()
+    uxxxx_char = "u%s" % raw_hex
+    return uxxxx_char
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/yomdle_tamil/v1/local/yomdle/yomdle2csv.py b/egs/yomdle_tamil/v1/local/yomdle/yomdle2csv.py
new file mode 100755
index 00000000000..d75b8bcbe8b
--- /dev/null
+++ b/egs/yomdle_tamil/v1/local/yomdle/yomdle2csv.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+'''
+
+GEDI2CSV
+
+Convert GEDI-type bounding boxes to CSV format
+
+'''
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+
+class GEDI2CSV(object):
+
+    ''' Initialize the extractor'''
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    '''
+    Segment image with GEDI bounding box information
+    '''
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        ''' for writing the files '''
+        writePath = self._args.outputDir
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','lang']
+        conf=100
+        pgrot = 0
+        bbrot = 0
+        qual = 0
+        script = ''
+
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+
+        strPos = writePath + baseName
+
+        for j in polys:
+            try:
+                arr = []
+                [id,poly_val,text,qual,lang] = j
+                script=None
+                #print(j)
+                for i in poly_val:
+                    if len(i.strip()) > 0:
+                        #print(i)
+                        arr.append(eval(i))
+
+                contour = np.asarray(arr)
+                #print(contour)
+                convex = cv2.convexHull(contour)
+                rect = cv2.minAreaRect(convex)
+                box = cv2.boxPoints(rect)
+                box = np.int0(box)
+                box = np.reshape(box,(-1,1)).T
+                c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+
+                bbrot = 0.0
+
+                rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,lang])
+
+            except:
+                print('...polygon error %s, %s' % (j, baseName))
+                continue
+
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+
+        return write_ctr
+
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    print('write to %s' % (writePath))
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+
+    ''' Setup logging '''
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    file_error_ctr = 0
+    '''
+    Get all XML files in the directory and sub folders
+    '''
+    print('reading %s' % (args.inputDir))
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                try:
+                    ''' read the XML file '''
+                    tree = ET.parse(fullName)
+                except:
+                    print('...ERROR parsing %s' % (fullName))
+                    file_error_ctr += 1
+                    continue
+
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+
+                ''' and for each page '''
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    ''' find children for each page '''
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' :
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Illegible'),zone.get('Language')])
+                            else:
+                                print('...Not polygon')
+
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no text content' % (baseName[0]))
+
+
+    print('complete...total files %d, lines written %d, img errors %d, line error %d' % (fileCnt, line_write_ctr, file_error_ctr, line_error_ctr))
+
+
+''' Args and defaults '''
+def parse_arguments(argv):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', default='/data/YOMDLE/final_arabic/xml')
+    parser.add_argument('--outputDir', type=str, help='Output directory', default='/exp/YOMDL/final_arabic/csv_truth/')
+    parser.add_argument('--log', type=str, help='Log directory', default='/exp/logs.txt')
+
+    return parser.parse_args(argv)
+
+
+''' Run '''
+if __name__ == '__main__':
+    main(parse_arguments(sys.argv[1:]))
diff --git a/egs/yomdle_tamil/v1/path.sh b/egs/yomdle_tamil/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_tamil/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_tamil/v1/run_end2end.sh b/egs/yomdle_tamil/v1/run_end2end.sh
new file mode 100755
index 00000000000..e6a8e0a4432
--- /dev/null
+++ b/egs/yomdle_tamil/v1/run_end2end.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+
+# Copyright 2018    Hossein Hadian
+#                   Ashish Arora
+#                   Jonathan Chang
+# Apache 2.0
+
+set -e
+stage=0
+nj=30
+
+language_main=Tamil
+slam_dir=/export/corpora5/slam/SLAM/
+yomdle_dir=/export/corpora5/slam/YOMDLE/
+corpus_dir=/export/corpora5/handwriting_ocr/corpus_data/ta/
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+./local/check_tools.sh
+# Start from stage=-2 for data preparation. This stage stores line images,
+# csv files and splits{train,test,train_unsup} data/download/truth_line_image,
+# data/download/truth_csv and data/local/splits respectively.
+if [ $stage -le -2 ]; then
+  echo "$(date): preparing data, obtaining line images and csv files..."
+  local/yomdle/create_download_dir.sh --language_main $language_main \
+    --slam_dir $slam_dir --yomdle_dir $yomdle_dir
+fi
+
+if [ $stage -le -1 ]; then
+  echo "$(date): getting corpus text for language modelling..."
+  mkdir -p data/local/text/cleaned
+  cat $corpus_dir/* > data/local/text/ta.txt
+  head -20000 data/local/text/ta.txt > data/local/text/val.txt
+  tail -n +20000 data/local/text/ta.txt > data/local/text/corpus.txt
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 0 ]; then
+  echo "$(date) stage 0: Processing train and test data."
+  echo " creating text, images.scp, utt2spk and spk2utt"
+  # removing empty transcription line images from train and test set.
+  # It can cause error while applying BPE.
+  for set in train test; do
+    local/process_data.py data/download/ \
+      data/local/splits/${set}.txt data/${set}
+    image/fix_data_dir.sh data/${set}
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$(date) stage 1: getting allowed image widths for e2e training..."
+  image/get_image2num_frames.py --feat-dim 40 data/train
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  for set in train test; do
+    echo "$(date) Extracting features, creating feats.scp file"
+    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set}
+    steps/compute_cmvn_stats.sh data/${set} || exit 1;
+  done
+  image/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  for set in train; do
+    echo "$(date) stage 2: Performing augmentation, it will double training data"
+    local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
+    steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$(date) stage 3: BPE preparation"
+  # getting non-silence phones.
+  cut -d' ' -f2- data/train/text | \
+python3 <(
+cat << "END"
+import os, sys, io;
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8');
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8');
+phone_dict = dict();
+for line in infile:
+    line_vect = line.strip().split();
+    for word in line_vect:
+        for phone in word:
+            phone_dict[phone] = phone;
+
+for phone in phone_dict.keys():
+      output.write(phone+ '\n');
+END
+   ) > data/local/text/cleaned/phones.txt
+
+  cut -d' ' -f2- data/train/text > data/local/text/cleaned/train.txt
+
+  echo "Processing corpus text..."
+  # we are removing the lines from the corpus which which have
+  # phones other than the phones in data/local/text/cleaned/phones.txt.
+  cat data/local/text/corpus.txt | \
+    local/process_corpus.py > data/local/text/cleaned/corpus.txt
+  cat data/local/text/val.txt | \
+    local/process_corpus.py > data/local/text/cleaned/val.txt
+
+  echo "learning BPE..."
+  # it is currently learned with only training text but we can also use all corpus text
+  # to learn BPE. phones are added so that one isolated occurance of every phone exists.
+  cat data/local/text/cleaned/phones.txt data/local/text/cleaned/train.txt | \
+    utils/lang/bpe/prepend_words.py | utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$(date) stage 4: applying BPE..."
+  echo "applying BPE on train, test text..."
+  for set in test train train_aug; do
+    cut -d' ' -f1 data/$set/text > data/$set/ids
+    cut -d' ' -f2- data/$set/text | utils/lang/bpe/prepend_words.py | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+      sed 's/@@//g' > data/$set/bpe_text
+    mv data/$set/text data/$set/text.old
+    paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
+  done
+
+  echo "applying BPE to corpus text..."
+  cat data/local/text/cleaned/corpus.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_corpus.txt
+  cat data/local/text/cleaned/val.txt | utils/lang/bpe/prepend_words.py | \
+    utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt | \
+    sed 's/@@//g' > data/local/text/cleaned/bpe_val.txt
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$(date) stage 5: Preparing dictionary and lang..."
+  local/prepare_dict.sh --dir data/local/dict
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+    data/local/dict "<sil>" data/lang/temp data/lang
+  utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$(date) stage 6: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang
+  utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
+                               data/lang data/lang_rescore_6g
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$(date) stage 7: Calling the flat-start chain recipe..."
+  local/chain/run_e2e_cnn.sh --train_set train_aug
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$(date) stage 8: Aligning the training data using the e2e chain model..."
+  steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+    --use-gpu false \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+    data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$(date) stage 9: Building a tree and training a regular chain model using the e2e alignments..."
+  local/chain/run_cnn_e2eali.sh --train_set train_aug
+fi
diff --git a/egs/yomdle_tamil/v1/steps b/egs/yomdle_tamil/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_tamil/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_tamil/v1/utils b/egs/yomdle_tamil/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_tamil/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_zh/README.txt b/egs/yomdle_zh/README.txt
new file mode 100644
index 00000000000..39d2348ca10
--- /dev/null
+++ b/egs/yomdle_zh/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources (e.g. Hamshahri)
diff --git a/egs/yomdle_zh/v1/cmd.sh b/egs/yomdle_zh/v1/cmd.sh
new file mode 100755
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/yomdle_zh/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/yomdle_zh/v1/image b/egs/yomdle_zh/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_zh/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_zh/v1/local/augment_data.sh b/egs/yomdle_zh/v1/local/augment_data.sh
new file mode 100755
index 00000000000..1f13ed15ded
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/augment_data.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --vertical-shift $verticle_shift \
+    --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_zh/v1/local/bidi.py b/egs/yomdle_zh/v1/local/bidi.py
new file mode 100755
index 00000000000..447313a5d02
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/bidi.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# Copyright   2018 Chun-Chieh Chang
+
+# This script is largely written by Stephen Rawls
+# and uses the python package https://pypi.org/project/PyICU_BiDi/
+# The code leaves right to left text alone and reverses left to right text.
+
+import icu_bidi
+import io
+import sys
+import unicodedata
+# R=strong right-to-left;  AL=strong arabic right-to-left
+rtl_set =  set(chr(i) for i in range(sys.maxunicode)
+               if unicodedata.bidirectional(chr(i)) in ['R','AL'])
+def determine_text_direction(text):
+    # Easy case first
+    for char in text:
+        if char in rtl_set:
+            return icu_bidi.UBiDiLevel.UBIDI_RTL
+    # If we made it here we did not encounter any strongly rtl char
+    return icu_bidi.UBiDiLevel.UBIDI_LTR
+
+def utf8_visual_to_logical(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+    bidi.inverse = True
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+def utf8_logical_to_visual(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT  #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+
+##main##
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+for line in sys.stdin:
+    line = line.strip()
+    line = utf8_logical_to_visual(line)[::-1]
+    sys.stdout.write(line + '\n')
diff --git a/egs/yomdle_zh/v1/local/chain/compare_wer.sh b/egs/yomdle_zh/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..ab880c1adb5
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/chain/compare_wer.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..357ce6a1f8e
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -0,0 +1,232 @@
+#!/bin/bash
+
+# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# WER                             63.19     53.67
+# CER                             19.01     12.86
+# Final train prob               0.2908   -0.0455
+# Final valid prob               0.2397   -0.0531
+# Final train prob (xent)                 -0.9753
+# Final valid prob (xent)                 -1.0559
+
+set -e -o pipefail
+
+data_dir=data
+exp_dir=exp
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=2000
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=true
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+e2echain_model_dir=$exp_dir/chain/e2e_cnn_1a
+ali_dir=$exp_dir/chain/e2e_ali_train
+lat_dir=$exp_dir/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=$exp_dir/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=$data_dir/${train_set}
+tree_dir=$exp_dir/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=$data_dir/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  cp -r $data_dir/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=180 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn3 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn6 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn8 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn9 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=4 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=16,8 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=wait \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
new file mode 100755
index 00000000000..28ea2863e38
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+
+# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# WER                             63.19     53.67
+# CER                             19.01     12.86
+# Final train prob               0.2908   -0.0455
+# Final valid prob               0.2397   -0.0531
+# Final train prob (xent)                 -0.9753
+# Final valid prob (xent)                 -1.0559
+
+set -e
+
+data_dir=data
+exp_dir=exp
+
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=4
+num_jobs_final=8
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_test=lang_test
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=$data_dir/lang_e2e
+treedir=$exp_dir/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=$exp_dir/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r $data_dir/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       $data_dir/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat $data_dir/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \
+    utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=1500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=180 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --use-gpu=wait \
+    --feat-dir $data_dir/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/yomdle_zh/v1/local/create_download.sh b/egs/yomdle_zh/v1/local/create_download.sh
new file mode 100755
index 00000000000..1daad354473
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/create_download.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright 2018 Chun-Chieh Chang
+
+# The original format of the dataset given is GEDI and page images.
+# This script is written to create line images from page images.
+# It also creates csv files from the GEDI files.
+
+database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi
+cangjie_url=https://raw.githubusercontent.com/wanleung/libcangjie/master/tables/cj5-cc.txt
+download_dir=download
+slam_dir=$download_dir/slam_farsi
+yomdle_dir=$download_dir/yomdle_farsi
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1; 
+
+echo "$0: Processing SLAM ${language}"
+echo "Date: $(date)."
+mkdir -p ${slam_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/gedi2csv.py \
+    --inputDir ${database_slam} \
+    --outputDir ${slam_dir}/truth_csv_raw \
+    --log ${slam_dir}/GEDI2CSV_enriched.log
+local/create_line_image_from_page_image.py \
+    ${database_slam} \
+    ${slam_dir}/truth_csv_raw \
+    ${slam_dir}
+
+echo "$0: Processing YOMDLE ${language}"
+echo "Date: $(date)."
+mkdir -p ${yomdle_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/yomdle2csv.py \
+    --inputDir ${database_yomdle} \
+    --outputDir ${yomdle_dir}/truth_csv_raw/ \
+    --log ${yomdle_dir}/YOMDLE2CSV.log
+local/create_line_image_from_page_image.py \
+    --im-format "jpg" \
+    ${database_yomdle}/images \
+    ${yomdle_dir}/truth_csv_raw \
+    ${yomdle_dir}
+
+echo "Downloading table for CangJie."
+wget -P $download_dir/ $cangjie_url || exit 1;
+perl -n -i -e 'print if $. > 8' $download_dir/cj5-cc.txt
diff --git a/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py
new file mode 100755
index 00000000000..7135bb1b242
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py
@@ -0,0 +1,458 @@
+#!/usr/bin/env python3
+
+# Copyright   2018 Ashish Arora
+# Apache 2.0
+# minimum bounding box part in this script is originally from
+#https://github.com/BebeSparkelSparkel/MinimumBoundingBox
+#https://startupnextdoor.com/computing-convex-hull-in-python/
+""" This module will be used for extracting line images from page image.
+ Given the word segmentation (bounding box around a word) for every word, it will
+ extract line segmentation. To extract line segmentation, it will take word bounding
+ boxes of a line as input, will create a minimum area bounding box that will contain
+ all corner points of word bounding boxes. The obtained bounding box (will not necessarily
+ be vertically or horizontally aligned). Hence to extract line image from line bounding box,
+ page image is rotated and line image is cropped and saved.
+"""
+
+import argparse
+import csv
+import itertools
+import sys
+import os
+import numpy as np
+from math import atan2, cos, sin, pi, degrees, sqrt
+from collections import namedtuple
+
+from scipy.spatial import ConvexHull
+from PIL import Image
+from scipy.misc import toimage
+
+parser = argparse.ArgumentParser(description="Creates line images from page image")
+parser.add_argument('image_dir', type=str, help='Path to full page images')
+parser.add_argument('csv_dir', type=str, help='Path to csv files')
+parser.add_argument('out_dir', type=str, help='Path to output directory')
+parser.add_argument('--im-format', type=str, default='png', help='What file format are the images')
+parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area')
+parser.add_argument('--head', type=int, default=-1, help='Number of csv files to process')
+args = parser.parse_args()
+
+"""
+bounding_box is a named tuple which contains:
+             area (float): area of the rectangle
+             length_parallel (float): length of the side that is parallel to unit_vector
+             length_orthogonal (float): length of the side that is orthogonal to unit_vector
+             rectangle_center(int, int): coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector (float, float): direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle (float): angle of the unit vector to be in radians.
+             corner_points [(float, float)]: set that contains the corners of the rectangle
+"""
+
+bounding_box_tuple = namedtuple('bounding_box_tuple', 'area '
+                                        'length_parallel '
+                                        'length_orthogonal '
+                                        'rectangle_center '
+                                        'unit_vector '
+                                        'unit_vector_angle '
+                                        'corner_points'
+                         )
+
+
+def unit_vector(pt0, pt1):
+    """ Given two points pt0 and pt1, return a unit vector that
+        points in the direction of pt0 to pt1.
+    Returns
+    -------
+    (float, float): unit vector
+    """
+    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
+    return (pt1[0] - pt0[0]) / dis_0_to_1, \
+           (pt1[1] - pt0[1]) / dis_0_to_1
+
+
+def orthogonal_vector(vector):
+    """ Given a vector, returns a orthogonal/perpendicular vector of equal length.
+    Returns
+    ------
+    (float, float): A vector that points in the direction orthogonal to vector.
+    """
+    return -1 * vector[1], vector[0]
+
+
+def bounding_area(index, hull):
+    """ Given index location in an array and convex hull, it gets two points
+        hull[index] and hull[index+1]. From these two points, it returns a named
+        tuple that mainly contains area of the box that bounds the hull. This
+        bounding box orintation is same as the orientation of the lines formed
+        by the point hull[index] and hull[index+1].
+    Returns
+    -------
+    a named tuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side.
+    (it's orthogonal vector can be found with the orthogonal_vector function)
+    """
+    unit_vector_p = unit_vector(hull[index], hull[index+1])
+    unit_vector_o = orthogonal_vector(unit_vector_p)
+
+    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
+    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)
+
+    min_p = min(dis_p)
+    min_o = min(dis_o)
+    len_p = max(dis_p) - min_p
+    len_o = max(dis_o) - min_o
+
+    return {'area': len_p * len_o,
+            'length_parallel': len_p,
+            'length_orthogonal': len_o,
+            'rectangle_center': (min_p + float(len_p) / 2, min_o + float(len_o) / 2),
+            'unit_vector': unit_vector_p,
+            }
+
+
+def to_xy_coordinates(unit_vector_angle, point):
+    """ Given angle from horizontal axis and a point from origin,
+        returns converted unit vector coordinates in x, y coordinates.
+        angle of unit vector should be in radians.
+    Returns
+    ------
+    (float, float): converted x,y coordinate of the unit vector.
+    """
+    angle_orthogonal = unit_vector_angle + pi / 2
+    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
+           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
+
+
+def rotate_points(center_of_rotation, angle, points):
+    """ Rotates a point cloud around the center_of_rotation point by angle
+    input
+    -----
+    center_of_rotation (float, float): angle of unit vector to be in radians.
+    angle (float): angle of rotation to be in radians.
+    points [(float, float)]: Points to be a list or tuple of points. Points to be rotated.
+    Returns
+    ------
+    [(float, float)]: Rotated points around center of rotation by angle
+    """
+    rot_points = []
+    ang = []
+    for pt in points:
+        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
+        diff_angle = atan2(diff[1], diff[0]) + angle
+        ang.append(diff_angle)
+        diff_length = sqrt(sum([d**2 for d in diff]))
+        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
+                           center_of_rotation[1] + diff_length * sin(diff_angle)))
+
+    return rot_points
+
+
+def rectangle_corners(rectangle):
+    """ Given rectangle center and its inclination, returns the corner
+        locations of the rectangle.
+    Returns
+    ------
+    [(float, float)]: 4 corner points of rectangle.
+    """
+    corner_points = []
+    for i1 in (.5, -.5):
+        for i2 in (i1, -1 * i1):
+            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
+                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))
+
+    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
+
+
+def get_orientation(origin, p1, p2):
+    """
+    Given origin and two points, return the orientation of the Point p1 with
+    regards to Point p2 using origin.
+    Returns
+    -------
+    integer: Negative if p1 is clockwise of p2.
+    """
+    difference = (
+        ((p2[0] - origin[0]) * (p1[1] - origin[1]))
+        - ((p1[0] - origin[0]) * (p2[1] - origin[1]))
+    )
+    return difference
+
+
+def compute_hull(points):
+    """
+    Given input list of points, return a list of points that
+    made up the convex hull.
+    Returns
+    -------
+    [(float, float)]: convexhull points
+    """
+    hull_points = []
+    start = points[0]
+    min_x = start[0]
+    for p in points[1:]:
+        if p[0] < min_x:
+            min_x = p[0]
+            start = p
+
+    point = start
+    hull_points.append(start)
+
+    far_point = None
+    while far_point is not start:
+        p1 = None
+        for p in points:
+            if p is point:
+                continue
+            else:
+                p1 = p
+                break
+
+        far_point = p1
+
+        for p2 in points:
+            if p2 is point or p2 is p1:
+                continue
+            else:
+                direction = get_orientation(point, far_point, p2)
+                if direction > 0:
+                    far_point = p2
+
+        hull_points.append(far_point)
+        point = far_point
+    return hull_points
+
+
+def minimum_bounding_box(points):
+    """ Given a list of 2D points, it returns the minimum area rectangle bounding all
+        the points in the point cloud.
+    Returns
+    ------
+    returns a namedtuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side. RADIANS
+    unit_vector_angle: angle of the unit vector
+    corner_points: set that contains the corners of the rectangle
+    """
+
+    if len(points) <= 2: raise ValueError('More than two points required.')
+
+    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
+    hull_ordered.append(hull_ordered[0])
+    #hull_ordered = compute_hull(points)
+    hull_ordered = tuple(hull_ordered)
+
+    min_rectangle = bounding_area(0, hull_ordered)
+    for i in range(1, len(hull_ordered)-1):
+        rectangle = bounding_area(i, hull_ordered)
+        if rectangle['area'] < min_rectangle['area']:
+            min_rectangle = rectangle
+
+    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
+    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])
+
+    return bounding_box_tuple(
+        area = min_rectangle['area'],
+        length_parallel = min_rectangle['length_parallel'],
+        length_orthogonal = min_rectangle['length_orthogonal'],
+        rectangle_center = min_rectangle['rectangle_center'],
+        unit_vector = min_rectangle['unit_vector'],
+        unit_vector_angle = min_rectangle['unit_vector_angle'],
+        corner_points = set(rectangle_corners(min_rectangle))
+    )
+
+
+def get_center(im):
+    """ Given image, returns the location of center pixel
+    Returns
+    -------
+    (int, int): center of the image
+    """
+    center_x = float(im.size[0]) / 2
+    center_y = float(im.size[1]) / 2
+    return int(center_x), int(center_y)
+
+
+def get_horizontal_angle(unit_vector_angle):
+    """ Given an angle in radians, returns angle of the unit vector in
+        first or fourth quadrant.
+    Returns
+    ------
+    (float): updated angle of the unit vector to be in radians.
+             It is only in first or fourth quadrant.
+    """
+    if unit_vector_angle > pi / 2 and unit_vector_angle <= pi:
+        unit_vector_angle = unit_vector_angle - pi
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2:
+        unit_vector_angle = unit_vector_angle + pi
+
+    return unit_vector_angle
+
+
+def get_smaller_angle(bounding_box):
+    """ Given a rectangle, returns its smallest absolute angle from horizontal axis.
+    Returns
+    ------
+    (float): smallest angle of the rectangle to be in radians.
+    """
+    unit_vector = bounding_box.unit_vector
+    unit_vector_angle = bounding_box.unit_vector_angle
+    ortho_vector = orthogonal_vector(unit_vector)
+    ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0])
+
+    unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle)
+    ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle)
+
+    if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated):
+        return unit_vector_angle_updated
+    else:
+        return ortho_vector_angle_updated
+
+
+def rotated_points(bounding_box, center):
+    """ Given the rectangle, returns corner points of rotated rectangle.
+        It rotates the rectangle around the center by its smallest angle.
+    Returns
+    -------
+    [(int, int)]: 4 corner points of rectangle.
+    """
+    p1, p2, p3, p4 = bounding_box.corner_points
+    x1, y1 = p1
+    x2, y2 = p2
+    x3, y3 = p3
+    x4, y4 = p4
+    center_x, center_y = center
+    rotation_angle_in_rad = -get_smaller_angle(bounding_box)
+    x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x
+
+    y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y
+    return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4
+
+
+def pad_image(image):
+    """ Given an image, returns a padded image around the border.
+        This routine save the code from crashing if bounding boxes that are
+        slightly outside the page boundary.
+    Returns
+    -------
+    image: page image
+    """
+    offset = int(args.padding // 2)
+    padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white")
+    padded_image.paste(im = image, box = (offset, offset))
+    return padded_image
+
+def update_minimum_bounding_box_input(bounding_box_input):
+    """ Given list of 2D points, returns list of 2D points shifted by an offset.
+    Returns
+    ------
+    points [(float, float)]: points, a list or tuple of 2D coordinates
+    """
+    updated_minimum_bounding_box_input = []
+    offset = int(args.padding // 2)
+    for point in bounding_box_input:
+        x, y = point
+        new_x = x + offset
+        new_y = y + offset
+        word_coordinate = (new_x, new_y)
+        updated_minimum_bounding_box_input.append(word_coordinate)
+
+    return updated_minimum_bounding_box_input
+
+
+### main ###
+csv_count = 0
+for filename in sorted(os.listdir(args.csv_dir)):
+    if filename.endswith('.csv') and (csv_count < args.head or args.head < 0):
+        csv_count = csv_count + 1
+        with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f:
+            image_file = os.path.join(args.image_dir, os.path.splitext(filename)[0] + '.' + args.im_format)
+            if not os.path.isfile(image_file):
+                continue
+            csv_out_file = os.path.join(args.out_dir, 'truth_csv', filename)
+            csv_out_fh = open(csv_out_file, 'w', encoding='utf-8')
+            csv_out_writer = csv.writer(csv_out_fh)
+            im = Image.open(image_file)
+            im = pad_image(im)
+            count = 1
+            for row in itertools.islice(csv.reader(f), 0, None):
+                if count == 1:
+                    count = 0
+                    continue
+    
+                points = []
+                points.append((int(row[2]), int(row[3])))
+                points.append((int(row[4]), int(row[5])))
+                points.append((int(row[6]), int(row[7])))
+                points.append((int(row[8]), int(row[9])))
+    
+                x = [int(row[2]), int(row[4]), int(row[6]), int(row[8])]
+                y = [int(row[3]), int(row[5]), int(row[7]), int(row[9])]
+                min_x, min_y = min(x), min(y)
+                max_x, max_y = max(x), max(y)
+                if min_x == max_x or min_y == max_y:
+                    continue
+    
+                try:
+                    updated_mbb_input = update_minimum_bounding_box_input(points)
+                    bounding_box = minimum_bounding_box(updated_mbb_input)
+                except Exception as e:
+                    print("Error: Skipping Image " + row[1])
+                    continue
+    
+                p1, p2, p3, p4 = bounding_box.corner_points
+                x1, y1 = p1
+                x2, y2 = p2
+                x3, y3 = p3
+                x4, y4 = p4
+                min_x = int(min(x1, x2, x3, x4))
+                min_y = int(min(y1, y2, y3, y4))
+                max_x = int(max(x1, x2, x3, x4))
+                max_y = int(max(y1, y2, y3, y4))
+                box = (min_x, min_y, max_x, max_y)
+                region_initial = im.crop(box)
+                rot_points = []
+                p1_new = (x1 - min_x, y1 - min_y)
+                p2_new = (x2 - min_x, y2 - min_y)
+                p3_new = (x3 - min_x, y3 - min_y)
+                p4_new = (x4 - min_x, y4 - min_y)
+                rot_points.append(p1_new)
+                rot_points.append(p2_new)
+                rot_points.append(p3_new)
+                rot_points.append(p4_new)
+    
+                cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                        bounding_box.length_parallel,
+                        bounding_box.length_orthogonal,
+                        bounding_box.length_orthogonal,
+                        bounding_box.unit_vector,
+                        bounding_box.unit_vector_angle,
+                        set(rot_points))
+    
+                rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+                img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
+                x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                    cropped_bounding_box, get_center(region_initial))
+    
+                min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                box = (min_x, min_y, max_x, max_y)
+                region_final = img2.crop(box)
+                csv_out_writer.writerow(row)
+                image_out_file = os.path.join(args.out_dir, 'truth_line_image', row[1])
+                region_final.save(image_out_file)
diff --git a/egs/yomdle_zh/v1/local/extract_features.sh b/egs/yomdle_zh/v1/local/extract_features.sh
new file mode 100755
index 00000000000..f75837ae5b3
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/extract_features.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+augment='no_aug'
+num_channels=3
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_zh/v1/local/gedi2csv.py b/egs/yomdle_zh/v1/local/gedi2csv.py
new file mode 100755
index 00000000000..0b80c2e80bb
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/gedi2csv.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id= col= row= width= height= Language= Quality= Overlay= Script= Type= Text_Content=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+    
+class GEDI2CSV(object):
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        writePath = os.path.join(writePath,'')
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','text_type']
+        conf=100
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+            
+        strPos = writePath + baseName
+
+        """ for each group of coordinates """
+        for i in coords:
+
+            [id,x,y,w,h,degrees,text,qual,script,text_type] = i
+                    
+            contour = np.array([(x,y),(x+w,y),(x+w,y+h),(x,y+h)])
+
+            """
+            First rotate around upper left corner based on orientationD keyword
+            """
+            M, rot = Rotate2D(contour, np.array([x,y]), degrees*pi/180)
+            rot = np.int0(rot)
+
+            # rot is the 8 points rotated by degrees
+            # pgrot is the rotation after extraction, so save
+
+            # save rotated points to list or array
+            rot = np.reshape(rot,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(rot)
+            
+            text = text.replace(u'\ufeff','')
+
+            bbrot = degrees
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+
+        # if there are polygons, first save the text
+        for j in polys:
+            arr = []
+            [id,poly_val,text,qual,script,text_type] = j
+            for i in poly_val:
+                arr.append(eval(i))
+
+            contour = np.asarray(arr)
+            convex = cv2.convexHull(contour)
+            rect = cv2.minAreaRect(convex)
+            box = cv2.boxPoints(rect)
+            box = np.int0(box)
+            box = np.reshape(box,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+            
+            bbrot = 0.0
+            
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+            
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+                
+        return write_ctr
+    
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+        
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    
+    """
+    Get all XML files in the directory and sub folders
+    """
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                """ read the XML file """
+                tree = ET.parse(fullName)
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+                if args.ftype == 'boxed':
+                    fileTypeStr = 'col'
+                elif args.ftype == 'transcribed':
+                    fileTypeStr = 'Text_Content'
+                else:
+                    print('Filetype must be either boxed or transcribed!')
+                    logger.info('Filetype must be either boxed or transcribed!')
+                    sys.exit(-1)
+                
+                if args.quality == 'both':
+                    qualset = {'Regular','Low-Quality'}
+                elif args.quality == 'low':
+                    qualset = {'Low-Quality'}
+                elif args.quality == 'regular':
+                    qualset = {'Regular'}
+                else:
+                    print('Quality must be both, low or regular!')
+                    logger.info('Quality must be both, low or regular!')
+                    sys.exit(-1)
+                    
+                    
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+                        
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' and zone.attrib['Type'] in \
+                            ('Machine_Print','Confusable_Allograph','Handwriting') and zone.attrib['Quality'] in qualset:
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')])
+                            elif zone.get(fileTypeStr) != None:
+                                keyCnt+=1
+                                coord = [zone.attrib['id'],int(zone.attrib['col']),int(zone.attrib['row']),
+                                                    int(zone.attrib['width']), int(zone.attrib['height']),
+                                                    float(zone.get('orientationD',0.0)),
+                                                    zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')]
+                                coordinates.append(coord)
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no applicable content' % (baseName[0]))
+
+    print('complete...total files %d, lines written %d' % (fileCnt, line_write_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', required=True)
+    parser.add_argument('--outputDir', type=str, help='Output directory', required=True)
+    parser.add_argument('--ftype', type=str, help='GEDI file type (either "boxed" or "transcribed")', default='transcribed')
+    parser.add_argument('--quality', type=str, help='GEDI file quality (either "both" or "low" or "regular")', default='regular')
+    parser.add_argument('--log', type=str, help='Log directory', default='./GEDI2CSV_enriched.log')
+
+    return parser.parse_args(argv)
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
+
+
+
+    
+
+
diff --git a/egs/yomdle_zh/v1/local/gen_topo.py b/egs/yomdle_zh/v1/local/gen_topo.py
new file mode 100755
index 00000000000..f64dcc5eec1
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/gen_topo.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+
+# Copyright 2017 (author: Chun-Chieh Chang)
+
+# Generate a topology file.  This allows control of the number of states in the
+# non-silence HMMs, and in the silence HMMs. This is a modified version of
+# 'utils/gen_topo.pl'. The difference is that this creates two topologies for
+# the non-silence HMMs. The number of states for punctuations is different than
+# the number of states for other characters.
+
+from __future__ import print_function
+import argparse
+import string
+
+parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py "
+                                             "<colon-separated-nonsilence-phones> <colon-separated-silence-phones>"
+                                             "e.g.:  steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n",
+                                 epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage.");
+parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones");
+parser.add_argument("num_sil_states", type=int, help="number of states for silence phones");
+parser.add_argument("num_cj5_states", type=int, help="number of states for punctuation");
+parser.add_argument("nonsilence_phones", type=str,
+                    help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9");
+parser.add_argument("silence_phones", type=str,
+                    help="List of silence phones as integers, separated by colons, e.g. 1:2:3");
+parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number.");
+
+args = parser.parse_args()
+
+silence_phones = [ int(x) for x in args.silence_phones.split(":") ]
+nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ]
+all_phones = silence_phones +  nonsilence_phones
+
+cj5_phones = []
+with open(args.phone_list) as f:
+    for line in f:
+        line = line.strip()
+        phone = line.split(' ')[0]
+        if "cj5" in phone:
+            cj5_phones.append(int(line.split(' ')[1]))
+# For nonsilence phones that are not punctuations
+print("<Topology>")
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x not in cj5_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_nonsil_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_nonsil_states) + " </State>")
+print("</TopologyEntry>")
+
+# For nonsilence phones that are cj5
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in nonsilence_phones if x in cj5_phones]))
+print("</ForPhones>")
+for x in range(0, args.num_cj5_states):
+    xp1 = x + 1
+    print("<State> " + str(x) + " <PdfClass> " + str(x) + " <Transition> " + str(x) + " 0.75 <Transition> " + str(xp1) + " 0.25 </State>")
+print("<State> " + str(args.num_cj5_states) + " </State>")
+print("</TopologyEntry>")
+
+# For silence phones
+print("<TopologyEntry>")
+print("<ForPhones>")
+print(" ".join([str(x) for x in silence_phones]))
+print("</ForPhones>")
+if(args.num_sil_states > 1):
+    transp = 1.0 / (args.num_sil_states - 1)
+    
+    state_str = "<State> 0 <PdfClass> 0 "
+    for x in range(0, (args.num_sil_states - 1)):
+        state_str = state_str + "<Transition> " + str(x) + " " + str(transp) + " "
+    state_str = state_str + "</State>"
+    print(state_str)
+
+    for x in range(1, (args.num_sil_states - 1)):
+        state_str = "<State> " + str(x) + " <PdfClass> " + str(x) + " "
+        for y in range(1, args.num_sil_states):
+            state_str = state_str + "<Transition> " + str(y) + " " + str(transp) + " "
+        state_str = state_str + "</State>"
+        print(state_str)
+    second_last = args.num_sil_states - 1
+    print("<State> " + str(second_last) + " <PdfClass> " + str(second_last) + " <Transition> " + str(second_last) + " 0.75 <Transition> " + str(args.num_sil_states) + " 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+else:
+    print("<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>")
+    print("<State> " + str(args.num_sil_states) + " </State>")
+print("</TopologyEntry>")
+print("</Topology>")
diff --git a/egs/yomdle_zh/v1/local/normalize_text.py b/egs/yomdle_zh/v1/local/normalize_text.py
new file mode 100755
index 00000000000..80c4e3ad3ab
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/normalize_text.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+# Copyright   2018 Chun-Chieh Chang
+
+# This script reads in text and outputs the normalized version
+
+import io
+import re
+import sys
+import unicodedata
+
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+for line in sys.stdin:
+    line = line.strip()
+    line = unicodedata.normalize('NFC', line)
+    line = re.sub(r'\s', ' ', line)
+    sys.stdout.write(line + '\n')
diff --git a/egs/yomdle_zh/v1/local/prepare_dict.sh b/egs/yomdle_zh/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..cf2ecb1ce9b
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/prepare_dict.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+data_dir=data
+
+. ./utils/parse_options.sh || exit 1;
+
+base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2)
+
+mkdir -p $dir
+
+cut -d' ' -f1 download/cj5-cc.txt | ./utils/lang/bpe/learn_bpe.py -s 300 > $dir/bpe.out
+cut -d' ' -f1 download/cj5-cc.txt | ./utils/lang/bpe/apply_bpe.py -c $dir/bpe.out | sed 's/@@//g' > $dir/bpe_text
+cut -d' ' -f2- download/cj5-cc.txt | sed 's/ //g' > $dir/ids
+paste -d' ' $dir/bpe_text $dir/ids > $dir/cj5-cc.txt
+local/prepare_lexicon.py --data-dir $data_dir $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_zh/v1/local/prepare_lexicon.py b/egs/yomdle_zh/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..6ac45fd735f
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/prepare_lexicon.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Ashish Arora
+#                  Chun-Chieh Chang
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+parser.add_argument('--data-dir', type=str, default='data', help='Path to text file')
+args = parser.parse_args()
+
+### main ###
+radical = ['日', '月', '金', '木', '水', '火', '土', '竹', '戈', '十', '大', '中', '一', '弓', '人', '心', '手','口','尸','廿','山','女','田','卜']
+lex = {}
+text_path = os.path.join(args.data_dir, 'train', 'text')
+text_fh = open(text_path, 'r', encoding='utf-8')
+
+# Used specially for Chinese.
+# Uses the ChangJie keyboard input method to create subword units for Chinese.
+cj5_table = {}
+with open(os.path.join(args.dir, 'cj5-cc.txt'), 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split()
+        if not ''.join(line_vect[:-1]).startswith('yyy') and not ''.join(line_vect[:-1]).startswith('z'):
+            cj5_table[line_vect[-1]] = "cj5_" + " cj5_".join(line_vect[:-1])
+
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split()
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+	    # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
+            characters = " ".join([ 'SIL' if char == '|' else char if char in radical else cj5_table[char] if char in cj5_table else char for char in characters])
+            characters = characters.replace('#','<HASH>')
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_zh/v1/local/process_data.py b/egs/yomdle_zh/v1/local/process_data.py
new file mode 100755
index 00000000000..eadb0052705
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/process_data.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Farsi OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import re
+import sys
+import csv
+import itertools
+import unicodedata
+
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+parser.add_argument('--head', type=int, default=-1, help='limit on number of synth data')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+count = 0
+for filename in sorted(os.listdir(os.path.join(args.database_path, 'truth_csv'))):
+    if filename.endswith('.csv') and (count < args.head or args.head < 0):
+        count = count + 1
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        row_count = 0
+        for row in csv.reader(csv_file):
+            if row_count == 0:
+                row_count = 1
+                continue
+            image_id = os.path.splitext(row[1])[0]
+            image_filepath = os.path.join(args.database_path, 'truth_line_image', row[1])
+            text = unicodedata.normalize('NFC', row[11]).replace('\n', '')
+            text = re.sub(r'\s', ' ', text)
+            if os.path.isfile(image_filepath) and os.stat(image_filepath).st_size != 0 and text:
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(image_id.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath + ' ' + row[13] +  '\n')
diff --git a/egs/yomdle_zh/v1/local/score.sh b/egs/yomdle_zh/v1/local/score.sh
new file mode 100755
index 00000000000..f2405205f02
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@"
+steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@"
diff --git a/egs/yomdle_zh/v1/local/train_lm.sh b/egs/yomdle_zh/v1/local/train_lm.sh
new file mode 100755
index 00000000000..bc738f217da
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/train_lm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_zh/v1/local/train_lm_lr.sh b/egs/yomdle_zh/v1/local/train_lm_lr.sh
new file mode 100755
index 00000000000..a8b1bfb76a4
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/train_lm_lr.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE+Extra training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+extra_lm=download/extra_lm.txt
+order=3
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  cat ${extra_lm} | \
+    local/normalize_text.py | \
+    utils/lang/bpe/prepend_words.py | \
+    python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | \
+    sed 's/@@//g' > ${dir}/data/text/extra_lm.txt
+  
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='extra_lm=10 train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_zh/v1/local/wer_output_filter b/egs/yomdle_zh/v1/local/wer_output_filter
new file mode 100755
index 00000000000..449484e1061
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/wer_output_filter
@@ -0,0 +1,151 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+# Arabic-specific normalization
+while (<>) {
+  @F = split " ";
+  print "$F[0] ";
+  foreach $s (@F[1..$#F]) {
+    # Normalize tabs, spaces, and no-break spaces
+    $s =~ s/[\x{0009}\x{0020}\x{00A0}]+/ /g;
+    # Normalize "dots"/"filled-circles" to periods
+    $s =~ s/[\x{25CF}\x{u2022}\x{2219}]+/\x{002E}/g;
+    # Normalize dashes to regular hyphen
+    $s =~ s/[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}]+/\x{002D}/g;
+    # Normalize various parenthesis to regular parenthesis
+    $s =~ s/\x{FF09}/\x{0029}/g;
+    $s =~ s/\x{FF08}/\x{0028}/g;
+    
+    # Convert various presentation forms to base form
+    $s =~ s/[\x{FED1}\x{FED3}\x{FED4}\x{FED2}]+/\x{0641}/g;
+    $s =~ s/[\x{FBB0}\x{FBB1}]+/\x{06D3}/g;
+    $s =~ s/[\x{FECD}\x{FECF}\x{FED0}\x{FECE}]+/\x{063A}/g;
+    $s =~ s/[\x{FBDD}]+/\x{0677}/g;
+    $s =~ s/[\x{FBA6}\x{FBA8}\x{FBA9}\x{FBA7}]+/\x{06C1}/g;
+    $s =~ s/[\x{FEC1}\x{FEC3}\x{FEC4}\x{FEC2}]+/\x{0637}/g;
+    $s =~ s/[\x{FE85}\x{FE86}]+/\x{0624}/g;
+    $s =~ s/[\x{FEA5}\x{FEA7}\x{FEA8}\x{FEA6}]+/\x{062E}/g;
+    $s =~ s/[\x{FBD9}\x{FBDA}]+/\x{06C6}/g;
+    $s =~ s/[\x{FE8F}\x{FE91}\x{FE92}\x{FE90}]+/\x{0628}/g;
+    $s =~ s/[\x{FEED}\x{FEEE}]+/\x{0648}/g;
+    $s =~ s/[\x{FE99}\x{FE9B}\x{FE9C}\x{FE9A}]+/\x{062B}/g;
+    $s =~ s/[\x{FEBD}\x{FEBF}\x{FEC0}\x{FEBE}]+/\x{0636}/g;
+    $s =~ s/[\x{FEE5}\x{FEE7}\x{FEE8}\x{FEE6}]+/\x{0646}/g;
+    $s =~ s/[\x{FBFC}\x{FBFE}\x{FBFF}\x{FBFD}]+/\x{06CC}/g;
+    $s =~ s/[\x{FBA4}\x{FBA5}]+/\x{06C0}/g;
+    $s =~ s/[\x{FB72}\x{FB74}\x{FB75}\x{FB73}]+/\x{0684}/g;
+    $s =~ s/[\x{FBD3}\x{FBD5}\x{FBD6}\x{FBD4}]+/\x{06AD}/g;
+    $s =~ s/[\x{FB6A}\x{FB6C}\x{FB6D}\x{FB6B}]+/\x{06A4}/g;
+    $s =~ s/[\x{FB66}\x{FB68}\x{FB69}\x{FB67}]+/\x{0679}/g;
+    $s =~ s/[\x{FB5E}\x{FB60}\x{FB61}\x{FB5F}]+/\x{067A}/g;
+    $s =~ s/[\x{FB88}\x{FB89}]+/\x{0688}/g;
+    $s =~ s/[\x{FB7E}\x{FB80}\x{FB81}\x{FB7F}]+/\x{0687}/g;
+    $s =~ s/[\x{FB8E}\x{FB90}\x{FB91}\x{FB8F}]+/\x{06A9}/g;
+    $s =~ s/[\x{FB86}\x{FB87}]+/\x{068E}/g;
+    $s =~ s/[\x{FE83}\x{FE84}]+/\x{0623}/g;
+    $s =~ s/[\x{FB8A}\x{FB8B}]+/\x{0698}/g;
+    $s =~ s/[\x{FED5}\x{FED7}\x{FED8}\x{FED6}]+/\x{0642}/g;
+    $s =~ s/[\x{FED9}\x{FEDB}\x{FEDC}\x{FEDA}]+/\x{0643}/g;
+    $s =~ s/[\x{FBE0}\x{FBE1}]+/\x{06C5}/g;
+    $s =~ s/[\x{FEB9}\x{FEBB}\x{FEBC}\x{FEBA}]+/\x{0635}/g;
+    $s =~ s/[\x{FEC5}\x{FEC7}\x{FEC8}\x{FEC6}]+/\x{0638}/g;
+    $s =~ s/[\x{FE8D}\x{FE8E}]+/\x{0627}/g;
+    $s =~ s/[\x{FB9A}\x{FB9C}\x{FB9D}\x{FB9B}]+/\x{06B1}/g;
+    $s =~ s/[\x{FEAD}\x{FEAE}]+/\x{0631}/g;
+    $s =~ s/[\x{FEF1}\x{FEF3}\x{FEF4}\x{FEF2}]+/\x{064A}/g;
+    $s =~ s/[\x{FE93}\x{FE94}]+/\x{0629}/g;
+    $s =~ s/[\x{FBE4}\x{FBE6}\x{FBE7}\x{FBE5}]+/\x{06D0}/g;
+    $s =~ s/[\x{FE89}\x{FE8B}\x{FE8C}\x{FE8A}]+/\x{0626}/g;
+    $s =~ s/[\x{FB84}\x{FB85}]+/\x{068C}/g;
+    $s =~ s/[\x{FE9D}\x{FE9F}\x{FEA0}\x{FE9E}]+/\x{062C}/g;
+    $s =~ s/[\x{FB82}\x{FB83}]+/\x{068D}/g;
+    $s =~ s/[\x{FEA1}\x{FEA3}\x{FEA4}\x{FEA2}]+/\x{062D}/g;
+    $s =~ s/[\x{FB52}\x{FB54}\x{FB55}\x{FB53}]+/\x{067B}/g;
+    $s =~ s/[\x{FB92}\x{FB94}\x{FB95}\x{FB93}]+/\x{06AF}/g;
+    $s =~ s/[\x{FB7A}\x{FB7C}\x{FB7D}\x{FB7B}]+/\x{0686}/g;
+    $s =~ s/[\x{FBDB}\x{FBDC}]+/\x{06C8}/g;
+    $s =~ s/[\x{FB56}\x{FB58}\x{FB59}\x{FB57}]+/\x{067E}/g;
+    $s =~ s/[\x{FEB5}\x{FEB7}\x{FEB8}\x{FEB6}]+/\x{0634}/g;
+    $s =~ s/[\x{FBE2}\x{FBE3}]+/\x{06C9}/g;
+    $s =~ s/[\x{FB96}\x{FB98}\x{FB99}\x{FB97}]+/\x{06B3}/g;
+    $s =~ s/[\x{FE80}]+/\x{0621}/g;
+    $s =~ s/[\x{FBAE}\x{FBAF}]+/\x{06D2}/g;
+    $s =~ s/[\x{FB62}\x{FB64}\x{FB65}\x{FB63}]+/\x{067F}/g;
+    $s =~ s/[\x{FEE9}\x{FEEB}\x{FEEC}\x{FEEA}]+/\x{0647}/g;
+    $s =~ s/[\x{FE81}\x{FE82}]+/\x{0622}/g;
+    $s =~ s/[\x{FBDE}\x{FBDF}]+/\x{06CB}/g;
+    $s =~ s/[\x{FE87}\x{FE88}]+/\x{0625}/g;
+    $s =~ s/[\x{FB6E}\x{FB70}\x{FB71}\x{FB6F}]+/\x{06A6}/g;
+    $s =~ s/[\x{FBA0}\x{FBA2}\x{FBA3}\x{FBA1}]+/\x{06BB}/g;
+    $s =~ s/[\x{FBAA}\x{FBAC}\x{FBAD}\x{FBAB}]+/\x{06BE}/g;
+    $s =~ s/[\x{FEA9}\x{FEAA}]+/\x{062F}/g;
+    $s =~ s/[\x{FEE1}\x{FEE3}\x{FEE4}\x{FEE2}]+/\x{0645}/g;
+    $s =~ s/[\x{FEEF}\x{FBE8}\x{FBE9}\x{FEF0}]+/\x{0649}/g;
+    $s =~ s/[\x{FB8C}\x{FB8D}]+/\x{0691}/g;
+    $s =~ s/[\x{FB76}\x{FB78}\x{FB79}\x{FB77}]+/\x{0683}/g;
+    $s =~ s/[\x{FB5A}\x{FB5C}\x{FB5D}\x{FB5B}]+/\x{0680}/g;
+    $s =~ s/[\x{FB9E}\x{FB9F}]+/\x{06BA}/g;
+    $s =~ s/[\x{FEC9}\x{FECB}\x{FECC}\x{FECA}]+/\x{0639}/g;
+    $s =~ s/[\x{FEDD}\x{FEDF}\x{FEE0}\x{FEDE}]+/\x{0644}/g;
+    $s =~ s/[\x{FB50}\x{FB51}]+/\x{0671}/g;
+    $s =~ s/[\x{FEB1}\x{FEB3}\x{FEB4}\x{FEB2}]+/\x{0633}/g;
+    $s =~ s/[\x{FE95}\x{FE97}\x{FE98}\x{FE96}]+/\x{062A}/g;
+    $s =~ s/[\x{FBD7}\x{FBD8}]+/\x{06C7}/g;
+    $s =~ s/[\x{FEAF}\x{FEB0}]+/\x{0632}/g;
+    $s =~ s/[\x{FEAB}\x{FEAC}]+/\x{0630}/g;
+
+    # Remove tatweel
+    $s =~ s/\x{0640}//g;
+    # Remove vowels and hamza
+    $s =~ s/[\x{064B}-\x{0655}]+//g;
+    # Remove right-to-left and left-to-right
+    $s =~ s/[\x{200F}\x{200E}]+//g;
+    # Arabic Keheh to Arabic Kaf
+    $s =~ s/\x{06A9}/\x{0643}/g;
+    # Arabic Yeh to Farsi Yeh
+    $s =~ s/\x{064A}/\x{06CC}/g;
+    # Decompose RIAL
+    $s =~ s/\x{FDFC}/\x{0631}\x{06CC}\x{0627}\x{0644}/g;
+    # Farsi arabic-indic digits to arabic-indic digits
+    $s =~ s/\x{06F0}/\x{0660}/g;
+    $s =~ s/\x{06F1}/\x{0661}/g;
+    $s =~ s/\x{06F2}/\x{0662}/g;
+    $s =~ s/\x{06F3}/\x{0663}/g;
+    $s =~ s/\x{06F4}/\x{0664}/g;
+    $s =~ s/\x{06F5}/\x{0665}/g;
+    $s =~ s/\x{06F6}/\x{0666}/g;
+    $s =~ s/\x{06F7}/\x{0667}/g;
+    $s =~ s/\x{06F8}/\x{0668}/g;
+    $s =~ s/\x{06F9}/\x{0669}/g;
+    # Arabic-indic digits to digits
+    $s =~ s/\x{0660}/0/g;
+    $s =~ s/\x{0661}/1/g;
+    $s =~ s/\x{0662}/2/g;
+    $s =~ s/\x{0663}/3/g;
+    $s =~ s/\x{0664}/4/g;
+    $s =~ s/\x{0665}/5/g;
+    $s =~ s/\x{0666}/6/g;
+    $s =~ s/\x{0667}/7/g;
+    $s =~ s/\x{0668}/8/g;
+    $s =~ s/\x{0669}/9/g;
+    # Arabic comma to comma
+    $s =~ s/\x{060C}/\x{002C}/g;
+
+    $s =~ s/\|/ /g;
+    if ($s ne "") {
+      print "$s";
+    } else {
+      print "";
+    }
+  }
+  print "\n";
+}
+
diff --git a/egs/yomdle_zh/v1/local/yomdle2csv.py b/egs/yomdle_zh/v1/local/yomdle2csv.py
new file mode 100755
index 00000000000..8f208e2d968
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/yomdle2csv.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id=  Illegible= polygon=  Language= Text_Content= text_raw=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+
+class GEDI2CSV(object):
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','lang']
+        conf=100
+        pgrot = 0
+        bbrot = 0
+        qual = 0
+        script = ''
+
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+
+        strPos = writePath + baseName
+
+        for j in polys:
+            try:
+                arr = []
+                [id,poly_val,text,qual,lang] = j
+                script=None
+                #print(j)
+                for i in poly_val:
+                    if len(i.strip()) > 0:
+                        #print(i)
+                        arr.append(eval(i))
+
+                contour = np.asarray(arr)
+                #print(contour)
+                convex = cv2.convexHull(contour)
+                rect = cv2.minAreaRect(convex)
+                box = cv2.boxPoints(rect)
+                box = np.int0(box)
+                box = np.reshape(box,(-1,1)).T
+                c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+
+                bbrot = 0.0
+
+                rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,lang])
+
+            except:
+                print('...polygon error %s, %s' % (j, baseName))
+                continue
+
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+
+        return write_ctr
+
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    print('write to %s' % (writePath))
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    file_error_ctr = 0
+    """
+    Get all XML files in the directory and sub folders
+    """
+    print('reading %s' % (args.inputDir))
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                try:
+                    """ read the XML file """
+                    tree = ET.parse(fullName)
+                except:
+                    print('...ERROR parsing %s' % (fullName))
+                    file_error_ctr += 1
+                    continue
+
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' :
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Illegible'),zone.get('Language')])
+                            else:
+                                print('...Not polygon')
+
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no text content' % (baseName[0]))
+
+
+    print('complete...total files %d, lines written %d, img errors %d, line error %d' % (fileCnt, line_write_ctr, file_error_ctr, line_error_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', default='/data/YOMDLE/final_arabic/xml')
+    parser.add_argument('--outputDir', type=str, help='Output directory', default='/exp/YOMDLE/final_arabic/csv_truth/')
+    parser.add_argument('--log', type=str, help='Log directory', default='/exp/logs.txt')
+
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
diff --git a/egs/yomdle_zh/v1/path.sh b/egs/yomdle_zh/v1/path.sh
new file mode 100644
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_zh/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_zh/v1/run.sh b/egs/yomdle_zh/v1/run.sh
new file mode 100755
index 00000000000..eb8e9e11927
--- /dev/null
+++ b/egs/yomdle_zh/v1/run.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+
+set -e
+stage=0
+nj=60
+
+database_slam=/export/corpora5/slam/SLAM/Chinese/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_chinese
+download_dir=data_yomdle_chinese/download/
+extra_lm=download/extra_lm.txt
+data_dir=data_yomdle_chinese
+exp_dir=exp_yomdle_chinese
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $stage -le -1 ]; then
+    local/create_download.sh --database-slam $database_slam \
+        --database-yomdle $database_yomdle \
+        --slam-dir download/slam_chinese \
+        --yomdle-dir download/yomdle_chinese
+fi
+
+if [ $stage -le 0 ]; then
+    mkdir -p data_slam_chinese/slam
+    mkdir -p data_yomdle_chinese/yomdle
+    local/process_data.py download/slam_chinese data_slam_chinese/slam
+    local/process_data.py download/yomdle_chinese data_yomdle_chinese/yomdle
+    ln -s ../data_slam_chinese/slam ${data_dir}/test
+    ln -s ../data_yomdle_chinese/yomdle ${data_dir}/train
+    image/fix_data_dir.sh ${data_dir}/test
+    image/fix_data_dir.sh ${data_dir}/train
+fi
+
+mkdir -p $data_dir/{train,test}/data
+if [ $stage -le 1 ]; then
+    echo "$0: Obtaining image groups. calling get_image2num_frames"
+    echo "Date: $(date)."
+    image/get_image2num_frames.py --feat-dim 60 $data_dir/train
+    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train
+
+    for datasplit in train test; do
+        echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. "
+        echo "Date: $(date)."
+        local/extract_features.sh --nj $nj --cmd "$cmd" \
+            --feat-dim 60 --num-channels 3 \
+            $data_dir/${datasplit}
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1;
+    done
+
+    echo "$0: Fixing data directory for train dataset"
+    echo "Date: $(date)."
+    utils/fix_data_dir.sh $data_dir/train
+fi
+
+if [ $stage -le 2 ]; then
+    for datasplit in train; do
+        echo "$(date) stage 2: Performing augmentation, it will double training data"
+        local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 60 $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1;
+    done
+fi
+
+if [ $stage -le 3 ]; then
+    echo "$0: Preparing dictionary and lang..."
+    if [ ! -f $data_dir/train/bpe.out ]; then
+        cut -d' ' -f2- $data_dir/train/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out
+        for datasplit in test train train_aug; do
+            cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids
+            cut -d' ' -f2- $data_dir/$datasplit/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text
+            mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old
+            paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text
+        done
+    fi
+
+    local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict
+    # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+    # So we set --sil-prob to 0.0
+    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 6 --sil-prob 0.0 --position-dependent-phones false \
+        $data_dir/local/dict "<sil>" $data_dir/lang/temp $data_dir/lang
+    silphonelist=`cat $data_dir/lang/phones/silence.csl`
+    nonsilphonelist=`cat $data_dir/lang/phones/nonsilence.csl`
+    local/gen_topo.py 8 4 10 $nonsilphonelist $silphonelist $data_dir/lang/phones.txt > $data_dir/lang/topo 
+    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang
+fi
+
+if [ $stage -le 4 ]; then
+    echo "$0: Estimating a language model for decoding..."
+    local/train_lm.sh --data-dir $data_dir  --dir $data_dir/local/local_lm
+    utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+        $data_dir/local/dict/lexicon.txt $data_dir/lang_test
+fi
+
+if [ $stage -le 5 ]; then
+    echo "$0: Calling the flat-start chain recipe..."
+    echo "Date: $(date)." 
+    local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 6 ]; then
+    echo "$0: Aligning the training data using the e2e chain model..."
+    echo "Date: $(date)."
+    steps/nnet3/align.sh --nj $nj --cmd "$cmd" --use-gpu false \
+        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+        $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+    echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+    echo "Date: $(date)."
+    local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 8 ]; then
+    echo "$0: Estimating a language model for lattice rescoring...$(date)"
+    local/train_lm_lr.sh --data-dir $data_dir  --dir $data_dir/local/local_lm_lr --extra-lm $extra_lm --order 6
+
+    utils/build_const_arpa_lm.sh $data_dir/local/local_lm_lr/data/arpa/6gram_unpruned.arpa.gz \
+        $data_dir/lang_test $data_dir/lang_test_lr
+    steps/lmrescore_const_arpa.sh $data_dir/lang_test $data_dir/lang_test_lr \
+        $data_dir/test $exp_dir/chain/cnn_e2eali_1b/decode_test $exp_dir/chain/cnn_e2eali_1b/decode_test_lr
+fi
diff --git a/egs/yomdle_zh/v1/steps b/egs/yomdle_zh/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_zh/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_zh/v1/utils b/egs/yomdle_zh/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_zh/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/zeroth_korean/s5/README.txt b/egs/zeroth_korean/s5/README.txt
new file mode 100644
index 00000000000..daa007362d8
--- /dev/null
+++ b/egs/zeroth_korean/s5/README.txt
@@ -0,0 +1,13 @@
+Zeroth-Korean kaldi example is from Zeroth Project. Zeroth project introduces free Korean speech corpus and aims to make Korean speech recognition more broadly accessible to everyone. This project was developed in collaboration between Lucas Jo(@Atlas Guide Inc.) and Wonkyum Lee(@Gridspace Inc.). 
+
+In this example, we are using 51.6 hours transcribed Korean audio for training data (22,263 utterances, 105 people, 3000 sentences) and 1.2 hours transcribed Korean audio for testing data (457 utterances, 10 people). Besides audio and transcription, we provide pre-trained/designed language model, lexicon and morpheme-based segmenter(morfessor)
+
+The database can be also downloaded from openslr:
+http://www.openslr.org/40
+
+The database is licensed under Attribution 4.0 International (CC BY 4.0)
+
+This folder contains a speech recognition recipe which is based on WSJ/Librispeech example.
+
+For more details about Zeroth project, please visit:
+https://github.com/goodatlas/zeroth
diff --git a/egs/zeroth_korean/s5/RESULTS b/egs/zeroth_korean/s5/RESULTS
new file mode 100644
index 00000000000..9255ec17673
--- /dev/null
+++ b/egs/zeroth_korean/s5/RESULTS
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# this RESULTS file was obtained by Wonkyum Lee in July 2018.
+
+for dir in exp/*; do
+  steps/info/gmm_dir_info.pl $dir
+  for x in $dir/decode*test*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+done
+exit 0
+
+# monophone, trained on the 2k shortest utterances
+exp/mono: nj=16 align prob=-99.85 over 2.66h [retry=0.8%, fail=0.3%] states=130 gauss=1004
+%WER 70.24 [ 6499 / 9253, 295 ins, 1399 del, 4805 sub ] exp/mono/decode_nosp_fglarge_test_clean/wer_8_0.5
+%WER 71.28 [ 6596 / 9253, 185 ins, 1721 del, 4690 sub ] exp/mono/decode_nosp_tglarge_test_clean/wer_9_1.0
+%WER 78.83 [ 7294 / 9253, 218 ins, 1752 del, 5324 sub ] exp/mono/decode_nosp_tgsmall_test_clean/wer_10_0.0
+
+# first triphone build, trained on 5k utterances
+exp/tri1: nj=16 align prob=-98.34 over 11.55h [retry=1.6%, fail=0.6%] states=1568 gauss=10030 tree-impr=4.07
+%WER 37.44 [ 3464 / 9253, 258 ins, 725 del, 2481 sub ] exp/tri1/decode_nosp_fglarge_test_clean/wer_15_0.5
+%WER 38.85 [ 3595 / 9253, 347 ins, 633 del, 2615 sub ] exp/tri1/decode_nosp_tglarge_test_clean/wer_15_0.0
+%WER 53.23 [ 4925 / 9253, 296 ins, 1060 del, 3569 sub ] exp/tri1/decode_nosp_tgsmall_test_clean/wer_15_0.0
+
+# tri2 is an LDA+MLLT systemm, trained on 10k utterances
+exp/tri2: nj=16 align prob=-49.63 over 23.00h [retry=1.7%, fail=0.8%] states=2000 gauss=15039 tree-impr=4.70 lda-sum=18.11 mllt:impr,logdet=0.99,1.39
+%WER 33.50 [ 3100 / 9253, 248 ins, 626 del, 2226 sub ] exp/tri2/decode_nosp_fglarge_test_clean/wer_16_0.5
+%WER 34.55 [ 3197 / 9253, 315 ins, 537 del, 2345 sub ] exp/tri2/decode_nosp_tglarge_test_clean/wer_16_0.0
+%WER 48.98 [ 4532 / 9253, 303 ins, 903 del, 3326 sub ] exp/tri2/decode_nosp_tgsmall_test_clean/wer_14_0.0
+
+# tri3 is an LDA+MLLT+SAT system, trained on entire clean training set
+exp/tri3: nj=16 align prob=-48.95 over 51.22h [retry=1.6%, fail=0.7%] states=3336 gauss=40065 fmllr-impr=2.72 over 19.18h tree-impr=7.23
+%WER 23.89 [ 2211 / 9253, 233 ins, 404 del, 1574 sub ] exp/tri3/decode_nosp_fglarge_test_clean/wer_15_0.0
+%WER 24.47 [ 2264 / 9253, 252 ins, 385 del, 1627 sub ] exp/tri3/decode_nosp_tglarge_test_clean/wer_13_0.0
+%WER 37.81 [ 3499 / 9253, 274 ins, 671 del, 2554 sub ] exp/tri3/decode_nosp_tgsmall_test_clean/wer_13_0.0
+%WER 49.00 [ 4534 / 9253, 302 ins, 874 del, 3358 sub ] exp/tri3/decode_nosp_tgsmall_test_clean.si/wer_14_0.0
+%WER 21.68 [ 2006 / 9253, 226 ins, 346 del, 1434 sub ] exp/tri3/decode_fglarge_test_clean/wer_15_0.0
+%WER 22.59 [ 2090 / 9253, 231 ins, 372 del, 1487 sub ] exp/tri3/decode_tglarge_test_clean/wer_15_0.0
+%WER 34.83 [ 3223 / 9253, 294 ins, 605 del, 2324 sub ] exp/tri3/decode_tgsmall_test_clean/wer_12_0.0
+%WER 45.28 [ 4190 / 9253, 270 ins, 880 del, 3040 sub ] exp/tri3/decode_tgsmall_test_clean.si/wer_15_0.0
+
+# tri4 is an LDA+MLLT+SAT system after estimating pronunciation probabilities
+# and word-and-pronunciation-dependent silence probabilities.
+exp/tri4: nj=16 align prob=-48.70 over 51.22h [retry=1.5%, fail=0.7%] states=3368 gauss=40039 fmllr-impr=0.23 over 42.91h tree-impr=7.87
+%WER 21.61 [ 2000 / 9253, 210 ins, 379 del, 1411 sub ] exp/tri4/decode_fglarge_test_clean/wer_14_0.5
+%WER 22.59 [ 2090 / 9253, 237 ins, 371 del, 1482 sub ] exp/tri4/decode_tglarge_test_clean/wer_15_0.0
+%WER 34.57 [ 3199 / 9253, 285 ins, 595 del, 2319 sub ] exp/tri4/decode_tgsmall_test_clean/wer_12_0.0
+%WER 45.82 [ 4240 / 9253, 270 ins, 833 del, 3137 sub ] exp/tri4/decode_tgsmall_test_clean.si/wer_13_0.0
+
+for dir in exp/chain/tdnn*_sp; do
+  steps/info/chain_dir_info.pl $dir
+  for x in ${dir}_online/decode*test*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+done
+exit 0
+
+# tdnn_1a is a kind of factorized TDNN, with skip connections.
+exp/chain/tdnn1b_sp: num-iters=174 nj=2..8 num-params=12.9M dim=40+100->3040 combine=-0.041->-0.041 (over 2) xent:train/valid[115,173,final]=(-1.14,-0.759,-0.751/-1.14,-0.788,-0.777) logprob:train/valid[115,173,final]=(-0.084,-0.047,-0.046/-0.080,-0.050,-0.048)
+%WER 10.55 [ 976 / 9253, 122 ins, 166 del, 688 sub ] exp/chain/tdnn1b_sp_online/decode_fglarge_test_clean/wer_13_1.0
+%WER 17.65 [ 1633 / 9253, 208 ins, 233 del, 1192 sub ] exp/chain/tdnn1b_sp_online/decode_tgsmall_test_clean/wer_10_0.0
+
+# This chain system has TDNN+Norm-OPGRU architecture. 
+exp/chain/tdnn_opgru1a_sp: num-iters=99 nj=2..12 num-params=38.0M dim=40+100->3040 combine=-0.045->-0.045 (over 1) xent:train/valid[65,98,final]=(-1.18,-0.663,-0.651/-1.21,-0.698,-0.684) logprob:train/valid[65,98,final]=(-0.079,-0.038,-0.037/-0.076,-0.040,-0.039)
+%WER 9.45 [ 874 / 9253, 109 ins, 159 del, 606 sub ] exp/chain/tdnn_opgru1a_sp_online/decode_fglarge_test_clean/wer_10_1.0
+%WER 15.22 [ 1408 / 9253, 175 ins, 196 del, 1037 sub ] exp/chain/tdnn_opgru1a_sp_online/decode_tgsmall_test_clean/wer_8_0.0
+
diff --git a/egs/zeroth_korean/s5/cmd.sh b/egs/zeroth_korean/s5/cmd.sh
new file mode 100644
index 00000000000..34031439792
--- /dev/null
+++ b/egs/zeroth_korean/s5/cmd.sh
@@ -0,0 +1,17 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+export normalize_cmd="queue.pl --mem 4G"
+
diff --git a/egs/zeroth_korean/s5/conf/decode.config b/egs/zeroth_korean/s5/conf/decode.config
new file mode 100644
index 00000000000..7ba966f2b83
--- /dev/null
+++ b/egs/zeroth_korean/s5/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/zeroth_korean/s5/conf/mfcc.conf b/egs/zeroth_korean/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/zeroth_korean/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/zeroth_korean/s5/conf/mfcc_hires.conf b/egs/zeroth_korean/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/zeroth_korean/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/zeroth_korean/s5/conf/online_cmvn.conf b/egs/zeroth_korean/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/zeroth_korean/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/zeroth_korean/s5/local/chain/compare_wer.sh b/egs/zeroth_korean/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..e8366bfb358
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/chain/compare_wer.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER test_clean (tgsmall)          "
+  "#WER test_clean (fglarge)            ")
+
+for n in 0 1 ; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgsmall_test_clean fglarge_test_clean)
+
+     wer=$(grep WER ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/zeroth_korean/s5/local/chain/run_tdnn.sh b/egs/zeroth_korean/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru.sh b/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru.sh
new file mode 120000
index 00000000000..aedd4c8b4ac
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/chain/run_tdnn_opgru.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_opgru_1a.sh
\ No newline at end of file
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..14b9a8d6c8e
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,290 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This recipe trains TDNN-F AM
+# The training recipe is from WSJ example(egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh)
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1b_sp: num-iters=174 nj=2..8 num-params=12.9M dim=40+100->3040 combine=-0.041->-0.041 (over 2) xent:train/valid[115,173,final]=(-1.14,-0.759,-0.751/-1.14,-0.788,-0.777) logprob:train/valid[115,173,final]=(-0.084,-0.047,-0.046/-0.080,-0.050,-0.048)
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp
+# System                tdnn1b_sp
+#WER test_clean (tgsmall)               17.65
+#WER test_clean (fglarge)                 10.55
+# Final train prob        -0.0460
+# Final valid prob        -0.0480
+# Final train prob (xent)   -0.7512
+# Final valid prob (xent)   -0.7769
+# Num-params                12922560
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_clean
+speed_perturb=true
+test_sets="test_clean"
+gmm=tri4        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --speed-perturb ${speed_perturb}
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  train_set=${train_set}_sp
+  suffix=_sp
+fi
+
+gmm_dir=exp/${gmm}
+lat_dir=exp/chain/${gmm}_${train_set}_lats
+dir=exp/chain/tdnn${affix}${suffix}
+train_data_dir=data/${train_set}_hires
+train_ivector_dir=exp/nnet3/ivectors_${train_set}_hires
+lores_train_data_dir=data/${train_set}
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain/tree_a
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+if [ -d exp/${gmm}_ali_${train_set} ]; then 
+    ali_dir=exp/${gmm}_ali_${train_set}
+else
+    echo "$0: Using Alignment from GMM dir at ${gmm}..."
+    ali_dir=${gmm_dir}
+fi
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology.  
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 11 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1280
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1280 bottleneck-dim=160 time-stride=3
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1280 small-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1280 small-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=2000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+
+fi
+
+if [ $stage -le 13 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgsmall/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if $test_online_decoding && [ $stage -le 14 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgsmall; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_test_${data_affix} || exit 1
+      done
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,fglarge} \
+       data/${data} ${dir}_online/decode_{${lmtype},fglarge}_test_${data_affix} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
new file mode 100755
index 00000000000..28b36243ba3
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/chain/tuning/run_tdnn_opgru_1a.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+
+set -e -o pipefail
+
+# This is recipe using TDNN+Norm-OPGRU. 
+# The recipe is based on AMI example.(egs/ami/s5b/local/chain/tuning/run_tdnn_opgru_1c.sh) 
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_opgru1a_sp
+# exp/chain/tdnn_opgru1a_sp: num-iters=99 nj=2..12 num-params=38.0M dim=40+100->3040 combine=-0.045->-0.045 (over 1) xent:train/valid[65,98,final]=(-1.19,-0.661,-0.647/-1.21,-0.696,-0.680) logprob:train/valid[65,98,final]=(-0.080,-0.039,-0.038/-0.076,-0.039,-0.038)
+
+# ./local/chain/compare_wer.sh exp/chain/tdnn_opgru1a_sp
+# System                tdnn_opgru1a_sp
+#WER test_clean (tgsmall)               15.22
+#WER test_clean (fglarge)                  9.45
+# Final train prob        -0.0373
+# Final valid prob        -0.0386
+# Final train prob (xent)   -0.6506
+# Final valid prob (xent)   -0.6837
+# Num-params                37970368
+
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_clean
+speed_perturb=true
+test_sets="test_clean"
+gmm=tri4        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+
+# OPGRU/chain options
+train_stage=-10
+get_egs_stage=-10
+
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+
+chunk_width=140,100,160
+label_delay=5
+
+remove_egs=true
+
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --speed-perturb ${speed_perturb}
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  train_set=${train_set}_sp
+  suffix=_sp
+fi
+
+gmm_dir=exp/${gmm}
+lat_dir=exp/chain/${gmm}_${train_set}_lats
+dir=exp/chain/tdnn_opgru${affix}${suffix}
+train_data_dir=data/${train_set}_hires
+train_ivector_dir=exp/nnet3/ivectors_${train_set}_hires
+lores_train_data_dir=data/${train_set}
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain/tree_a
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+if [ -d exp/${gmm}_ali_${train_set} ]; then 
+    ali_dir=exp/${gmm}_ali_${train_set}
+else
+    echo "$0: Using Alignment from GMM dir at ${gmm}..."
+    ali_dir=${gmm_dir}
+fi
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 8 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology.  
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)
+  gru_opts="dropout-per-frame=true dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/gru.py for the other options and defaults
+  norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context 40 \
+    --egs.chunk-right-context 0 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.backstitch-training-scale 0.3 \
+    --trainer.optimization.backstitch-training-interval 1 \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.num-epochs=8 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+
+fi
+
+if [ $stage -le 13 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgsmall/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if $test_online_decoding && [ $stage -le 14 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgsmall; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 140 \
+		  --extra-left-context-initial 0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_test_${data_affix} || exit 1
+      done
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,fglarge} \
+       data/${data} ${dir}_online/decode_{${lmtype},fglarge}_test_${data_affix} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/zeroth_korean/s5/local/data_prep.sh b/egs/zeroth_korean/s5/local/data_prep.sh
new file mode 100755
index 00000000000..4fbb727f1cb
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/data_prep.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Copyright  2018  Atlas Guide (Author : Lucas Jo)
+#            2018  Gridspace Inc. (Author: Wonkyum Lee)
+# Apache 2.0
+
+# Modified by Lucas Jo 2017 (Altas Guide)
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <db-dir> <part>"
+  echo "e.g.: $0 ./db/train_data_01 data/train_data_01"
+  exit 1
+fi
+
+db_dir=$1
+data_part=$2
+
+src=${db_dir}/${data_part}
+dst=data/${data_part}
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
+   exit 1
+fi
+
+spk_file=${db_dir}/AUDIO_INFO
+
+mkdir -p $dst || exit 1;
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1;
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1;
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
+spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
+utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
+
+for scriptid_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  scriptid=$(basename $scriptid_dir)
+  if ! [ $scriptid -eq $scriptid ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $scriptid"
+    exit 1;
+  fi
+  
+  for reader_dir in $(find -L $scriptid_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    reader=$(basename $reader_dir)
+    if ! [ "$reader" -eq "$reader" ]; then
+      echo "$0: unexpected reader-subdirectory name $reader"
+      exit 1;
+    fi
+
+	reader_gender=$(egrep "^$reader\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($3)}')
+	if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
+      echo "Unexpected gender: '$reader_gender'"
+      exit 1;
+    fi
+	
+	echo "  "$scriptid $reader $reader_gender
+
+    find -L $reader_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+		awk -v "dir=$reader_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
+    
+	reader_trans=$reader_dir/${reader}_${scriptid}.trans.txt
+    [ ! -f  $reader_trans ] && echo "$0: expected file $reader_trans to exist" && exit 1
+    cat $reader_trans >>$trans
+
+    # NOTE: Each chapter is dedicated to each speaker. 
+    awk -v "reader=$reader" -v "scriptid=$scriptid" '{printf "%s %s_%s\n", $1, reader, scriptid}' \
+      <$reader_trans >>$utt2spk || exit 1
+    
+	# reader -> gender map (again using per-chapter granularity)
+    echo "${reader}_${scriptid} $reader_gender" >>$spk2gender  
+
+  done
+done
+
+# sort 
+cat $wav_scp    | sort > tmp
+cp tmp $wav_scp
+cat $trans      | sort > tmp
+cp tmp $trans
+cat $utt2spk    | sort > tmp
+cp tmp $utt2spk
+cat $spk2gender | sort > tmp
+cp tmp $spk2gender
+rm tmp
+
+
+spk2utt=$dst/spk2utt
+utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
+
+ntrans=$(wc -l <$trans)
+nutt2spk=$(wc -l <$utt2spk)
+! [ "$ntrans" -eq "$nutt2spk" ] && \
+  echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1;
+
+utils/data/get_utt2dur.sh $dst 1>&2 || exit 1
+
+utils/validate_data_dir.sh --no-feats $dst || exit 1;
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/zeroth_korean/s5/local/download_and_untar.sh b/egs/zeroth_korean/s5/local/download_and_untar.sh
new file mode 100755
index 00000000000..2e62a3273d4
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/download_and_untar.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Copyright 2018 Lucas Jo (Atlas Guide)
+#           2018 Wonkyum Lee (Gridspace)
+# Apache 2.0
+
+if [ $# -ne "1" ]; then
+	echo "Usage: $0 <download_dir>"
+	echo "e.g.: $0 ./db"
+	exit 1
+fi
+
+exists(){
+	command -v "$1" >/dev/null 2>&1
+}
+
+
+dir=$1
+local_lm_dir=data/local/lm
+
+AUDIOINFO='AUDIO_INFO'
+AUDIOLIST='train_data_01 test_data_01'
+
+echo "Now download corpus ----------------------------------------------------"
+if [ ! -f $dir/db.tar.gz ]; then
+  if [ ! -d $dir ]; then 
+    mkdir -p $dir
+  fi
+  wget -O $dir/db.tar.gz http://www.openslr.org/resources/40/zeroth_korean.tar.gz 
+else
+  echo "  $dir/db.tar.gz already exist"
+fi
+
+echo "Now extract corpus ----------------------------------------------------"
+if [ ! -f $dir/$AUDIOINFO ]; then
+  tar -zxvf $dir/db.tar.gz -C $dir
+  else
+    echo "  corpus already extracted"
+fi
+
+if [ ! -d $local_lm_dir ]; then
+    mkdir -p $local_lm_dir
+fi
+echo "Check LMs files"
+LMList="\
+  zeroth.lm.fg.arpa.gz \
+  zeroth.lm.tg.arpa.gz \
+  zeroth.lm.tgmed.arpa.gz \
+  zeroth.lm.tgsmall.arpa.gz \
+  zeroth_lexicon \
+  zeroth_morfessor.seg"
+
+for file in $LMList; do
+  if [ -f $local_lm_dir/$file ]; then
+    echo $file already exist
+  else
+    echo "Linking "$file
+    ln -s $PWD/$dir/$file $local_lm_dir/$file
+  fi
+done
+echo "all the files (lexicon, LM, segment model) are ready"
diff --git a/egs/zeroth_korean/s5/local/format_lms.sh b/egs/zeroth_korean/s5/local/format_lms.sh
new file mode 100755
index 00000000000..a9111e80eeb
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/format_lms.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+# Prepares the test time language model(G) transducers
+# (adapted from wsj/s5/local/wsj_format_data.sh)
+
+# Modified by Lucas Jo 2017 (Altas Guide)
+
+. ./path.sh || exit 1;
+
+# begin configuration section
+src_dir=data/lang
+# end configuration section
+
+. utils/parse_options.sh || exit 1;
+
+set -e
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <lm-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/lm"
+  echo ", where:"
+  echo "    <lm-dir> is the directory in which the language model is stored/downloaded"
+  echo "Options:"
+  echo "   --src-dir  <dir>           # source lang directory, default data/lang"
+  exit 1
+fi
+
+lm_dir=$1
+
+if [ ! -d $lm_dir ]; then
+  echo "$0: expected source LM directory $lm_dir to exist"
+  exit 1;
+fi
+if [ ! -f $src_dir/words.txt ]; then
+  echo "$0: expected $src_dir/words.txt to exist."
+  exit 1;
+fi
+
+
+tmpdir=data/local/lm_tmp.$$
+trap "rm -r $tmpdir" EXIT
+
+mkdir -p $tmpdir
+
+#lm_sets="tgsmall tgmed"
+lm_sets="tgsmall"
+for lm_suffix in ${lm_sets}; do
+  # tglarge is prepared by a separate command, called from run.sh; we don't
+  # want to compile G.fst for tglarge, as it takes a while.
+  test=${src_dir}_test_${lm_suffix}
+  mkdir -p $test
+  cp -r ${src_dir}/* $test
+  gunzip -c $lm_dir/zeroth.lm.${lm_suffix}.arpa.gz | \
+    arpa2fst --disambig-symbol=#0 \
+             --read-symbol-table=$test/words.txt - $test/G.fst
+
+  utils/validate_lang.pl --skip-determinization-check $test || exit 1;
+done
+
+echo "Succeeded in formatting data."
+
+exit 0
diff --git a/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..70be96310e1
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# this script contains some common (shared) parts of the run_nnet*.sh scripts.
+. cmd.sh
+
+
+stage=0
+gmmdir=exp/tri4
+speed_perturb=false
+trainset=train_clean
+
+set -e
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if [ "$speed_perturb" == "true" ]; then
+  if [ $stage -le 1 ]; then
+    echo "$0: preparing directory for speed-perturbed data"
+    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # _sp stands for speed-perturbed
+    for datadir in ${trainset} ; do
+	  utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp 
+
+      mfccdir=mfcc_perturbed
+      steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 \
+        data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1;
+      steps/compute_cmvn_stats.sh data/${datadir}_sp exp/make_mfcc/${datadir}_sp $mfccdir || exit 1;
+      utils/fix_data_dir.sh data/${datadir}_sp
+    done
+  fi
+
+  if [ $stage -le 2 ]; then
+	echo "$0: aligning with the perturbed low-resolution data"
+    #obtain the alignment of the perturbed data
+    steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
+      data/${trainset}_sp data/lang_nosp ${gmmdir} ${gmmdir}_ali_${trainset}_sp || exit 1
+  fi
+  trainset=${trainset}_sp
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+
+  echo "$0: creating high-resolution MFCC features"  
+  for datadir in ${trainset} ; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires  || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires  || exit 1;
+  done
+
+  # We need to build a small system just because we need PCA transform
+  # to train the diag-UBM on top of.  
+  utils/subset_data_dir.sh data/${trainset}_hires 30000 data/train_30k_hires
+fi
+
+
+if [ $stage -le 4 ]; then
+  # Train a small system just for its PCA transform.  
+  echo "$0: computing a PCA transform from the hires data."
+  mkdir exp -p exp/nnet3
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" \
+    --max-utts 30000 --subsample 2 \
+    data/train_30k_hires exp/nnet3/pca_transform
+fi
+
+if [ $stage -le 5 ]; then
+  # To train a diagonal UBM we don't need very much data, so use a small subset
+  echo "$0: training the diagonal UBM."
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
+    data/train_30k_hires 512 exp/nnet3/pca_transform exp/nnet3/diag_ubm
+fi
+
+if [ $stage -le 6 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of 100
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
+    data/${trainset}_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  ivectordir=exp/nnet3/ivectors_${trainset}_hires
+
+  # We extract iVectors on all the train data, which will be what we train the
+  # system on.  With --utts-per-spk-max 2, the script.  pairs the utterances
+  # into twos, and treats each of these pairs as one speaker.  Note that these
+  # are extracted 'online'.
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  echo "$0: extracing iVector using trained iVector extractor"
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${trainset}_hires data/${trainset}_hires_max2
+  
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
+    data/${trainset}_hires_max2 exp/nnet3/extractor $ivectordir || exit 1;
+fi
+
+
+exit 0;
diff --git a/egs/zeroth_korean/s5/local/prepare_dict.sh b/egs/zeroth_korean/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..76c6821e11e
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/prepare_dict.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright 2014 Vassil Panayotov
+# Apache 2.0
+
+# Modified by Lucas Jo 2017 (Altas Guide)
+# Prepare dictionary
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <lm-dir> <dst-dir>"
+  echo "e.g.: /data/local/lm data/local/dict_nosp"
+  exit 1
+fi
+lm_dir=$1
+dst_dir=$2
+
+mkdir -p $dst_dir || exit 1;
+
+# this file is  a copy of the lexicon we obtained from download_lm.sh process
+lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt
+
+if [[ ! -s "$lexicon_raw_nosil" ]]; then
+  cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1
+fi
+
+silence_phones=$dst_dir/silence_phones.txt
+optional_silence=$dst_dir/optional_silence.txt
+nonsil_phones=$dst_dir/nonsilence_phones.txt
+extra_questions=$dst_dir/extra_questions.txt
+
+echo "Preparing phone lists and clustering questions"
+(echo SIL; echo SPN;) > $silence_phones
+#( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $silence_phones
+echo SIL > $optional_silence
+# nonsilence phones; on each line is a list of phones that correspond
+# really to the same base phone.
+awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
+  sort -u |\
+  perl -e 'while(<>){
+    chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
+    $phones_of{$1} .= "$_ "; }
+    foreach $list (values %phones_of) {print $list . "\n"; } ' \
+    > $nonsil_phones || exit 1;
+# A few extra questions that will be added to those obtained by
+# automatically clustering
+# the "real" phones.  These ask about stress; there's also one for
+# silence.
+cat $silence_phones| awk '{printf("%s ", $1);} END{printf "\n";}' > $extra_questions || exit 1;
+cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)){
+$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+  >> $extra_questions || exit 1;
+
+echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
+echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
+echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
+echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
+
+#(echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH';
+# echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH'
+#  echo '<UNK> NSN' ) | \
+(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
+  cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
+echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
+exit 0
+
diff --git a/egs/zeroth_korean/s5/local/score.sh b/egs/zeroth_korean/s5/local/score.sh
new file mode 100755
index 00000000000..c812199fc98
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/score.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+#           2014  Guoguo Chen
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=true
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=7
+max_lmwt=17
+iter=final
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \
+    lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+    lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+    lattice-best-path --word-symbol-table=$symtab \
+      ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
+done
+
+# Note: the double level of quoting for the sed command
+for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+  $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
+    cat $dir/scoring/LMWT.$wip.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+    ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+done
+
+exit 0;
diff --git a/egs/zeroth_korean/s5/local/update_segmentation.sh b/egs/zeroth_korean/s5/local/update_segmentation.sh
new file mode 100755
index 00000000000..e1eea821645
--- /dev/null
+++ b/egs/zeroth_korean/s5/local/update_segmentation.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright 2017 Lucas Jo (Atlas Guide)
+# Apache 2.0
+
+# do this when the segmentation rule is changed
+dataDir=$1
+lmDir=$2
+
+exists(){
+	command -v "$1" >/dev/null 2>&1
+}
+
+# check morfessor installation 
+if ! exists morfessor; then
+	echo "You appear to not have Morfessor installed, either on your path."
+    echo "See tools/extras/install_morfessor.sh installation instructions."
+	exit 1
+fi
+
+trans=$dataDir/text
+echo "Re-segment transcripts: $trans --------------------------------------------"
+if [ ! -f $trans ]; then
+	echo "transcription file is not found in "$dataDir
+	exit 1
+fi
+cp $trans $trans".old"
+awk '{print $1}' $trans".old" > $trans"_tmp_index"
+cut -d' ' -f2- $trans".old" |\
+	sed -E 's/\s+/ /g; s/^\s//g; s/\s$//g' |\
+	morfessor -e 'utf-8' -l $lmDir/zeroth_morfessor.seg -T - -o - \
+	--output-format '{analysis} ' --output-newlines \
+	--nosplit-re '[0-9\[\]\(\){}a-zA-Z&.,\-]+' \
+	| paste -d" " $trans"_tmp_index" - > $trans
+rm -f $trans"_tmp_index"
+
diff --git a/egs/zeroth_korean/s5/path.sh b/egs/zeroth_korean/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/zeroth_korean/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/zeroth_korean/s5/run.sh b/egs/zeroth_korean/s5/run.sh
new file mode 100755
index 00000000000..c5c7506980b
--- /dev/null
+++ b/egs/zeroth_korean/s5/run.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+#
+# Based mostly on the WSJ/Librispeech recipe. 
+# The training/testing database is described in http://www.openslr.org/40/
+# This corpus consists of 51hrs korean speech with cleaned automatic transcripts:
+#
+# Copyright  2018  Atlas Guide (Author : Lucas Jo)
+#            2018  Gridspace Inc. (Author: Wonkyum Lee)
+#
+# Apache 2.0
+#
+
+# Check list before start
+# 1. required software: Morfessor-2.0.1 (see tools/extras/install_morfessor.sh)
+
+stage=0
+db_dir=./db
+nj=16
+
+chain_train=true
+decode=true # set false if you don't want to decode each GMM model
+decode_rescoring=true # set false if you don't want to rescore with large language model
+test_set="test_clean"
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh  # e.g. this parses the --stage option if supplied.
+
+# you might not want to do this for interactive shells.
+set -e
+
+if [ $stage -le 0 ]; then
+  # download the data.  
+  local/download_and_untar.sh $db_dir
+fi
+
+if [ $stage -le 1 ]; then
+  # format the data as Kaldi data directories
+  for part in train_data_01 test_data_01; do
+  	# use underscore-separated names in data directories.
+  	local/data_prep.sh $db_dir $part
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # update segmentation of transcripts
+  for part in train_data_01 test_data_01; do
+  	local/update_segmentation.sh data/$part data/local/lm
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  # prepare dictionary and language model 
+  local/prepare_dict.sh data/local/lm data/local/dict_nosp
+  
+  utils/prepare_lang.sh data/local/dict_nosp \
+  	"<UNK>" data/local/lang_tmp_nosp data/lang_nosp
+fi
+
+if [ $stage -le 4 ]; then
+  # build testing language model
+  local/format_lms.sh --src-dir data/lang_nosp data/local/lm
+
+  # re-scoring language model
+  if $decode_rescoring ; then
+    utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.tg.arpa.gz \
+    	data/lang_nosp data/lang_nosp_test_tglarge
+    utils/build_const_arpa_lm.sh data/local/lm/zeroth.lm.fg.arpa.gz \
+    	  data/lang_nosp data/lang_nosp_test_fglarge
+  fi
+fi
+
+
+if [ $stage -le 5 ]; then
+  # Feature extraction (MFCC)
+  mfccdir=mfcc
+  for part in train_data_01 test_data_01; do
+  	steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$part exp/make_mfcc/$part $mfccdir
+  	steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir
+  done
+  
+  # ... and then combine data sets into one (for later extension)
+  utils/combine_data.sh \
+    data/train_clean data/train_data_01
+  
+  utils/combine_data.sh \
+    data/test_clean data/test_data_01
+  
+  # Make some small data subsets for early system-build stages.
+  utils/subset_data_dir.sh --shortest data/train_clean 2000 data/train_2kshort
+  utils/subset_data_dir.sh data/train_clean 5000 data/train_5k
+  utils/subset_data_dir.sh data/train_clean 10000 data/train_10k
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: #### Monophone Training ###########"
+  # train a monophone system with 2k short utts
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  	data/train_2kshort data/lang_nosp exp/mono
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/mono exp/mono/graph_nosp_tgsmall
+    nspk=$(wc -l <data/${test_set}/spk2utt)
+    steps/decode.sh --nj $nspk --cmd "$decode_cmd" \
+      exp/mono/graph_nosp_tgsmall data/${test_set} exp/mono/decode_nosp_tgsmall_${test_set}
+    if $decode_rescoring; then
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
+        data/$test_set exp/mono/decode_nosp_{tgsmall,tglarge}_$test_set
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
+        data/$test_set exp/mono/decode_nosp_{tgsmall,fglarge}_$test_set
+    fi 
+  fi
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: #### Triphone Training, delta + delta-delta ###########"
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+  	data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k
+  # train a first delta + delta-delta triphone system on a subset of 5000 utterances
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
+      2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/tri1 exp/tri1/graph_nosp_tgsmall
+    nspk=$(wc -l <data/${test_set}/spk2utt)
+    steps/decode.sh --nj $nspk --cmd "$decode_cmd" \
+      exp/tri1/graph_nosp_tgsmall data/${test_set} exp/tri1/decode_nosp_tgsmall_${test_set}
+    if $decode_rescoring; then
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
+        data/$test_set exp/tri1/decode_nosp_{tgsmall,tglarge}_$test_set
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
+        data/$test_set exp/tri1/decode_nosp_{tgsmall,fglarge}_$test_set
+    fi
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: #### Triphone Training, LDA+MLLT ###########"
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k
+  # train an LDA+MLLT system.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+     --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
+     data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/tri2 exp/tri2/graph_nosp_tgsmall
+    nspk=$(wc -l <data/${test_set}/spk2utt)
+    steps/decode.sh --nj $nspk --cmd "$decode_cmd" \
+      exp/tri2/graph_nosp_tgsmall data/${test_set} exp/tri2/decode_nosp_tgsmall_${test_set}
+    if $decode_rescoring; then
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
+        data/$test_set exp/tri2/decode_nosp_{tgsmall,tglarge}_$test_set
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
+        data/$test_set exp/tri2/decode_nosp_{tgsmall,fglarge}_$test_set
+    fi
+  fi
+fi
+
+
+if [ $stage -le 9 ]; then
+  echo "$0: #### Triphone Training, LDA+MLLT+SAT ###########"
+  # Align the entire train_clean using the tri2 model
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+    data/train_clean data/lang_nosp exp/tri2 exp/tri2_ali_train_clean
+  
+  # Train tri3, which is LDA+MLLT+SAT on the entire train_clean
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_clean data/lang_nosp exp/tri2_ali_train_clean exp/tri3
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgsmall exp/tri3 exp/tri3/graph_nosp_tgsmall
+    nspk=$(wc -l <data/${test_set}/spk2utt)
+    steps/decode_fmllr.sh --nj $nspk --cmd "$decode_cmd" \
+      exp/tri3/graph_nosp_tgsmall data/${test_set} exp/tri3/decode_nosp_tgsmall_${test_set}
+    if $decode_rescoring; then
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \
+        data/$test_set exp/tri3/decode_nosp_{tgsmall,tglarge}_$test_set
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \
+        data/$test_set exp/tri3/decode_nosp_{tgsmall,fglarge}_$test_set
+    fi
+  fi
+fi 
+
+if [ $stage -le 10 ]; then
+  echo "$0: #### Re-computing pronunciation model using tri3 model ###########"
+  # Now we compute the pronunciation and silence probabilities from training data,
+  # and re-create the lang directory.
+  # silence transition probability ...
+  steps/get_prons.sh --cmd "$train_cmd" \
+        data/train_clean data/lang_nosp exp/tri3
+  
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+        data/local/dict_nosp \
+          exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+            exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
+  
+  utils/prepare_lang.sh data/local/dict \
+        "<UNK>" data/local/lang_tmp data/lang
+  
+  local/format_lms.sh --src-dir data/lang data/local/lm
+  
+  utils/build_const_arpa_lm.sh \
+        data/local/lm/zeroth.lm.tg.arpa.gz data/lang data/lang_test_tglarge
+  utils/build_const_arpa_lm.sh \
+        data/local/lm/zeroth.lm.fg.arpa.gz data/lang data/lang_test_fglarge
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_test_tgsmall exp/tri3 exp/tri3/graph_tgsmall
+    nspk=$(wc -l <data/${test_set}/spk2utt)
+    steps/decode_fmllr.sh --nj $nspk --cmd "$decode_cmd" \
+      exp/tri3/graph_tgsmall data/${test_set} exp/tri3/decode_tgsmall_${test_set}
+    if $decode_rescoring; then
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+        data/$test_set exp/tri3/decode_{tgsmall,tglarge}_$test_set
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+        data/$test_set exp/tri3/decode_{tgsmall,fglarge}_$test_set
+    fi
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+
+  echo "$0: #### SAT again on train_clean ###########"
+  # align the entire train_clean using the tri3 model
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_clean data/lang exp/tri3 exp/tri3_ali_train_clean
+  
+  # train another LDA+MLLT+SAT system on the entire train_clean
+  steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
+    data/train_clean data/lang exp/tri3_ali_train_clean exp/tri4
+ 
+  if $decode; then
+    utils/mkgraph.sh data/lang_test_tgsmall exp/tri4 exp/tri4/graph_tgsmall
+    nspk=$(wc -l <data/${test_set}/spk2utt)
+    steps/decode_fmllr.sh --nj $nspk --cmd "$decode_cmd" \
+      exp/tri4/graph_tgsmall data/${test_set} exp/tri4/decode_tgsmall_${test_set}
+    if $decode_rescoring; then
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \
+        data/$test_set exp/tri4/decode_{tgsmall,tglarge}_$test_set
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \
+        data/$test_set exp/tri4/decode_{tgsmall,fglarge}_$test_set
+    fi
+  fi 
+fi 
+
+echo "$0: GMM trainig is Done"
+
+if $chain_train; then
+  ## Training Chain Acoustic model using clean data set
+  echo "$0: #### chain training  ###########"
+  local/chain/run_tdnn.sh
+fi 
+
+exit 0;
+
diff --git a/egs/zeroth_korean/s5/steps b/egs/zeroth_korean/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/zeroth_korean/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/zeroth_korean/s5/utils b/egs/zeroth_korean/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/zeroth_korean/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/misc/docker/centos/Dockerfile b/misc/docker/centos/Dockerfile
deleted file mode 100644
index 304951fa4e0..00000000000
--- a/misc/docker/centos/Dockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-FROM centos:latest
-
-MAINTAINER sih4sing5hong5
-
-ENV CPU_CORE 4
-
-RUN yum update -y 
-RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools" "System Tools"
-RUN  yum install -y \
-    git bzip2 wget subversion which \
-    gcc-c++ make automake autoconf zlib-devel atlas-static \
-	 python
-
-## How To Install Python 3 and Set Up a Local Programming Environment on CentOS 7 | DigitalOcean
-## https://www.digitalocean.com/community/tutorials/how-to-install-python-3-and-set-up-a-local-programming-environment-on-centos-7
-RUN yum -y install https://centos7.iuscommunity.org/ius-release.rpm
-RUN yum -y install python36u
-RUN ln -s /usr/bin/python3.6 /usr/bin/python3
-
-WORKDIR /usr/local/
-# Use the newest kaldi version
-RUN git clone https://github.com/kaldi-asr/kaldi.git
-
-
-WORKDIR /usr/local/kaldi/tools
-RUN extras/check_dependencies.sh
-RUN make -j $CPU_CORE
-
-
-WORKDIR /usr/local/kaldi/src
-RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE
-
diff --git a/misc/docker/fedora/Dockerfile b/misc/docker/fedora/Dockerfile
deleted file mode 100644
index 68f2d9504c7..00000000000
--- a/misc/docker/fedora/Dockerfile
+++ /dev/null
@@ -1,30 +0,0 @@
-FROM fedora:latest
-
-MAINTAINER sih4sing5hong5
-
-ENV CPU_CORE 4
-
-RUN yum update -y
-RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools"
-RUN  yum install -y \
-    git bzip2 wget subversion \
-    gcc-c++ make automake autoconf zlib-devel \
-    python python3
-
-
-WORKDIR /usr/local/
-# Use the newest kaldi version
-RUN git clone https://github.com/kaldi-asr/kaldi.git
-
-
-WORKDIR /usr/local/kaldi/tools
-
-RUN extras/check_dependencies.sh
-# RUN yum groupinstall -y "System Tools"
-RUN make -j $CPU_CORE
-
-#    libatlas-dev libatlas-base-dev
-
-WORKDIR /usr/local/kaldi/src
-RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE
-
diff --git a/misc/docker/ubuntu-cuda/Dockerfile b/misc/docker/ubuntu-cuda/Dockerfile
deleted file mode 100644
index f61d4403355..00000000000
--- a/misc/docker/ubuntu-cuda/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM nvidia/cuda:9.1-devel-ubuntu16.04
-
-MAINTAINER sih4sing5hong5
-
-ENV CPU_CORE 4
-
-RUN \
-  apt-get update -qq && \
-  apt-get install -y \
-    git bzip2 wget \
-    g++ make python python3 \
-    zlib1g-dev automake autoconf libtool subversion \
-    libatlas-base-dev
-
-
-WORKDIR /usr/local/
-# Use the newest kaldi version
-RUN git clone https://github.com/kaldi-asr/kaldi.git
-
-
-WORKDIR /usr/local/kaldi/tools
-RUN extras/check_dependencies.sh
-RUN make -j $CPU_CORE
-
-WORKDIR /usr/local/kaldi/src
-RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE
-
-
diff --git a/misc/docker/ubuntu/Dockerfile b/misc/docker/ubuntu/Dockerfile
deleted file mode 100644
index 6e2bc5def92..00000000000
--- a/misc/docker/ubuntu/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM ubuntu:latest
-
-MAINTAINER sih4sing5hong5
-
-ENV CPU_CORE 4
-
-RUN \
-  apt-get update -qq && \
-  apt-get install -y \
-    git bzip2 wget \
-    g++ make python python3 \
-    zlib1g-dev automake autoconf libtool subversion \
-    libatlas-base-dev
-
-
-WORKDIR /usr/local/
-# Use the newest kaldi version
-RUN git clone https://github.com/kaldi-asr/kaldi.git
-
-
-WORKDIR /usr/local/kaldi/tools
-RUN extras/check_dependencies.sh
-RUN make -j $CPU_CORE
-
-WORKDIR /usr/local/kaldi/src
-RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE
-
-
diff --git a/misc/maintenance/cpplint.py b/misc/maintenance/cpplint.py
index 03d0569ab1c..91658705f41 100755
--- a/misc/maintenance/cpplint.py
+++ b/misc/maintenance/cpplint.py
@@ -83,6 +83,7 @@
 We do a small hack, which is to ignore //'s with "'s after them on the
 same line, but it is far from perfect (in either direction).
 """
+from __future__ import division
 
 import codecs
 import getopt
@@ -564,7 +565,7 @@ def IncrementErrorCount(self, category):
 
   def PrintErrorCounts(self):
     """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
+    for category, count in self.errors_by_category.items():
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
     sys.stderr.write('Total errors found: %d\n' % self.error_count)
@@ -656,7 +657,7 @@ def Check(self, error, filename, linenum):
     trigger = base_trigger * 2**_VerboseLevel()
 
     if self.lines_in_function > trigger:
-      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      error_level = int(math.log(float(self.lines_in_function) / base_trigger, 2))
       # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
       if error_level > 5:
         error_level = 5
@@ -676,7 +677,7 @@ class _IncludeError(Exception):
   pass
 
 
-class FileInfo:
+class FileInfo(object):
   """Provides utility functions for filenames.
 
   FileInfo provides easy access to the components of a file's path
@@ -1012,7 +1013,7 @@ def CheckForCopyright(filename, lines, error):
 
   # We'll say it should occur by line 10. Don't forget there's a
   # dummy line at the front.
-  for line in xrange(1, min(len(lines), 11)):
+  for line in range(1, min(len(lines), 11)):
     if re.search(r'Copyright', lines[line], re.I): break
   else:                       # means no copyright line was found
     error(filename, 0, 'legal/copyright', 5,
@@ -1604,7 +1605,7 @@ def CheckForFunctionLengths(filename, clean_lines, linenum,
 
   if starting_func:
     body_found = False
-    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+    for start_linenum in range(linenum, clean_lines.NumLines()):
       start_line = lines[start_linenum]
       joined_line += ' ' + start_line.lstrip()
       if Search(r'(;|})', start_line):  # Declarations and trivial functions
@@ -2073,7 +2074,7 @@ def GetLineWidth(line):
     The width of the line in column positions, accounting for Unicode
     combining characters and wide characters.
   """
-  if isinstance(line, unicode):
+  if isinstance(line, str):
     width = 0
     for c in unicodedata.normalize('NFC', line):
       if unicodedata.east_asian_width(c) in ('W', 'F'):
@@ -2861,7 +2862,7 @@ def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
   required = {}  # A map of header name to linenumber and the template entity.
                  # Example of required: { '<functional>': (1219, 'less<>') }
 
-  for linenum in xrange(clean_lines.NumLines()):
+  for linenum in range(clean_lines.NumLines()):
     line = clean_lines.elided[linenum]
     if not line or line[0] == '#':
       continue
@@ -2994,7 +2995,7 @@ def ProcessFileData(filename, file_extension, lines, error):
 
   RemoveMultiLineComments(filename, lines, error)
   clean_lines = CleansedLines(lines)
-  for line in xrange(clean_lines.NumLines()):
+  for line in range(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, class_state, error)
   class_state.CheckFinished(filename, error)
diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py
index 0686c8f88c6..595c1d85bc1 100755
--- a/scripts/rnnlm/choose_features.py
+++ b/scripts/rnnlm/choose_features.py
@@ -8,10 +8,10 @@
 import sys
 import math
 from collections import defaultdict
-sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
+sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. "
                                              "To be more specific, it chooses the set of features-- you compute "
@@ -86,9 +86,9 @@
 #  and 'wordlist' is a list indexed by integer id, that returns the string-valued word.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="latin-1") as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -115,9 +115,9 @@ def read_vocab(vocab_file):
 # id of the word, which evaluates to the unigram prob of the word.
 def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
-    with open(unigram_probs_file, 'r', encoding="latin-1") as f:
+    with open(unigram_probs_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
diff --git a/scripts/rnnlm/compute_perplexity.sh b/scripts/rnnlm/compute_perplexity.sh
new file mode 100755
index 00000000000..17c441e6aea
--- /dev/null
+++ b/scripts/rnnlm/compute_perplexity.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# This script computes perplexity of text on the specified RNNLM model. 
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <rnn-dir> <input-text>"
+  exit 1
+fi
+
+dir=$1
+text_in=$2
+
+# the format of the $text_in file is one sentence per line, without explicit
+# <s> or </s> symbols, and without utterance-id's, for example:
+
+# ====== begin file ======
+# well western new york is supposed to be used to this kind of weather but
+# yeah you are right
+# in um anaheim california you know just
+# ====== end file ======
+
+if [ -f $dir/word_embedding.final.mat ]; then
+  word_embedding=$dir/word_embedding.final.mat
+else
+  [ ! -f $dir/feat_embedding.final.mat ] &&
+             echo "$0: expect file $dir/feat_embedding.final.mat to exit"
+  word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|"
+fi
+
+for x in final.raw config/words.txt; do
+  if [ ! -f $dir/$x ]; then 
+    echo "$0: expected file $dir/$x to exist."
+    exit 1;
+  fi
+done
+
+special_symbol_opts=$(cat $dir/special_symbol_opts.txt)
+
+ppl=$(rnnlm-sentence-probs --normalize-probs=true \
+       $special_symbol_opts $dir/final.raw "$word_embedding" \
+       <(cat $text_in | sym2int.pl $dir/config/words.txt | awk '{print "utt_id ", $0}') | \
+       awk '{for(i=2;i<=NF;i++) a+=$i; b+=NF-1}END{print exp(-a / b)}')
+
+echo "$0: perplexity is $ppl"
+
diff --git a/scripts/rnnlm/compute_sentence_scores_back.sh b/scripts/rnnlm/compute_sentence_scores_back.sh
new file mode 100755
index 00000000000..3024d43439e
--- /dev/null
+++ b/scripts/rnnlm/compute_sentence_scores_back.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright 2017  Hainan Xu
+#           2017  Szu-Jui Chen
+
+# This script is very similar to rnnlm/compute_sentence_scores.sh, where it do the
+# same procedure for reversed data. And it computes log-likelihoods from a
+# Kaldi-RNNLM model instead of that of Mikolov's RNNLM. Because Kaldi-RNNLM uses
+# letter-features which does not need an <OOS> symbol, we don't need the "unk.probs"
+# file any more to add as a penalty term in sentence likelihoods.
+
+ensure_normalized_probs=false  # If true then the probabilities computed by the
+                               # RNNLM will be correctly normalized. Note it is
+                               # OK to set it to false because Kaldi-RNNLM is
+                               # trained in a way that ensures the sum of probabilities
+                               # is close to 1.
+
+. ./path.sh || exit 1;
+. utils/parse_options.sh
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 <rnn-dir> <temp-dir> <input-text> <output-scores>"
+  exit 1;
+fi
+
+dir=$1
+tempdir=$2
+text_in=$3
+scores_out=$4
+
+if [ -f $dir/word_embedding.final.mat ]; then
+  word_embedding=$dir/word_embedding.final.mat
+else
+  [ ! -f $dir/feat_embedding.final.mat ] &&
+             echo "$0: expect file $dir/feat_embedding.final.mat to exit"
+  word_embedding="rnnlm-get-word-embedding $dir/word_feats.txt $dir/feat_embedding.final.mat -|"
+fi
+
+for x in final.raw config/words.txt; do
+  if [ ! -f $dir/$x ]; then 
+    echo "$0: expected file $dir/$x to exist."
+    exit 1;
+  fi
+done
+
+mkdir -p $tempdir
+cat $text_in | sym2int.pl -f 2- $dir/config/words.txt | \
+    awk '{printf("%s ",$1);for(i=NF;i>1;i--) printf("%s ",$i); print""}' > $tempdir/text.int
+    
+special_symbol_opts=$(cat ${dir}/special_symbol_opts.txt)
+
+rnnlm-sentence-probs --normalize-probs=$ensure_normalized_probs \
+       $special_symbol_opts $dir/final.raw "$word_embedding" $tempdir/text.int > $tempdir/loglikes.rnn
+# Now $tempdir/loglikes.rnn has the following structure
+# utt-id log P(word1 | <s>) log P(word2 | <s> word1) ... log P(</s> | all word histories)
+# for example,
+#
+# en_4156-A_058697-058813-2 -3.57205 -2.70411 -4.29876 -3.63707 -6.00299 -2.11093 -2.03955
+# en_4156-A_058697-058813-3 -6.6074 -1.21244 -3.89991 -3.23747 -5.35102 -1.90448 -1.77809
+# en_4156-A_058697-058813-4 -5.09022 -1.24148 -4.76337 -4.75594 -5.77118 -2.08555 -2.18403
+# en_4156-A_058697-058813-5 -4.54489 -2.97485 -3.93646 -3.28041 -5.18779 -2.83356 -1.72601
+# en_4156-A_058697-058813-6 -2.31464 -3.74738 -4.03309 -3.22942 -5.66818 -2.0396 -1.64734
+# en_4156-A_058697-058813-7 -5.0728 -2.96303 -4.6539 -3.20266 -5.40682 -2.10625 -1.90956
+
+[ $(cat $tempdir/loglikes.rnn | wc -l) -ne $(cat $tempdir/text.int | wc -l) ] && \
+  echo "$0: rnnlm rescoring failed" && exit 1;
+  
+# We need the negative log-probabilities
+cat $tempdir/loglikes.rnn | awk '{sum=0;for(i=2;i<=NF;i++)sum-=$i; print $1,sum}' >$scores_out
diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py
index e8c6bd8a2f4..ed266346e06 100755
--- a/scripts/rnnlm/get_best_model.py
+++ b/scripts/rnnlm/get_best_model.py
@@ -3,14 +3,14 @@
 # Copyright  2017  Johns Hopkins University (author: Daniel Povey)
 # License: Apache 2.0.
 
-import os
 import argparse
-import sys
+import glob
 import re
+import sys
 
 parser = argparse.ArgumentParser(description="Works out the best iteration of RNNLM training "
-                                 "based on dev-set perplexity, and prints the number corresponding "
-                                 "to that iteration",
+                                             "based on dev-set perplexity, and prints the number corresponding "
+                                             "to that iteration",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm_a",
                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
@@ -19,10 +19,9 @@
 
 args = parser.parse_args()
 
-
-num_iters=None
+num_iters = None
 try:
-    with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f:
+    with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f:
         for line in f:
             a = line.split("=")
             if a[0] == "num_iters":
@@ -36,15 +35,15 @@
     sys.exit(sys.argv[0] + ": could not get num_iters from {0}/info.txt".format(
         args.rnnlm_dir))
 
-best_objf=-2000
-best_iter=-1
+best_objf = -2000
+best_iter = -1
 for i in range(1, num_iters):
     this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i)
     try:
-        f = open(this_logfile, 'r', encoding='latin-1')
+        f = open(this_logfile, 'r', encoding='utf-8')
     except:
         sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile))
-    this_objf=-1000
+    this_objf = -1000
     for line in f:
         m = re.search('Overall objf .* (\S+)$', str(line))
         if m is not None:
@@ -53,6 +52,10 @@
             except Exception as e:
                 sys.exit(sys.argv[0] + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
                     this_logfile, line, str(e)))
+    # verify this iteration still has model files present
+    if len(glob.glob("{0}/{1}.raw".format(args.rnnlm_dir, i))) == 0:
+        # this iteration has log files, but model files have been cleaned up, skip it
+        continue
     if this_objf == -1000:
         print(sys.argv[0] + ": warning: could not parse objective function from {0}".format(
             this_logfile), file=sys.stderr)
@@ -63,5 +66,4 @@
 if best_iter == -1:
     sys.exit(sys.argv[0] + ": error: could not get best iteration.")
 
-
 print(str(best_iter))
diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py
index a5ddb8c25f3..1d516e0edf5 100755
--- a/scripts/rnnlm/get_embedding_dim.py
+++ b/scripts/rnnlm/get_embedding_dim.py
@@ -45,7 +45,7 @@
 left_context=0
 right_context=0
 for line in out_lines:
-    line = line.decode('latin-1')
+    line = line.decode('utf-8')
     m = re.search(r'input-node name=input dim=(\d+)', line)
     if m is not None:
         try:
@@ -101,4 +101,4 @@
              "nnet '{0}': {1} != {2}".format(
             args.nnet, input_dim, output_dim))
 
-print(str(input_dim))
+print('{}'.format(input_dim))
diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py
index 83f7d708a49..7ee0ca54c9a 100755
--- a/scripts/rnnlm/get_special_symbol_opts.py
+++ b/scripts/rnnlm/get_special_symbol_opts.py
@@ -9,7 +9,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="This script checks whether the special symbols "
                                  "appear in words.txt with expected values, if not, it will "
@@ -28,9 +28,9 @@
 
 lower_ids = {}
 upper_ids = {}
-input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1')
+input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
 for line in input_stream:
-    fields = re.split(tab_or_space, line)
+    fields = line.split()
     assert(len(fields) == 2)
     sym = fields[0]
     if sym in special_symbols:
diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py
index abb8515f330..52e637a0e2d 100755
--- a/scripts/rnnlm/get_unigram_probs.py
+++ b/scripts/rnnlm/get_unigram_probs.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.",
                                  epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt "
@@ -27,7 +27,7 @@
                     help="File that specifies multiplicities and weights for each data source: "
                     "e.g. if <text_dir> contains foo.txt and bar.txt, then should have lines "
                     "like 'foo 1 0.5' and 'bar 5 1.5'.  These "
-                    "don't have to sum to on.")
+                    "don't have to sum to one.")
 parser.add_argument("--smooth-unigram-counts", type=float, default=1.0,
                     help="Specify the constant for smoothing. We will add "
                          "(smooth_unigram_counts * num_words_with_non_zero_counts / vocab_size) "
@@ -77,10 +77,10 @@ def get_all_data_sources_except_dev(text_dir):
 #                    value is a tuple (repeated_times_per_epoch, weight)
 def read_data_weights(weights_file, data_sources):
     data_weights = {}
-    with open(weights_file, 'r', encoding="latin-1") as f:
+    with open(weights_file, 'r', encoding="utf-8") as f:
         for line in f:
             try:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
@@ -102,9 +102,9 @@ def read_data_weights(weights_file, data_sources):
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="latin-1") as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -131,9 +131,9 @@ def get_counts(data_sources, data_weights, vocab):
         if weight == 0.0:
             continue
 
-        with open(counts_file, 'r', encoding="latin-1") as f:
+        with open(counts_file, 'r', encoding="utf-8") as f:
             for line in f:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr);
                 assert(len(fields) == 2)
                 word = fields[0]
diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py
index e30ce4a94c9..baafcb3a131 100755
--- a/scripts/rnnlm/get_vocab.py
+++ b/scripts/rnnlm/get_vocab.py
@@ -6,10 +6,10 @@
 import os
 import argparse
 import sys
-sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
+sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts "
                                  "of words produced by get_unigram_counts.sh",
@@ -28,10 +28,10 @@
 # Add the count for every word in counts_file
 # the result is written into word_counts
 def add_counts(word_counts, counts_file):
-    with open(counts_file, 'r', encoding="latin-1") as f:
+    with open(counts_file, 'r', encoding="utf-8") as f:
         for line in f:
-            line = line.strip()
-            word_and_count = re.split(tab_or_space, line)
+            line = line.strip(" \t\r\n")
+            word_and_count = line.split()
             assert len(word_and_count) == 2
             if word_and_count[0] in word_counts:
                 word_counts[word_and_count[0]] += int(word_and_count[1])
diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py
index 54d84077060..cdcc0a77734 100755
--- a/scripts/rnnlm/get_word_features.py
+++ b/scripts/rnnlm/get_word_features.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, "
                                              "using features from rnnlm/choose_features.py.",
@@ -41,9 +41,9 @@
 # return the vocab, which is a dict mapping the word to a integer id.
 def read_vocab(vocab_file):
     vocab = {}
-    with open(vocab_file, 'r', encoding="latin-1") as f:
+    with open(vocab_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             if fields[0] in vocab:
                 sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}"
@@ -62,9 +62,9 @@ def read_vocab(vocab_file):
 # return a list of unigram_probs, indexed by word id
 def read_unigram_probs(unigram_probs_file):
     unigram_probs = []
-    with open(unigram_probs_file, 'r', encoding="latin-1") as f:
+    with open(unigram_probs_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert len(fields) == 2
             idx = int(fields[0])
             if idx >= len(unigram_probs):
@@ -103,9 +103,9 @@ def read_features(features_file):
     feats['min_ngram_order'] = 10000
     feats['max_ngram_order'] = -1
 
-    with open(features_file, 'r', encoding="latin-1") as f:
+    with open(features_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert(len(fields) in [3, 4, 5])
 
             feat_id = int(fields[0])
diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh
index 6f28c960dd9..a22d43961ab 100755
--- a/scripts/rnnlm/lmrescore_nbest.sh
+++ b/scripts/rnnlm/lmrescore_nbest.sh
@@ -29,7 +29,7 @@ if [ $# != 6 ]; then
    echo "This version applies an RNNLM and mixes it with the LM scores"
    echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)"
    echo ""
-   echo "Usage: utils/rnnlmrescore.sh <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "Usage: $0 [options] <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
    echo "Main options:"
    echo "  --inv-acwt <inv-acwt>          # default 12.  e.g. --inv-acwt 17.  Equivalent to LM scale to use."
    echo "                                 # for N-best list generation... note, we'll score at different acwt's"
@@ -58,7 +58,7 @@ elif [ ! -f $oldlm ]; then
     exit 1;
 fi
 
-for f in $rnndir/final.raw $data/feats.scp $indir/lat.1.gz; do
+for f in $rnndir/final.raw $indir/lat.1.gz; do
   [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
 done
 
@@ -174,6 +174,7 @@ if [ $stage -le 5 ]; then
       $adir.$n/lmwt.lmonly || exit 1;
   done
 fi
+
 if [ $stage -le 6 ]; then
   echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores."
   $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \
diff --git a/scripts/rnnlm/lmrescore_nbest_back.sh b/scripts/rnnlm/lmrescore_nbest_back.sh
new file mode 100755
index 00000000000..7531d99b0a4
--- /dev/null
+++ b/scripts/rnnlm/lmrescore_nbest_back.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+# Copyright 2017  Hainan Xu
+#           2017  Szu-Jui Chen
+
+# This script is very similar to scripts/rnnlm/lmrescore_nbest.sh, and it takes the results
+# from forward model then performs n-best LM rescoring based on backward model with Kaldi-RNNLM.
+
+# Begin configuration section.
+N=10
+inv_acwt=10
+cmd=run.pl
+use_phi=false  # This is kind of an obscure option.  If true, we'll remove the old
+  # LM weights (times 1-RNN_scale) using a phi (failure) matcher, which is
+  # appropriate if the old LM weights were added in this way, e.g. by
+  # lmrescore.sh.  Otherwise we'll use normal composition, which is appropriate
+  # if the lattices came directly from decoding.  This won't actually make much
+  # difference (if any) to WER, it's more so we know we are doing the right thing.
+test=false # Activate a testing option.
+stage=1 # Stage of this script, for partial reruns.
+skip_scoring=false
+keep_ali=true
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh
+
+if [ $# != 6 ]; then
+   echo "Do language model rescoring of lattices (partially remove old LM, add new LM)"
+   echo "This version applies an RNNLM and mixes it with the LM scores"
+   echo "previously in the lattices., controlled by the first parameter (rnnlm-weight)"
+   echo ""
+   echo "Usage: $0 [options] <rnn-weight> <old-lang-dir> <rnn-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "Main options:"
+   echo "  --inv-acwt <inv-acwt>          # default 12.  e.g. --inv-acwt 17.  Equivalent to LM scale to use."
+   echo "                                 # for N-best list generation... note, we'll score at different acwt's"
+   echo "  --cmd <run.pl|queue.pl [opts]> # how to run jobs."
+   echo "  --phi (true|false)             # Should be set to true if the source lattices were created"
+   echo "                                 # by lmrescore.sh, false if they came from decoding."
+   echo "  --N <N>                        # Value of N in N-best rescoring (default: 10)"
+   exit 1;
+fi
+
+rnnweight=$1
+oldlang=$2
+rnndir=$3
+data=$4
+indir=$5
+dir=$6
+
+acwt=`perl -e "print (1.0/$inv_acwt);"`
+
+# Figures out if the old LM is G.fst or G.carpa
+oldlm=$oldlang/G.fst
+if [ -f $oldlang/G.carpa ]; then
+  oldlm=$oldlang/G.carpa
+elif [ ! -f $oldlm ]; then
+  echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\
+    exit 1;
+fi
+
+for f in $rnndir/final.raw $data/feats.scp $indir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
+done
+
+nj=`cat $indir/num_jobs` || exit 1;
+mkdir -p $dir;
+cp $indir/num_jobs $dir/num_jobs
+
+adir=$dir/archives
+
+phi=`grep -w '#0' $oldlang/words.txt | awk '{print $2}'`
+
+rm $dir/.error 2>/dev/null
+mkdir -p $dir/log
+
+# First convert lattice to N-best.  Be careful because this
+# will be quite sensitive to the acoustic scale; this should be close
+# to the one we'll finally get the best WERs with.
+# Note: the lattice-rmali part here is just because we don't
+# need the alignments for what we're doing.
+if [ $stage -le 5 ]; then
+  echo "$0: Copying needed information from $indir/archives to $adir"
+    # Do some small tasks; for these we don't use the queue, it will only slow us down.
+  for n in `seq $nj`; do
+    mkdir -p $adir.$n
+    cp $indir/archives.$n/ali $adir.$n/
+    cp $indir/archives.$n/words $adir.$n/
+    cp $indir/archives.$n/words_text $adir.$n/
+    cp $indir/archives.$n/lmwt.nolm $adir.$n/
+    cp $indir/archives.$n/acwt $adir.$n/
+    cp $indir/archives.$n/lmwt.withlm $adir.$n/
+    
+    mkdir -p $adir.$n/temp
+    paste $adir.$n/lmwt.nolm $adir.$n/lmwt.withlm | awk '{print $1, ($4-$2);}' > \
+      $adir.$n/lmwt.lmonly || exit 1;
+  done
+fi
+if [ $stage -le 6 ]; then
+  echo "$0: invoking rnnlm/compute_sentence_scores_back.sh which calls rnnlm to get RNN LM scores."
+  $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \
+    rnnlm/compute_sentence_scores_back.sh $rnndir $adir.JOB/temp \
+                                   $adir.JOB/words_text $adir.JOB/lmwt.rnn 
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: doing average on forward and backward scores."
+  for n in `seq $nj`; do
+    paste $indir/archives.$n/lmwt.rnn $adir.$n/lmwt.rnn | awk -F' ' '{print $1,$2 * 0.5 + $4 * 0.5}' \
+    > $adir.$n/lmwt.rnn_bi
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: reconstructing total LM+graph scores including interpolation of RNNLM and old LM scores."
+  for n in `seq $nj`; do
+    paste $adir.$n/lmwt.nolm $adir.$n/lmwt.lmonly $adir.$n/lmwt.rnn_bi | awk -v rnnweight=$rnnweight \
+      '{ key=$1; graphscore=$2; lmscore=$4; rnnscore=$6;
+     score = graphscore+(rnnweight*rnnscore)+((1-rnnweight)*lmscore);
+     print $1,score; } ' > $adir.$n/lmwt.interp.$rnnweight || exit 1;
+  done
+fi
+
+if [ $stage -le 9 ]; then
+  echo "$0: reconstructing archives back into lattices."
+  $cmd JOB=1:$nj $dir/log/reconstruct_lattice.JOB.log \
+    linear-to-nbest "ark:$adir.JOB/ali" "ark:$adir.JOB/words" \
+    "ark:$adir.JOB/lmwt.interp.$rnnweight" "ark:$adir.JOB/acwt" ark:- \| \
+    nbest-to-lattice ark:- "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+fi
+
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh --cmd "$cmd" $data $oldlang $dir ||
+    { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; }
+fi
+
+exit 0;
+
diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh
index 9ba78415708..b6ec694ffd4 100755
--- a/scripts/rnnlm/lmrescore_pruned.sh
+++ b/scripts/rnnlm/lmrescore_pruned.sh
@@ -26,7 +26,7 @@ normalize=false # If true, we add a normalization step to the output of the RNNL
                 # as in our RNNLM setup, a properly trained network would automatically
                 # have its normalization term close to 1. The details of this
                 # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf
-lattice_prune_beam=4 # Beam used in pruned lattice composition
+lattice_prune_beam=8 # Beam used in pruned lattice composition
                      # This option affects speed and how large the composed lattice may be
 
 # End configuration section.
diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh
index 1de91bb7232..e101822d983 100755
--- a/scripts/rnnlm/prepare_rnnlm_dir.sh
+++ b/scripts/rnnlm/prepare_rnnlm_dir.sh
@@ -23,7 +23,7 @@ if [ $# != 3 ]; then
   echo "Usage: $0 [options] <text-dir> <rnnlm-config-dir> <rnnlm-dir>"
   echo "Sets up the directory <rnnlm-dir> for RNNLM training as done by"
   echo "rnnlm/train_rnnlm.sh, and initializes the model."
-  echo " <text-dir> is as validated by rnnlm/validate_data_dir.py"
+  echo " <text-dir> is as validated by rnnlm/validate_text_dir.py"
   echo " <rnnlm-config-dir> is as validated by rnnlm/validate_config_dir.sh."
   exit 1
 fi
@@ -53,9 +53,13 @@ if [ $stage -le 1 ]; then
     echo "$0: copying config directory"
     mkdir -p $dir/config
     # copy expected things from $config_dir to $dir/config.
-    for f in words.txt features.txt data_weights.txt oov.txt xconfig; do
+    for f in words.txt data_weights.txt oov.txt xconfig; do
       cp $config_dir/$f $dir/config
     done
+    # features.txt is optional, check separately
+    if [ -f $config_dir/features.txt ]; then
+      cp $config_dir/features.txt $dir/config
+    fi
   fi
 
   rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt
diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py
index e39f4504f37..427f043df98 100755
--- a/scripts/rnnlm/prepare_split_data.py
+++ b/scripts/rnnlm/prepare_split_data.py
@@ -9,7 +9,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, "
                                  "for consumption by nnet3-get-egs.",
@@ -66,10 +66,10 @@ def get_all_data_sources_except_dev(text_dir):
 #                    value is a tuple (repeated_times_per_epoch, weight)
 def read_data_weights(weights_file, data_sources):
     data_weights = {}
-    with open(weights_file, 'r', encoding="latin-1") as f:
+    with open(weights_file, 'r', encoding="utf-8") as f:
         for line in f:
             try:
-                fields = re.split(tab_or_space, line)
+                fields = line.split()
                 assert len(fields) == 3
                 if fields[0] in data_weights:
                     raise Exception("duplicated data source({0}) specified in "
@@ -97,7 +97,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
     num_outputs = len(output_filehandles)
     n = 0
     try:
-        f = open(source_filename, 'r', encoding="latin-1")
+        f = open(source_filename, 'r', encoding="utf-8")
     except Exception as e:
         sys.exit(sys.argv[0] + ": failed to open file {0} for reading: {1} ".format(
             source_filename, str(e)))
@@ -124,7 +124,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
     os.makedirs(args.split_dir +  "/info")
 
 # set up the 'num_splits' file, which contains an integer.
-with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="latin-1") as f:
+with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="utf-8") as f:
     print(args.num_splits, file=f)
 
 # e.g. set temp_files = [ 'foo/1.tmp', 'foo/2.tmp', ..., 'foo/5.tmp' ]
@@ -136,7 +136,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles):
 temp_filehandles = []
 for fname in temp_files:
     try:
-        temp_filehandles.append(open(fname, 'w', encoding="latin-1"))
+        temp_filehandles.append(open(fname, 'w', encoding="utf-8"))
     except Exception as e:
         sys.exit(sys.argv[0] + ": failed to open file: " + str(e) +
                  ".. if this is a max-open-filehandles limitation, you may "
diff --git a/scripts/rnnlm/rnnlm_cleanup.py b/scripts/rnnlm/rnnlm_cleanup.py
new file mode 100644
index 00000000000..6a304f7f4cb
--- /dev/null
+++ b/scripts/rnnlm/rnnlm_cleanup.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+
+# Copyright 2018 Tilde
+# License: Apache 2.0
+
+import sys
+
+import argparse
+import os
+import re
+import glob
+
+script_name = sys.argv[0]
+
+parser = argparse.ArgumentParser(description="Removes models from past training iterations of "
+                                             "RNNLM. Can use either 'keep_latest' (default) or "
+                                             "'keep_best' cleanup strategy, where former keeps "
+                                             "the models that are freshest, while latter keeps "
+                                             "the models with best training objective score on "
+                                             "dev set.",
+                                 epilog="E.g. " + script_name + " exp/rnnlm_a --keep_best",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+parser.add_argument("rnnlm_dir",
+                    help="Directory where the RNNLM has been trained")
+parser.add_argument("--iters_to_keep",
+                    help="Max number of iterations to keep",
+                    type=int,
+                    default=3)
+parser.add_argument("--keep_latest",
+                    help="Keeps the training iterations that are latest by age",
+                    action="store_const",
+                    const=True,
+                    default=False)
+parser.add_argument("--keep_best",
+                    help="Keeps the training iterations that have the best objf",
+                    action="store_const",
+                    const=True,
+                    default=False)
+
+args = parser.parse_args()
+
+# validate arguments
+if args.keep_latest and args.keep_best:
+    sys.exit(script_name + ": can only use one of 'keep_latest' or 'keep_best', but not both")
+elif not args.keep_latest and not args.keep_best:
+    sys.exit(script_name + ": no cleanup strategy specified: use 'keep_latest' or 'keep_best'")
+
+
+class IterationInfo:
+    def __init__(self, model_files, objf, compute_prob_done):
+        self.model_files = model_files
+        self.objf = objf
+        self.compute_prob_done = compute_prob_done
+
+    def __str__(self):
+        return "{model_files: %s, compute_prob: %s, objf: %2.3f}" % (self.model_files,
+                                                                     self.compute_prob_done,
+                                                                     self.objf)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def get_compute_prob_info(log_file):
+    # we want to know 3 things: iteration number, objf and whether compute prob is done
+    iteration = int(log_file.split(".")[-2])
+    objf = -2000
+    compute_prob_done = False
+    # roughly based on code in get_best_model.py
+    try:
+        f = open(log_file, "r", encoding="utf-8")
+    except:
+        print(script_name + ": warning: compute_prob log not found for iteration " +
+              str(iter) + ". Skipping",
+              file=sys.stderr)
+        return iteration, objf, compute_prob_done
+    for line in f:
+        objf_m = re.search('Overall objf .* (\S+)$', str(line))
+        if objf_m is not None:
+            try:
+                objf = float(objf_m.group(1))
+            except Exception as e:
+                sys.exit(script_name + ": line in file {0} could not be parsed: {1}, error is: {2}".format(
+                    log_file, line, str(e)))
+        if "# Ended" in line:
+            compute_prob_done = True
+    if objf == -2000:
+        print(script_name + ": warning: could not parse objective function from " + log_file, file=sys.stderr)
+    return iteration, objf, compute_prob_done
+
+
+def get_iteration_files(exp_dir):
+    iterations = dict()
+    compute_prob_logs = glob.glob(exp_dir + "/log/compute_prob.[0-9]*.log")
+    for log in compute_prob_logs:
+        iteration, objf, compute_prob_done = get_compute_prob_info(log)
+        if iteration == 0:
+            # iteration 0 is special, never consider it for cleanup
+            continue
+        if compute_prob_done:
+            # this iteration can be safely considered for cleanup
+            # gather all model files belonging to it
+            model_files = []
+            # when there are multiple jobs per iteration, there can be several model files
+            # we need to potentially clean them all up without mixing them up
+            model_files.extend(glob.glob("{0}/word_embedding.{1}.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/word_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/feat_embedding.{1}.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/feat_embedding.{1}.[0-9]*.mat".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/{1}.raw".format(exp_dir, iteration)))
+            model_files.extend(glob.glob("{0}/{1}.[0-9]*.raw".format(exp_dir, iteration)))
+            # compute_prob logs outlive model files, only consider iterations that do still have model files
+            if len(model_files) > 0:
+                iterations[iteration] = IterationInfo(model_files, objf, compute_prob_done)
+    return iterations
+
+
+def remove_model_files_for_iter(iter_info):
+    for f in iter_info.model_files:
+        os.remove(f)
+
+
+def keep_latest(iteration_dict):
+    max_to_keep = args.iters_to_keep
+    kept = 0
+    iterations_in_reverse_order = reversed(sorted(iteration_dict))
+    for iter in iterations_in_reverse_order:
+        if kept < max_to_keep:
+            kept += 1
+        else:
+            remove_model_files_for_iter(iteration_dict[iter])
+
+
+def keep_best(iteration_dict):
+    iters_to_keep = args.iters_to_keep
+    best = []
+    for iter, iter_info in iteration_dict.items():
+        objf = iter_info.objf
+        if objf == -2000:
+            print(script_name + ": warning: objf unavailable for iter " + str(iter), file=sys.stderr)
+            continue
+        # add potential best, sort by objf, trim to iters_to_keep size
+        best.append((iter, objf))
+        best = sorted(best, key=lambda x: -x[1])
+        if len(best) > iters_to_keep:
+            throwaway = best[iters_to_keep:]
+            best = best[:iters_to_keep]
+            # remove iters that we know are not the best
+            for (iter, _) in throwaway:
+                remove_model_files_for_iter(iteration_dict[iter])
+
+
+# grab all the iterations mapped to their model files, objf score and compute_prob status
+iterations = get_iteration_files(args.rnnlm_dir)
+# apply chosen cleanup strategy
+if args.keep_latest:
+    keep_latest(iterations)
+else:
+    keep_best(iterations)
diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py
index 5fe049cb8ce..4335caed5d8 100755
--- a/scripts/rnnlm/show_word_features.py
+++ b/scripts/rnnlm/show_word_features.py
@@ -6,10 +6,11 @@
 import os
 import argparse
 import sys
-sys.stdout = open(1, 'w', encoding='latin-1', closefd=False)
+
+sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.",
                                  epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt "
@@ -30,9 +31,9 @@
 def read_feature_type_and_key(features_file):
     feat_types = {}
 
-    with open(features_file, 'r', encoding="latin-1") as f:
+    with open(features_file, 'r', encoding="utf-8") as f:
         for line in f:
-            fields = re.split(tab_or_space, line)
+            fields = line.split()
             assert(len(fields) in [2, 3, 4])
 
             feat_id = int(fields[0])
@@ -47,9 +48,9 @@ def read_feature_type_and_key(features_file):
 feat_type_and_key = read_feature_type_and_key(args.features_file)
 
 num_word_feats = 0
-with open(args.word_features_file, 'r', encoding="latin-1") as f:
+with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert len(fields) % 2 == 1
 
         print(int(fields[0]), end='\t')
diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh
index aedfc470ac9..013e9a56c2f 100755
--- a/scripts/rnnlm/train_rnnlm.sh
+++ b/scripts/rnnlm/train_rnnlm.sh
@@ -38,6 +38,11 @@ num_egs_threads=10  # number of threads used for sampling, if we're using
 use_gpu=true  # use GPU for training
 use_gpu_for_diagnostics=false  # set true to use GPU for compute_prob_*.log
 
+# optional cleanup options
+cleanup=false  # add option --cleanup true to enable automatic cleanup of old models
+cleanup_strategy="keep_latest"  # determines cleanup strategy, use either "keep_latest" or "keep_best"
+cleanup_keep_iters=3  # number of iterations that will have their models retained
+
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 . utils/parse_options.sh
 
@@ -208,7 +213,7 @@ while [ $x -lt $num_iters ]; do
              --read-rnnlm="$src_rnnlm" --write-rnnlm=$dir/$dest_number.raw \
              --read-embedding=$dir/${embedding_type}_embedding.$x.mat \
              --write-embedding=$dir/${embedding_type}_embedding.$dest_number.mat \
-             "ark,bg:cat $repeated_data | rnnlm-get-egs --srand=$num_splits_processed $train_egs_args - ark:- |" || touch $dir/.train_error &
+             "ark,bg:cat $repeated_data | rnnlm-get-egs --chunk-length=$chunk_length --srand=$num_splits_processed $train_egs_args - ark:- |" || touch $dir/.train_error &
       done
       wait # wait for just the training jobs.
       [ -f $dir/.train_error ] && \
@@ -222,12 +227,16 @@ while [ $x -lt $num_iters ]; do
           nnet3-average $src_models $dir/$[x+1].raw '&&' \
           matrix-sum --average=true $src_matrices $dir/${embedding_type}_embedding.$[x+1].mat
       fi
+      # optionally, perform cleanup after training
+      if [ "$cleanup" = true ] ; then
+        python3 rnnlm/rnnlm_cleanup.py $dir --$cleanup_strategy --iters_to_keep $cleanup_keep_iters
+      fi
     )
-
     # the error message below is not that informative, but $cmd will
     # have printed a more specific one.
     [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1;
   fi
+
   x=$[x+1]
   num_splits_processed=$[num_splits_processed+this_num_jobs]
 done
diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py
index 010ceb72615..e67f03207bb 100755
--- a/scripts/rnnlm/validate_features.py
+++ b/scripts/rnnlm/validate_features.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt",
@@ -24,7 +24,7 @@
 if not os.path.isfile(args.features_file):
     sys.exit(sys.argv[0] + ": Expected file {0} to exist".format(args.features_file))
 
-with open(args.features_file, 'r', encoding="latin-1") as f:
+with open(args.features_file, 'r', encoding="utf-8") as f:
     has_unigram = False
     has_length = False
     idx = 0
@@ -33,7 +33,7 @@
     final_feats = {}
     word_feats = {}
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert(len(fields) in [3, 4, 5])
 
         assert idx == int(fields[0])
diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py
index 4b311a8abbd..1f250d4c2f8 100755
--- a/scripts/rnnlm/validate_text_dir.py
+++ b/scripts/rnnlm/validate_text_dir.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="Validates data directory containing text "
                                  "files from one or more data sources, including dev.txt.",
@@ -40,7 +40,7 @@
 
 
 def check_text_file(text_file):
-    with open(text_file, 'r', encoding="latin-1") as f:
+    with open(text_file, 'r', encoding="utf-8") as f:
         found_nonempty_line = False
         lineno = 0
         if args.allow_internal_eos == 'true':
@@ -54,7 +54,7 @@ def check_text_file(text_file):
             lineno += 1
             if args.spot_check == 'true' and lineno > 10:
                 break
-            words = re.split(tab_or_space, line)
+            words = line.split()
             if len(words) != 0:
                 found_nonempty_line = True
                 for word in words:
@@ -76,9 +76,9 @@ def check_text_file(text_file):
     # with some kind of utterance-id
     first_field_set = set()
     other_fields_set = set()
-    with open(text_file, 'r', encoding="latin-1") as f:
+    with open(text_file, 'r', encoding="utf-8") as f:
         for line in f:
-            array = re.split(tab_or_space, line)
+            array = line.split()
             if len(array) > 0:
                 first_word = array[0]
                 if first_word in first_field_set or first_word in other_fields_set:
diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py
index f8eb5858d95..372286d8d12 100755
--- a/scripts/rnnlm/validate_word_features.py
+++ b/scripts/rnnlm/validate_word_features.py
@@ -8,7 +8,7 @@
 import sys
 
 import re
-tab_or_space = re.compile('[ \t]')
+
 
 parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.",
                                  epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt "
@@ -28,9 +28,9 @@
 unigram_feat_id = -1
 length_feat_id = -1
 max_feat_id = -1
-with open(args.features_file, 'r', encoding="latin-1") as f:
+with open(args.features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert(len(fields) in [3, 4, 5])
 
         feat_id = int(fields[0])
@@ -52,9 +52,9 @@
         if feat_id > max_feat_id:
             max_feat_id = feat_id
 
-with open(args.word_features_file, 'r', encoding="latin-1") as f:
+with open(args.word_features_file, 'r', encoding="utf-8") as f:
     for line in f:
-        fields = re.split(tab_or_space, line)
+        fields = line.split()
         assert len(fields) > 0 and len(fields) % 2 == 1
         word_id = int(fields[0])
 
diff --git a/src/.version b/src/.version
index 37c2d9960ec..9ad974f6109 100644
--- a/src/.version
+++ b/src/.version
@@ -1 +1 @@
-5.4
+5.5
diff --git a/src/Doxyfile b/src/Doxyfile
index a6c0b434ff2..0032214f803 100644
--- a/src/Doxyfile
+++ b/src/Doxyfile
@@ -457,7 +457,7 @@ INPUT    = doc itf \
           fstext hmm lm decoder lat cudamatrix nnet \
           bin fstbin gmmbin fgmmbin featbin \
           nnetbin latbin sgmm2 sgmm2bin nnet2 nnet2bin nnet3 nnet3bin \
-          kwsbin ivector ivectorbin
+          kws kwsbin ivector ivectorbin online onlinebin online2 online2bin
 
 # If the value of the INPUT tag contains directories, you can use the
 # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
diff --git a/src/Makefile b/src/Makefile
index 6dfd146e3d5..07b7947f3b1 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,16 +4,16 @@
 
 SHELL := /bin/bash
 
-
-SUBDIRS = base matrix util feat tree gmm transform \
+SUBDIRS = base matrix util feat cudafeat tree gmm transform \
           fstext hmm lm decoder lat kws cudamatrix nnet \
-          bin fstbin gmmbin fgmmbin featbin \
+          bin fstbin gmmbin fgmmbin featbin cudafeatbin \
           nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin
+          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin \
+          cudadecoder cudadecoderbin
 
-MEMTESTDIRS = base matrix util feat tree gmm transform \
+MEMTESTDIRS = base matrix util feat cudafeat tree gmm transform \
           fstext hmm lm decoder lat nnet kws chain \
-          bin fstbin gmmbin fgmmbin featbin \
+          bin fstbin gmmbin fgmmbin featbin cudafeatbin \
           nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \
           ivector ivectorbin online2 online2bin lmbin
 
@@ -31,15 +31,9 @@ include kaldi.mk
 
 # Reset the default goal, so that the all target will become default
 .DEFAULT_GOAL :=
-all:
-	$(MAKE) checkversion
-	$(MAKE) kaldi.mk
-	$(MAKE) mklibdir
-	$(MAKE) subdirs
+all: $(SUBDIRS) matrix/test
 	-echo Done
 
-subdirs: $(SUBDIRS)
-
 mklibdir:
 	test -d $(KALDILIBDIR) || mkdir $(KALDILIBDIR)
 
@@ -138,20 +132,21 @@ ext_depend: check_portaudio
 
 
 .PHONY: $(SUBDIRS)
-$(SUBDIRS) : mklibdir
+$(SUBDIRS) : checkversion kaldi.mk mklibdir
 	$(MAKE) -C $@
 
 .PHONY: $(EXT_SUBDIRS)
-$(EXT_SUBDIRS) : mklibdir ext_depend
+$(EXT_SUBDIRS) : checkversion kaldi.mk mklibdir ext_depend
 	$(MAKE) -C $@
 
 
 ### Dependency list ###
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
- base matrix util feat tree gmm transform sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm
+bin fstbin gmmbin fgmmbin sgmm2bin featbin cudafeatbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin cudadecoderbin: \
+ base matrix util feat cudafeat tree gmm transform sgmm2 fstext hmm \
+ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm \
+ cudadecoder
 
 #2)The libraries have inter-dependencies
 base: base/.depend.mk
@@ -165,18 +160,22 @@ sgmm2: base util matrix gmm tree transform hmm
 fstext: base util matrix tree
 hmm: base tree matrix util
 lm: base util matrix fstext
-decoder: base util matrix gmm hmm tree transform lat
+decoder: base util matrix gmm hmm tree transform lat fstext
 lat: base util hmm tree matrix
 cudamatrix: base util matrix
 nnet: base util hmm tree matrix cudamatrix
 nnet2: base util matrix lat gmm hmm tree transform cudamatrix
-nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext
+nnet3: base util matrix decoder lat gmm hmm tree transform cudamatrix chain fstext
 rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm
 #3)Dependencies for optional parts of Kaldi
 onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online
 # python-kaldi-decoding: base matrix util feat tree gmm transform sgmm2 fstext hmm decoder lat online
+cudafeat: base matrix util gmm transform tree feat cudamatrix online2
+cudafeatbin: base matrix util gmm transform tree feat cudamatrix cudafeat online2
 online: decoder gmm transform feat matrix util base lat hmm tree
 online2: decoder gmm transform feat matrix util base lat hmm tree ivector cudamatrix nnet2 nnet3 chain
 kws: base util hmm tree matrix lat
+cudadecoder:  cudamatrix cudafeat online2 nnet3 ivector feat fstext lat chain transform
+cudadecoderbin: cudadecoder cudafeat cudamatrix online2 nnet3 ivector feat fstext lat chain transform
diff --git a/src/base/Makefile b/src/base/Makefile
index 583c6badcf2..49af4f87ff4 100644
--- a/src/base/Makefile
+++ b/src/base/Makefile
@@ -18,7 +18,7 @@ include ../kaldi.mk
 
 TESTFILES = kaldi-math-test io-funcs-test kaldi-error-test timer-test
 
-OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o
+OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o timer.o
 
 LIBNAME = kaldi-base
 
diff --git a/src/base/io-funcs-inl.h b/src/base/io-funcs-inl.h
index 6b87f4c1a24..b703ef5addc 100644
--- a/src/base/io-funcs-inl.h
+++ b/src/base/io-funcs-inl.h
@@ -47,7 +47,7 @@ template<class T>  void WriteBasicType(std::ostream &os,
       os << t << " ";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteBasicType.");
+    KALDI_ERR << "Write failure in WriteBasicType.";
   }
 }
 
@@ -122,7 +122,7 @@ inline void WriteIntegerPairVector(std::ostream &os, bool binary,
     os << "]\n";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteIntegerPairVector.");
+    KALDI_ERR << "Write failure in WriteIntegerPairVector.";
   }
 }
 
@@ -224,7 +224,7 @@ template<class T> inline void WriteIntegerVector(std::ostream &os, bool binary,
     os << "]\n";
   }
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteIntegerVector.");
+    KALDI_ERR << "Write failure in WriteIntegerVector.";
   }
 }
 
diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc
index 90988faf3ea..150f74099be 100644
--- a/src/base/io-funcs.cc
+++ b/src/base/io-funcs.cc
@@ -138,7 +138,7 @@ void WriteToken(std::ostream &os, bool binary, const char *token) {
   CheckToken(token);  // make sure it's valid (can be read back)
   os << token << " ";
   if (os.fail()) {
-    throw std::runtime_error("Write failure in WriteToken.");
+    KALDI_ERR << "Write failure in WriteToken.";
   }
 }
 
@@ -179,11 +179,8 @@ int PeekToken(std::istream &is, bool binary) {
   int ans = is.peek();
   if (read_bracket) {
     if (!is.unget()) {
-      KALDI_WARN << "Error ungetting '<' in PeekToken";
-      // Clear the bad bit.  It seems to be possible for this code to be
-      // reached, and the C++ standard is very vague on whether even a single
-      // call to unget() should succeed; see
-      // http://www.cplusplus.com/reference/istream/istream/unget/
+      // Clear the bad bit. This code can be (and is in fact) reached, since the
+      // C++ standard does not guarantee that a call to unget() must succeed.
       is.clear();
     }
   }
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index ca476033950..895f661ecee 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -31,7 +31,9 @@
 #include <cctype>
 #include <vector>
 #include <string>
+
 #include "base/kaldi-common.h"
+#include "base/io-funcs-inl.h"
 
 namespace kaldi {
 
@@ -44,7 +46,7 @@ namespace kaldi {
   We also want to have control over whitespace in text mode without affecting
   the meaning of the file, for pretty-printing purposes.
 
-  Errors are handled by throwing an exception (std::runtime_error).
+  Errors are handled by throwing a KaldiFatalError exception.
 
   For integer and floating-point types (and boolean values):
 
@@ -106,7 +108,7 @@ namespace kaldi {
   it doesn't throw.  It's useful if a class can have various forms based on
   typedefs and virtual classes, and wants to know which version to read.
 
-  ReadToken allow the caller to obtain the next token.  PeekToken works just
+  ReadToken allows the caller to obtain the next token.  PeekToken works just
   like ReadToken, but seeks back to the beginning of the token.  A subsequent
   call to ReadToken will read the same token again.  This is useful when
   different object types are written to the same file; using PeekToken one can
@@ -201,13 +203,18 @@ void WriteToken(std::ostream &os, bool binary, const std::string & token);
 /// value of the stream.
 int Peek(std::istream &is, bool binary);
 
-/// ReadToken gets the next token and puts it in str (exception on failure).
+/// ReadToken gets the next token and puts it in str (exception on failure). If
+/// PeekToken() had been previously called, it is possible that the stream had
+/// failed to unget the starting '<' character. In this case ReadToken() returns
+/// the token string without the leading '<'. You must be prepared to handle
+/// this case. ExpectToken() handles this internally, and is not affected.
 void ReadToken(std::istream &is, bool binary, std::string *token);
 
 /// PeekToken will return the first character of the next token, or -1 if end of
 /// file.  It's the same as Peek(), except if the first character is '<' it will
-/// skip over it and will return the next character.  It will unget the '<' so
-/// the stream is where it was before you did PeekToken().
+/// skip over it and will return the next character. It will attempt to unget
+/// the '<' so the stream is where it was before you did PeekToken(), however,
+/// this is not guaranteed (see ReadToken()).
 int PeekToken(std::istream &is, bool binary);
 
 /// ExpectToken tries to read in the given token, and throws an exception
@@ -235,7 +242,4 @@ inline void InitKaldiOutputStream(std::ostream &os, bool binary);
 inline bool InitKaldiInputStream(std::istream &is, bool *binary);
 
 }  // end namespace kaldi.
-
-#include "base/io-funcs-inl.h"
-
 #endif  // KALDI_BASE_IO_FUNCS_H_
diff --git a/src/base/kaldi-common.h b/src/base/kaldi-common.h
index e0002d91bb7..264565d1812 100644
--- a/src/base/kaldi-common.h
+++ b/src/base/kaldi-common.h
@@ -36,5 +36,6 @@
 #include "base/kaldi-types.h"
 #include "base/io-funcs.h"
 #include "base/kaldi-math.h"
+#include "base/timer.h"
 
 #endif  // KALDI_BASE_KALDI_COMMON_H_
diff --git a/src/base/kaldi-error-test.cc b/src/base/kaldi-error-test.cc
index 527de852cac..31440edf3f9 100644
--- a/src/base/kaldi-error-test.cc
+++ b/src/base/kaldi-error-test.cc
@@ -17,19 +17,14 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-
 #include "base/kaldi-common.h"
 
 // testing that we get the stack trace.
 namespace kaldi {
 
-void MyFunction2() {
-  KALDI_ERR << "Ignore this error";
-}
+void MyFunction2() { KALDI_ERR << "Ignore this error"; }
 
-void MyFunction1() {
-  MyFunction2();
-}
+void MyFunction1() { MyFunction2(); }
 
 void UnitTestError() {
   {
@@ -38,17 +33,50 @@ void UnitTestError() {
   }
 }
 
+void VerifySymbolRange(const std::string &trace, const bool want_found,
+                       const std::string &want_symbol) {
+  size_t begin, end;
+  const bool found = internal::LocateSymbolRange(trace, &begin, &end);
+  if (found != want_found) {
+    KALDI_ERR << "Found mismatch, got " << found << " want " << want_found;
+  }
+  if (!found) {
+    return;
+  }
+  const std::string symbol = trace.substr(begin, end - begin);
+  if (symbol != want_symbol) {
+    KALDI_ERR << "Symbol mismatch, got " << symbol << " want " << want_symbol;
+  }
+}
+
+void TestLocateSymbolRange() {
+  VerifySymbolRange("", false, "");
+  VerifySymbolRange(
+      R"TRACE(./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d])TRACE",
+      true, "_ZN5kaldi13UnitTestErrorEv");
+  // It is ok thread_start is not found because it is a C symbol.
+  VerifySymbolRange(
+      R"TRACE(31  libsystem_pthread.dylib             0x00007fff6fe4e40d thread_start + 13)TRACE",
+      false, "");
+  VerifySymbolRange(
+      R"TRACE(0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813)TRACE",
+      true, "_ZNK5kaldi13MessageLogger10LogMessageEv");
+  VerifySymbolRange(
+      R"TRACE(29  libsystem_pthread.dylib             0x00007fff6fe4f2eb _pthread_body + 126)TRACE",
+      true, "_pthread_body");
+}
 
-}  // end namespace kaldi.
+} // namespace kaldi
 
 int main() {
-  kaldi::g_program_name = "/foo/bar/kaldi-error-test";
+  kaldi::TestLocateSymbolRange();
+
+  kaldi::SetProgramName("/foo/bar/kaldi-error-test");
   try {
     kaldi::UnitTestError();
-    KALDI_ASSERT(0);  // should not happen.
+    KALDI_ASSERT(0); // should not happen.
     exit(1);
-  } catch(std::runtime_error &r) {
-    std::cout << "UnitTestError: the error we generated was: " << r.what();
+  } catch (kaldi::KaldiFatalError &e) {
+    std::cout << "The error we generated was: '" << e.KaldiMessage() << "'\n";
   }
 }
-
diff --git a/src/base/kaldi-error.cc b/src/base/kaldi-error.cc
index f2ce1edf37d..12f972ee856 100644
--- a/src/base/kaldi-error.cc
+++ b/src/base/kaldi-error.cc
@@ -1,5 +1,7 @@
 // base/kaldi-error.cc
 
+// Copyright 2019 LAIX (Yi Sun)
+// Copyright 2019 SmartAction LLC (kkm)
 // Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Lukas Burget;  Ondrej Glembek
 
@@ -19,122 +21,154 @@
 // limitations under the License.
 
 #ifdef HAVE_EXECINFO_H
-#include <execinfo.h>  // To get stack trace in error messages.
+#include <execinfo.h> // To get stack trace in error messages.
 // If this #include fails there is an error in the Makefile, it does not
 // support your platform well. Make sure HAVE_EXECINFO_H is undefined,
 // and the code will compile.
 #ifdef HAVE_CXXABI_H
-#include <cxxabi.h>  // For name demangling.
+#include <cxxabi.h> // For name demangling.
 // Useful to decode the stack trace, but only used if we have execinfo.h
-#endif  // HAVE_CXXABI_H
-#endif  // HAVE_EXECINFO_H
+#endif // HAVE_CXXABI_H
+#endif // HAVE_EXECINFO_H
 
 #include "base/kaldi-common.h"
 #include "base/kaldi-error.h"
+
+// KALDI_GIT_HEAD is useless currently in full repo
+#if !defined(KALDI_VERSION)
 #include "base/version.h"
+#endif
 
 namespace kaldi {
 
 /***** GLOBAL VARIABLES FOR LOGGING *****/
 
 int32 g_kaldi_verbose_level = 0;
-const char *g_program_name = NULL;
-static LogHandler g_log_handler = NULL;
-
-// If the program name was set (g_program_name != ""), GetProgramName
-// returns the program name (without the path), e.g. "gmm-align".
-// Otherwise it returns the empty string "".
-const char *GetProgramName() {
-  return g_program_name == NULL ? "" : g_program_name;
+static std::string program_name;
+static LogHandler log_handler = NULL;
+
+void SetProgramName(const char *basename) {
+  // Using the 'static std::string' for the program name is mostly harmless,
+  // because (a) Kaldi logging is undefined before main(), and (b) no stdc++
+  // string implementation has been found in the wild that would not be just
+  // an empty string when zero-initialized but not yet constructed.
+  program_name = basename;
 }
 
 /***** HELPER FUNCTIONS *****/
 
-// Given a filename like "/a/b/c/d/e/f.cc",  GetShortFileName
-// returns "e/f.cc".  Does not currently work if backslash is
-// the filename separator.
-static const char *GetShortFileName(const char *filename) {
-  const char *last_slash = strrchr(filename, '/');
-  if (!last_slash) {
-    return filename;
-  } else {
-    while (last_slash > filename && last_slash[-1] != '/')
-      last_slash--;
-    return last_slash;
+// Trim filename to at most 1 trailing directory long. Given a filename like
+// "/a/b/c/d/e/f.cc", return "e/f.cc". Support both '/' and '\' as the path
+// separator.
+static const char *GetShortFileName(const char *path) {
+  if (path == nullptr)
+    return "";
+
+  const char *prev = path, *last = path;
+  while ((path = std::strpbrk(path, "\\/")) != nullptr) {
+    ++path;
+    prev = last;
+    last = path;
   }
+  return prev;
 }
 
+/***** STACK TRACE *****/
 
-/***** STACKTRACE *****/
+namespace internal {
+bool LocateSymbolRange(const std::string &trace_name, size_t *begin,
+                       size_t *end) {
+  // Find the first '_' with leading ' ' or '('.
+  *begin = std::string::npos;
+  for (size_t i = 1; i < trace_name.size(); i++) {
+    if (trace_name[i] != '_') {
+      continue;
+    }
+    if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
+      *begin = i;
+      break;
+    }
+  }
+  if (*begin == std::string::npos) {
+    return false;
+  }
+  *end = trace_name.find_first_of(" +", *begin);
+  return *end != std::string::npos;
+}
+} // namespace internal
 
+#ifdef HAVE_EXECINFO_H
 static std::string Demangle(std::string trace_name) {
-#if defined(HAVE_CXXABI_H) && defined(HAVE_EXECINFO_H)
-  // at input the string looks like:
+#ifndef HAVE_CXXABI_H
+  return trace_name;
+#else  // HAVE_CXXABI_H
+  // Try demangle the symbol. We are trying to support the following formats
+  // produced by different platforms:
+  //
+  // Linux:
   //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
-  // We want to extract the name e.g. '_ZN5kaldi13UnitTestErrorEv",
-  // demangle it and return it.
-
-  // try to locate '(' and '+', take the string in between,
-  size_t begin(trace_name.find("(")),
-         end(trace_name.rfind("+"));
-  if (begin != std::string::npos && end != std::string::npos && begin < end) {
-    trace_name = trace_name.substr(begin+1,end-(begin+1));
+  //
+  // Mac:
+  //   0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
+  //
+  // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
+  // demangle it info a readable name like kaldi::UnitTextError.
+  size_t begin, end;
+  if (!internal::LocateSymbolRange(trace_name, &begin, &end)) {
+    return trace_name;
   }
-  // demangle,
+  std::string symbol = trace_name.substr(begin, end - begin);
   int status;
-  char *demangled_name = abi::__cxa_demangle(trace_name.c_str(), 0, 0, &status);
-  std::string ans;
-  if (status == 0) {
-    ans = demangled_name;
+  char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
+  if (status == 0 && demangled_name != nullptr) {
+    symbol = demangled_name;
     free(demangled_name);
-  } else {
-    ans = trace_name;
   }
-  // return,
-  return ans;
-#else
-  return trace_name;
-#endif
+  return trace_name.substr(0, begin) + symbol +
+         trace_name.substr(end, std::string::npos);
+#endif // HAVE_CXXABI_H
 }
-
+#endif // HAVE_EXECINFO_H
 
 static std::string KaldiGetStackTrace() {
   std::string ans;
 #ifdef HAVE_EXECINFO_H
-#define KALDI_MAX_TRACE_SIZE 50
-#define KALDI_MAX_TRACE_PRINT 20  // must be even.
-  // buffer for the trace,
+  const size_t KALDI_MAX_TRACE_SIZE = 50;
+  const size_t KALDI_MAX_TRACE_PRINT = 50; // Must be even.
+  // Buffer for the trace.
   void *trace[KALDI_MAX_TRACE_SIZE];
-  // get the trace,
+  // Get the trace.
   size_t size = backtrace(trace, KALDI_MAX_TRACE_SIZE);
-  // get the trace symbols,
+  // Get the trace symbols.
   char **trace_symbol = backtrace_symbols(trace, size);
+  if (trace_symbol == NULL)
+    return ans;
 
-  // Compose the 'string',
+  // Compose a human-readable backtrace string.
   ans += "[ Stack-Trace: ]\n";
   if (size <= KALDI_MAX_TRACE_PRINT) {
     for (size_t i = 0; i < size; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
-  } else {  // print out first+last (e.g.) 5.
-    for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT/2; i++) {
+  } else { // Print out first+last (e.g.) 5.
+    for (size_t i = 0; i < KALDI_MAX_TRACE_PRINT / 2; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
     ans += ".\n.\n.\n";
-    for (size_t i = size - KALDI_MAX_TRACE_PRINT/2; i < size; i++) {
+    for (size_t i = size - KALDI_MAX_TRACE_PRINT / 2; i < size; i++) {
       ans += Demangle(trace_symbol[i]) + "\n";
     }
     if (size == KALDI_MAX_TRACE_SIZE)
-      ans += ".\n.\n.\n";  // stack was too long, probably a bug.
+      ans += ".\n.\n.\n"; // Stack was too long, probably a bug.
   }
 
-  // cleanup,
-  free(trace_symbol);  // it's okay, just the pointers, not the strings.
-#endif  // HAVE_EXECINFO_H
+  // We must free the array of pointers allocated by backtrace_symbols(),
+  // but not the strings themselves.
+  free(trace_symbol);
+#endif // HAVE_EXECINFO_H
   return ans;
 }
 
-
 /***** KALDI LOGGING *****/
 
 MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity,
@@ -142,104 +176,74 @@ MessageLogger::MessageLogger(LogMessageEnvelope::Severity severity,
   // Obviously, we assume the strings survive the destruction of this object.
   envelope_.severity = severity;
   envelope_.func = func;
-  envelope_.file = GetShortFileName(file);  // Pointer inside 'file'.
+  envelope_.file = GetShortFileName(file); // Points inside 'file'.
   envelope_.line = line;
 }
 
-
-MessageLogger::~MessageLogger() KALDI_NOEXCEPT(false) {
-  // remove trailing '\n',
-  std::string str = ss_.str();
-  while (!str.empty() && str[str.length() - 1] == '\n')
-    str.resize(str.length() - 1);
-
-  // print the mesage (or send to logging handler),
-  MessageLogger::HandleMessage(envelope_, str.c_str());
-}
-
-
-void MessageLogger::HandleMessage(const LogMessageEnvelope &envelope,
-                                  const char *message) {
-  // Send to a logging handler if provided.
-  if (g_log_handler != NULL) {
-    g_log_handler(envelope, message);
-  } else {
-    // Otherwise, we use the default Kaldi logging.
-    // Build the log-message 'header',
-    std::stringstream header;
-    if (envelope.severity > LogMessageEnvelope::kInfo) {
-      header << "VLOG[" << envelope.severity << "] (";
-    } else {
-      switch (envelope.severity) {
-        case LogMessageEnvelope::kInfo :
-          header << "LOG (";
-          break;
-        case LogMessageEnvelope::kWarning :
-          header << "WARNING (";
-          break;
-        case LogMessageEnvelope::kError :
-          header << "ERROR (";
-          break;
-        case LogMessageEnvelope::kAssertFailed :
-          header << "ASSERTION_FAILED (";
-          break;
-        default:
-          abort();  // coding error (unknown 'severity'),
-      }
-    }
-    // fill the other info from the envelope,
-    header << GetProgramName() << "[" KALDI_VERSION "]" << ':'
-           << envelope.func << "():" << envelope.file << ':' << envelope.line
-           << ")";
-
-    // Printing the message,
-    if (envelope.severity >= LogMessageEnvelope::kWarning) {
-      // VLOG, LOG, WARNING:
-      fprintf(stderr, "%s %s\n", header.str().c_str(), message);
-    } else {
-      // ERROR, ASSERT_FAILED (print with stack-trace):
-      fprintf(stderr, "%s %s\n\n%s\n", header.str().c_str(), message,
-              KaldiGetStackTrace().c_str());
-    }
+void MessageLogger::LogMessage() const {
+  // Send to the logging handler if provided.
+  if (log_handler != NULL) {
+    log_handler(envelope_, GetMessage().c_str());
+    return;
   }
 
-  // Should we throw exception, or abort?
-  switch (envelope.severity) {
+  // Otherwise, use the default Kaldi logging.
+  // Build the log-message header.
+  std::stringstream full_message;
+  if (envelope_.severity > LogMessageEnvelope::kInfo) {
+    full_message << "VLOG[" << envelope_.severity << "] (";
+  } else {
+    switch (envelope_.severity) {
+    case LogMessageEnvelope::kInfo:
+      full_message << "LOG (";
+      break;
+    case LogMessageEnvelope::kWarning:
+      full_message << "WARNING (";
+      break;
     case LogMessageEnvelope::kAssertFailed:
-      abort(); // ASSERT_FAILED,
+      full_message << "ASSERTION_FAILED (";
       break;
     case LogMessageEnvelope::kError:
-      if (!std::uncaught_exception()) {
-        // throw exception with empty message,
-        throw std::runtime_error(""); // KALDI_ERR,
-      } else {
-        // If we got here, this thread has already thrown exception,
-        // and this exception has not yet arrived to its 'catch' clause...
-        // Throwing a new exception would be unsafe!
-        // (can happen during 'stack unwinding', if we have 'KALDI_ERR << msg'
-        // in a destructor of some local object).
-        abort();
-      }
+    default: // If not the ERROR, it still an error!
+      full_message << "ERROR (";
       break;
+    }
+  }
+  // Add other info from the envelope and the message text.
+  full_message << program_name.c_str() << "[" KALDI_VERSION "]" << ':'
+               << envelope_.func << "():" << envelope_.file << ':'
+               << envelope_.line << ") " << GetMessage().c_str();
+
+  // Add stack trace for errors and assertion failures, if available.
+  if (envelope_.severity < LogMessageEnvelope::kWarning) {
+    const std::string &stack_trace = KaldiGetStackTrace();
+    if (!stack_trace.empty()) {
+      full_message << "\n\n" << stack_trace;
+    }
   }
-}
 
+  // Print the complete message to stderr.
+  full_message << "\n";
+  std::cerr << full_message.str();
+}
 
 /***** KALDI ASSERTS *****/
 
-void KaldiAssertFailure_(const char *func, const char *file,
-                         int32 line, const char *cond_str) {
-  MessageLogger ml(LogMessageEnvelope::kAssertFailed, func, file, line);
-  ml.stream() << ": '" << cond_str << "' ";
+void KaldiAssertFailure_(const char *func, const char *file, int32 line,
+                         const char *cond_str) {
+  MessageLogger::Log() =
+      MessageLogger(LogMessageEnvelope::kAssertFailed, func, file, line)
+      << "Assertion failed: (" << cond_str << ")";
+  fflush(NULL); // Flush all pending buffers, abort() may not flush stderr.
+  std::abort();
 }
 
-
 /***** THIRD-PARTY LOG-HANDLER *****/
 
-LogHandler SetLogHandler(LogHandler new_handler) {
-  LogHandler old_handler = g_log_handler;
-  g_log_handler = new_handler;
+LogHandler SetLogHandler(LogHandler handler) {
+  LogHandler old_handler = log_handler;
+  log_handler = handler;
   return old_handler;
 }
 
-}  // end namespace kaldi
+} // namespace kaldi
diff --git a/src/base/kaldi-error.h b/src/base/kaldi-error.h
index 172ea675312..a9904a752cd 100644
--- a/src/base/kaldi-error.h
+++ b/src/base/kaldi-error.h
@@ -1,5 +1,7 @@
 // base/kaldi-error.h
 
+// Copyright 2019 LAIX (Yi Sun)
+// Copyright 2019 SmartAction LLC (kkm)
 // Copyright 2016 Brno University of Technology (author: Karel Vesely)
 // Copyright 2009-2011  Microsoft Corporation;  Ondrej Glembek;  Lukas Burget;
 //                      Saarland University
@@ -33,18 +35,6 @@
 #include "base/kaldi-utils.h"
 /* Important that this file does not depend on any other kaldi headers. */
 
-// By adding 'KALDI_NOEXCEPT(bool)' immediately after function declaration,
-// we can tell the compiler that the function must-not produce
-// exceptions (true), or may produce exceptions (false):
-#if _MSC_VER >= 1900 || (!defined(_MSC_VER) && __cplusplus >= 201103L)
-#define KALDI_NOEXCEPT(Predicate) noexcept((Predicate))
-#elif defined(__GXX_EXPERIMENTAL_CXX0X__) && \
-      (__GNUC__ >= 4 && __GNUC_MINOR__ >= 6)
-#define KALDI_NOEXCEPT(Predicate) noexcept((Predicate))
-#else
-#define KALDI_NOEXCEPT(Predicate)
-#endif
-
 #ifdef _MSC_VER
 #define __func__ __FUNCTION__
 #endif
@@ -54,106 +44,128 @@ namespace kaldi {
 /// \addtogroup error_group
 /// @{
 
-/***** VERBOSITY LEVEL *****/
+/***** PROGRAM NAME AND VERBOSITY LEVEL *****/
 
-/// This is set by util/parse-options.{h, cc} if you set --verbose=? option.
-extern int32 g_kaldi_verbose_level;
+/// Called by ParseOptions to set base name (no directory) of the executing
+/// program. The name is printed in logging code along with every message,
+/// because in our scripts, we often mix together the stderr of many programs.
+/// This function is very thread-unsafe.
+void SetProgramName(const char *basename);
 
-/// This is set by util/parse-options.{h, cc} (from argv[0]) and used (if set)
-/// in error reporting code to display the name of the program (this is because
-/// in our scripts, we often mix together the stderr of many programs).  it is
-/// the base-name of the program (no directory), followed by ':' We don't use
-/// std::string, due to the static initialization order fiasco.
-extern const char *g_program_name;
+/// This is set by util/parse-options.{h,cc} if you set --verbose=? option.
+/// Do not use directly, prefer {Get,Set}VerboseLevel().
+extern int32 g_kaldi_verbose_level;
 
+/// Get verbosity level, usually set via command line '--verbose=' switch.
 inline int32 GetVerboseLevel() { return g_kaldi_verbose_level; }
 
-/// This should be rarely used; command-line programs set the verbose level
-/// automatically from ParseOptions.
+/// This should be rarely used, except by programs using Kaldi as library;
+/// command-line programs set the verbose level automatically from ParseOptions.
 inline void SetVerboseLevel(int32 i) { g_kaldi_verbose_level = i; }
 
-
 /***** KALDI LOGGING *****/
 
 /// Log message severity and source location info.
 struct LogMessageEnvelope {
+  /// Message severity. In addition to these levels, positive values (1 to 6)
+  /// specify verbose logging level. Verbose messages are produced only when
+  /// SetVerboseLevel() has been called to set logging level to at least the
+  /// corresponding value.
   enum Severity {
-    kAssertFailed = -3,
-    kError = -2,
-    kWarning = -1,
-    kInfo = 0,
+    kAssertFailed = -3, //!< Assertion failure. abort() will be called.
+    kError = -2,        //!< Fatal error. KaldiFatalError will be thrown.
+    kWarning = -1,      //!< Indicates a recoverable but abnormal condition.
+    kInfo = 0,          //!< Informational message.
   };
-  // An 'enum Severity' value, or a positive number indicating verbosity level.
-  int severity;
-  const char *func;
-  const char *file;
-  int32 line;
+  int severity;     //!< A Severity value, or positive verbosity level.
+  const char *func; //!< Name of the function invoking the logging.
+  const char *file; //!< Source file name with up to 1 leading directory.
+  int32 line;       //<! Line number in the source file.
 };
 
-// Class MessageLogger is invoked from the KALDI_ASSERT, KALDI_ERR, KALDI_WARN and
-// KALDI_LOG macros. It formats the message, then either prints it to stderr or
-// passes to the log custom handler if provided, then, in case of the error,
-// throws an std::runtime_exception, in case of failed KALDI_ASSERT calls abort().
-//
-// Note: we avoid using std::cerr for thread safety issues.
-// fprintf(stderr,...) is guaranteed thread-safe, and outputs
-// its formatted string atomically.
-class MessageLogger {
+/// Kaldi fatal runtime error exception. This exception is thrown from any use
+/// of the KALDI_ERR logging macro after the logging function, either set by
+/// SetLogHandler(), or the Kaldi's internal one, has returned.
+class KaldiFatalError : public std::runtime_error {
 public:
-  /// Constructor stores the info,
-  MessageLogger(LogMessageEnvelope::Severity severity,
-                const char *func,
-                const char *file,
-                int32 line);
+  explicit KaldiFatalError(const std::string &message)
+      : std::runtime_error(message) {}
+  explicit KaldiFatalError(const char *message) : std::runtime_error(message) {}
 
-  /// Destructor, calls 'HandleMessage' which prints the message,
-  /// (since C++11 a 'throwing' destructor must be declared 'noexcept(false)')
-  ~MessageLogger() KALDI_NOEXCEPT(false);
+  /// Returns the exception name, "kaldi::KaldiFatalError".
+  virtual const char *what() const noexcept override {
+    return "kaldi::KaldiFatalError";
+  }
 
-  /// The hook for the 'insertion operator', e.g.
-  /// 'KALDI_LOG << "Message,"',
-  inline std::ostream &stream() { return ss_; }
+  /// Returns the Kaldi error message logged by KALDI_ERR.
+  const char *KaldiMessage() const { return std::runtime_error::what(); }
+};
 
-private:
-  /// The logging function,
-  static void HandleMessage(const LogMessageEnvelope &env, const char *msg);
+// Class MessageLogger is the workhorse behind the KALDI_ASSERT, KALDI_ERR,
+// KALDI_WARN, KALDI_LOG and KALDI_VLOG macros. It formats the message, then
+// either prints it to stderr or passes to the custom logging handler if
+// provided. Then, in case of the error, throws a KaldiFatalError exception, or
+// in case of failed KALDI_ASSERT, calls std::abort().
+class MessageLogger {
+public:
+  /// The constructor stores the message's "envelope", a set of data which
+  // identifies the location in source which is sending the message to log.
+  // The pointers to strings are stored internally, and not owned or copied,
+  // so that their storage must outlive this object.
+  MessageLogger(LogMessageEnvelope::Severity severity, const char *func,
+                const char *file, int32 line);
+
+  // The stream insertion operator, used in e.g. 'KALDI_LOG << "Message"'.
+  template <typename T> MessageLogger &operator<<(const T &val) {
+    ss_ << val;
+    return *this;
+  }
+
+  // When assigned a MessageLogger, log its contents.
+  struct Log final {
+    void operator=(const MessageLogger &logger) { logger.LogMessage(); }
+  };
+
+  // When assigned a MessageLogger, log its contents and then throw
+  // a KaldiFatalError.
+  struct LogAndThrow final {
+    [[noreturn]] void operator=(const MessageLogger &logger) {
+      logger.LogMessage();
+      throw KaldiFatalError(logger.GetMessage());
+    }
+  };
 
 private:
+  std::string GetMessage() const { return ss_.str(); }
+  void LogMessage() const;
+
   LogMessageEnvelope envelope_;
   std::ostringstream ss_;
 };
 
-// The definition of the logging macros,
-#define KALDI_ERR \
-  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kError, \
-                         __func__, __FILE__, __LINE__).stream()
-#define KALDI_WARN \
-  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kWarning, \
-                         __func__, __FILE__, __LINE__).stream()
-#define KALDI_LOG \
-  ::kaldi::MessageLogger(::kaldi::LogMessageEnvelope::kInfo, \
-                         __func__, __FILE__, __LINE__).stream()
-#define KALDI_VLOG(v) if ((v) <= ::kaldi::g_kaldi_verbose_level)     \
-  ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v), \
-                         __func__, __FILE__, __LINE__).stream()
-
+// Logging macros.
+#define KALDI_ERR                                                              \
+  ::kaldi::MessageLogger::LogAndThrow() = ::kaldi::MessageLogger(              \
+      ::kaldi::LogMessageEnvelope::kError, __func__, __FILE__, __LINE__)
+#define KALDI_WARN                                                             \
+  ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger(                      \
+      ::kaldi::LogMessageEnvelope::kWarning, __func__, __FILE__, __LINE__)
+#define KALDI_LOG                                                              \
+  ::kaldi::MessageLogger::Log() = ::kaldi::MessageLogger(                      \
+      ::kaldi::LogMessageEnvelope::kInfo, __func__, __FILE__, __LINE__)
+#define KALDI_VLOG(v)                                                          \
+  if ((v) <= ::kaldi::GetVerboseLevel())                                       \
+  ::kaldi::MessageLogger::Log() =                                              \
+      ::kaldi::MessageLogger((::kaldi::LogMessageEnvelope::Severity)(v),       \
+                             __func__, __FILE__, __LINE__)
 
 /***** KALDI ASSERTS *****/
 
-void KaldiAssertFailure_(const char *func, const char *file,
-                         int32 line, const char *cond_str);
+[[noreturn]] void KaldiAssertFailure_(const char *func, const char *file,
+                                      int32 line, const char *cond_str);
 
-// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT
-// The original (simple) version of the code was this
-//
-// #define KALDI_ASSERT(cond) if (!(cond))
-//              kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);
+// Note on KALDI_ASSERT and KALDI_PARANOID_ASSERT:
 //
-// That worked well, but we were concerned that it
-// could potentially cause a performance issue due to failed branch
-// prediction (best practice is to have the if branch be the commonly
-// taken one).
-// Therefore, we decided to move the call into the else{} branch.
 // A single block {} around if /else  does not work, because it causes
 // syntax error (unmatched else block) in the following code:
 //
@@ -162,41 +174,58 @@ void KaldiAssertFailure_(const char *func, const char *file,
 // else
 //   SomethingElse();
 //
-// do {} while(0)  -- note there is no semicolon at the end! --- works nicely
+// do {} while(0) -- note there is no semicolon at the end! -- works nicely,
 // and compilers will be able to optimize the loop away (as the condition
 // is always false).
+//
+// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h, and
+// KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE, also defined
+// there.
 #ifndef NDEBUG
-#define KALDI_ASSERT(cond) do { if (cond) (void)0; else \
-  ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
+#define KALDI_ASSERT(cond)                                                     \
+  do {                                                                         \
+    if (cond)                                                                  \
+      (void)0;                                                                 \
+    else                                                                       \
+      ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
+  } while (0)
 #else
 #define KALDI_ASSERT(cond) (void)0
 #endif
-// Also see KALDI_COMPILE_TIME_ASSERT, defined in base/kaldi-utils.h,
-// and KALDI_ASSERT_IS_INTEGER_TYPE and KALDI_ASSERT_IS_FLOATING_TYPE,
-// also defined there.
-// some more expensive asserts only checked if this defined
+
+// Some more expensive asserts only checked if this defined.
 #ifdef KALDI_PARANOID
-#define KALDI_PARANOID_ASSERT(cond) do { if (cond) (void)0; else \
-  ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond); } while(0)
+#define KALDI_PARANOID_ASSERT(cond)                                            \
+  do {                                                                         \
+    if (cond)                                                                  \
+      (void)0;                                                                 \
+    else                                                                       \
+      ::kaldi::KaldiAssertFailure_(__func__, __FILE__, __LINE__, #cond);       \
+  } while (0)
 #else
 #define KALDI_PARANOID_ASSERT(cond) (void)0
 #endif
 
-
 /***** THIRD-PARTY LOG-HANDLER *****/
 
-/// Type of third-party logging function,
+/// Type of third-party logging function.
 typedef void (*LogHandler)(const LogMessageEnvelope &envelope,
                            const char *message);
 
 /// Set logging handler. If called with a non-NULL function pointer, the
-/// function pointed by it is called to send messages to a caller-provided
-/// log. If called with NULL pointer, restores default Kaldi error logging to
-/// stderr.  SetLogHandler is obviously not thread safe.
+/// function pointed by it is called to send messages to a caller-provided log.
+/// If called with a NULL pointer, restores default Kaldi error logging to
+/// stderr. This function is obviously not thread safe; the log handler must be.
+/// Returns a previously set logging handler pointer, or NULL.
 LogHandler SetLogHandler(LogHandler);
 
 /// @} end "addtogroup error_group"
 
-}  // namespace kaldi
+// Functions within internal is exported for testing only, do not use.
+namespace internal {
+bool LocateSymbolRange(const std::string &trace_name, size_t *begin,
+                       size_t *end);
+} // namespace internal
+} // namespace kaldi
 
-#endif  // KALDI_BASE_KALDI_ERROR_H_
+#endif // KALDI_BASE_KALDI_ERROR_H_
diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc
index 991e46a590c..484c80d44ee 100644
--- a/src/base/kaldi-math.cc
+++ b/src/base/kaldi-math.cc
@@ -21,6 +21,7 @@
 #include "base/kaldi-math.h"
 #ifndef _MSC_VER
 #include <stdlib.h>
+#include <unistd.h>
 #endif
 #include <string>
 #include <mutex>
@@ -42,7 +43,7 @@ int32 RoundUpToNearestPowerOfTwo(int32 n) {
 static std::mutex _RandMutex;
 
 int Rand(struct RandomState* state) {
-#if defined(_MSC_VER) || defined(__CYGWIN__)
+#if !defined(_POSIX_THREAD_SAFE_FUNCTIONS)
   // On Windows and Cygwin, just call Rand()
   return rand();
 #else
@@ -109,10 +110,8 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {
       return min_val + ( (unsigned int)( (Rand(state)+RAND_MAX*Rand(state)))
                     % (unsigned int)(max_val+1-min_val));
     } else {
-      throw std::runtime_error(std::string()
-                               +"rand_int failed because we do not support "
-                               +"such large random numbers. "
-                               +"(Extend this function).");
+      KALDI_ERR << "rand_int failed because we do not support such large "
+          "random numbers. (Extend this function).";
     }
   }
 #else
@@ -122,7 +121,7 @@ int32 RandInt(int32 min_val, int32 max_val, struct RandomState* state) {
 }
 
 // Returns poisson-distributed random number.
-// Take care: this takes time proportinal
+// Take care: this takes time proportional
 // to lambda.  Faster algorithms exist but are more complex.
 int32 RandPoisson(float lambda, struct RandomState* state) {
   // Knuth's algorithm.
diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index 21665ddfc63..93c265ee96e 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -158,7 +158,7 @@ inline float RandGauss(struct RandomState* state = NULL) {
 }
 
 // Returns poisson-distributed random number.  Uses Knuth's algorithm.
-// Take care: this takes time proportinal
+// Take care: this takes time proportional
 // to lambda.  Faster algorithms exist but are more complex.
 int32 RandPoisson(float lambda, struct RandomState* state = NULL);
 
@@ -180,7 +180,7 @@ inline Float RandPrune(Float post, BaseFloat prune_thresh,
       (RandUniform(state) <= fabs(post)/prune_thresh ? prune_thresh : 0.0);
 }
 
-
+// returns log(exp(x) + exp(y)).
 inline double LogAdd(double x, double y) {
   double diff;
 
@@ -202,6 +202,7 @@ inline double LogAdd(double x, double y) {
 }
 
 
+// returns log(exp(x) + exp(y)).
 inline float LogAdd(float x, float y) {
   float diff;
 
@@ -223,7 +224,7 @@ inline float LogAdd(float x, float y) {
 }
 
 
-// returns exp(x) - exp(y).
+// returns log(exp(x) - exp(y)).
 inline double LogSub(double x, double y) {
   if (y >= x) {  // Throws exception if y>=x.
     if (y == x)
@@ -242,7 +243,7 @@ inline double LogSub(double x, double y) {
 }
 
 
-// returns exp(x) - exp(y).
+// returns log(exp(x) - exp(y)).
 inline float LogSub(float x, float y) {
   if (y >= x) {  // Throws exception if y>=x.
     if (y == x)
diff --git a/src/base/timer.cc b/src/base/timer.cc
new file mode 100644
index 00000000000..ce4ef292783
--- /dev/null
+++ b/src/base/timer.cc
@@ -0,0 +1,85 @@
+// base/timer.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/timer.h"
+#include "base/kaldi-error.h"
+#include <algorithm>
+#include <iomanip>
+#include <map>
+#include <unordered_map>
+
+namespace kaldi {
+
+class ProfileStats {
+ public:
+  void AccStats(const char *function_name, double elapsed) {
+    std::unordered_map<const char*, ProfileStatsEntry>::iterator
+        iter = map_.find(function_name);
+    if (iter == map_.end()) {
+      map_[function_name] = ProfileStatsEntry(function_name);
+      map_[function_name].total_time = elapsed;
+    } else {
+      iter->second.total_time += elapsed;
+    }
+  }
+  ~ProfileStats() {
+    // This map makes sure we agglomerate the time if there were any duplicate
+    // addresses of strings.
+    std::unordered_map<std::string, double> total_time;
+    for (auto iter = map_.begin(); iter != map_.end(); iter++)
+      total_time[iter->second.name] += iter->second.total_time;
+
+    ReverseSecondComparator comp;
+    std::vector<std::pair<std::string, double> > pairs(total_time.begin(),
+                                                       total_time.end());
+    std::sort(pairs.begin(), pairs.end(), comp);
+    for (size_t i = 0; i < pairs.size(); i++) {
+      KALDI_LOG << "Time taken in " << pairs[i].first << " is "
+                << std::fixed << std::setprecision(2) << pairs[i].second << "s.";
+    }
+  }
+ private:
+
+  struct ProfileStatsEntry {
+    std::string name;
+    double total_time;
+    ProfileStatsEntry() { }
+    ProfileStatsEntry(const char *name): name(name) { }
+  };
+
+  struct ReverseSecondComparator {
+    bool operator () (const std::pair<std::string, double> &a,
+                      const std::pair<std::string, double> &b) {
+      return a.second > b.second;
+    }
+  };
+
+  // Note: this map is keyed on the address of the string, there is no proper
+  // hash function.  The assumption is that the strings are compile-time
+  // constants.
+  std::unordered_map<const char*, ProfileStatsEntry> map_;
+};
+
+ProfileStats g_profile_stats;
+
+Profiler::~Profiler() {
+  g_profile_stats.AccStats(name_, tim_.Elapsed());
+}
+
+}  // namespace kaldi
diff --git a/src/base/timer.h b/src/base/timer.h
index 7889c4a258b..0e033766362 100644
--- a/src/base/timer.h
+++ b/src/base/timer.h
@@ -20,7 +20,7 @@
 #define KALDI_BASE_TIMER_H_
 
 #include "base/kaldi-utils.h"
-// Note: Sleep(float secs) is included in base/kaldi-utils.h.
+#include "base/kaldi-error.h"
 
 
 #if defined(_MSC_VER) || defined(MINGW)
@@ -53,7 +53,7 @@ class Timer {
  private:
   LARGE_INTEGER time_start_;
 };
-}
+
 
 #else
 #include <sys/time.h>
@@ -87,9 +87,29 @@ class Timer {
   struct timeval time_start_;
   struct timezone time_zone_;
 };
-}
 
 #endif
 
+class Profiler {
+ public:
+  // Caution: the 'const char' should always be a string constant; for speed,
+  // internally the profiling code uses the address of it as a lookup key.
+  Profiler(const char *function_name): name_(function_name) { }
+  ~Profiler();
+ private:
+  Timer tim_;
+  const char *name_;
+};
+
+//  To add timing info for a function, you just put
+//  KALDI_PROFILE;
+//  at the beginning of the function.  Caution: this doesn't
+//  include the class name.
+#define KALDI_PROFILE Profiler _profiler(__func__)
+
+
+
+}  // namespace kaldi
+
 
 #endif  // KALDI_BASE_TIMER_H_
diff --git a/src/bin/Makefile b/src/bin/Makefile
index 627c4f8a131..a04a84e21af 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -21,7 +21,8 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         post-to-pdf-post logprob-to-post prob-to-post copy-post \
         matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
-        transform-vec align-text matrix-dim post-to-smat
+        transform-vec align-text matrix-dim post-to-smat compile-graph \
+        compare-int-vector latgen-incremental-mapped compute-gop
 
 
 OBJFILES =
@@ -29,8 +30,8 @@ OBJFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
 
 
 TESTFILES =
diff --git a/src/bin/acc-lda.cc b/src/bin/acc-lda.cc
index 92cd192b9a6..b664135bdc7 100644
--- a/src/bin/acc-lda.cc
+++ b/src/bin/acc-lda.cc
@@ -37,7 +37,7 @@ int main(int argc, char *argv[]) {
         "Accumulate LDA statistics based on pdf-ids.\n"
         "Usage:  acc-lda [options] <transition-gmm/model> <features-rspecifier> <posteriors-rspecifier> <lda-acc-out>\n"
         "Typical usage:\n"
-        " ali-to-post ark:1.ali ark:- | lda-acc 1.mdl \"ark:splice-feats scp:train.scp|\"  ark:- ldaacc.1\n";
+        " ali-to-post ark:1.ali ark:- | acc-lda 1.mdl \"ark:splice-feats scp:train.scp|\"  ark:- ldaacc.1\n";
 
     bool binary = true;
     BaseFloat rand_prune = 0.0;
@@ -126,5 +126,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
index 2a76000cfae..602e32e9768 100644
--- a/src/bin/ali-to-phones.cc
+++ b/src/bin/ali-to-phones.cc
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
         " ali-to-phones 1.mdl ark:1.ali ark:-\n"
         "or:\n"
         " ali-to-phones --ctm-output 1.mdl ark:1.ali 1.ctm\n"
-        "See also: show-alignments lattice-align-phones\n";
+        "See also: show-alignments lattice-align-phones, compare-int-vector\n";
     ParseOptions po(usage);
     bool per_frame = false;
     bool write_lengths = false;
@@ -137,5 +137,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/bin/align-text.cc b/src/bin/align-text.cc
index 616dac858d7..1c695675274 100644
--- a/src/bin/align-text.cc
+++ b/src/bin/align-text.cc
@@ -86,28 +86,34 @@ int main(int argc, char *argv[]) {
 
       if (!text2_reader.HasKey(key)) {
         KALDI_WARN << "Key " << key << " is in " << text1_rspecifier
-            << ", but not in " << text2_rspecifier;
+                   << ", but not in " << text2_rspecifier;
         n_fail++;
         continue;
       }
       const std::vector<std::string> &text1 = text1_reader.Value();
       const std::vector<std::string> &text2 = text2_reader.Value(key);
 
-      // Checks if the special symbol is in the string.
-      KALDI_ASSERT(std::find(text1.begin(),
-                             text1.end(), special_symbol) == text1.end());
-      KALDI_ASSERT(std::find(text2.begin(),
-                             text2.end(), special_symbol) == text2.end());
-
       if (std::find_if(text1.begin(), text1.end(), IsNotToken) != text1.end()) {
-        KALDI_ERR << "In text1, the utterance " << key << " contains unprintable characters." \
-          << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
-        return  -1;
+        KALDI_ERR << "In text1, the utterance " << key
+                  << " contains unprintable characters. That means there is"
+                  << " a problem with the text (such as incorrect encoding).";
       }
       if (std::find_if(text2.begin(), text2.end(), IsNotToken) != text2.end()) {
-        KALDI_ERR << "In text2, the utterance " << key << " contains unprintable characters." \
-          << "That means there is a problem with the text (such as incorrect encoding)." << std::endl;
-        return  -1;
+        KALDI_ERR << "In text2, the utterance " << key
+                  << " contains unprintable characters. That means there is"
+                  << " a problem with the text (such as incorrect encoding).";
+      }
+
+      // Verify that the special symbol is not in the string.
+      if (std::find(text1.begin(), text1.end(), special_symbol) != text1.end()){
+        KALDI_ERR << "In text1, the utterance " << key
+                  << " contains the special symbol '" << special_symbol
+                  << "'. This is not allowed.";
+      }
+      if (std::find(text2.begin(), text2.end(), special_symbol) != text2.end()){
+        KALDI_ERR << "In text2, the utterance " << key
+                  << " contains the special symbol '" << special_symbol
+                  << "'. This is not allowed.";
       }
 
       std::vector<std::pair<std::string, std::string> > aligned;
diff --git a/src/bin/compare-int-vector.cc b/src/bin/compare-int-vector.cc
new file mode 100644
index 00000000000..5f80ff5ee6c
--- /dev/null
+++ b/src/bin/compare-int-vector.cc
@@ -0,0 +1,184 @@
+// bin/compare-int-vector.cc
+
+// Copyright 2018  Johns Hopkins University (Author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "matrix/kaldi-vector.h"
+#include "transform/transform-common.h"
+#include <iomanip>
+
+
+namespace kaldi {
+void AddToCount(int32 location_to_add,
+                double value_to_add,
+                std::vector<double> *counts) {
+  if (location_to_add < 0)
+    KALDI_ERR << "Contents of vectors cannot be "
+        "negative if --write-tot-counts or --write-diff-counts "
+        "options are provided.";
+  if (counts->size() <= static_cast<size_t>(location_to_add))
+    counts->resize(location_to_add + 1, 0.0);
+  (*counts)[location_to_add] += value_to_add;
+}
+
+void AddToConfusionMatrix(int32 phone1, int32 phone2,
+                          Matrix<double> *counts) {
+  if (phone1 < 0 || phone2 < 0)
+    KALDI_ERR << "Contents of vectors cannot be "
+        "negative if --write-confusion-matrix option is "
+        "provided.";
+  int32 max_size = std::max<int32>(phone1, phone2) + 1;
+  if (counts->NumRows() < max_size)
+    counts->Resize(max_size, max_size, kCopyData);
+  (*counts)(phone1, phone2) += 1.0;
+}
+
+
+void WriteAsKaldiVector(const std::vector<double> &counts,
+                        std::string &wxfilename,
+                        bool binary) {
+  Vector<BaseFloat> counts_vec(counts.size());
+  for (size_t i = 0; i < counts.size(); i++)
+    counts_vec(i) = counts[i];
+  WriteKaldiObject(counts_vec, wxfilename, binary);
+}
+
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+
+    const char *usage =
+        "Compare vectors of integers (e.g. phone alignments)\n"
+        "Prints to stdout fields of the form:\n"
+        "<utterance-id>  <num-frames-in-utterance>  <num-frames-that-differ>\n"
+        "\n"
+        "e.g.:\n"
+        " SWB1_A_31410_32892 420 36\n"
+        "\n"
+        "Usage:\n"
+        "compare-int-vector [options] <vector1-rspecifier> <vector2-rspecifier>\n"
+        "\n"
+        "e.g. compare-int-vector scp:foo.scp scp:bar.scp > comparison\n"
+        "E.g. the inputs might come from ali-to-phones.\n"
+        "Warnings are printed if the vector lengths differ for a given utterance-id,\n"
+        "and in those cases, the number of frames printed will be the smaller of the\n"
+        "\n"
+        "See also: ali-to-phones, copy-int-vector\n";
+
+
+    ParseOptions po(usage);
+
+    std::string tot_wxfilename,
+        diff_wxfilename,
+        confusion_matrix_wxfilename;
+    bool binary = true;
+
+    po.Register("binary", &binary, "If true, write in binary mode (only applies "
+                "if --write-tot-counts or --write-diff-counts options are supplied).");
+    po.Register("write-tot-counts", &tot_wxfilename, "Filename to write "
+                "vector of total counts.  These may be summed with 'vector-sum'.");
+    po.Register("write-diff-counts", &diff_wxfilename, "Filename to write "
+                "vector of counts of phones (or whatever is in the inputs) "
+                "that differ from one vector to the other.  Each time a pair differs, "
+                "0.5 will be added to each one's location.");
+    po.Register("write-confusion-matrix", &confusion_matrix_wxfilename,
+                "Filename to write confusion matrix, indexed by [phone1][phone2]."
+                "These may be summed by 'matrix-sum'.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string vector1_rspecifier = po.GetArg(1),
+        vector2_rspecifier = po.GetArg(2);
+
+    int64 num_done = 0,
+        num_not_found = 0,
+        num_mismatched_lengths = 0,
+        tot_frames = 0, tot_difference = 0;
+
+    std::vector<double> diff_counts;
+    std::vector<double> tot_counts;
+    Matrix<double> confusion_matrix;
+
+    SequentialInt32VectorReader reader1(vector1_rspecifier);
+    RandomAccessInt32VectorReader reader2(vector2_rspecifier);
+
+    for (; !reader1.Done(); reader1.Next(), num_done++) {
+      const std::string &key = reader1.Key();
+      if (!reader2.HasKey(key)) {
+        KALDI_WARN << "No key " << key << " found in second input.";
+        num_not_found++;
+        continue;
+      }
+      const std::vector<int32> &value1 = reader1.Value(),
+          &value2 = reader2.Value(key);
+      size_t len1 = value1.size(), len2 = value2.size();
+      if (len1 != len2) {
+        KALDI_WARN << "For utterance " << key << ", lengths differ "
+                   << len1 << " vs. " << len2;
+        num_mismatched_lengths++;
+      }
+      size_t len = std::min(len1, len2),
+          difference = 0;
+      for (size_t i = 0; i < len; i++) {
+        int32 phone1 = value1[i], phone2 = value2[i];
+        if (phone1 != phone2) {
+          difference++;
+          if (!diff_wxfilename.empty()) {
+            AddToCount(phone1, 0.5, &diff_counts);
+            AddToCount(phone2, 0.5, &diff_counts);
+          }
+        }
+        if (!tot_wxfilename.empty())
+          AddToCount(phone1, 1.0, &tot_counts);
+        if (!confusion_matrix_wxfilename.empty())
+          AddToConfusionMatrix(phone1, phone2, &confusion_matrix);
+      }
+      num_done++;
+      std::cout << key << " " << len << " " << difference << "\n";
+      tot_frames += len;
+      tot_difference += difference;
+    }
+
+    BaseFloat difference_percent = tot_difference * 100.0 / tot_frames;
+    KALDI_LOG << "Computed difference for " << num_done << " utterances, of which "
+              << num_mismatched_lengths << " had mismatched lengths; corresponding "
+        "utterance not found for " << num_not_found;
+    KALDI_LOG << "Average p(different) is " << std::setprecision(4) << difference_percent
+              << "%, over " << tot_frames << " frames.";
+
+    if (!tot_wxfilename.empty())
+      WriteAsKaldiVector(tot_counts, tot_wxfilename, binary);
+    if (!diff_wxfilename.empty())
+      WriteAsKaldiVector(diff_counts, diff_wxfilename, binary);
+    if (!confusion_matrix_wxfilename.empty())
+      WriteKaldiObject(confusion_matrix, confusion_matrix_wxfilename, binary);
+
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/bin/compile-graph.cc b/src/bin/compile-graph.cc
new file mode 100644
index 00000000000..7174fdf8113
--- /dev/null
+++ b/src/bin/compile-graph.cc
@@ -0,0 +1,200 @@
+// bin/compile-graph.cc
+
+// Copyright 2018     Johns Hopkins University (Author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "hmm/hmm-utils.h"
+#include "fstext/fstext-lib.h"
+#include "fstext/push-special.h"
+#include "fstext/grammar-context-fst.h"
+#include "decoder/grammar-fst.h"
+
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+
+    const char *usage =
+        "Creates HCLG decoding graph.  Similar to mkgraph.sh but done in code.\n"
+        "\n"
+        "Usage:   compile-graph [options] <tree-in> <model-in> <lexicon-fst-in> "
+        " <gammar-rspecifier> <hclg-wspecifier>\n"
+        "e.g.: \n"
+        " compile-train-graphs-fsts tree 1.mdl L_disambig.fst G.fst HCLG.fst\n";
+    ParseOptions po(usage);
+
+
+    BaseFloat transition_scale = 1.0;
+    BaseFloat self_loop_scale = 1.0;  // Caution: the script default is 0.1.
+    int32 nonterm_phones_offset = -1;
+    std::string disambig_rxfilename;
+
+
+    po.Register("read-disambig-syms", &disambig_rxfilename, "File containing "
+                "list of disambiguation symbols in phone symbol table");
+    po.Register("transition-scale", &transition_scale, "Scale of transition "
+                "probabilities (excluding self-loops).");
+    po.Register("self-loop-scale", &self_loop_scale, "Scale of self-loop vs. "
+                "non-self-loop probability mass.  Caution: the default of "
+                "mkgraph.sh is 0.1, but this defaults to 1.0.");
+    po.Register("nonterm-phones-offset", &nonterm_phones_offset, "Integer "
+                "value of symbol #nonterm_bos in phones.txt, if present. "
+                "(Only relevant for grammar decoding).");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string tree_rxfilename = po.GetArg(1),
+        model_rxfilename = po.GetArg(2),
+        lex_rxfilename = po.GetArg(3),
+        grammar_rxfilename = po.GetArg(4),
+        hclg_wxfilename = po.GetArg(5);
+
+    ContextDependency ctx_dep;  // the tree.
+    ReadKaldiObject(tree_rxfilename, &ctx_dep);
+
+    TransitionModel trans_model;
+    ReadKaldiObject(model_rxfilename, &trans_model);
+
+    VectorFst<StdArc> *lex_fst = fst::ReadFstKaldi(lex_rxfilename),
+        *grammar_fst = fst::ReadFstKaldi(grammar_rxfilename);
+
+    std::vector<int32> disambig_syms;
+    if (disambig_rxfilename != "")
+      if (!ReadIntegerVectorSimple(disambig_rxfilename, &disambig_syms))
+        KALDI_ERR << "Could not read disambiguation symbols from "
+                  << disambig_rxfilename;
+    if (disambig_syms.empty())
+      KALDI_WARN << "You supplied no disambiguation symbols; note, these are "
+                 << "typically necessary when compiling graphs from FSTs (i.e. "
+                 << "supply L_disambig.fst and the list of disambig syms with\n"
+                 << "--read-disambig-syms)";
+
+    const std::vector<int32> &phone_syms = trans_model.GetPhones();
+    SortAndUniq(&disambig_syms);
+    for (int32 i = 0; i < disambig_syms.size(); i++)
+      if (std::binary_search(phone_syms.begin(), phone_syms.end(),
+                             disambig_syms[i]))
+        KALDI_ERR << "Disambiguation symbol " << disambig_syms[i]
+                  << " is also a phone.";
+
+    VectorFst<StdArc> lg_fst;
+    TableCompose(*lex_fst, *grammar_fst, &lg_fst);
+
+    DeterminizeStarInLog(&lg_fst, fst::kDelta);
+
+    MinimizeEncoded(&lg_fst, fst::kDelta);
+
+    fst::PushSpecial(&lg_fst, fst::kDelta);
+
+    delete grammar_fst;
+    delete lex_fst;
+
+    VectorFst<StdArc> clg_fst;
+
+    std::vector<std::vector<int32> > ilabels;
+
+    int32 context_width = ctx_dep.ContextWidth(),
+        central_position = ctx_dep.CentralPosition();
+
+    if (nonterm_phones_offset < 0) {
+      // The normal case.
+      ComposeContext(disambig_syms, context_width, central_position,
+                     &lg_fst, &clg_fst, &ilabels);
+    } else {
+      // The grammar-FST case. See ../doc/grammar.dox for an intro.
+      if (context_width != 2 || central_position != 1) {
+        KALDI_ERR << "Grammar-fst graph creation only supports models with left-"
+            "biphone context.  (--nonterm-phones-offset option was supplied).";
+      }
+      ComposeContextLeftBiphone(nonterm_phones_offset,  disambig_syms,
+                                lg_fst, &clg_fst, &ilabels);
+    }
+    lg_fst.DeleteStates();
+
+    HTransducerConfig h_cfg;
+    h_cfg.transition_scale = transition_scale;
+    h_cfg.nonterm_phones_offset = nonterm_phones_offset;
+    std::vector<int32> disambig_syms_h; // disambiguation symbols on
+                                        // input side of H.
+    VectorFst<StdArc> *h_fst = GetHTransducer(ilabels,
+                                              ctx_dep,
+                                              trans_model,
+                                              h_cfg,
+                                              &disambig_syms_h);
+
+    VectorFst<StdArc> hclg_fst;  // transition-id to word.
+    TableCompose(*h_fst, clg_fst, &hclg_fst);
+    clg_fst.DeleteStates();
+    delete h_fst;
+
+    KALDI_ASSERT(hclg_fst.Start() != fst::kNoStateId);
+
+    // Epsilon-removal and determinization combined. This will fail if not determinizable.
+    DeterminizeStarInLog(&hclg_fst);
+
+    if (!disambig_syms_h.empty()) {
+      RemoveSomeInputSymbols(disambig_syms_h, &hclg_fst);
+      RemoveEpsLocal(&hclg_fst);
+    }
+
+    // Encoded minimization.
+    MinimizeEncoded(&hclg_fst);
+
+    std::vector<int32> disambig;
+    bool check_no_self_loops = true,
+        reorder = true;
+    AddSelfLoops(trans_model,
+                 disambig,
+                 self_loop_scale,
+                 reorder,
+                 check_no_self_loops,
+                 &hclg_fst);
+
+    if (nonterm_phones_offset >= 0)
+      PrepareForGrammarFst(nonterm_phones_offset, &hclg_fst);
+
+    {  // convert 'hclg' to ConstFst and write.
+      fst::ConstFst<StdArc> const_hclg(hclg_fst);
+      bool binary = true, write_binary_header = false;  // suppress the ^@B
+      Output ko(hclg_wxfilename, binary, write_binary_header);
+      fst::FstWriteOptions wopts(PrintableWxfilename(hclg_wxfilename));
+      const_hclg.Write(ko.Stream(), wopts);
+    }
+
+    KALDI_LOG << "Wrote graph with " << hclg_fst.NumStates()
+              << " states to " << hclg_wxfilename;
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/bin/compute-gop.cc b/src/bin/compute-gop.cc
new file mode 100644
index 00000000000..63b42212ee7
--- /dev/null
+++ b/src/bin/compute-gop.cc
@@ -0,0 +1,227 @@
+// bin/compute-gop.cc
+
+// Copyright 2019  Junbo Zhang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+   This code computes Goodness of Pronunciation (GOP) and extracts phone-level
+   pronunciation feature for mispronunciations detection tasks, the reference:
+
+   "Improved mispronunciation detection with deep neural network trained acoustic
+   models and transfer learning based logistic regression classifiers"
+   by Hu et al., Speech Comunication, 2015.
+
+   GOP is widely used to detect mispronunciations. The DNN-based GOP was defined
+   as the log phone posterior ratio between the canonical phone and the one with
+   the highest score.
+
+   To compute GOP, we need to compute Log Phone Posterior (LPP):
+     LPP(p) = \log p(p|\mathbf o; t_s,t_e)
+   where {\mathbf o} is the input observations, p is the canonical phone,
+   {t_s, t_e} are the start and end frame indexes.
+
+   LPP could be calculated as the average of the frame-level LPP, i.e. p(p|o_t):
+     LPP(p) = \frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
+     p(p|o_t) = \sum_{s \in p} p(s|o_t)
+   where s is the senone label, {s|s \in p} is the states belonging to those
+   triphones whose current phone is p.
+
+   GOP is extracted from LPP:
+     GOP(p) = \log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
+
+   An array of a phone-level feature for each phone is extracted as well, which
+   could be used to train a classifier to detect mispronunciations. Normally the
+   classifier-based approach archives better performance than the GOP-based approach.
+
+   The phone-level feature is defined as:
+     {[LPP(p_1),\cdots,LPP(p_M), LPR(p_1|p_i), \cdots, LPR(p_j|p_i),\cdots]}^T
+
+   where the Log Posterior Ratio (LPR) between phone p_j and p_i is defined as:
+     LPR(p_j|p_i) = \log p(p_j|\mathbf o; t_s, t_e) - \log p(p_i|\mathbf o; t_s, t_e)
+ */
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "hmm/transition-model.h"
+#include "hmm/hmm-utils.h"
+#include "hmm/tree-accu.h"
+#include "hmm/posterior.h"
+
+namespace kaldi {
+
+/** FrameLevelLpp compute a log posterior for pure-phones by sum the posterior
+    of the states belonging to those triphones whose current phone is the canonical
+    phone:
+
+    p(p|o_t) = \sum_{s \in p} p(s|o_t),
+
+    where s is the senone label, {s|s \in p} is the states belonging to those
+    riphones whose current phone is the canonical phone p.
+
+ */
+void FrameLevelLpp(const SubVector<BaseFloat> &prob_row,
+                   const std::vector<std::set<int32> > &pdf2phones,
+                   const std::vector<int32> *phone_map,
+                   Vector<BaseFloat> *out_frame_level_lpp) {
+  for (int32 i = 0; i < prob_row.Dim(); i++) {
+    std::set<int32> dest_idxs;
+    for (int32 ph : pdf2phones.at(i)) {
+      dest_idxs.insert((phone_map != NULL) ? (*phone_map)[ph] - 1 : ph - 1);
+    }
+
+    for (int32 idx : dest_idxs) {
+      KALDI_ASSERT(idx < out_frame_level_lpp->Dim());
+      (*out_frame_level_lpp)(idx) += prob_row(i);
+    }
+  }
+  out_frame_level_lpp->ApplyLog();
+}
+
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  try {
+    const char *usage =
+        "Compute Goodness Of Pronunciation (GOP) from a matrix of "
+        "probabilities (e.g. from nnet3-compute).\n"
+        "Usage:  compute-gop [options] <model> <alignments-rspecifier> "
+        "<prob-matrix-rspecifier> <gop-wspecifier> "
+        "[<phone-feature-wspecifier>]\n"
+        "e.g.:\n"
+        " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-"
+        " ark:gop.1 ark:phone-feat.1\n";
+
+    ParseOptions po(usage);
+
+    bool log_applied = true;
+    std::string phone_map_rxfilename;
+
+    po.Register("log-applied", &log_applied,
+        "If true, assume the input probabilities have been applied log.");
+    po.Register("phone-map", &phone_map_rxfilename,
+                "File name containing old->new phone mapping (each line is: "
+                "old-integer-id new-integer-id)");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4 && po.NumArgs() != 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_filename = po.GetArg(1),
+                alignments_rspecifier = po.GetArg(2),
+                prob_rspecifier = po.GetArg(3),
+                gop_wspecifier = po.GetArg(4),
+                feat_wspecifier = po.GetArg(5);
+
+    TransitionModel trans_model;
+    {
+      bool binary;
+      Input ki(model_filename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+    }
+    std::vector<std::set<int32> > pdf2phones;
+    GetPdfToPhonesMap(trans_model, &pdf2phones);
+    int32 phone_num = trans_model.NumPhones();
+
+    std::vector<int32> phone_map;
+    if (phone_map_rxfilename != "") {
+      ReadPhoneMap(phone_map_rxfilename, &phone_map);
+      phone_num = phone_map[phone_map.size() - 1];
+    }
+
+    RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier);
+    SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier);
+    PosteriorWriter gop_writer(gop_wspecifier);
+    BaseFloatMatrixWriter feat_writer(feat_wspecifier);
+
+    int32 num_done = 0;
+    for (; !prob_reader.Done(); prob_reader.Next()) {
+      std::string key = prob_reader.Key();
+      auto alignment = alignment_reader.Value(key);
+      Matrix<BaseFloat> &probs = prob_reader.Value();
+      if (log_applied) probs.ApplyExp();
+
+      int32 frame_num = alignment.size();
+      if (alignment.size() != probs.NumRows()) {
+        KALDI_WARN << "The frame numbers of alignment and prob are not equal.";
+        if (frame_num > probs.NumRows()) frame_num = probs.NumRows();
+      }
+
+      KALDI_ASSERT(frame_num > 0);
+      int32 cur_phone_id = alignment[0] - 1;  // start by 0, skipping <eps>
+      int32 duration = 0;
+      Vector<BaseFloat> phone_level_feat(phone_num * 2);  // LPPs and LPRs
+      SubVector<BaseFloat> lpp_part(phone_level_feat, 0, phone_num);
+      std::vector<Vector<BaseFloat> > phone_level_feat_stdvector;
+      Posterior posterior_gop;
+      for (int32 i = 0; i < frame_num; i++) {
+        // Calculate LPP and LPR for each pure-phone
+        Vector<BaseFloat> frame_level_lpp(phone_num);
+        FrameLevelLpp(probs.Row(i), pdf2phones,
+                      (phone_map_rxfilename != "") ? &phone_map : NULL,
+                      &frame_level_lpp);
+
+        // LPP(p)=\frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t)
+        lpp_part.AddVec(1, frame_level_lpp);
+        duration++;
+
+        int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1] - 1: -1;
+        if (next_phone_id != cur_phone_id) {
+          // The current phone's feature have been ready
+          lpp_part.Scale(1.0 / duration);
+
+          // LPR(p_j|p_i)=\log p(p_j|\mathbf o; t_s, t_e)-\log p(p_i|\mathbf o; t_s, t_e)
+          for (int k = 0; k < phone_num; k++)
+            phone_level_feat(phone_num + k) = lpp_part(cur_phone_id) - lpp_part(k);
+          phone_level_feat_stdvector.push_back(phone_level_feat);
+
+          // Compute GOP from LPP
+          // GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)}
+          BaseFloat gop = lpp_part(cur_phone_id) - lpp_part.Max();
+          std::vector<std::pair<int32, BaseFloat> > posterior_item;
+          posterior_item.push_back(std::make_pair(cur_phone_id + 1, gop));
+          posterior_gop.push_back(posterior_item);
+
+          // Reset
+          phone_level_feat.Set(0);
+          duration = 0;
+        }
+        cur_phone_id = next_phone_id;
+      }
+
+      // Write GOPs and the phone-level features
+      Matrix<BaseFloat> feats(phone_level_feat_stdvector.size(), phone_num * 2);
+      for (int32 i = 0; i < phone_level_feat_stdvector.size(); i++) {
+        SubVector<BaseFloat> row(feats, i);
+        row.AddVec(1.0, phone_level_feat_stdvector[i]);
+      }
+      feat_writer.Write(key, feats);
+      gop_writer.Write(key, posterior_gop);
+      num_done++;
+    }
+
+    KALDI_LOG << "Processed " << num_done << " prob matrices.";
+    return (num_done != 0 ? 0 : 1);
+  } catch (const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/bin/compute-wer-bootci.cc b/src/bin/compute-wer-bootci.cc
index c6dcd051749..ba2a4ce739c 100644
--- a/src/bin/compute-wer-bootci.cc
+++ b/src/bin/compute-wer-bootci.cc
@@ -162,10 +162,10 @@ int main(int argc, char *argv[]) {
 
   try {
     const char *usage =
-      "Compute a bootstrapping of WER to extract the 95\% confidence interval.\n"
+      "Compute a bootstrapping of WER to extract the 95% confidence interval.\n"
       "Take a reference and a transcription file, in integer or text format,\n"
       "and outputs overall WER statistics to standard output along with its\n"
-      "confidence interval using the bootstrap methos of Bisani and Ney.\n"
+      "confidence interval using the bootstrap method of Bisani and Ney.\n"
       "If a second transcription file corresponding to the same reference is\n"
       "provided, a bootstrap comparison of the two transcription is performed\n"
       "to estimate the probability of improvement.\n"
@@ -234,12 +234,12 @@ int main(int argc, char *argv[]) {
     std::cout.precision(2);
     std::cerr.precision(2);
     std::cout << "Set1: %WER " << std::fixed << 100*mean_wer <<
-              " 95\% Conf Interval [ " << 100*mean_wer-100*interval <<
+              " 95% Conf Interval [ " << 100*mean_wer-100*interval <<
               ", " << 100*mean_wer+100*interval << " ]" << '\n';
 
     if(!hyp2_rspecifier.empty()) {
         std::cout << "Set2: %WER " << std::fixed << 100*mean_wer2 <<
-            " 95\% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
+            " 95% Conf Interval [ " << 100*mean_wer2-100*interval2 <<
             ", " << 100*mean_wer2+100*interval2 << " ]" << '\n';
 
         std::cout << "Probability of Set2 improving Set1: " << std::fixed <<
diff --git a/src/bin/copy-post.cc b/src/bin/copy-post.cc
index 6d0d351a594..d5ca3f42980 100644
--- a/src/bin/copy-post.cc
+++ b/src/bin/copy-post.cc
@@ -26,13 +26,13 @@
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
-    typedef kaldi::int32 int32;  
+    typedef kaldi::int32 int32;
 
     const char *usage =
         "Copy archives of posteriors, with optional scaling\n"
-        "(Also see rand-prune-post and sum-post)\n"
         "\n"
-        "Usage: copy-post <post-rspecifier> <post-wspecifier>\n";
+        "Usage: copy-post <post-rspecifier> <post-wspecifier>\n"
+        "See also: post-to-weights, scale-post, sum-post, weight-post ...\n";
 
     BaseFloat scale = 1.0;
     ParseOptions po(usage);
@@ -43,15 +43,15 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       exit(1);
     }
-      
+
     std::string post_rspecifier = po.GetArg(1),
         post_wspecifier = po.GetArg(2);
 
     kaldi::SequentialPosteriorReader posterior_reader(post_rspecifier);
-    kaldi::PosteriorWriter posterior_writer(post_wspecifier); 
+    kaldi::PosteriorWriter posterior_writer(post_wspecifier);
 
     int32 num_done = 0;
-   
+
     for (; !posterior_reader.Done(); posterior_reader.Next()) {
       std::string key = posterior_reader.Key();
 
@@ -71,4 +71,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc
index ad1dd41a53f..d107ab1cfac 100644
--- a/src/bin/draw-tree.cc
+++ b/src/bin/draw-tree.cc
@@ -18,6 +18,7 @@
 // limitations under the License.
 
 #include "tree/tree-renderer.h"
+#include "tree/context-dep.h"
 
 void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms,
                kaldi::EventType **query)
@@ -33,25 +34,23 @@ void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms,
     if (key == kPdfClass) {
       value = static_cast<EventValueType>(atoi(valstr.c_str()));
       if (value < 0) { // not valid pdf-class
-        KALDI_ERR << "Bad query: invalid pdf-class ("
-                  << valstr << ')' << std::endl << std::endl;
+        KALDI_ERR << "Bad query: invalid pdf-class (" << valstr << ')';
       }
     }
     else {
       value = static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
       if (value == -1) { // fst::kNoSymbol
-        KALDI_ERR << "Bad query: invalid symbol ("
-                  << valstr << ')' << std::endl << std::endl;
+        KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')';
       }
     }
     query_event->push_back(std::make_pair(key++, value));
     old_found = found + 1;
   }
   std::string valstr = qry.substr(old_found);
-  EventValueType value = static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
+  EventValueType value =
+      static_cast<EventValueType>(phone_syms->Find(valstr.c_str()));
   if (value == -1) { // fst::kNoSymbol
-    KALDI_ERR << "Bad query: invalid symbol ("
-              << valstr << ')' << std::endl << std::endl;
+    KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')';
   }
   query_event->push_back(std::make_pair(key, value));
 
diff --git a/src/bin/latgen-incremental-mapped.cc b/src/bin/latgen-incremental-mapped.cc
new file mode 100644
index 00000000000..80c65bfb535
--- /dev/null
+++ b/src/bin/latgen-incremental-mapped.cc
@@ -0,0 +1,183 @@
+// bin/latgen-incremental-mapped.cc
+
+// Copyright      2019  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::Fst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices, reading log-likelihoods as matrices\n"
+        " (model is needed only for the integer mappings in its transition-model)\n"
+        "The lattice determinization algorithm here can operate\n"
+        "incrementally.\n"
+        "Usage: latgen-incremental-mapped [options] trans-model-in "
+        "(fst-in|fsts-rspecifier) loglikes-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeIncrementalDecoderConfig config;
+
+    std::string word_syms_filename;
+    config.Register(&po);
+    po.Register("acoustic-scale", &acoustic_scale,
+                "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_in_filename = po.GetArg(1), fst_in_str = po.GetArg(2),
+                feature_rspecifier = po.GetArg(3), lattice_wspecifier = po.GetArg(4),
+                words_wspecifier = po.GetOptArg(5),
+                alignment_wspecifier = po.GetOptArg(6);
+
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    bool determinize = true;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (!(determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+                      : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file " << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader loglike_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+      timer.Reset();
+
+      {
+        LatticeIncrementalDecoder decoder(*decode_fst, trans_model, config);
+
+        for (; !loglike_reader.Done(); loglike_reader.Next()) {
+          std::string utt = loglike_reader.Key();
+          Matrix<BaseFloat> loglikes(loglike_reader.Value());
+          loglike_reader.FreeCurrent();
+          if (loglikes.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+
+          DecodableMatrixScaledMapped decodable(trans_model, loglikes,
+                                                acoustic_scale);
+
+          double like;
+          if (DecodeUtteranceLatticeIncremental(
+                  decoder, decodable, trans_model, word_syms, utt, acoustic_scale,
+                  determinize, allow_partial, &alignment_writer, &words_writer,
+                  &compact_lattice_writer, &lattice_writer, &like)) {
+            tot_like += like;
+            frame_count += loglikes.NumRows();
+            num_success++;
+          } else {
+            num_fail++;
+          }
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else {             // We have different FSTs for different utterances.
+      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
+      RandomAccessBaseFloatMatrixReader loglike_reader(feature_rspecifier);
+      for (; !fst_reader.Done(); fst_reader.Next()) {
+        std::string utt = fst_reader.Key();
+        if (!loglike_reader.HasKey(utt)) {
+          KALDI_WARN << "Not decoding utterance " << utt
+                     << " because no loglikes available.";
+          num_fail++;
+          continue;
+        }
+        const Matrix<BaseFloat> &loglikes = loglike_reader.Value(utt);
+        if (loglikes.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_fail++;
+          continue;
+        }
+        LatticeIncrementalDecoder decoder(fst_reader.Value(), trans_model, config);
+        DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
+        double like;
+        if (DecodeUtteranceLatticeIncremental(
+                decoder, decodable, trans_model, word_syms, utt, acoustic_scale,
+                determinize, allow_partial, &alignment_writer, &words_writer,
+                &compact_lattice_writer, &lattice_writer, &like)) {
+          tot_like += like;
+          frame_count += loglikes.NumRows();
+          num_success++;
+        } else {
+          num_fail++;
+        }
+      }
+    }
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken " << elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed * 100.0 / frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for " << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like / frame_count)
+              << " over " << frame_count << " frames.";
+
+    delete word_syms;
+    if (num_success != 0)
+      return 0;
+    else
+      return 1;
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc
index 8a7b5a39e00..3c93dfd0d39 100644
--- a/src/bin/matrix-sum.cc
+++ b/src/bin/matrix-sum.cc
@@ -238,17 +238,20 @@ int32 TypeThreeUsage(const ParseOptions &po,
               << "tables, the intermediate arguments must not be tables.";
   }
 
-  bool add = true;
-  Matrix<BaseFloat> mat;
+  Matrix<BaseFloat> sum;
   for (int32 i = 1; i < po.NumArgs(); i++) {
-    bool binary_in;
-    Input ki(po.GetArg(i), &binary_in);
-    // this Read function will throw if there is a size mismatch.
-    mat.Read(ki.Stream(), binary_in, add);
+    Matrix<BaseFloat> this_mat;
+    ReadKaldiObject(po.GetArg(i), &this_mat);
+    if (sum.NumRows() < this_mat.NumRows() ||
+        sum.NumCols() < this_mat.NumCols())
+      sum.Resize(std::max(sum.NumRows(), this_mat.NumRows()),
+                 std::max(sum.NumCols(), this_mat.NumCols()),
+                 kCopyData);
+    sum.AddMat(1.0, this_mat);
   }
   if (average)
-    mat.Scale(1.0 / (po.NumArgs() - 1));
-  WriteKaldiObject(mat, po.GetArg(po.NumArgs()), binary);
+    sum.Scale(1.0 / (po.NumArgs() - 1));
+  WriteKaldiObject(sum, po.GetArg(po.NumArgs()), binary);
   KALDI_LOG << "Summed " << (po.NumArgs() - 1) << " matrices; "
             << "wrote sum to " << PrintableWxfilename(po.GetArg(po.NumArgs()));
   return 0;
@@ -335,4 +338,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/bin/post-to-phone-post.cc b/src/bin/post-to-phone-post.cc
index 92f67514a0f..871f03a91a1 100644
--- a/src/bin/post-to-phone-post.cc
+++ b/src/bin/post-to-phone-post.cc
@@ -1,6 +1,7 @@
 // bin/post-to-phone-post.cc
 
 // Copyright 2012-2013  Johns Hopkins University (author: Daniel Povey)
+//                2019  Daniel Povey
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -26,24 +27,36 @@
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
-    typedef kaldi::int32 int32;  
+    typedef kaldi::int32 int32;
 
     const char *usage =
-        "Convert posteriors to phone-level posteriors\n"
+        "Convert posteriors (or pdf-level posteriors) to phone-level posteriors\n"
         "See also: post-to-pdf-post, post-to-weights, get-post-on-ali\n"
         "\n"
+        "First, the usage when your posteriors are on transition-ids (the normal case):\n"
         "Usage: post-to-phone-post [options] <model> <post-rspecifier> <phone-post-wspecifier>\n"
-        " e.g.: post-to-phone-post --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" ark,t:-\n";
-    
+        " e.g.: post-to-phone-post --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" ark,t:-\n"
+        "\n"
+        "Next, the usage when your posteriors are on pdfs (e.g. if they are neural-net\n"
+        "posteriors)\n"
+        "post-to-phone-post --transition-id-counts=final.tacc 1.mdl ark:pdf_post.ark ark,t:-\n"
+        "See documentation of --transition-id-counts option for more details.";
+
+    std::string tacc_rxfilename;
+
     ParseOptions po(usage);
 
+    po.Register("transition-id-counts", &tacc_rxfilename, "Rxfilename where vector of counts\n"
+                "for transition-ids can be read (would normally come from training data\n"
+                "alignments, e.g. from ali-to-post and then post-to-tacc with --per-pdf=false)\n");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
-      
+
     std::string model_rxfilename = po.GetArg(1),
         post_rspecifier = po.GetArg(2),
         phone_post_wspecifier = po.GetArg(3);
@@ -51,20 +64,89 @@ int main(int argc, char *argv[]) {
     kaldi::SequentialPosteriorReader posterior_reader(post_rspecifier);
     kaldi::PosteriorWriter posterior_writer(phone_post_wspecifier);
 
-    TransitionModel trans_model;    
+    TransitionModel trans_model;
     {
       bool binary_in;
       Input ki(model_rxfilename, &binary_in);
       trans_model.Read(ki.Stream(), binary_in);
     }
-    int32 num_done = 0;      
-    
-    for (; !posterior_reader.Done(); posterior_reader.Next()) {
-      const kaldi::Posterior &posterior = posterior_reader.Value();
-      kaldi::Posterior phone_posterior;
-      ConvertPosteriorToPhones(trans_model, posterior, &phone_posterior);
-      posterior_writer.Write(posterior_reader.Key(), phone_posterior);
-      num_done++;
+    int32 num_done = 0;
+
+
+    if (tacc_rxfilename.empty()) {
+      // Input is transition-ids
+      for (; !posterior_reader.Done(); posterior_reader.Next()) {
+        const kaldi::Posterior &posterior = posterior_reader.Value();
+        kaldi::Posterior phone_posterior;
+        ConvertPosteriorToPhones(trans_model, posterior, &phone_posterior);
+        posterior_writer.Write(posterior_reader.Key(), phone_posterior);
+        num_done++;
+      }
+    } else {
+      Vector<BaseFloat> transition_counts;
+      ReadKaldiObject(tacc_rxfilename, &transition_counts);
+      int32 num_pdfs = trans_model.NumPdfs(),
+          num_tids = trans_model.NumTransitionIds();
+      if (transition_counts.Dim() != num_tids + 1) {
+        KALDI_ERR << "Wrong size for transition counts in " << tacc_rxfilename
+                  << ", expected " << num_tids << " + 1, got "
+                  << transition_counts.Dim();
+      }
+      // Maps from pdf-id to a map from phone -> count associated with that
+      // phone.
+      std::vector<std::unordered_map<int32, BaseFloat> > pdf_to_phones(num_pdfs);
+
+      for (int32 i = 1; i <= num_tids; i++) {
+        BaseFloat count = transition_counts(i);
+        int32 phone = trans_model.TransitionIdToPhone(i),
+            pdf_id = trans_model.TransitionIdToPdf(i);
+        // Relying on C++11 value-initialization thingies that should make the
+        // map's elements default to zero.
+        pdf_to_phones[pdf_id][phone] += count;
+      }
+
+      for (int32 i = 0; i < num_pdfs; i++) {
+        BaseFloat denominator = 0.0;
+        for (auto p: pdf_to_phones[i])
+          denominator += p.second;
+        for (auto iter = pdf_to_phones[i].begin(); iter != pdf_to_phones[i].end();
+             ++iter) {
+          if (denominator != 0.0)
+            iter->second /= denominator;
+          else
+            iter->second = 1.0 / pdf_to_phones[i].size();
+        }
+      }
+
+      // Input is pdf-ids
+      for (; !posterior_reader.Done(); posterior_reader.Next()) {
+        const kaldi::Posterior &posterior = posterior_reader.Value();
+        int32 T = posterior.size();
+        kaldi::Posterior phone_posterior(T);
+        std::unordered_map<int32, BaseFloat> phone_to_count;
+        for (int32 t = 0; t < T; t++) {
+          phone_to_count.clear();
+          for (auto p : posterior[t]) {
+            int32 pdf_id = p.first;
+            BaseFloat count = p.second;
+            if (pdf_id < 0 || pdf_id >= num_pdfs)
+              KALDI_ERR << "pdf-id on input out of range, expected [0.." << (num_pdfs-1)
+                        << ", got: " << pdf_id;
+            for (auto q: pdf_to_phones[pdf_id]) {
+              int32 phone = q.first;
+              BaseFloat prob = q.second;
+              if (prob != 0.0)
+                phone_to_count[phone] += count * prob;
+            }
+          }
+          for (auto p : phone_to_count) {
+            phone_posterior[t].push_back(
+                std::pair<int32, BaseFloat>(p.first, p.second));
+          }
+        }
+        posterior_writer.Write(posterior_reader.Key(), phone_posterior);
+        num_done++;
+      }
     }
     KALDI_LOG << "Done converting posteriors to phone posteriors for "
               << num_done << " utterances.";
@@ -74,4 +156,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/bin/post-to-smat.cc b/src/bin/post-to-smat.cc
index 8cd8df41647..2d043000866 100644
--- a/src/bin/post-to-smat.cc
+++ b/src/bin/post-to-smat.cc
@@ -48,15 +48,16 @@ int main(int argc, char *argv[]) {
 
     po.Read(argc, argv);
 
-    if (dim <= 0) {
-      KALDI_ERR << "The --dim option must be specified.";
-    }
-
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
     }
 
+    if (dim <= 0) {
+      KALDI_ERR << "The --dim option must be specified.";
+    }
+
+
     std::string  posteriors_rspecifier = po.GetArg(1),
         sparse_matrix_wspecifier = po.GetArg(2);
 
diff --git a/src/bin/vector-sum.cc b/src/bin/vector-sum.cc
index 42404e38384..3e622cafdc7 100644
--- a/src/bin/vector-sum.cc
+++ b/src/bin/vector-sum.cc
@@ -1,7 +1,7 @@
 // bin/vector-sum.cc
 
-// Copyright 2014  Vimal Manohar
-//           2014  Johns Hopkins University (author: Daniel Povey)
+// Copyright      2014  Vimal Manohar
+//           2014-2018  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -43,7 +43,7 @@ int32 TypeOneUsage(const ParseOptions &po) {
 
   // Input vectors
   SequentialBaseFloatVectorReader vector_reader1(vector_in_fn1);
-  std::vector<RandomAccessBaseFloatVectorReader*> vector_readers(num_args-2, 
+  std::vector<RandomAccessBaseFloatVectorReader*> vector_readers(num_args-2,
                                                                  static_cast<RandomAccessBaseFloatVectorReader*>(NULL));
   std::vector<std::string> vector_in_fns(num_args-2);
   for (int32 i = 2; i < num_args; ++i) {
@@ -51,7 +51,7 @@ int32 TypeOneUsage(const ParseOptions &po) {
     vector_in_fns[i-2] = po.GetArg(i);
   }
 
-  int32 n_utts = 0, n_total_vectors = 0, 
+  int32 n_utts = 0, n_total_vectors = 0,
       n_success = 0, n_missing = 0, n_other_errors = 0;
 
   for (; !vector_reader1.Done(); vector_reader1.Next()) {
@@ -70,10 +70,10 @@ int32 TypeOneUsage(const ParseOptions &po) {
         if (vector2.Dim() == vector_out.Dim()) {
           vector_out.AddVec(1.0, vector2);
         } else {
-          KALDI_WARN << "Dimension mismatch for utterance " << key 
+          KALDI_WARN << "Dimension mismatch for utterance " << key
                      << " : " << vector2.Dim() << " for "
                      << "system " << (i + 2) << ", rspecifier: "
-                     << vector_in_fns[i] << " vs " << vector_out.Dim() 
+                     << vector_in_fns[i] << " vs " << vector_out.Dim()
                      << " primary vector, rspecifier:" << vector_in_fn1;
           n_other_errors++;
         }
@@ -94,9 +94,9 @@ int32 TypeOneUsage(const ParseOptions &po) {
             << " different systems";
   KALDI_LOG << "Produced output for " << n_success << " utterances; "
             << n_missing << " total missing vectors";
-  
+
   DeletePointers(&vector_readers);
-  
+
   return (n_success != 0 && n_missing < (n_success - n_missing)) ? 0 : 1;
 }
 
@@ -108,13 +108,13 @@ int32 TypeTwoUsage(const ParseOptions &po,
                "vector-sum: first argument must be an rspecifier");
   // if next assert fails it would be bug in the code as otherwise we shouldn't
   // be called.
-  KALDI_ASSERT(ClassifyWspecifier(po.GetArg(2), NULL, NULL, NULL) == 
+  KALDI_ASSERT(ClassifyWspecifier(po.GetArg(2), NULL, NULL, NULL) ==
                kNoWspecifier);
 
   SequentialBaseFloatVectorReader vec_reader(po.GetArg(1));
 
   Vector<double> sum;
-  
+
   int32 num_done = 0, num_err = 0;
 
   for (; !vec_reader.Done(); vec_reader.Next()) {
@@ -134,7 +134,7 @@ int32 TypeTwoUsage(const ParseOptions &po,
       }
     }
   }
-  
+
   if (num_done > 0 && average) sum.Scale(1.0 / num_done);
 
   Vector<BaseFloat> sum_float(sum);
@@ -157,21 +157,21 @@ int32 TypeThreeUsage(const ParseOptions &po,
                 << "tables, the intermediate arguments must not be tables.";
     }
   }
-  if (ClassifyWspecifier(po.GetArg(po.NumArgs()), NULL, NULL, NULL) != 
+  if (ClassifyWspecifier(po.GetArg(po.NumArgs()), NULL, NULL, NULL) !=
       kNoWspecifier) {
     KALDI_ERR << "Wrong usage (type 3): if first and last arguments are not "
               << "tables, the intermediate arguments must not be tables.";
   }
 
-  bool add = true;
-  Vector<BaseFloat> vec;
+  Vector<BaseFloat> sum;
   for (int32 i = 1; i < po.NumArgs(); i++) {
-    bool binary_in;
-    Input ki(po.GetArg(i), &binary_in);
-    // this Read function will throw if there is a size mismatch.
-    vec.Read(ki.Stream(), binary_in, add);
+    Vector<BaseFloat> this_vec;
+    ReadKaldiObject(po.GetArg(i), &this_vec);
+    if (sum.Dim() < this_vec.Dim())
+      sum.Resize(this_vec.Dim(), kCopyData);;
+    sum.AddVec(1.0, this_vec);
   }
-  WriteKaldiObject(vec, po.GetArg(po.NumArgs()), binary);
+  WriteKaldiObject(sum, po.GetArg(po.NumArgs()), binary);
   KALDI_LOG << "Summed " << (po.NumArgs() - 1) << " vectors; "
             << "wrote sum to " << PrintableWxfilename(po.GetArg(po.NumArgs()));
   return 0;
@@ -201,15 +201,15 @@ int main(int argc, char *argv[]) {
         " <vector-out-wxfilename>\n"
         " e.g.: vector-sum --binary=false 1.vec 2.vec 3.vec sum.vec\n"
         "See also: copy-vector, dot-weights\n";
-        
+
     bool binary, average = false;
-    
+
     ParseOptions po(usage);
 
     po.Register("binary", &binary, "If true, write output as binary (only "
                 "relevant for usage types two or three");
     po.Register("average", &average, "Do average instead of sum");
-    
+
     po.Read(argc, argv);
 
     int32 N = po.NumArgs(), exit_status;
@@ -226,11 +226,11 @@ int main(int argc, char *argv[]) {
       exit_status = TypeTwoUsage(po, binary, average);
     } else if (po.NumArgs() >= 2 &&
                ClassifyRspecifier(po.GetArg(1), NULL, NULL) == kNoRspecifier &&
-               ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == 
+               ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) ==
                kNoWspecifier) {
       // summing flat files.
       exit_status = TypeThreeUsage(po, binary);
-    } else {      
+    } else {
       po.PrintUsage();
       exit(1);
     }
diff --git a/src/chain/Makefile b/src/chain/Makefile
index 2a735c2ca2d..fbad28f7de6 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -18,8 +18,7 @@ LIBNAME = kaldi-chain
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index 62d2f3aaa56..11c851091bd 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -1,6 +1,6 @@
 // chain/chain-den-graph.cc
 
-// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+// Copyright      2015-2018   Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -315,11 +315,18 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
     fst::Project(&phone_lm, fst::PROJECT_INPUT);
   }
   std::vector<int32> disambig_syms;  // empty list of diambiguation symbols.
-  fst::ContextFst<StdArc> cfst(subsequential_symbol, trans_model.GetPhones(),
-                               disambig_syms, ctx_dep.ContextWidth(),
-                               ctx_dep.CentralPosition());
-  StdVectorFst context_dep_lm;
-  fst::ComposeContextFst(cfst, phone_lm, &context_dep_lm);
+
+  // inv_cfst will be expanded on the fly, as needed.
+  fst::InverseContextFst inv_cfst(subsequential_symbol,
+                                  trans_model.GetPhones(),
+                                  disambig_syms,
+                                  ctx_dep.ContextWidth(),
+                                  ctx_dep.CentralPosition());
+
+  fst::StdVectorFst context_dep_lm;
+  fst::ComposeDeterministicOnDemandInverse(phone_lm, &inv_cfst,
+                                           &context_dep_lm);
+
   // at this point, context_dep_lm will have indexes into 'ilabels' as its
   // input symbol (representing context-dependent phones), and phones on its
   // output.  We don't need the phones, so we'll project.
@@ -335,7 +342,7 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
   // we'll use the same value in test time.  Consistency is the key here.
   h_config.transition_scale = 1.0;
 
-  StdVectorFst *h_fst = GetHTransducer(cfst.ILabelInfo(),
+  StdVectorFst *h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
                                        ctx_dep,
                                        trans_model,
                                        h_config,
@@ -355,7 +362,7 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep,
   AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder,
                check_no_self_loops, &transition_id_fst);
   // at this point transition_id_fst will have transition-ids as its ilabels and
-  // context-dependent phones (indexes into ILabelInfo()) as its olabels.
+  // context-dependent phones (indexes into IlabelInfo()) as its olabels.
   // Discard the context-dependent phones by projecting on the input, keeping
   // only the transition-ids.
   fst::Project(&transition_id_fst, fst::PROJECT_INPUT);
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index e41e942e266..b9023f02f5e 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -24,6 +24,7 @@
 namespace kaldi {
 namespace chain {
 
+
 DenominatorComputation::DenominatorComputation(
     const ChainTrainingOptions &opts,
     const DenominatorGraph &den_graph,
@@ -54,6 +55,18 @@ DenominatorComputation::DenominatorComputation(
   // log-space.
   KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 &&
                opts_.leaky_hmm_coefficient < 1.0);
+
+  if (RandInt(0, 99) == 0) {
+    // A check, that all values in nnet_output are in the range [-30, 30]..
+    // otherwise derivatives will be wrong (search below for 30).
+    BaseFloat max_val = nnet_output.Max(), min_val = nnet_output.Min();
+    if (max_val > 30.0 || min_val < -30.0) {
+      KALDI_WARN << "Nnet outputs " << min_val << ", "
+                 << max_val <<
+          " outside the range [-30,30], derivs may be inaccurate.";
+    }
+  }
+
   // make sure the alpha sums and beta sums are zeroed.
   alpha_.ColRange(den_graph_.NumStates() * num_sequences_,
                   num_sequences_).SetZero();
@@ -61,10 +74,11 @@ DenominatorComputation::DenominatorComputation(
                  num_sequences_).SetZero();
 
   KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
-  // the kStrideEqualNumCols argument means we'll allocate a contiguous block of
-  // memory for this; it is added to ensure that the same block of memory
-  // (cached in the allocator) can be used for xent_output_deriv when allocated
-  // from chain-training.cc.
+  // the kStrideEqualNumCols argument is so that we can share the same
+  // memory block with xent_output_deriv (see chain-training.cc, search for
+  // kStrideEqualNumCols).  This depends on how the allocator works, and
+  // actually might not happen, but anyway, the impact on speed would
+  // likely be un-measurably small.
   exp_nnet_output_transposed_.Resize(nnet_output.NumCols(),
                                      nnet_output.NumRows(),
                                      kUndefined, kStrideEqualNumCols);
@@ -293,6 +307,7 @@ bool DenominatorComputation::Backward(
         transposed_deriv_part.SetZero();
     }
   }
+
   return ok_;
 }
 
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index d76e4244ae2..217b7447621 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -218,7 +218,8 @@ class DenominatorComputation {
   BaseFloat Forward();
 
   // this adds deriv_weight times (the derivative of the log-prob w.r.t. the
-  // nnet output), to 'nnet_output_deriv'.
+  // nnet output), to 'nnet_output_deriv'.  Note: normally, deriv_weight
+  // will be -1, or some other negative number if we are doing data weighting.
   // returns true if everything seemed OK, false if a failure was detected.
   bool Backward(BaseFloat deriv_weight,
                 CuMatrixBase<BaseFloat> *nnet_output_deriv);
diff --git a/src/chain/chain-kernels-ansi.h b/src/chain/chain-kernels-ansi.h
index 388c78ab2ee..f5814d7c11c 100644
--- a/src/chain/chain-kernels-ansi.h
+++ b/src/chain/chain-kernels-ansi.h
@@ -48,6 +48,12 @@ extern "C" {
                               const BaseFloat *prev_alpha,
                               BaseFloat *this_alpha);
 
+  void cuda_penalize_out_of_range(dim3 Gr, dim3 Bl, BaseFloat limit,
+                                  BaseFloat scale, const BaseFloat *in_data,
+                                  MatrixDim dim, int out_stride,
+                                  BaseFloat *out_deriv);
+
+
 } // extern "C"
 
 #endif  // HAVE_CUDA
diff --git a/src/chain/chain-kernels.cu b/src/chain/chain-kernels.cu
index f093f21a5a5..a63944f0012 100644
--- a/src/chain/chain-kernels.cu
+++ b/src/chain/chain-kernels.cu
@@ -1,6 +1,6 @@
 // chain/chain-kernels.cu
 
-// Copyright  2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright  2015-2019  Johns Hopkins University (author: Daniel Povey)
 
 
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -287,3 +287,32 @@ void cuda_chain_hmm_backward(dim3 Gr, dim3 Bl,
                                       this_beta, log_prob_deriv,
                                       log_prob_deriv_stride);
 }
+
+
+// See documentation for PenalizeOutOfRange() in chain-training.cc to see what
+// this is about.
+__global__
+static void _penalize_out_of_range(
+    BaseFloat limit, BaseFloat scale, const BaseFloat *in_data, MatrixDim dim,
+    int out_stride, BaseFloat *out_deriv) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int in_index = i + j * dim.stride,
+      out_index = i + j * out_stride;
+  if (i < dim.cols && j < dim.rows) {
+    BaseFloat val = in_data[in_index];
+    if (val < -limit) {
+      out_deriv[out_index] -= scale * (val + limit);
+    } else if (val > limit) {
+      out_deriv[out_index] -= scale * (val - limit);
+    }
+  }
+}
+
+void cuda_penalize_out_of_range(dim3 Gr, dim3 Bl, BaseFloat limit,
+                                BaseFloat scale, const BaseFloat *in_data,
+                                MatrixDim dim, int out_stride,
+                                BaseFloat *out_deriv) {
+  _penalize_out_of_range<<<Gr,Bl>>>(limit, scale, in_data,
+                                    dim, out_stride, out_deriv);
+}
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 8f95034c437..f8a2c1d11cc 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -299,6 +299,7 @@ bool ProtoSupervisionToSupervision(
   using fst::VectorFst;
   using fst::StdArc;
   VectorFst<StdArc> phone_fst(proto_supervision.fst);
+  std::vector<int32> disambig_syms;  // empty list of diambiguation symbols.
   int32 subsequential_symbol = trans_model.GetPhones().back() + 1;
   if (ctx_dep.CentralPosition() != ctx_dep.ContextWidth() - 1) {
     // note: this function only adds the subseq symbol to the input of what was
@@ -307,19 +308,28 @@ bool ProtoSupervisionToSupervision(
     AddSubsequentialLoop(subsequential_symbol, &phone_fst);
     fst::Project(&phone_fst, fst::PROJECT_INPUT);
   }
-  std::vector<int32> disambig_syms;  // empty list of diambiguation symbols.
-  fst::ContextFst<StdArc> cfst(subsequential_symbol, trans_model.GetPhones(),
-                               disambig_syms, ctx_dep.ContextWidth(),
-                               ctx_dep.CentralPosition());
+
+  // inv_cfst will be expanded on the fly, as needed.
+  fst::InverseContextFst inv_cfst(subsequential_symbol,
+                                  trans_model.GetPhones(),
+                                  disambig_syms,
+                                  ctx_dep.ContextWidth(),
+                                  ctx_dep.CentralPosition());
+
+
   VectorFst<StdArc> context_dep_fst;
-  fst::ComposeContextFst(cfst, phone_fst, &context_dep_fst);
-  // at this point, context_dep_fst will have indexes into 'ilabels' as its
-  // input symbol (representing context-dependent phones), and phones on its
-  // output.  We don't need the phones, so we'll project.
+  ComposeDeterministicOnDemandInverse(phone_fst, &inv_cfst, &context_dep_fst);
+
+
+  // at this point, context_dep_fst will have indexes into
+  // 'inv_cfst.IlabelInfo()' as its input symbol (representing context-dependent
+  // phones), and phones on its output.  We don't need the phones, so we'll
+  // project.
   fst::Project(&context_dep_fst, fst::PROJECT_INPUT);
 
-  std::vector<int32> disambig_syms_h; // disambiguation symbols on input side
-                                      // of H -- will be empty.
+  std::vector<int32> disambig_syms_h; // disambiguation symbols on input side of
+                                      // H -- will be empty, as there were no
+                                      // disambiguation symbols on the output.
 
   HTransducerConfig h_cfg;
 
@@ -327,7 +337,7 @@ bool ProtoSupervisionToSupervision(
   // when we compose with the denominator graph.
   h_cfg.transition_scale = 0.0;
 
-  VectorFst<StdArc> *h_fst = GetHTransducer(cfst.ILabelInfo(),
+  VectorFst<StdArc> *h_fst = GetHTransducer(inv_cfst.IlabelInfo(),
                                             ctx_dep,
                                             trans_model,
                                             h_cfg,
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 6b4a7b593c2..d20ecfa4c1e 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -28,6 +28,62 @@ namespace kaldi {
 namespace chain {
 
 
+/**
+   This is a rather special-purpose function which adds something to
+   the derivative in order to encourage the value to stay within
+   a specified range.  This is something we use in chain training
+   in order to encourage the nnet outputs to stay within the
+   range [-30, 30] (needed because we don't do the forward-backward
+   denominator computation in log space).
+
+   It's very similar to l2 regularization but only applied once you depart
+   the range [-limit, limit].
+
+   Basically, this function does as follows:
+
+     (*out_deriv)(i,j) +=   0                                if   -limit <= in_value(i,j) <= limit
+                            (-limit - in_value(i,j)) * scale if  in_value(i,j) < -limit
+                            (limit - in_value(i,j)) * scale  if  in_value(i,j) > limit
+   If limit were zero, this would be the same as l2 regularization with scale 'scale'.
+ */
+static void PenalizeOutOfRange(const CuMatrixBase<BaseFloat> &in_value,
+                               BaseFloat limit,
+                               BaseFloat scale,
+                               CuMatrixBase<BaseFloat> *out_deriv) {
+  KALDI_ASSERT(SameDim(in_value, *out_deriv) && limit > 0 && scale >= 0);
+  if (scale == 0)
+    return;
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
+    dim3 dimGrid(n_blocks(in_value.NumCols(), CU2DBLOCK),
+                 n_blocks(in_value.NumRows(), CU2DBLOCK));
+    cuda_penalize_out_of_range(dimGrid, dimBlock, limit, scale,
+                               in_value.Data(), in_value.Dim(),
+                               out_deriv->Stride(), out_deriv->Data());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    int32 num_rows = in_value.NumRows(),
+        num_cols = in_value.NumCols();
+    for (int32 r = 0; r < num_rows; r++) {
+      const BaseFloat *in_row_data =  in_value.RowData(r);
+      BaseFloat *out_row_data = out_deriv->RowData(r);
+      for (int32 c = 0; c < num_cols; c++) {
+        BaseFloat val = in_row_data[c];
+        if (val < -limit) {
+          out_row_data[c] -= scale * (val + limit);
+        } else if (val > limit) {
+          out_row_data[c] -= scale * (val - limit);
+        }
+      }
+    }
+  }
+}
+
 
 void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
                                  const DenominatorGraph &den_graph,
@@ -47,6 +103,14 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
   if (nnet_output_deriv != NULL)
     nnet_output_deriv->SetZero();
 
+  if (nnet_output_deriv != NULL && RandInt(0, 1) == 0) {
+    // Only do this about every other frame, for efficiency; we'll multiply the
+    // scale by 2 to compensate.  See docs for the function, for its purpose.
+    PenalizeOutOfRange(nnet_output, 30.0,
+                       2.0 * opts.out_of_range_regularize,
+                       nnet_output_deriv);
+  }
+
   { // Doing the denominator first helps to reduce the maximum
     // memory use, as we can set 'xent_deriv' to nonempty after
     // we've freed the memory in this object.
@@ -172,6 +236,14 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                                 nnet_output_deriv);
   }
 
+  if (nnet_output_deriv != NULL && RandInt(0, 1) == 0) {
+    // Only do this about every other frame, for efficiency; we'll multiply the
+    // scale by 2 to compensate.  See docs for the function, for its purpose.
+    PenalizeOutOfRange(nnet_output, 30.0,
+                       2.0 * opts.out_of_range_regularize,
+                       nnet_output_deriv);
+  }
+
   if (xent_output_deriv != NULL) {
     // the reason for kStrideEqualNumCols is so that we can share the memory
     // block with the memory that was used for exp_nnet_output_transposed_ from
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 6ea70b5ca41..3e7efbb59a1 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -45,6 +45,14 @@ struct ChainTrainingOptions {
   // (squared so it's additive across the dimensions).  e.g. try 0.0005.
   BaseFloat l2_regularize;
 
+
+  // This is similar to an l2 regularization constant (like l2-regularize) but
+  // applied on the part of the nnet output matrix that exceeds the range
+  // [-30,30]... this is necessary to avoid things regularly going out of the
+  // range that we can do exp() on, since the denominator computation is not in
+  // log space and to avoid NaNs we limit the outputs to the range [-30,30].
+  BaseFloat out_of_range_regularize;
+
   // Coefficient for 'leaky hmm'.  This means we have an epsilon-transition from
   // each state to a special state with probability one, and then another
   // epsilon-transition from that special state to each state, with probability
@@ -62,13 +70,19 @@ struct ChainTrainingOptions {
   // should have a softmax as its final nonlinearity.
   BaseFloat xent_regularize;
 
-  ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
+  ChainTrainingOptions(): l2_regularize(0.0), out_of_range_regularize(0.01),
+                          leaky_hmm_coefficient(1.0e-05),
                           xent_regularize(0.0) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
                    "constant for 'chain' training, applied to the output "
                    "of the neural net.");
+    opts->Register("out-of-range-regularize", &out_of_range_regularize,
+                   "Constant that controls how much we penalize the nnet output "
+                   "being outside the range [-30,30].  This is needed because we "
+                   "limit it to that range in the denominator computation (which "
+                   "is to avoid NaNs because it is not done in log space.");
     opts->Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "Coefficient "
                    "that allows transitions from each HMM state to each other "
                    "HMM state, to ensure gradual forgetting of context (can "
diff --git a/src/chain/language-model.cc b/src/chain/language-model.cc
index 41e06116ea8..dd69340a6b8 100644
--- a/src/chain/language-model.cc
+++ b/src/chain/language-model.cc
@@ -129,7 +129,6 @@ int32 LanguageModelEstimator::FindOrCreateLmStateIndexForHistory(
     int32 backoff_lm_state = FindOrCreateLmStateIndexForHistory(
         backoff_hist);
     lm_states_[ans].backoff_lmstate_index = backoff_lm_state;
-    hist_to_lmstate_index_[backoff_hist] = backoff_lm_state;
   }
   return ans;
 }
@@ -298,7 +297,7 @@ int32 LanguageModelEstimator::AssignFstStates() {
 void LanguageModelEstimator::Estimate(fst::StdVectorFst *fst) {
   KALDI_LOG << "Estimating language model with --no-prune-ngram-order="
             << opts_.no_prune_ngram_order << ", --ngram-order="
-            << opts_.ngram_order << ", --num-extra-lm-state="
+            << opts_.ngram_order << ", --num-extra-lm-states="
             << opts_.num_extra_lm_states;
   SetParentCounts();
   num_basic_lm_states_ = CheckActiveStates();
@@ -408,5 +407,3 @@ void LanguageModelEstimator::OutputToFst(
 
 }  // namespace chain
 }  // namespace kaldi
-
-
diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile
index 61f653f174f..41ac7342d17 100644
--- a/src/chainbin/Makefile
+++ b/src/chainbin/Makefile
@@ -25,7 +25,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/chainbin/chain-get-supervision.cc b/src/chainbin/chain-get-supervision.cc
index 6090d9f0058..1ac89d4630b 100644
--- a/src/chainbin/chain-get-supervision.cc
+++ b/src/chainbin/chain-get-supervision.cc
@@ -22,6 +22,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "chain/chain-supervision.h"
+#include "tree/context-dep.h"
 
 namespace kaldi {
 namespace chain {
diff --git a/src/chainbin/nnet3-chain-train.cc b/src/chainbin/nnet3-chain-train.cc
index 9ea7ba1b06f..536669a17d3 100644
--- a/src/chainbin/nnet3-chain-train.cc
+++ b/src/chainbin/nnet3-chain-train.cc
@@ -20,6 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/nnet-chain-training.h"
+#include "cudamatrix/cu-allocator.h"
 
 
 int main(int argc, char *argv[]) {
@@ -52,6 +53,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     opts.Register(&po);
+    RegisterCuAllocatorOptions(&po);
 
     po.Read(argc, argv);
 
diff --git a/src/configure b/src/configure
index a954583d3fb..7015946a5b6 100755
--- a/src/configure
+++ b/src/configure
@@ -1,48 +1,54 @@
 #!/bin/bash
 
-# This configure script is hand-generated, not auto-generated.
-# It creates the file kaldi.mk, which is %included by the Makefiles
-# in the subdirectories.
+# This configure script is hand-generated, not auto-generated.  It creates the
+# file kaldi.mk, which is %included by the Makefiles in the subdirectories.
 # The file kaldi.mk is editable by hand -- for example, you may want to
-# remove the options -g -O0 -DKALDI_PARANOID, or edit the
-# DOUBLE_PRECISION variable (to be 1 not 0).
-
+# uncomment the options -O0 -DKALDI_PARANOID, or edit the DOUBLE_PRECISION
+# variable (to be 1 not 0).
 
 #  Example command lines:
-# ./configure --shared  ## shared libraries.
 # ./configure
-# ./configure --mkl-root=/opt/intel/mkl
-# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes
-# ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb
-#        # This is for MKL 11.3, which does not seem  to provide Intel OMP libs
-# ./configure --openblas-root=../tools/OpenBLAS/install
-#        # Before doing this, cd to ../tools and type "make openblas".
+# ./configure --shared                # Build shared Kaldi libraries.
+# ./configure --mathlib=OPENBLAS      # Build and use OpenBLAS.
+#        # Before doing this, cd to ../tools and type "make -j openblas".
+# ./configure --openblas-root=/usr    # Use system OpenBLAS.
 #        # Note: this is not working correctly on all platforms, do "make test"
 #        # and look out for segmentation faults.
 # ./configure --atlas-root=../tools/ATLAS/build
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
-#                             # version of kaldi even on CUDA-enabled machine
+#                             # version of kaldi even on CUDA-enabled machine.
+# ./configure --use-cuda --cudatk-dir=/usr/local/cuda/ --cuda-arch=-arch=sm_70
+#        # Use cuda in /usr/local/cuda and set the arch to sm_70
 # ./configure --static --fst-root=/opt/cross/armv8hf \
-# --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
-#        # Cross compile for armv8hf, this assumes that you have openfst built
+#   --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
+#        # Cross-compile for armv8hf. This assumes that you have OpenFST built
 #        # with the armv8-rpi3-linux-gnueabihf toolchain and installed to
 #        # /opt/cross/armv8hf. It also assumes that you have an ATLAS library
 #        # built for the target install to /opt/cross/armv8hf and that the
-#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path
+#        # armv8-rpi3-linux-gnueabihf toolchain is available in your path.
 # ./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
-# --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.4.1 \
-# --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
-# --host=arm-linux-androideabi
-#        # Cross compile for Android on arm. The only difference here is the
+#   --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.6.9 \
+#   --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
+#   --host=arm-linux-androideabi
+#        # Cross-compile for Android on arm. The only difference here is the
 #        # addition of the the --android-includes flag because the toolchains
 #        # produced by the Android NDK don't always include the C++ stdlib
-#        # headers in the normal cross compile include path.
-# --host=aarch64-linux-android
-#        # support for 64bit ARMv8(AArch64) architecture in Android.
+#        # headers in the normal cross-compile include path.
+#   --host=aarch64-linux-android
+#        # support for 64bit ARMv8 (AArch64) architecture in Android.
 
 # This should be incremented after any significant change to the configure
 # script, i.e. any change affecting kaldi.mk or the build system as a whole.
-CONFIGURE_VERSION=7
+CONFIGURE_VERSION=11
+
+# We support bash version 3.2 (Macs still ship with this version as of 2019)
+# and above.
+[[ $BASH_VERSION < '3.2' ]] && {
+  echo >&2 "bash version ${BASH_VERSION} is too old, cannot continue." \
+           "You won't be able to run Kaldi recipes with it anyway." \
+           "Please upgrade. bash version 3.2 or higher is required."
+  exit 1;
+}
 
 if ! [ -x "$PWD/configure" ]; then
   echo 'You must run "configure" from the src/ directory.'
@@ -65,12 +71,15 @@ Configuration options:
   --shared              Build and link against shared libraries [default=no]
   --use-cuda            Build with CUDA [default=yes]
   --cudatk-dir=DIR      CUDA toolkit directory
+  --cuda-arch=FLAGS     Override the default CUDA_ARCH flags. See:
+         https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-examples.
+  --debug-level=N       Use assertion level 0 (disabled), 1, or 2 [default=1]
   --double-precision    Build with BaseFloat set to double if yes [default=no],
                         mostly useful for testing purposes.
   --static-fst          Build with static OpenFst libraries [default=no]
   --fst-root=DIR        OpenFst root directory [default=../tools/openfst/]
   --fst-version=STR     OpenFst version string
-  --mathlib=LIB         Math library [default=ATLAS]
+  --mathlib=LIB         Math library [default=MKL|OPENBLAS, based on platform]
                         Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
   --static-math         Build with static math libraries [default=no]
   --threaded-math       Build with multi-threaded math libraries [default=no]
@@ -107,20 +116,42 @@ compiler/linker.
 EOF
 }
 
+# E.g. Die "Invalid switch --foobar"
+Die() { echo >&2 "$0: FATAL:" "$@"; exit 1; }
+
+# E.g. abspath=$(rel2abs "../tools") || exit 1
+#  - Set 'abspath' to existing absolute path of $1, return 0.
+#  - print empty string if path does not exist, return non-0.
 function rel2abs {
-  if [ ! -z "$1" ]; then
-    local retval=`cd $1 2>/dev/null && pwd || exit 1`
-    echo $retval
-  fi
+  [[ $1 ]] && cd -P "$1" 2>/dev/null && pwd
 }
 
-function read_dirname {
-  local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
-  local retval=`rel2abs $dir_name`
-  [ -z $retval ] && echo "Bad option '$1': no such directory" && exit 1;
-  echo $retval
+# E.g.: GetSwitchValue var --some-switch=foo
+# Assign variable named 'var' to 'foo'. Return 0 iff value is not empty.
+GetSwitchValue() {
+  IFS='=' read -r -- _ $1 <<< "$2" && [[ ${!1} ]]
 }
 
+# E.g.: GetSwitchValueOrDie var --some-switch=foo
+# Assign variable named 'var' to 'foo'. Die with a fatal error if value is empty.
+GetSwitchValueOrDie() {
+  GetSwitchValue "$@" ||
+    Die "'$2': switch requires a value. See '$0 --help'."
+}
+
+# E.g.: GetSwitchExistingPathOrDie var --some-switch=../tools
+#  - Set 'var' to absolute path of '../tools' if exists, return 1.
+#  - Die with a fatal error if path does not exist or not given in switch.
+GetSwitchExistingPathOrDie() {
+  GetSwitchValueOrDie "$@"  # Already sets variable named $1 to path.
+  local path varname=$1
+  path=$(rel2abs "${!varname}") && [[ -d $path ]] ||
+    Die "'$2': switch must specify an existing directory. See '$0 --help'."
+  builtin printf -v $varname %s "$path"  # Assign $path to variable '$varname'.
+}
+
+# TODO(kkm): Kill this. `[[ ${var-} ]]' is the idiomatic equivalent in bash.
+#   Even better, do not rely on uninitialized variables.
 function is_set {
   local myvar=${1:-notset}
   if [ "$myvar" == "notset" ]; then
@@ -130,6 +161,11 @@ function is_set {
   fi
 }
 
+# Lowercase/uppercase argument. Only bash 4.2+ has internal faclilties for this,
+# and we support versions down to 3.2.
+lcase () { awk '{print tolower($0)}' <<<"$1" ; }
+ucase () { awk '{print toupper($0)}' <<<"$1" ; }
+
 function failure {
   echo "***configure failed: $* ***" >&2
   if [ -f kaldi.mk ]; then rm kaldi.mk; fi
@@ -137,7 +173,7 @@ function failure {
 }
 
 function check_exists {
-  if [ ! -f $1 ]; then failure "$1 not found."; fi
+  if [[ ! -f $1 ]]; then failure "$1 not found."; fi
 }
 
 function check_library {
@@ -284,7 +320,7 @@ function linux_configure_mkl_extra {
   echo "$linkline ${extra_libs[$threaded]}"
 }
 
-function linux_configure_threadinglibdir {
+function linux_configure_mkl_threadinglibdir {
   local library=$1
   local mklroot=$2
   local mkllibdir=$3
@@ -334,9 +370,9 @@ function linux_configure_mkl_threading {
 
   if ! is_set $OMPLIBDIR ; then
     if  $static ; then
-      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"`
+      OMPLIBDIR=`linux_configure_mkl_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "a"`
     else
-      OMPLIBDIR=`linux_configure_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"`
+      OMPLIBDIR=`linux_configure_mkl_threadinglibdir $library "$MKLROOT" "$MKLLIBDIR" "so"`
     fi
   fi
 
@@ -367,7 +403,7 @@ function linux_configure_mkl_threading {
 function configure_cuda {
   # Check for CUDA toolkit in the system
   if [ ! -d  "$CUDATKDIR" ]; then
-    for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do
+    for base in /usr/local/share/cuda /usr/local/cuda /usr/; do
       if [ -f $base/bin/nvcc ]; then
         CUDATKDIR=$base
       fi
@@ -395,29 +431,35 @@ function configure_cuda {
       GCC_VER=$($COMPILER -dumpversion)
       GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
       case $CUDA_VERSION in
-        5_5)
-          MIN_UNSUPPORTED_GCC_VER="5.0"
-          MIN_UNSUPPORTED_GCC_VER_NUM=50000;
-        ;;
-        6_*)
-          MIN_UNSUPPORTED_GCC_VER="5.0"
-          MIN_UNSUPPORTED_GCC_VER_NUM=50000;
-        ;;
         7_*)
           MIN_UNSUPPORTED_GCC_VER="5.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=50000;
+          CUSOLVER=false
         ;;
         8_*)
           MIN_UNSUPPORTED_GCC_VER="6.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=60000;
+          CUSOLVER=false
+        ;;
+        9_0)
+          MIN_UNSUPPORTED_GCC_VER="7.0"
+          MIN_UNSUPPORTED_GCC_VER_NUM=70000;
+          CUSOLVER=false
         ;;
-        9_0 | 9_1)
+        9_1)
           MIN_UNSUPPORTED_GCC_VER="7.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=70000;
+          CUSOLVER=true
         ;;
-        9_2 | 9_*)
+        9_2 | 9_* | 10_0)
           MIN_UNSUPPORTED_GCC_VER="8.0"
           MIN_UNSUPPORTED_GCC_VER_NUM=80000;
+          CUSOLVER=true
+        ;;
+        10_1 | 10_*)
+          MIN_UNSUPPORTED_GCC_VER="9.0"
+          MIN_UNSUPPORTED_GCC_VER_NUM=90000;
+          CUSOLVER=true
         ;;
         *)
           echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1;
@@ -429,14 +471,30 @@ function configure_cuda {
       fi
     fi
 
-    case $CUDA_VERSION in
-      5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
-      6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
-      7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53" ;;
-      8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62" ;;
-      9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_53,code=sm_53 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_70,code=sm_70" ;;
-      *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
-    esac
+    if [ -z "$CUDA_ARCH" ]; then
+      case `uname -m` in
+        x86_64|ppc64le)
+          case $CUDA_VERSION in
+            5_5) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35" ;;
+            6_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50" ;;
+            7_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52" ;;
+            8_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61" ;;
+            9_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70" ;;
+            10_*) CUDA_ARCH="-gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75" ;;
+            *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+          esac
+        ;;
+        aarch64)
+          case $CUDA_VERSION in
+            7_*)     CUDA_ARCH="-gencode arch=compute_53,code=sm_53" ;;
+            8_*|9_*) CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62" ;;
+            10_*)    CUDA_ARCH="-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62 -gencode arch=compute_72,code=sm_72" ;;
+            *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
+          esac
+        ;;
+        *) echo "Unsupported architecture for use of Kaldi with CUDA.  Please report it to Kaldi mailing list."; exit 1 ;;
+      esac
+    fi
 
     echo "Using CUDA toolkit $CUDATKDIR (nvcc compiler and runtime libraries)"
     echo >> kaldi.mk
@@ -445,9 +503,12 @@ function configure_cuda {
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
+    
+    
     echo >> kaldi.mk
 
-    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct calls to uname -m here
+    # 64bit/32bit? We do not support cross compilation with CUDA so, use direct
+    # calls to uname -m here
     if [ "`uname -m`" == "x86_64" ]; then
       if [ "`uname`" == "Darwin" ]; then
         sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
@@ -459,13 +520,22 @@ function configure_cuda {
     elif [ "`uname -m`" == "ppc64le" ]; then
       cat makefiles/cuda_64bit.mk >> kaldi.mk
     else
-      cat makefiles/cuda_32bit.mk >> kaldi.mk
+      echo "\
+WARNING: CUDA will not be used!
+         CUDA is not supported with 32-bit builds."
+      exit 1;
+    fi
+    
+    #add cusolver flags for newer toolkits
+    if [ "$CUSOLVER" == "true" ]; then
+      echo "CUDA_LDLIBS += -lcusolver" >> kaldi.mk
     fi
 
   else
-    echo "CUDA will not be used! If you have already installed cuda drivers "
-    echo "and cuda toolkit, try using --cudatk-dir=... option.  Note: this is"
-    echo "only relevant for neural net experiments"
+    echo "\
+WARNING: CUDA will not be used! If you have already installed cuda drivers
+         and CUDA toolkit, try using the --cudatk-dir= option. A GPU and CUDA
+         are required to run neural net experiments in a realistic time."
   fi
 }
 
@@ -485,8 +555,9 @@ function linux_configure_speex {
     spx_type=so
   fi
   if [ ! -f "$SPEEXLIBDIR/libspeex.${spx_type}" ];then
-    echo "Info: configuring Kaldi not to link with Speex (don't worry, it's only needed if you"
-    echo "intend to use 'compress-uncompress-speex', which is very unlikely)"
+    echo "\
+INFO: Configuring Kaldi not to link with Speex. Don't worry, it's only needed if
+      you intend to use 'compress-uncompress-speex', which is very unlikely."
     return
   fi
 
@@ -507,17 +578,11 @@ function linux_configure_speex {
   fi
 }
 
-function linux_atlas_failure {
+function linux_configure_atlas_failure {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
+
   echo "** $* ***"
   echo "**  ERROR   **"
   echo "** Configure cannot proceed automatically."
@@ -531,11 +596,11 @@ function linux_atlas_failure {
   echo "**"
   echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
   echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
-  echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
+  echo "** and type './configure  --mathlib=OPENBLAS'"
   exit 1;
 }
 
-function linux_check_static {
+function linux_atlas_check_static {
   # will exit with success if $dir seems to contain ATLAS libraries with
   # right architecture (compatible with default "nm")
   echo "int main(void) { return 0; }" > test_linking.cc;
@@ -555,104 +620,30 @@ function linux_check_static {
   fi
 }
 
-function linux_configure_debian_ubuntu {
-  m=$1
-  ATLASLIBS="/usr/lib$m/atlas-base/libatlas.so.3gf  /usr/lib$m/atlas-base/libf77blas.so.3gf /usr/lib$m/atlas-base/libcblas.so.3gf  /usr/lib$m/atlas-base/liblapack_atlas.so.3gf"
-  for f in $ATLASLIBS; do
-    [ ! -f $f ] && return 1;
-  done
-  lapacklib=$(echo $ATLASLIBS | awk '{print $NF}')
-  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then
-    exit 1;
-  fi
-  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-  echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && configure_cuda
-  linux_configure_speex
-}
-
-function linux_configure_debian_ubuntu3 {
-  ATLASLIBS="/usr/lib/libatlas.so.3  /usr/lib/libf77blas.so.3 /usr/lib/libcblas.so.3  /usr/lib/liblapack_atlas.so.3"
+function linux_configure_atlas_generic {
+  # You pass in a directory (e.g. /usr/lib/atlas-base) and a suffix (e.g. so.3.0)
+  # and it tries to find ATLAS libraries with that dir and suffix.  On success it
+  # returns 0; on failure, it returns 1.
+  dir=$1
+  suffix=$2
+  ATLASLIBS="$dir/libatlas.$suffix $dir/libf77blas.$suffix $dir/libcblas.$suffix $dir/liblapack_atlas.$suffix"
   for f in $ATLASLIBS; do
     [ ! -f $f ] && return 1;
   done
   lapacklib=$(echo $ATLASLIBS | awk '{print $NF}')
   if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then
+    echo "configure: failed to find symbol ATL_cgetrf in library $lapacklib"
     exit 1;
   fi
-  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-  echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && configure_cuda
-  linux_configure_speex
-}
-
-function linux_configure_debian7 {
-  ATLASLIBS="/usr/lib/atlas-base/libatlas.so.3.0 /usr/lib/atlas-base/libf77blas.so.3.0 /usr/lib/atlas-base/libcblas.so.3 /usr/lib/atlas-base/liblapack_atlas.so.3"
-  for f in $ATLASLIBS; do
-    [ ! -f $f ] && return 1;
-  done
-  lapacklib=$(echo $ATLASLIBS | awk '{print $NF}')
-  if ! nm --dynamic $lapacklib | grep ATL_cgetrf >/dev/null; then
-    exit 1;
-  fi
-  libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
-  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_debian7" && exit 1;
-  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-  echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && configure_cuda
-  linux_configure_speex
-}
-
-function linux_configure_redhat {
-  m=$1  # 64 or empty.
-  ATLASLIBS="/usr/lib$m/atlas/libatlas.so.3 /usr/lib$m/atlas/libf77blas.so.3 /usr/lib$m/atlas/libcblas.so.3 /usr/lib$m/atlas/libclapack.so.3"
-  for f in $ATLASLIBS; do
-    [ ! -f $f ] && return 1;
-  done
   libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
-  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat" && exit 1;
+  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_atlas_generic: dir=$dir,suffix=$suffix" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && configure_cuda
+  echo "Successfully configured ATLAS with ATLASLIBS=$ATLASLIBS"
 }
 
-function linux_configure_redhat_fat {
+function linux_configure_atlas_redhat_fat {
   # This is for when only two so-called 'fat' ATLAS libs are provided:
   # libsatlas.so.3 and libtatlas.so.3.
   # See http://stackoverflow.com/questions/13439296/build-shared-libraries-in-atlas.
@@ -662,28 +653,20 @@ function linux_configure_redhat_fat {
     [ ! -f $f ] && return 1;
   done
   libdir=$(dirname $(echo $ATLASLIBS | awk '{print $1}'))
-  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat_fat" && exit 1;
+  [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_atlas_redhat_fat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && configure_cuda
 }
 
-function linux_configure_static {
+function linux_configure_atlas_static {
   if $threaded_atlas; then pt=pt; else pt=""; fi
 
   if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
     for dir in /usr{,/local}/lib{64,}{,/atlas,/atlas-sse2,/atlas-sse3} \
        /usr/local/atlas/lib{,64} `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-     linux_check_static &&  ATLASLIBDIR=$dir
+     linux_atlas_check_static && ATLASLIBDIR=$dir
     done
     if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
       echo "Could not find libatlas.a in any of the generic-Linux places, but we'll try other stuff..."
@@ -696,11 +679,11 @@ function linux_configure_static {
   echo "Validating presence of ATLAS libs in $ATLASLIBDIR"
   ATLASLIBS=
   # The Lapack part of ATLAS seems to appear under various different names.. but it
-  # should always have symbols like ATL_cgetrf defined, so we test for this,
-  # for all the names we have encountered.
+  # should always have symbols like ATL_cgetrf and clapack_cgetrf defined, so we test for this.
   for libname in liblapack liblapack_atlas  libclapack; do
     if [ -f $ATLASLIBDIR/${libname}.a -a "$ATLASLIBS" == "" ]; then
-      if nm  $ATLASLIBDIR/${libname}.a  | grep ATL_cgetrf >/dev/null; then
+      if nm  $ATLASLIBDIR/${libname}.a  | grep ATL_cgetrf >/dev/null && \
+	 nm  $ATLASLIBDIR/${libname}.a  | grep clapack_cgetrf >/dev/null; then
          ATLASLIBS=$ATLASLIBDIR/${libname}.a
          echo "Using library $ATLASLIBS as ATLAS's CLAPACK library."
       fi
@@ -723,115 +706,24 @@ function linux_configure_static {
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  $use_cuda && configure_cuda
-  linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
 }
 
-function linux_check_dynamic {
-  # will exit with success if $dir seems to contain ATLAS libraries with
-  # right architecture (compatible with default "nm")
-  if $threaded_atlas; then pt=t; else pt=s; fi
-  for atlas_libname in libatlas.so lib${pt}atlas.so; do
-    if [ -f $dir/$atlas_libname ]; then # candidate...
-      if nm --dynamic $dir/$atlas_libname 2>&1 | grep "File format not recognized" >/dev/null; then
-        echo "Directory $dir may contain dynamic ATLAS libraries but seems to be wrong architecture";
-        return 1;
-      fi
-        echo "Atlas found in $dir";
-        return 0;
-      fi
-  done
-  # echo "... no {libatlas,lib${pt}atlas}.so in $dir";
-  return 1;
-}
-
-function linux_configure_dynamic {
-  if $threaded_atlas; then pt=t; else pt=s; fi # relevant to "fat" libraries, will change later for separate ones
-  if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
-    for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3,/x86_64-linux-gnu} \
-      `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
-      linux_check_dynamic && ATLASLIBDIR=$dir && ATLASLIBNAME=$atlas_libname
-    done
-    if [ -z $ATLASLIBDIR -o -z $ATLASLIBNAME ]; then
-      echo "Could not find {libatlas,lib${pt}atlas}.so in any of the obvious places, will most likely try static:"
-      return 1;
-    fi
-  fi
-
-  # If using "fat" libraries we only need one file to link against
-  if [ $ATLASLIBNAME != libatlas.so ]; then
-    if [ -f $ATLASLIBDIR/$ATLASLIBNAME ]; then
-      ATLASLIBS="$ATLASLIBDIR/$ATLASLIBNAME"
-    else
-      echo "Configuring dynamic ATLAS library failed: library $ATLASLIBNAME not found in $ATLASLIBDIR"
-      return 1;
-    fi
-  else  # with "thin" libraries, we have several object to link against, and different single/multi-thread names
-    if $threaded_atlas; then pt=pt; else pt=""; fi
-    echo "Validating presence of ATLAS libs in $ATLASLIBDIR"
-    ATLASLIBS=
-    # The Lapack part of ATLAS seems to appear under various different names.. but it
-    # should always have symbols like ATL_cgetrf defined, so we test for this,
-    # for all the names we have encountered.
-    for libname in lapack lapack_atlas  clapack; do
-      if [ -f $ATLASLIBDIR/lib${libname}.so -a "$ATLASLIBS" == "" ]; then
-        if nm  --dynamic $ATLASLIBDIR/lib${libname}.so  | grep ATL_cgetrf >/dev/null; then
-           ATLASLIBS="$ATLASLIBDIR/lib${libname}.so"
-           echo "Using library $ATLASLIBS as ATLAS's CLAPACK library."
-        fi
-      fi
-    done
-    if [ "$ATLASLIBS" == "" ]; then
-      echo Could not find any libraries $ATLASLIBDIR/{liblapack,liblapack_atlas,libclapack} that seem to be an ATLAS CLAPACK library.
-      return 1;
-    fi
-
-    for x in ${pt}cblas atlas ${pt}f77blas; do
-      if [ ! -f $ATLASLIBDIR/lib$x.so ]; then
-        echo "Configuring dynamic ATLAS libraries failed: Could not find library $x in directory $ATLASLIBDIR"
-        return 1;
-      fi
-      ATLASLIBS="$ATLASLIBS $ATLASLIBDIR/lib${x}.so"
-    done
-    if $threaded_atlas; then ATLASLIBS="$ATLASLIBS"; fi
-  fi
-
-  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-  echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
-  echo ATLASLDFLAGS = -Wl,-rpath,$ATLASLIBDIR >> kaldi.mk
-  echo >> kaldi.mk
-  if [[ "$TARGET_ARCH" == arm* ]]; then
-    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-  else
-    cat makefiles/linux_atlas.mk >> kaldi.mk
-  fi
-  $use_cuda && configure_cuda
-  linux_configure_speex
-  echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-}
-
 #############################    CONFIGURATION    #############################
 
 # If configuration sets any of these variables, we will switch the external
 # math library. Here we unset them so that we can check later.
-unset MKLROOT
-unset CLAPACKROOT
-unset OPENBLASROOT
-unset MKLLIBDIR
+#TODO(kkm): Maybe allow env vars to provide defaults?
+ATLASROOT=
+CLAPACKROOT=
+MATHLIB=
+MKLLIBDIR=
+MKLROOT=
+OPENBLASROOT=
 
 # This variable identifies the type of system where built programs and
 # libraries will run. It is set by the configure script when cross compiling.
-unset HOST
+HOST=
 
 # These environment variables can be used to override the default toolchain.
 CXX=${CXX:-g++}
@@ -848,6 +740,7 @@ ENV_LDFLAGS=$LDFLAGS
 ENV_LDLIBS=$LDLIBS
 
 # Default configuration
+debug_level=1
 double_precision=false
 dynamic_kaldi=false
 use_cuda=true
@@ -857,9 +750,8 @@ threaded_atlas=false
 mkl_threading=sequential
 android=false
 
-MATHLIB='ATLAS'
-ATLASROOT=`rel2abs ../tools/ATLAS_headers/`
 FSTROOT=`rel2abs ../tools/openfst`
+CUBROOT=`rel2abs ../tools/cub`
 
 # Save the command line to include in kaldi.mk
 cmd_line="$0 $@"
@@ -881,6 +773,9 @@ do
     static_math=false;
     static_fst=false;
     shift ;;
+  --debug-level=*)
+    GetSwitchValueOrDie debug_level "$1"
+    shift ;;
   --double-precision)
     double_precision=true;
     shift ;;
@@ -891,7 +786,7 @@ do
     double_precision=false;
     shift ;;
   --atlas-root=*)
-    ATLASROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie ATLASROOT "$1"
     shift ;;
   --threaded-atlas)
     threaded_atlas=true;
@@ -946,50 +841,57 @@ do
     mkl_threading=sequential;
     shift ;;
   --mkl-threading=*)
-    mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`;
+    GetSwitchValueOrDie mkl_threading "$1"
     threaded_atlas=true;
     shift ;;
   --fst-root=*)
-    FSTROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie FSTROOT "$1"
+    shift ;;
+  --cub-root=*)
+    GetSwitchExistingPathOrDie CUBROOT "$1"
     shift ;;
   --clapack-root=*)
-    CLAPACKROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie CLAPACKROOT "$1"
     shift ;;
   --openblas-root=*)
-    OPENBLASROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie OPENBLASROOT "$1"
     shift ;;
   --mkl-root=*)
-    MKLROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie MKLROOT "$1"
     shift ;;
   --mkl-libdir=*)
-    MKLLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie MKLLIBDIR "$1"
     shift ;;
   --speex-root=*)
-    SPEEXROOT=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXROOT "$1"
     shift ;;
   --speex-libdir=*)
-    SPEEXLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXLIBDIR "$1"
     shift ;;
   --speex-incdir=*)
-    SPEEXINCDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie SPEEXINCDIR "$1"
     shift ;;
   --omp-libdir=*)
-    OMPLIBDIR=`read_dirname $1`;
+    GetSwitchExistingPathOrDie OMPLIBDIR "$1"
     shift ;;
   --mathlib=*)
-    MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`;
+    GetSwitchValueOrDie MATHLIB "$1"
     shift ;;
   --cudatk-dir=*)
-    CUDATKDIR=`read_dirname $1`;
-    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
+    # CUDA is used in src/cudamatrix and src/nnet{,bin} only.
+    GetSwitchExistingPathOrDie CUDATKDIR "$1"
+    shift ;;
+  --cuda-arch=*)
+    GetSwitchValueOrDie CUDA_ARCH "$1"
+    shift;;
   --fst-version=*)
-    OPENFST_VER=`expr "X$1" : '[^=]*=\(.*\)'`;
+    GetSwitchValueOrDie OPENFST_VER "$1"
     shift;;
   --host=*)
     # The type of system where built programs and libraries will run.
     # It should be in the format cpu-vendor-os. If specified, this script
     # will infer the target architecture from the specified host triple.
-    HOST=`expr "X$1" : '[^=]*=\(.*\)'`;
+    GetSwitchValueOrDie HOST "$1"
     shift ;;
   --android-incdir=*)
     android=true;
@@ -998,12 +900,17 @@ do
     static_fst=true;
     dynamic_kaldi=false;
     MATHLIB='OPENBLAS';
-    ANDROIDINC=`read_dirname $1`;
+    GetSwitchExistingPathOrDie ANDROIDINC "$1"
     shift;;
   *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
   esac
 done
 
+case "$debug_level" in
+  [012]) ;;
+  *) failure "Invalid value --debug-level=$debug_level. Supported values are 0, 1, and 2." ;;
+esac
+
 # The idea here is that if you change the configuration options from using
 # CUDA to not using it, or vice versa, we want to recompile all parts of the
 # code that may use a GPU. Touching this file is a way to force this.
@@ -1032,7 +939,8 @@ if is_set $HOST; then
   IFS='-' read -ra PARTS <<< "$HOST"
   # The first field in the PARTS list is the target architecture.
   TARGET_ARCH="$PARTS"
-  if [[ "$TARGET_ARCH" != aarch64* && "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && "$TARGET_ARCH" != x86* ]] ; then
+  if [[ "$TARGET_ARCH" != aarch64* && "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && \
+        "$TARGET_ARCH" != x86* && "$TARGET_ARCH" != i686* ]] ; then
     # We currently only support building for x86[_64], arm*, aarch64* and ppc64le.
     # If TARGET_ARCH was read from the HOST variable, it must be one of these.
     failure "$TARGET_ARCH is not a supported architecture.
@@ -1042,13 +950,67 @@ else
   TARGET_ARCH="`uname -m`"
 fi
 
-# If one of these variables is set, we switch the external math library.
-is_set $MKLLIBDIR && echo "Configuring KALDI to use MKL" && export MATHLIB="MKL"
-is_set $MKLROOT && echo "Configuring KALDI to use MKL"&& export MATHLIB="MKL"
-is_set $CLAPACKROOT && echo "Configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
-is_set $OPENBLASROOT && echo "Configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
+#------------------------------------------------------------------------------
+# Matrix algebra library selection and validation.
+#--------------
+
+declare -a mathlibs   # Contains e. g. 'atlas', 'mkl'
+declare -a incompat   # Contains mutually-inconsistent switches, if any.
+auto_lib=             # Deduced lib name, used when $MATHLIB is not set.
+
+# Validate the (optionally) provided MATHLIB value.
+case $MATHLIB in
+  ''|ATLAS|CLAPACK|MKL|OPENBLAS) : ;;
+  *) failure "Unknown --mathlib='${MATHLIB}'. Supported libs: ATLAS CLAPACK MKL OPENBLAS" ;;
+esac
+
+# See which library-root switches are set, what mathlib they imply, and whether
+# there are any conflicts betweeh the switches.
+[[ $MKLLIBDIR || $MKLROOT ]] && { mathlibs+=(mkl); auto_lib=MKL; }
+[[ $CLAPACKROOT  ]] && { mathlibs+=(clapack); auto_lib=CLAPACK; }
+[[ $OPENBLASROOT ]] && { mathlibs+=(openblas); auto_lib=OPENBLAS; }
+[[ $ATLASROOT    ]] && { mathlibs+=(atlas); auto_lib=ATLAS; }
+
+# When --mathlib= is explicitly provided, and some mathlib(s) deduced, but
+# MATHLIB is not among them, record a conflict for the --mathlib= value.
+shopt -s nocasematch
+[[ $MATHLIB && $mathlibs && ! " ${mathlibs[@]} " =~ " $MATHLIB " ]] &&
+  incompat+=(--mathlib=$MATHLIB)
+shopt -u nocasematch
+
+# If more than one library specified, or a conflict has been recorded above
+# already, then add all deduced libraries as conflicting options (not all may
+# be conflicting sensu stricto, but let the user deal with it).
+if [[ ${#mathlibs[@]} -gt 1 || $incompat ]]; then
+  for libpfx in "${mathlibs[@]}"; do
+    # Handle --mkl-libdir out of common pattern.
+    [[ $libpfx == mkl && $MKLLIBDIR ]] && incompat+=(--mkl-libdir=)
+    # All other switches follow the pattern --$libpfx-root.
+    incompat+=(--$(lcase $libpfx)-root=)
+  done
+  failure "Incompatible configuration switches: ${incompat[@]}"
+fi
 
-echo "Configuring ..."
+# When no library roots were provided, so that auto_lib is not deduced, and
+# MATHLIB is also not explicitly provided by the user, then default to MKL.
+[[ ! $auto_lib && ! $MATHLIB ]] &&
+  case $TARGET_ARCH in
+    x86_64) auto_lib=MKL ;;
+    *) auto_lib=OPENBLAS ;;
+  esac
+: ${MATHLIB:=$auto_lib}
+export MATHLIB  #TODO(kkm): Likely not needed. Briefly tested without,
+                #    but left in the hotfix. Remove when doing the #3192.
+
+# Define default library roots where known (others may be found by probing).
+case $MATHLIB in
+  MKL) [[ ! $MKLLIBDIR && ! $MKLROOT ]] && MKLROOT=/opt/intel/mkl ;;
+  ATLAS) : ${ATLASROOT:=$(rel2abs ../tools/ATLAS_headers/)} ;;
+esac
+
+unset auto_lib incompat libpfx mathlibs
+
+echo "Configuring KALDI to use ${MATHLIB}."
 
 # Back up the old kaldi.mk in case we modified it
 if [ -f kaldi.mk ]; then
@@ -1081,6 +1043,7 @@ if $dynamic_kaldi ; then
   echo "KALDI_FLAVOR := dynamic" >> kaldi.mk
   echo "KALDILIBDIR := $KALDILIBDIR" >> kaldi.mk
 fi
+echo "DEBUG_LEVEL = $debug_level" >> kaldi.mk
 if $double_precision; then
   echo "DOUBLE_PRECISION = 1" >> kaldi.mk
 else
@@ -1118,6 +1081,16 @@ echo "OPENFSTLIBS = $OPENFSTLIBS" >> kaldi.mk
 echo "OPENFSTLDFLAGS = $OPENFSTLDFLAGS" >> kaldi.mk
 echo >> kaldi.mk
 
+$use_cuda && echo "Checking cub library in $CUBROOT ..."
+if [[ "$use_cuda" = true  && ! -f $CUBROOT/cub/cub.cuh ]]; then
+  failure "Could not find file $CUBROOT/cub/cub.cuh:
+  you may not have installed cub.  Go to ../tools/ and type
+  e.g. 'make cub'; cub is a new requirement."
+else
+  echo "CUBROOT = $CUBROOT" >> kaldi.mk
+fi
+
+
 # OS-specific steps given below append to kaldi.mk
 echo "Doing OS specific configurations ..."
 
@@ -1209,33 +1182,24 @@ elif [ "`uname`" == "Linux" ]; then
     # containing {liblapack.a,libblas.a}, and linking against just these two
     # libraries worked.
 
-    if $static_math; then
-      # Prefer static to dynamic math.
-      linux_configure_static || \
-        linux_configure_debian_ubuntu3 || \
-        linux_configure_dynamic || \
-        linux_configure_debian_ubuntu 64 || \
-        linux_configure_debian_ubuntu || \
-        linux_configure_debian7 || \
-        linux_configure_redhat 64 || \
-        linux_configure_redhat || \
-        linux_configure_redhat_fat 64 || \
-        linux_configure_redhat_fat || \
-        linux_atlas_failure "Failed to configure ATLAS libraries";
-    else
-      # Prefer dynamic to static math.
-      linux_configure_debian_ubuntu3 || \
-        linux_configure_dynamic || \
-        linux_configure_static || \
-        linux_configure_debian_ubuntu 64 || \
-        linux_configure_debian_ubuntu || \
-        linux_configure_debian7 || \
-        linux_configure_redhat 64 || \
-        linux_configure_redhat || \
-        linux_configure_redhat_fat 64 || \
-        linux_configure_redhat_fat || \
-        linux_atlas_failure "Failed to configure ATLAS libraries";
-    fi
+    ( $static_math && linux_configure_atlas_static ) || \
+      linux_configure_atlas_generic /usr/lib "so.3" || \
+      linux_configure_atlas_generic /usr/lib/atlas-base "so.3gf" || \
+      linux_configure_atlas_generic /usr/lib64/atlas-base "so.3gf" \
+      linux_configure_atlas_generic /usr/lib/atlas "so.3" || \
+      linux_configure_atlas_generic /usr/lib64/atlas "so.3" || \
+      linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so.3" || \
+      linux_configure_atlas_generic /usr/lib/x86_64-linux-gnu/ "so" || \
+      linux_configure_atlas_redhat_fat 64 || \
+      linux_configure_atlas_redhat_fat || \
+      linux_configure_atlas_static || \
+      linux_configure_atlas_failure "Failed to configure ATLAS libraries";
+
+    case $TARGET_ARCH in
+      arm*)    cat makefiles/linux_atlas_arm.mk ;;
+      ppc64le) cat makefiles/linux_atlas_ppc64le.mk ;;
+      *)       cat makefiles/linux_atlas.mk ;;
+    esac >> kaldi.mk
 
   elif [ "$MATHLIB" == "MKL" ]; then
     if [ "$TARGET_ARCH" != "x86_64" ]; then
@@ -1247,7 +1211,7 @@ elif [ "`uname`" == "Linux" ]; then
       echo -n "Configuring MKL library directory: "
       MKLLIBDIR=`linux_configure_mkllibdir $MKLROOT`
       if [ $? -ne 0 ]; then
-        failure "MKL libraries could not be found. Please use the switch --mkl-libdir "
+        failure "MKL libraries could not be found. Please use the switch --mkl-libdir or try another math library, e.g. --mathlib=ATLAS (would be slower)"
       else
         echo "Found: $MKLLIBDIR"
       fi
@@ -1284,8 +1248,6 @@ elif [ "`uname`" == "Linux" ]; then
     cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
     echo "Successfully configured for Linux with MKL libs from $MKLROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
 
   elif [ "$MATHLIB" == "CLAPACK" ]; then
     if [ -z "$CLAPACKROOT" ]; then
@@ -1309,42 +1271,78 @@ elif [ "`uname`" == "Linux" ]; then
     fi
     echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
     echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
 
   elif [ "$MATHLIB" == "OPENBLAS" ]; then
-    OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
-    if [ -z "$OPENBLASROOT" ]; then
-      failure "Must specify the location of OPENBLAS with --openblas-root option (and it must exist)"
+    if [[ ! $OPENBLASROOT ]]; then
+      # Either the user specified --mathlib=OPENBLAS or we've autodetected the
+      # system where OpenBLAS is the preferred option (the parser for
+      # --openblas-root fails fatally if the path does not exist, so we trust
+      # that if set, the variable contains the existing path, converted to
+      # absolute form).
+      OPENBLASROOT="$(rel2abs ../tools/OpenBLAS/install)" ||
+        Die "OpenBLAS not found in '../tools/OpenBLAS/install'.
+** This is the only place we look for it. The best option is to build OpenBLAS
+** tuned for your system and CPU. To do that, run the following commands:
+**
+**   cd ../tools; extras/install_openblas.sh
+**
+** Another option is to specify the location of existing OpenBLAS directory
+** with the switch '--openblas-root='. However, even if a package is provided
+** for your system, the packaged version is almost always significantly slower
+** and often older than the above commands can fetch and build.
+**
+** You can also use other matrix algebra libraries. For information, see:
+**   http://kaldi-asr.org/doc/matrixwrap.html"
     fi
-    if [ ! -f $OPENBLASROOT/lib/libopenblas.so ]; then
+    if [ -f $OPENBLASROOT/lib/libopenblas.so ]; then
+      OPENBLASLIBDIR=$OPENBLASROOT/lib
+    elif [ -f $OPENBLASROOT/lib64/libopenblas.so ]; then
+      # in REDHAT/CentOS package installs, the library is located here
+      OPENBLASLIBDIR=$OPENBLASROOT/lib64
+    else
       failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so"
     fi
+    if [ -f $OPENBLASROOT/include/cblas.h ] ; then
+      OPENBLASINCDIR=$OPENBLASROOT/include
+    elif [ -f $OPENBLASROOT/include/openblas/cblas.h ] ; then
+      # in REDHAT/CentOS/Ubuntu package installs, the includes are located here
+      OPENBLASINCDIR=$OPENBLASROOT/include/openblas
+    else
+      echo "$0: ***** Using OpenBLAS from $OPENBLASROOT but cblas.h is not found. "
+      echo "** Assuming openblas is aleady in a default include path, but"
+      echo "** if you get compilation messages about not finding files like cblas.h,"
+      echo "** you should look into this (e.g. make sure to install the 'openblas-dev' package,"
+      echo "** if it is a package-based install)."
+      OPENBLASINCDIR="/usr/include"
+    fi
     echo "Your math library seems to be OpenBLAS from $OPENBLASROOT.  Configuring appropriately."
+    # TODO(kkm): Probably, OpenBLAS required libgfortran.so.3 at some point, but
+    # no longer does. *My* linker does not complain about a missing library, but
+    # is it safe to keep the reference if no longer required? Try to figure out
+    # how long ago the dependency was dropped.
     if $static_math; then
       echo "Configuring static OpenBlas since --static-math=yes"
-      OPENBLASLIBS="$OPENBLASROOT/lib/libopenblas.a -lgfortran"
+      OPENBLASLIBS="-L$OPENBLASLIBDIR -l:libopenblas.a -lgfortran"
     else
       echo "Configuring dynamically loaded OpenBlas since --static-math=no (the default)"
-      OPENBLASLIBS="-L$OPENBLASROOT/lib -lopenblas -lgfortran -Wl,-rpath=$OPENBLASROOT/lib"
+      OPENBLASLIBS="-L$OPENBLASLIBDIR -lopenblas -lgfortran -Wl,-rpath=$OPENBLASLIBDIR"
     fi
-    echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
+    echo "OPENBLASINC = $OPENBLASINCDIR" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
     echo >> kaldi.mk
-    if [[ "$TARGET_ARCH" == arm* ]]; then
-      cat makefiles/linux_openblas_arm.mk >> kaldi.mk
-    elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-      cat makefiles/linux_openblas_ppc64le.mk >> kaldi.mk
-    else
-      cat makefiles/linux_openblas.mk >> kaldi.mk
-    fi
-    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
-    $use_cuda && configure_cuda
-    linux_configure_speex
+    case $TARGET_ARCH in
+      aarch64*) cat makefiles/linux_openblas_aarch64.mk ;;
+      arm*)     cat makefiles/linux_openblas_arm.mk ;;
+      ppc64le)  cat makefiles/linux_openblas_ppc64le.mk ;;
+      *)        cat makefiles/linux_openblas.mk ;;
+    esac >> kaldi.mk
 
+    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
   else
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
+  $use_cuda && configure_cuda
+  linux_configure_speex
 else
   failure "Could not detect the platform or we have not yet worked out the
   appropriate configuration for this platform. Please contact the developers."
@@ -1362,7 +1360,13 @@ if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
 # We check for slow exp implementation just before we exit. This check uses
 # and possibly modifies the kaldi.mk file that we just generated.
 check_for_slow_expf;
-echo "SUCCESS"
-echo "To compile: make clean -j; make depend -j; make -j"
-echo " ... or e.g. -j 10, instead of -j, to use a specified number of CPUs"
-exit 0;
+echo "Kaldi has been successfully configured. To compile:
+
+  make -j clean depend; make -j <NCPU>
+
+where <NCPU> is the number of parallel builds you can afford to do. If unsure,
+use the smaller of the number of CPUs or the amount of RAM in GB divided by 2,
+to stay within safe limits. 'make -j' without the numeric value may not limit
+the number of parallel jobs at all, and overwhelm even a powerful workstation,
+since Kaldi build is highly parallelized."
+exit 0
diff --git a/src/cudadecoder/Makefile b/src/cudadecoder/Makefile
new file mode 100644
index 00000000000..166f72e060f
--- /dev/null
+++ b/src/cudadecoder/Makefile
@@ -0,0 +1,34 @@
+all:
+
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+ifeq ($(CUDA), true)
+
+# Make sure we have CUDA_ARCH from kaldi.mk,
+ifndef CUDA_ARCH
+  $(error CUDA_ARCH is undefined, run 'src/configure')
+endif
+
+TESTFILES =
+
+OBJFILES = batched-threaded-nnet3-cuda-pipeline.o decodable-cumatrix.o \
+           cuda-decoder.o cuda-decoder-kernels.o cuda-fst.o
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+LIBNAME = kaldi-cudadecoder
+
+ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../base/kaldi-base.a ../matrix/kaldi-matrix.a \
+          ../lat/kaldi-lat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../gmm/kaldi-gmm.a \
+          ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../transform/kaldi-transform.a \
+          ../tree/kaldi-tree.a ../online2/kaldi-online2.a ../nnet3/kaldi-nnet3.a \
+					../cudafeat/kaldi-cudafeat.a
+
+# Implicit rule for kernel compilation
+%.o : %.cu
+	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudadecoder/README b/src/cudadecoder/README
new file mode 100644
index 00000000000..64aeee3fa35
--- /dev/null
+++ b/src/cudadecoder/README
@@ -0,0 +1,141 @@
+CUDADECODER USAGE AND TUNING GUIDE
+
+INTRODUCTION:
+
+The CudaDecoder was developed by NVIDIA with coordination from Johns Hopkins.
+This work was intended to demonstrate efficient GPU utilization across a range 
+of NVIDIA hardware from SM_35 and on.  The following guide describes how to 
+use and tune the decoder for your models.
+
+A single speech-to-text is not enough work to fully saturate any NVIDIA GPUs.
+To fully saturate GPUs we need to decode many audio files concurrently.  The
+solution provide does this through a combination of batching many audio files
+into a single speech pipeline, running multiple pipelines in parallel on the
+device, and using multiple CPU threads to perform feature extraction and 
+determinization.  Users of the decoder will need to have a high level 
+understanding of the underlying implementation to know how to tune the 
+decoder.  
+
+The interface to the decoder is defined in "batched-threaded-cuda-decoder.h".
+A binary example can be found in cudadecoderbin/batched-wav-nnet3-cuda.cc".
+Below is a simple usage example. 
+/*
+ *  BatchedThreadedCudaDecoderConfig batchedDecoderConfig;
+ *  batchedDecoderConfig.Register(&po);
+ *  po.Read(argc, argv);
+ *  ...
+ *  BatchedThreadedCudaDecoder CudaDecoder(batchedDecoderConfig);
+ *  CudaDecoder.Initialize(*decode_fst, am_nnet, trans_model);
+ *  ...
+ *
+ *  for (; !wav_reader.Done(); wav_reader.Next()) {
+ *    std::string key = wav_reader.Key();
+ *    CudaDecoder.OpenDecodeHandle(key, wave_reader.Value());
+ *    ...
+ *  }
+ *
+ *  while (!processed.empty()) {
+ *    CompactLattice clat;
+ *    CudaDecoder.GetLattice(key, &clat);
+ *    CudaDecoder.CloseDecodeHandle(key);
+ *    ...
+ *  }
+ *
+ *  CudaDecoder.Finalize();
+ */
+
+In the code above we first declare a BatchedThreadedCudaDecoderConfig
+and register its options.  This enables us to tune the configuration 
+options.   Next we declare the CudaDecoder with that configuration.
+Before we can use the CudaDecoder we need to initalize it with an
+FST, AmNnetSimple, and TransitionModel.  
+
+Next we iterate through waves and enqueue them into the decoder by
+calling OpenDecodeHandle.  Note the key must be unique for each 
+decode. Once we have enqueued work we can query the results by calling
+GetLattice on the same key we opened the handle on.  This will automatticaly
+wait for processing to complete before returning. 
+
+The key to get performance is to have many decodes active at the same time
+by opening many decode handles before querying for the lattices.
+
+
+PERFORMANCE TUNING:
+
+The CudaDecoder has a lot of tuning parameters which should be used to
+increase performance on various models and hardware.  Note that it is 
+expected that the optimal parameters will vary according to both the hardware,
+model, and data being decoded.
+
+The following will briefly describe each parameter:
+
+BatchedThreadedCudaDecoderOptions:
+  cuda-control-threads:  Number of CPU threads simultaniously submitting work
+    to the device.  For best performance this should be between 2-4.
+  cuda-worker-threads:  CPU threads for worker tasks like determinization and
+    feature extraction.  For best performance this should take up all spare
+    CPU threads available on the system.
+  max-batch-size:  Maximum batch size in a single pipeline.  This should be as
+    large as possible but is expected to be between 50-200.  
+  batch-drain-size:  How far to drain the batch before getting new work.
+    Draining the batch allows nnet3 to be better batched.  Testing has 
+    indicated that 10-30% of max-batch-size is ideal.
+  determinize-lattice:  Use cuda-worker-threads to determinize the lattice. if
+    this is true then GetRawLattice can no longer be called.
+  max-outstanding-queue-length:  The maximum number of decodes that can be
+    queued and not assigned before OpenDecodeHandle will automatically stall 
+    the submitting thread.  Raising this increases CPU resources.  This should 
+    be set to a few thousand at least.
+
+Decoder Options:
+  beam:  The width of the beam during decoding
+  lattice-beam:  The width of the lattice beam
+  ntokens-preallocated:  number of tokens allocated in host buffers.  If
+    this size is exceeded the buffer will reallocate larger consuming more
+    resources
+  max-tokens-per-frame:  maximum tokens in GPU memory per frame.  If this
+    value is exceeded the beam will tighten and accuracy may decrease.
+  max-active: at the end of each frame computation, we keep only its best max-active tokens (arc instantiations)
+
+Device Options:
+  use-tensor-cores:  Enables tensor core (fp16 math) for gemms.  This is
+    faster but less accurate.  For inference the loss of accuracy is marginal
+
+GPU MEMORY USAGE:
+
+GPU memory is limited.  Large GPUs have between 16-32GB of memory.  Consumer
+GPUs have much less.  For best performance users should have as many
+concurrent decodes as possible.  Thus users should purchase GPUs with as
+much memory as possible.  GPUs with less memory may have to sacrifice either
+performance or accuracy.  On 16GB GPUs for example we are able to support
+around 200 concurrent decodes at a time. This translates into 4
+cuda-control-threads and a max-batch-size of 50 (4x50).  If your model is
+larger or smaller than the models our models when testing you may have to
+raise or lower this.  
+
+There are a number of parameters which can be used to control GPU memory
+usage. How they impact memory usage and accuracy is discussed below:
+
+  max-tokens-per-frame: Controls how many buffers can be stored on the GPU for
+    each frame.  This buffer size cannot be exceed or reallocated.  As this
+    buffer gets closer to being exhausted the beam is reduced possibly reducing
+    quality.  This should be tuned according to the model and data.  For
+    example, a highly accurate model could set this values smaller to enable
+    more concurrent decodes.
+
+  cuda-control-threads:  Each control thread is a concurrent pipeline.  Thus
+    the GPU memory scales linearly with this parameter.  This should always be
+    at least 2 but should probably not be higher than 4 as more concurrent
+    pipelines leads to more driver contention reducing performance.
+
+  max-batch-size:  The number of concurrent decodes in each pipeline.  The
+    memory usage also scales linear with this parameter.  Setting this smaller
+    will reduce kernel runtime while increase launch latency overhead.
+    Ideally this should be as large as possible while still fitting into
+    memory.  Note that currently the maximum allowed is 200.
+
+== Acknowledgement ==
+
+We would like to thank Daniel Povey, Zhehuai Chen and Daniel Galvez for their help and expertise during the review process.
+
+
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
new file mode 100644
index 00000000000..0ca64ccc275
--- /dev/null
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
@@ -0,0 +1,947 @@
+// cudadecoder/batched-threaded-nnet3-cuda-pipeline.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define SLEEP_BACKOFF_NS 500
+#define SLEEP_BACKOFF_S ((double)SLEEP_BACKOFF_NS / 1e9)
+#if HAVE_CUDA == 1
+
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
+#include <nvToolsExt.h>
+#include "base/kaldi-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+void BatchedThreadedNnet3CudaPipeline::Initialize(
+    const fst::Fst<fst::StdArc> &decode_fst, const nnet3::AmNnetSimple &am_nnet,
+    const TransitionModel &trans_model) {
+  KALDI_LOG << "BatchedThreadedNnet3CudaPipeline Initialize with "
+            << config_.num_control_threads << " control threads, "
+            << config_.num_worker_threads << " worker threads"
+            << " and batch size " << config_.max_batch_size;
+
+  am_nnet_ = &am_nnet;
+  trans_model_ = &trans_model;
+  cuda_fst_.Initialize(decode_fst, trans_model_);
+
+  feature_info_ = new OnlineNnet2FeaturePipelineInfo(config_.feature_opts);
+  feature_info_->ivector_extractor_info.use_most_recent_ivector = true;
+  feature_info_->ivector_extractor_info.greedy_ivector_extractor = true;
+
+  // initialize threads and save their contexts so we can join them later
+  thread_contexts_.resize(config_.num_control_threads);
+
+  // create work queue, padding so that we can better detect if this
+  // overflows. this should not happen and is just there as a sanity check
+  pending_task_queue_ = new TaskState *[config_.max_pending_tasks + 
+                                        config_.pending_queue_padding];
+  tasks_front_ = 0;
+  tasks_back_ = 0;
+
+  // ensure all allocations/kernels above are complete before launching threads
+  // in different streams.
+  cudaStreamSynchronize(cudaStreamPerThread);
+
+  // Create threadpool for CPU work
+  work_pool_ = new ThreadPool(config_.num_worker_threads);
+
+  exit_ = false;
+  numStarted_ = 0;
+
+  // start workers
+  for (int i = 0; i < config_.num_control_threads; i++) {
+    thread_contexts_[i] =
+        std::thread(&BatchedThreadedNnet3CudaPipeline::ExecuteWorker, this, i);
+  }
+
+  // wait for threads to start to ensure allocation time isn't in the timings
+  while (numStarted_ < config_.num_control_threads)
+    kaldi::Sleep(SLEEP_BACKOFF_S);
+}
+void BatchedThreadedNnet3CudaPipeline::Finalize() {
+  // Tell threads to exit and join them
+  exit_ = true;
+
+  for (int i = 0; i < config_.num_control_threads; i++) {
+    thread_contexts_[i].join();
+  }
+
+  cuda_fst_.Finalize();
+
+  delete feature_info_;
+  delete work_pool_;
+  delete[] pending_task_queue_;
+}
+
+// query a specific key to see if compute on it is complete
+bool BatchedThreadedNnet3CudaPipeline::isFinished(const std::string &key) {
+  bool finished;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    auto it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    finished = it->second.finished;
+  }
+  return finished;
+}
+
+// remove an audio file from the decoding and clean up resources
+void BatchedThreadedNnet3CudaPipeline::CloseDecodeHandle(
+    const std::string &key) {
+  TaskState *task;
+  decltype(tasks_lookup_.end()) it;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    task = &it->second;
+  }
+
+  // wait for task to finish processing
+  while (task->finished != true) kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  // Delete the group counter if necessary
+  std::lock_guard<std::mutex> lk1(group_tasks_mutex_);
+  if (group_tasks_not_done_[task->group] == 0)
+    group_tasks_not_done_.erase(task->group);
+
+  // remove it
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    std::string &group = task->group;
+    auto p = tasks_group_lookup_.equal_range(group);
+    bool found = false;
+    for (auto it = p.first; it != p.second; ++it) {
+      if (it->second == task) {
+        tasks_group_lookup_.erase(it);
+        found = true;
+        break;
+      }
+    }
+    KALDI_ASSERT(found);
+    tasks_lookup_.erase(it);
+
+    if (tasks_lookup_.empty()) tasks_lookup_cv_.notify_all();
+  }
+}
+
+void BatchedThreadedNnet3CudaPipeline::WaitForAllTasks() {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  group_done_cv_.wait(lk, [this] { return all_group_tasks_not_done_ == 0; });
+}
+
+void BatchedThreadedNnet3CudaPipeline::WaitForGroup(const std::string &group) {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  group_done_cv_.wait(
+      lk, [this, &group] { return group_tasks_not_done_[group] == 0; });
+  // Safe to delete entry from the map now. If the user creates new task in that
+  // group,
+  // the entry will be created once more
+  group_tasks_not_done_.erase(group);
+}
+
+bool BatchedThreadedNnet3CudaPipeline::IsGroupCompleted(
+    const std::string &group) {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  return (group_tasks_not_done_[group] == 0);  // will unlock in destructor
+}
+
+std::string BatchedThreadedNnet3CudaPipeline::WaitForAnyGroup() {
+  std::unique_lock<std::mutex> lk(group_tasks_mutex_);
+  // Waiting for any group to be done.
+  const string *group_done;
+  auto predicate = [this, &group_done] {
+    for (auto it : group_tasks_not_done_) {
+      if (it.second == 0) {
+        group_done = &it.first;
+        return true;
+      }
+    }
+    return false;
+  };
+  group_done_cv_.wait(lk, predicate);
+  return *group_done;
+}
+
+bool BatchedThreadedNnet3CudaPipeline::IsAnyGroupCompleted(std::string *group) {
+  std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+  for (auto it : group_tasks_not_done_) {
+    if (it.second == 0) {
+      *group = it.first;
+      return true;
+    }
+  }
+  return false;  // will unlock in destructor
+}
+
+void BatchedThreadedNnet3CudaPipeline::CloseAllDecodeHandlesForGroup(
+    const std::string &group) {
+  WaitForGroup(group);
+  std::lock_guard<std::mutex> lk1(tasks_lookup_mutex_);
+  auto p = tasks_group_lookup_.equal_range(group);
+  for (auto it = p.first; it != p.second; ++it) {
+    KALDI_ASSERT(it->second->finished==true);
+    tasks_lookup_.erase(it->second->key);
+  }
+  tasks_group_lookup_.erase(p.first, p.second);
+  std::lock_guard<std::mutex> lk2(group_tasks_mutex_);
+  group_tasks_not_done_.erase(group);
+}
+
+void BatchedThreadedNnet3CudaPipeline::CloseAllDecodeHandles() {
+  WaitForAllTasks();
+  std::lock_guard<std::mutex> lk1(tasks_lookup_mutex_);
+  tasks_lookup_.clear();
+  tasks_group_lookup_.clear();
+  std::lock_guard<std::mutex> lk2(group_tasks_mutex_);
+  group_tasks_not_done_.clear();
+}
+
+int32 BatchedThreadedNnet3CudaPipeline::GetNumberOfTasksPending() {
+  int size;
+  {
+    std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+    size = all_group_tasks_not_done_;
+  }
+  return size;
+}
+
+BatchedThreadedNnet3CudaPipeline::TaskState *
+BatchedThreadedNnet3CudaPipeline::AddTask(const std::string &key,
+                                          const std::string &group) {
+  TaskState *task;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    // ensure key is unique
+    KALDI_ASSERT(tasks_lookup_.end() == tasks_lookup_.find(key));
+
+    // Create a new task in lookup map
+    task = &tasks_lookup_[key];
+    tasks_group_lookup_.insert({group, task});
+  }
+  task->group = group;
+
+  // Add the task to its group
+  {
+    std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+    ++all_group_tasks_not_done_;
+    ++group_tasks_not_done_[task->group];
+  }
+  return task;
+}
+
+// Adds a decoding task to the decoder
+void BatchedThreadedNnet3CudaPipeline::OpenDecodeHandle(
+    const std::string &key, const WaveData &wave_data, const std::string &group,
+    const std::function<void(CompactLattice &clat)> &callback) {
+  TaskState *task = AddTask(key, group);
+  task->callback = std::move(callback);
+  task->Init(key, wave_data);
+
+  if (config_.gpu_feature_extract) {
+    // Feature extraction done on device
+    AddTaskToPendingTaskQueue(task);
+  } else {
+    // Feature extraction done on host thread
+    work_pool_->enqueue(THREAD_POOL_LOW_PRIORITY,
+                        &BatchedThreadedNnet3CudaPipeline::ComputeOneFeatureCPU,
+                        this, task);
+  }
+}
+
+void BatchedThreadedNnet3CudaPipeline::OpenDecodeHandle(
+    const std::string &key, const VectorBase<BaseFloat> &wave_data,
+    float sample_rate, const std::string &group,
+    const std::function<void(CompactLattice &clat)> &callback) {
+  TaskState *task = AddTask(key, group);
+  task->Init(key, wave_data, sample_rate);
+  task->callback = std::move(callback);
+
+  if (config_.gpu_feature_extract) {
+    // Feature extraction done on device
+    AddTaskToPendingTaskQueue(task);
+  } else {
+    // Feature extraction done on host thread
+    work_pool_->enqueue(THREAD_POOL_LOW_PRIORITY,
+                        &BatchedThreadedNnet3CudaPipeline::ComputeOneFeatureCPU,
+                        this, task);
+  }
+}
+
+bool BatchedThreadedNnet3CudaPipeline::GetRawLattice(const std::string &key,
+                                                     Lattice *lat) {
+  nvtxRangePushA("GetRawLattice");
+  TaskState *task;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+    auto it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    task = &it->second;
+  }
+
+  // wait for task to finish.  This should happens automatically without
+  // intervention from the master thread.
+  while (task->finished == false) kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  // GetRawLattice on a determinized lattice is not supported (Per email from
+  // DanP)
+  KALDI_ASSERT(task->determinized == false);
+
+  if (task->error) {
+    nvtxRangePop();
+    return false;
+  }
+  // Store off the lattice
+  *lat = task->lat;
+  nvtxRangePop();
+  return true;
+}
+
+bool BatchedThreadedNnet3CudaPipeline::GetLattice(const std::string &key,
+                                                  CompactLattice *clat) {
+  nvtxRangePushA("GetLattice");
+  TaskState *task;
+  {
+    std::lock_guard<std::mutex> lock(tasks_lookup_mutex_);
+
+    auto it = tasks_lookup_.find(key);
+    KALDI_ASSERT(it != tasks_lookup_.end());
+    task = &it->second;
+  }
+  // wait for task to finish.  This should happens automatically without
+  // intervention from the master thread.
+  while (!task->finished) kaldi::Sleep(SLEEP_BACKOFF_S);
+
+  if (task->error) {
+    nvtxRangePop();
+    return false;
+  }
+
+  // if user has not requested a determinized lattice from the decoder then we
+  // must
+  // determinize it here since it was done done already.
+  if (!config_.determinize_lattice && !task->determinized) {
+    // Determinzation was not done by worker threads so do it here
+    DeterminizeOneLattice(task);
+  }
+
+  *clat = task->dlat;  // grab compact lattice
+  nvtxRangePop();
+  return true;
+}
+
+// Adds task to the PendingTaskQueue
+void BatchedThreadedNnet3CudaPipeline::AddTaskToPendingTaskQueue(
+    TaskState *task) {
+  std::lock_guard<std::mutex> lk(tasks_add_mutex_);
+  if (NumPendingTasks() == config_.max_pending_tasks) {
+    // task queue is full launch a new thread to add this task and exit to make
+    // room for other work
+    work_pool_->enqueue(
+        THREAD_POOL_LOW_PRIORITY,
+        &BatchedThreadedNnet3CudaPipeline::AddTaskToPendingTaskQueue, this,
+        task);
+  } else {
+    // there is room so let's add it
+    // insert into pending task queue
+    pending_task_queue_[tasks_back_] = task;
+    // (int)tasks_back_);
+    tasks_back_ = (tasks_back_ + 1) % (config_.max_pending_tasks + 
+        config_.pending_queue_padding);
+    KALDI_ASSERT(NumPendingTasks() <= config_.max_pending_tasks);
+  }
+}
+
+// Attempts to fill the batch from the task queue.  May not fully fill the
+// batch.
+void BatchedThreadedNnet3CudaPipeline::AquireAdditionalTasks(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &free_channels = channel_state.free_channels;
+
+  int tasksRequested =
+      std::min(free_channels.size(), config_.max_batch_size - channels.size());
+  int tasksAssigned = 0;
+  int firstTask = channels.size();
+
+  {
+    // lock required because front might change from other
+    // workers
+    std::lock_guard<std::mutex> lock(tasks_mutex_);
+    {
+      // compute number of tasks to grab
+      int tasksAvailable = NumPendingTasks();
+      tasksAssigned = std::min(tasksAvailable, tasksRequested);
+
+      // grab tasks
+      for (int i = 0; i < tasksAssigned; i++) {
+        // pending_task_queue_[tasks_front_]);
+        KALDI_ASSERT(NumPendingTasks() > 0);
+        tasks.push_back(pending_task_queue_[tasks_front_]);
+        tasks_front_ = (tasks_front_ + 1) % (config_.max_pending_tasks
+            + config_.pending_queue_padding);
+      }
+    }
+  }
+
+  if (tasksAssigned > 0) {
+    // for each assigned tasks we have to do a little bookkeeping
+
+    // list of channels that need initialization
+    std::vector<ChannelId> init_channels(tasksAssigned);
+
+    for (int i = 0; i < tasksAssigned; i++) {
+      // assign a free channel
+      ChannelId channel;
+      {
+        std::lock_guard<std::mutex> lk(channel_state.free_channels_mutex);
+        KALDI_ASSERT(free_channels.size() >
+                     0);  // it should always be true (cf std::min above)
+        channel = free_channels.back();
+        free_channels.pop_back();
+      }
+      // assign channel to task
+      tasks[i + firstTask]->ichannel = channel;
+      // add channel to processing list
+      channels.push_back(channel);
+      // add new channel to initialization list
+      init_channels[i] = channel;
+    }
+
+    // Setup cuda_decoder channels
+    cuda_decoder.InitDecoding(init_channels);
+  }
+}
+
+// Computes NNET3 across the tasks[first,tasks.size())
+void BatchedThreadedNnet3CudaPipeline::ComputeBatchNnet(
+    nnet3::NnetBatchComputer &computer, int32 first,
+    std::vector<TaskState *> &tasks) {
+  nvtxRangePushA("ComputeBatchNnet");
+
+  bool output_to_cpu = false;
+  int32 online_ivector_period = 0;
+  int max_pending_minibatches =
+      0;  // zero means unlimited.  This API call should not block then.
+
+  // list of nnet tasks for each batch
+  std::vector<std::vector<nnet3::NnetInferenceTask>> nnet_tasks(tasks.size());
+
+  // for all new batches enqueue up nnet work.
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    std::unique_ptr<TaskData> &task_data = task.task_data;
+    std::vector<nnet3::NnetInferenceTask> &ntasks = nnet_tasks[i];
+
+    if (config_.gpu_feature_extract) {
+      CuVector<BaseFloat> &ivector_features = task_data->ivector_features;
+      CuMatrix<BaseFloat> &input_features = task_data->input_features;
+
+      CuVector<BaseFloat> *ifeat = NULL;
+      if (ivector_features.Dim() > 0) {
+        ifeat = &ivector_features;
+      }
+      // create task list
+      computer.SplitUtteranceIntoTasks(output_to_cpu, input_features, ifeat,
+                                       NULL, online_ivector_period, &ntasks);
+    } else {
+      Vector<BaseFloat> &ivector_features = task_data->ivector_features_cpu;
+      Matrix<BaseFloat> &input_features = task_data->input_features_cpu;
+
+      Vector<BaseFloat> *ifeat = NULL;
+      if (ivector_features.Dim() > 0) {
+        ifeat = &ivector_features;
+      }
+      // create task list
+      computer.SplitUtteranceIntoTasks(output_to_cpu, input_features, ifeat,
+                                       NULL, online_ivector_period, &ntasks);
+    }
+
+    // Add tasks to computer
+    for (size_t j = 0; j < ntasks.size(); j++) {
+      computer.AcceptTask(&ntasks[j], max_pending_minibatches);
+    }
+  }
+
+  // process all minibatches, we allow partial minibatches but this should only
+  // occur on the last iteration
+  bool allow_partial_minibatch = true;
+  while (computer.Compute(allow_partial_minibatch))
+    ;
+
+  // Extract Posteriors
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    std::unique_ptr<TaskData> &task_data = task.task_data;
+    CuMatrix<BaseFloat> &posteriors = task_data->posteriors;
+    MergeTaskOutput(nnet_tasks[i], &posteriors);
+
+    // nnet output is no longer necessary as we have copied the output out
+    nnet_tasks[i].resize(0);
+
+    // features are no longer needed so free memory here
+    task_data->ivector_features.Resize(0);
+    task_data->input_features.Resize(0,0);
+  }
+
+  nvtxRangePop();
+}
+
+// Computes Features for a single decode instance.
+void BatchedThreadedNnet3CudaPipeline::ComputeOneFeatureCPU(TaskState *task_) {
+  nvtxRangePushA("ComputeOneFeatureCPU");
+  TaskState &task = *task_;
+  std::unique_ptr<TaskData> &task_data = task.task_data;
+  Vector<BaseFloat> &ivector_features = task_data->ivector_features_cpu;
+  Matrix<BaseFloat> &input_features = task_data->input_features_cpu;
+
+  // create decoding state
+  OnlineNnet2FeaturePipeline feature(*feature_info_);
+
+  // Accept waveforms
+  feature.AcceptWaveform(task_data->sample_frequency,
+                         SubVector<BaseFloat>(*task_data->wave_samples, 0,
+                                              task_data->wave_samples->Dim()));
+  feature.InputFinished();
+  // All frames should be ready here
+  int32 numFrames = feature.NumFramesReady();
+  // If we don't have anything to do, we must return now
+  if (numFrames == 0) {
+    task_->finished = true;
+    return;
+  }
+  int32 input_dim = feature.InputFeature()->Dim();
+
+  std::vector<int> frames(numFrames);
+  // create list of frames
+  for (int j = 0; j < numFrames; j++) frames[j] = j;
+
+  // Copy Features
+  input_features.Resize(numFrames, input_dim);
+  feature.InputFeature()->GetFrames(frames, &input_features);
+
+  // Ivectors are optional, if they were not provided skip this step
+  if (feature.IvectorFeature() != NULL) {
+    int32 ivector_dim = feature.IvectorFeature()->Dim();
+    ivector_features.Resize(ivector_dim);
+
+    // Copy Features
+    feature.IvectorFeature()->GetFrame(numFrames - 1, &ivector_features);
+  }
+
+  AddTaskToPendingTaskQueue(task_);
+
+  nvtxRangePop();
+}
+
+// Computes features across the tasks[first,tasks.size()
+void BatchedThreadedNnet3CudaPipeline::ComputeBatchFeatures(
+    int32 first, std::vector<TaskState *> &tasks,
+    OnlineCudaFeaturePipeline &feature_pipeline) {
+  KALDI_ASSERT(config_.gpu_feature_extract == true);
+  nvtxRangePushA("CopyBatchWaves");
+  // below we will pack waves into a single buffer for efficient transfer across
+  // device
+
+  // first count the total number of elements and create a single large vector
+  int count = 0;
+  for (int i = first; i < tasks.size(); i++) {
+    count += tasks[i]->task_data->wave_samples->Dim();
+  }
+
+  // creating a thread local vector of pinned memory.
+  // wave data will be stagged through this memory to get
+  // more efficient non-blocking transfers to the device.
+  thread_local Vector<BaseFloat> pinned_vector;
+
+  if (pinned_vector.Dim() < count) {
+    // WAR:  Not pinning memory because it seems to impact correctness
+    // we are continuing to look into a fix but want to commit this workaround
+    // as a temporary measure.
+    if (pinned_vector.Dim() != 0) {
+      cudaHostUnregister(pinned_vector.Data());
+    }
+
+    // allocated array 2x size
+    pinned_vector.Resize(count * 2, kUndefined);
+    cudaHostRegister(pinned_vector.Data(),
+        pinned_vector.Dim() * sizeof(BaseFloat), 0);
+  }
+
+  // We will launch a thread for each task in order to get better host memory
+  // bandwidth
+  std::vector<std::future<void>> futures;  // for syncing
+
+  // vector copy function for threading below.
+  auto copy_vec = [](SubVector<BaseFloat> dst, const SubVector<BaseFloat> src) {
+    nvtxRangePushA("CopyVec");
+    dst.CopyFromVec(src);
+    nvtxRangePop();
+  };
+
+  // next launch threads to copy all waves for each task in parallel
+  count = 0;
+  for (int i = first; i < tasks.size(); i++) {
+    std::unique_ptr<TaskData> &task_data = tasks[i]->task_data;
+    SubVector<BaseFloat> wave(pinned_vector, count,
+                              task_data->wave_samples->Dim());
+    count += task_data->wave_samples->Dim();
+    futures.push_back(
+        work_pool_->enqueue(copy_vec, wave, *(task_data->wave_samples)));
+  }
+
+  // wait for waves to be copied into place
+  for (int i = 0; i < futures.size(); i++) {
+    futures[i].get();
+  }
+
+  CuVector<BaseFloat> cu_waves(count, kUndefined);
+  // copy memory down asynchronously.  Vector copy functions are synchronous so
+  // we do it manually.
+  // It is important for this to happen asynchrously to help hide launch latency
+  // of smaller kernels
+  // that come in the future.
+  cudaMemcpyAsync(cu_waves.Data(), pinned_vector.Data(),
+                  cu_waves.Dim() * sizeof(BaseFloat), cudaMemcpyHostToDevice,
+                  cudaStreamPerThread);
+  nvtxRangePop();
+
+  nvtxRangePushA("ComputeBatchFeatures");
+  // extract features for each wave
+  count = 0;
+  for (int i = first; i < tasks.size(); i++) {
+    TaskState &task = *tasks[i];
+    std::unique_ptr<TaskData> &task_data = task.task_data;
+
+    CuSubVector<BaseFloat> cu_wave(cu_waves, count,
+                                   task_data->wave_samples->Dim());
+    count += task_data->wave_samples->Dim();
+    feature_pipeline.ComputeFeatures(cu_wave, task_data->sample_frequency,
+                                     &task_data->input_features,
+                                     &task_data->ivector_features);
+
+    int32 numFrames = task_data->input_features.NumRows();
+
+    if (numFrames == 0) {
+      // Make this a warning for now.  Need to check how this is handled
+      KALDI_WARN << "Warning empty audio file";
+    }
+  }
+  nvtxRangePop();
+}
+
+// Allocates decodables for tasks in the range of tasks[first,tasks.size())
+void BatchedThreadedNnet3CudaPipeline::AllocateDecodables(
+    int32 first, std::vector<TaskState *> &tasks,
+    std::vector<CudaDecodableInterface *> &decodables) {
+  // Create mapped decodable here
+  for (int i = first; i < tasks.size(); i++) {
+    std::unique_ptr<TaskData> &task_data = tasks[i]->task_data;
+    CuMatrix<BaseFloat> &posteriors = task_data->posteriors;
+    decodables.push_back(
+        new DecodableCuMatrixMapped(*trans_model_, posteriors, 0));
+  }
+}
+
+// Removes all completed channels from the channel list.
+// Also enqueues up work for post processing
+void BatchedThreadedNnet3CudaPipeline::RemoveCompletedChannels(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<CudaDecodableInterface *> &decodables,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &completed_channels = channel_state.completed_channels;
+
+  // Here we will reorder arrays to put finished decodes at the end
+  int cur = 0;  // points to the current unchecked decode
+  int back = tasks.size() - completed_channels.size() -
+             1;  // points to the last unchecked decode
+
+  // for each active channel
+  // scan channels to find finished decodes
+  // move finished decodes to the end
+  for (int i = 0; i < channels.size(); i++) {
+    ChannelId channel = channels[cur];
+    int numDecoded = cuda_decoder.NumFramesDecoded(channel);
+    int toDecode = decodables[cur]->NumFramesReady();
+
+    if (toDecode == numDecoded) {  // if current task is completed
+      // add channel to free and completed queues
+      completed_channels.push_back(channel);
+
+      // this was assigned earlier just making sure it is still consistent
+      KALDI_ASSERT(tasks[cur]->ichannel == channel);
+
+      // Rearrange queues,
+      // move this element to end and end to this spot
+      std::swap(tasks[cur], tasks[back]);
+      std::swap(channels[cur], channels[back]);
+      std::swap(decodables[cur], decodables[back]);
+
+      // back is a completed decode so decrement it
+      back--;
+    } else {
+      // not completed move to next task
+      cur++;
+    }  // end if completed[cur]
+  }    // end for loop
+
+  // removing finished channels from list
+  channels.resize(cur);
+}
+
+// Post decode some channels will be complete
+// For those channels we need to
+//  free up the channel
+//  get and determinize the lattice
+//
+void BatchedThreadedNnet3CudaPipeline::PostDecodeProcessing(
+    CudaDecoder &cuda_decoder, ChannelState &channel_state,
+    std::vector<CudaDecodableInterface *> &decodables,
+    std::vector<TaskState *> &tasks) {
+  std::vector<ChannelId> &channels = channel_state.channels;
+  std::vector<ChannelId> &completed_channels = channel_state.completed_channels;
+
+  // consistency check
+  KALDI_ASSERT(tasks.size() == channels.size() + completed_channels.size());
+
+  // Prepare data for GetRawLattice
+  cuda_decoder.PrepareForGetRawLattice(completed_channels, true);
+  // clean up datastructures for completed tasks
+  for (int i = channels.size(); i < tasks.size(); i++) {
+    tasks[i]->task_data->posteriors.Resize(0,0);
+    delete decodables[i];
+  }
+
+  // Calling GetRawLattice + Determinize (optional) on a CPU worker thread
+  for (int i = channels.size(); i < tasks.size(); i++) {
+    // checking that this channel is actually in the completed channels list
+    // order is reversed because we used push_back into completed_channel list
+    KALDI_ASSERT(tasks[i]->ichannel ==
+                 completed_channels[channels.size() +
+                                    completed_channels.size() - i - 1]);
+    // enqueue task completion on a worker thread.  We do not need to wait 
+    // for sychronization on this thread as the parameters passed to this
+    // thread are persistent and that thread will return resources to the
+    // system when they free up.
+    work_pool_->enqueue(THREAD_POOL_NORMAL_PRIORITY,
+                            &BatchedThreadedNnet3CudaPipeline::CompleteTask,
+                            this, &cuda_decoder, &channel_state, tasks[i]);
+  }
+
+  tasks.resize(channels.size());
+  decodables.resize(channels.size());
+  completed_channels.resize(0);
+}
+
+void BatchedThreadedNnet3CudaPipeline::CompleteTask(CudaDecoder *cuda_decoder,
+                                                    ChannelState *channel_state,
+                                                    TaskState *task) {
+  // Calling GetRawLattice for that channel. PrepareForGetRawLattice was already
+  // called
+  cuda_decoder->ConcurrentGetRawLatticeSingleChannel(task->ichannel,
+                                                     &task->lat);
+  // We are done using that channel. Putting it back into the free channels
+  {
+    std::lock_guard<std::mutex> lk(channel_state->free_channels_mutex);
+    channel_state->free_channels.push_back(task->ichannel);
+  }
+
+  // If necessary, determinize the lattice
+  if (config_.determinize_lattice) DeterminizeOneLattice(task);
+
+  if (!config_.determinize_lattice) {
+    ConvertLattice(task->lat, &task->dlat);
+  }
+
+  if (task->callback)  // if callable
+    task->callback(task->dlat);
+
+  task->finished = true;
+
+  {
+    std::lock_guard<std::mutex> lk(group_tasks_mutex_);
+    --all_group_tasks_not_done_;
+    int32 left_in_group = --group_tasks_not_done_[task->group];
+    //    std::cout << "left in group " << task->group << " " << left_in_group
+    //    << std::endl;
+    if (left_in_group == 0) group_done_cv_.notify_all();
+  }
+}
+
+void BatchedThreadedNnet3CudaPipeline::DeterminizeOneLattice(TaskState *task) {
+  nvtxRangePushA("DeterminizeOneLattice");
+  // Note this destroys the original raw lattice
+  DeterminizeLatticePhonePrunedWrapper(*trans_model_, &task->lat,
+                                       config_.decoder_opts.lattice_beam,
+                                       &(task->dlat), config_.det_opts);
+  task->determinized = true;
+  nvtxRangePop();
+}
+
+void BatchedThreadedNnet3CudaPipeline::ExecuteWorker(int threadId) {
+  // Initialize this threads device
+  CuDevice::Instantiate();
+
+  KALDI_LOG << "CudaDecoder batch_size=" << config_.max_batch_size
+            << " num_channels=" << config_.num_channels;
+  // Data structures that are reusable across decodes but unique to each thread
+  CudaDecoder cuda_decoder(cuda_fst_, config_.decoder_opts,
+                           config_.max_batch_size, config_.num_channels);
+  if (config_.num_decoder_copy_threads > 0)
+    cuda_decoder.SetThreadPoolAndStartCPUWorkers(
+        work_pool_, config_.num_decoder_copy_threads);
+  nnet3::NnetBatchComputer computer(config_.compute_opts, am_nnet_->GetNnet(),
+                                    am_nnet_->Priors());
+
+  OnlineCudaFeaturePipeline feature_pipeline(config_.feature_opts);
+
+  ChannelState channel_state;
+
+  std::vector<TaskState *> tasks;  // The state for each decode
+  std::vector<CudaDecodableInterface *> decodables;
+
+  // Initialize reuseable data structures
+  {
+    channel_state.channels.reserve(config_.max_batch_size);
+    channel_state.completed_channels.reserve(config_.max_batch_size);
+    tasks.reserve(config_.max_batch_size);
+    decodables.reserve(config_.max_batch_size);
+    {
+      std::lock_guard<std::mutex> lk(channel_state.free_channels_mutex);
+      channel_state.free_channels.reserve(config_.num_channels);
+      // add all channels to free channel list
+      for (int i = 0; i < config_.num_channels; i++) {
+        channel_state.free_channels.push_back(i);
+      }
+    }
+  }
+
+  numStarted_++;  // Tell master I have started
+
+  // main control loop.  At each iteration a thread will see if it has been
+  // asked to shut
+  // down.  If it has it will exit.  This loop condition will only be processed
+  // if all
+  // other work assigned to this thread has been processed.
+  while (!exit_) {
+    // main processing loop.  At each iteration the thread will do the
+    // following:
+    // 1) Attempt to grab more work.
+    // 2) Initialize any new work
+    // do
+    // 3) Process work in a batch
+    // while(free lanes < drain_count)
+    // 4) Postprocess any completed work
+    do {
+      // 1) attempt to fill the batch
+      if (tasks_front_ != tasks_back_) {  // if work is available grab more work
+
+        int start = tasks.size();  // Save the current assigned tasks size
+
+        AquireAdditionalTasks(cuda_decoder, channel_state, tasks);
+        // New tasks are now in the in tasks[start,tasks.size())
+        if (start != tasks.size()) {  // if there are new tasks
+          if (config_.gpu_feature_extract)
+            ComputeBatchFeatures(start, tasks, feature_pipeline);
+          ComputeBatchNnet(computer, start, tasks);
+          AllocateDecodables(start, tasks, decodables);
+        }
+      }  // end if (tasks_front_!=tasks_back_)
+
+      // check if there is no active work on this thread.
+      // This can happen if another thread was assigned the work.
+      if (tasks.size() == 0) {
+        // Thread is spinning waiting for work.  Backoff.
+        kaldi::Sleep(SLEEP_BACKOFF_S);
+        break;
+      }
+
+      // try/catch to catch and report errors inside decoder.
+      // errors can be recoverable or non-recoverable
+      // unrecoverable errors will assert
+      // recoverable errors will cancel the batch (output empty lattice)
+      // and print a warning.
+      // There should be no errors and this is just a sanity check
+      try {
+        // This is in a loop in case we want to drain the batch a little.
+        // Draining the batch will cause initialization tasks to be batched.
+        do {
+          // 3) Process outstanding work in a batch
+          // Advance decoding on all open channels
+          cuda_decoder.AdvanceDecoding(channel_state.channels, decodables);
+
+          // Adjust channel state for all completed decodes
+          RemoveCompletedChannels(cuda_decoder, channel_state, decodables,
+                                  tasks);
+          // do loop repeates until we meet drain size or run out of work
+        } while (config_.max_batch_size - channel_state.channels.size() <
+                     config_.batch_drain_size &&
+                 channel_state.channels.size() > 0);
+        // 4) Post process work.  This reorders completed work to the end,
+        // copies results outs, and cleans up data structures
+        PostDecodeProcessing(cuda_decoder, channel_state, decodables, tasks);
+
+      } catch (CudaDecoderException e) {
+        // Code to catch errors.  Most errors are unrecoverable but a user can
+        // mark them
+        // recoverable which will cancel the entire batch but keep processing.
+        if (!e.recoverable) {
+          bool UNRECOVERABLE_EXCEPTION = false;
+          KALDI_LOG << "Error unrecoverable cuda decoder error '" << e.what()
+                    << "'\n";
+          KALDI_ASSERT(UNRECOVERABLE_EXCEPTION);
+        } else {
+          KALDI_LOG << "Error recoverable cuda decoder error '" << e.what()
+                    << "'\n";
+          KALDI_LOG << "    Aborting batch for recovery.  Canceling the "
+                       "following decodes:\n";
+          // Cancel all outstanding tasks
+          for (int i = 0; i < tasks.size(); i++) {
+            // move all channels to free channel queue
+            ChannelId channel = channel_state.channels[i];
+            {
+              std::lock_guard<std::mutex> lk(channel_state.free_channels_mutex);
+              channel_state.free_channels.push_back(channel);
+            }
+            TaskState &task = *(tasks[i]);
+            KALDI_LOG << "      Canceled: " << task.key << "\n";
+
+            // set error flag
+            task.error = true;
+            task.error_string = e.what();
+
+            // cleanup memory
+            delete decodables[i];
+
+            // notifiy master decode is finished
+            task.finished = true;
+          }
+          tasks.resize(0);
+          channel_state.channels.resize(0);
+          decodables.resize(0);
+        }
+      }
+    } while (tasks.size() > 0);  // more work don't check exit condition
+  }                              // end while(!exit_)
+}  // end ExecuteWorker
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
new file mode 100644
index 00000000000..be6443b8a7a
--- /dev/null
+++ b/src/cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
@@ -0,0 +1,394 @@
+// cudadecoder/batched-threaded-nnet3-cuda-pipeline.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
+#define KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
+
+#include <atomic>
+#include <thread>
+
+#include "cudadecoder/cuda-decoder.h"
+#include "decodable-cumatrix.h"
+#include "feat/wave-reader.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "nnet3/nnet-batch-compute.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "cudafeat/online-cuda-feature-pipeline.h"
+#include "thread-pool.h"
+
+// If num_channels sets to automatic,
+// num_channels = [this define] * max_batch_size
+#define KALDI_CUDA_DECODER_CHANNELS_BATCH_SIZE_RATIO 1.3
+
+namespace kaldi {
+namespace cuda_decoder {
+
+/* BatchedThreadedNnet3CudaPipelineConfig
+ * This class is a common configuration class for the various components
+ * of a batched cuda multi-threaded pipeline.  It defines a single place
+ * to control all operations and ensures that the various componets
+ * match configurations
+ */
+// configuration options common to the BatchedThreadedNnet3CudaPipeline and
+// BatchedThreadedNnet3CudaPipeline
+struct BatchedThreadedNnet3CudaPipelineConfig {
+  BatchedThreadedNnet3CudaPipelineConfig()
+      : max_batch_size(200),
+        num_channels(-1),
+        batch_drain_size(10),
+        num_control_threads(2),
+        num_worker_threads(20),
+        determinize_lattice(true),
+        max_pending_tasks(4000),
+        pending_queue_padding(10),
+        num_decoder_copy_threads(2),
+        gpu_feature_extract(true) {};
+  void Register(OptionsItf *po) {
+    po->Register("max-batch-size", &max_batch_size,
+                 "The maximum batch size to be used by the decoder. "
+                 "This is also the number of lanes in the CudaDecoder. "
+                 "Larger = Faster and more GPU memory used.");
+    std::ostringstream num_channels_desc;
+    num_channels_desc
+        << "The number of channels "
+           "allocated to the cuda decoder.  This should be larger "
+           "than max_batch_size.  Each channel consumes a small "
+           "amount of memory but also allows us to better overlap "
+           "computation"
+           " (-1 = set to "
+        << KALDI_CUDA_DECODER_CHANNELS_BATCH_SIZE_RATIO << "*max-batch-size).";
+    po->Register("num-channels", &num_channels, num_channels_desc.str());
+    po->Register("batch-drain-size", &batch_drain_size,
+                 "How far to drain the batch before refilling work. This "
+                 "batches pre/post decode work.");
+    po->Register("cuda-control-threads", &num_control_threads,
+                 "The number of pipeline control threads for the CUDA work. "
+                 "e.g. 2 control threads -> 2 independent CUDA pipeline (nnet3 "
+                 "and decoder).");
+    po->Register(
+        "cuda-worker-threads", &num_worker_threads,
+        "The total number of CPU threads launched to process CPU tasks.");
+    po->Register("determinize-lattice", &determinize_lattice,
+                 "Determinize the lattice before output.");
+    po->Register("max-outstanding-queue-length", &max_pending_tasks,
+                 "Number of files to allow to be outstanding at a time. When "
+                 "the number of files is larger than this handles will be "
+                 "closed before opening new ones in FIFO order.");
+    po->Register("cuda-decoder-copy-threads", &num_decoder_copy_threads,
+                 "Advanced - Number of worker threads used in the decoder for "
+                 "the host to host copies.");
+    po->Register("gpu-feature-extract", &gpu_feature_extract,
+                 "Extract features on the GPU.  This reduces CPU overhead "
+                 "leading to better scalability but may reduce overall "
+                 "performance for a single GPU.");
+
+    feature_opts.Register(po);
+    decoder_opts.Register(po);
+    det_opts.Register(po);
+    compute_opts.Register(po);
+  }
+  int max_batch_size;
+  int num_channels;
+  int batch_drain_size;
+  int num_control_threads;
+  int num_worker_threads;
+  bool determinize_lattice;
+  int max_pending_tasks;
+  int pending_queue_padding;
+  int num_decoder_copy_threads;
+  bool gpu_feature_extract;
+
+  void ComputeConfig() {
+    if (num_channels == -1)
+      num_channels =
+          max_batch_size * KALDI_CUDA_DECODER_CHANNELS_BATCH_SIZE_RATIO;
+  }
+
+  OnlineNnet2FeaturePipelineConfig feature_opts;      // constant readonly
+  CudaDecoderConfig decoder_opts;                     // constant readonly
+  fst::DeterminizeLatticePhonePrunedOptions det_opts; // constant readonly
+  nnet3::NnetBatchComputerOptions compute_opts;       // constant readonly
+};
+
+/*
+ * BatchedThreadedNnet3CudaPipeline uses multiple levels of parallelism in order to
+ * decode quickly on CUDA GPUs. This is the primary interface for cuda decoding.
+ * For examples of how to use this decoder see cudadecoder/README and
+ * cudadecoderbin/batched-wav-nnet3-cuda.cc
+ */
+class BatchedThreadedNnet3CudaPipeline {
+public:
+ BatchedThreadedNnet3CudaPipeline(
+     const BatchedThreadedNnet3CudaPipelineConfig &config)
+     : config_(config), all_group_tasks_not_done_(0) {
+   config_.ComputeConfig();
+ };
+
+ // allocates reusable objects that are common across all decodings
+ void Initialize(const fst::Fst<fst::StdArc> &decode_fst,
+                 const nnet3::AmNnetSimple &nnet,
+                 const TransitionModel &trans_model);
+
+ // deallocates reusable objects
+ void Finalize();
+
+ // query a specific key to see if compute on it is complete
+ bool isFinished(const std::string &key);
+
+ // remove an audio file from the decoding and clean up resources
+ void CloseDecodeHandle(const std::string &key);
+ void CloseAllDecodeHandlesForGroup(const std::string &group);
+ void CloseAllDecodeHandles();
+
+ // Adds a decoding task to the decoder
+ // When passing in a vector of data, the caller must ensure the data exists
+ // until the CloseDecodeHandle/WaitForAllTasks is called
+ // callback is called once task is done and we pass it the final lattice
+ // callback can be used to compute lattice rescoring, find best path in
+ // lattice, writing lattice to disk, etc.
+ // Important: callback is launched in the threadpool. It must be threadsafe.
+ // For instance, if writing to disk, or to stdout,
+ // use a lock:
+ // e.g. :
+ // {
+ // 	std::lock_guard<std::mutex> lock(global_mutex);
+ // 	// write lattice to disk
+ //    // lock is released in the destructor of lock_guard<>
+ // }
+ void OpenDecodeHandle(
+     const std::string &key, const WaveData &wave_data,
+     const std::string &group = std::string(),
+     const std::function<void(CompactLattice &clat)> &callback =
+         std::function<void(CompactLattice &clat)>());
+ // When passing in a vector of data, the caller must ensure the data exists
+ // until the CloseDecodeHandle is called
+ void OpenDecodeHandle(
+     const std::string &key, const VectorBase<BaseFloat> &wave_data,
+     float sample_rate, const std::string &group = std::string(),
+     const std::function<void(CompactLattice &clat)> &callback =
+         std::function<void(CompactLattice &clat)>());
+
+ // Copies the raw lattice for decoded handle "key" into lat
+ bool GetRawLattice(const std::string &key, Lattice *lat);
+ // Determinizes raw lattice and returns a compact lattice
+ bool GetLattice(const std::string &key, CompactLattice *lat);
+
+ int32 GetNumberOfTasksPending();
+
+ // Wait for all tasks to complete
+ void WaitForAllTasks();
+ // Wait for all tasks in the group to complete
+ void WaitForGroup(const std::string &group);
+ // Check if a group is available. Returns if not.
+ bool IsGroupCompleted(const std::string &group);
+ // Wait for any group to complete, then returns which group completed
+ std::string WaitForAnyGroup();
+ // Check if any group is available. If one is available, set its name in *group
+ bool IsAnyGroupCompleted(std::string *group);
+ inline int NumPendingTasks() {
+   return (tasks_back_ - tasks_front_ + config_.max_pending_tasks + 
+       config_.pending_queue_padding) %
+          (config_.max_pending_tasks + config_.pending_queue_padding);
+  };
+
+private:
+ // Task data used during computation
+ // Is cleared when task is completed
+ struct TaskData {
+   Vector<BaseFloat> raw_data;  // Wave input data when wave_reader passed
+   std::shared_ptr<SubVector<BaseFloat>>
+       wave_samples;  // Used as a pointer to either the raw
+                      // data or the samples passed
+   float sample_frequency;
+   Vector<BaseFloat> ivector_features_cpu;
+   Matrix<BaseFloat> input_features_cpu;
+   CuVector<BaseFloat> ivector_features;
+   CuMatrix<BaseFloat> input_features;
+   CuMatrix<BaseFloat> posteriors;
+
+   TaskData(const WaveData &wave_data_in)
+       : wave_samples(NULL), sample_frequency(0) {
+     int rows = wave_data_in.Data().NumRows();
+     int cols = wave_data_in.Data().NumCols();
+     int stride = wave_data_in.Data().Stride();
+
+     raw_data.Resize(rows * cols, kUndefined);
+
+     if (stride == cols) {
+       // contigious so use one large memory copy
+       memcpy(raw_data.Data(), wave_data_in.Data().Data(),
+              rows * cols * sizeof(BaseFloat));
+     } else {
+       // data is not contigious so we need to copy one row at a time
+       for (int i = 0; i < rows; i++) {
+         memcpy(raw_data.Data() + i * cols, wave_data_in.Data().RowData(i),
+                cols * sizeof(BaseFloat));
+       }
+     }
+     wave_samples =
+         std::make_shared<SubVector<BaseFloat>>(raw_data, 0, raw_data.Dim());
+     sample_frequency = wave_data_in.SampFreq();
+   };
+
+   // Init when raw data is passed in.  This data is shallow copied.
+   TaskData(const VectorBase<BaseFloat> &wave_data_in, float sample_rate) {
+     wave_samples = std::make_shared<SubVector<BaseFloat>>(wave_data_in, 0,
+                                                           wave_data_in.Dim());
+     sample_frequency = sample_rate;
+   }
+ };
+
+ // State needed for each decode task.
+ // This state can be passed around by reference or pointer safely
+ // and provides a convieniet way to store all decoding state.
+ struct TaskState {
+   std::string key;
+   std::string group;  // group for that task. "" is default
+   bool error;
+   std::string error_string;
+
+   std::unique_ptr<TaskData> task_data;
+
+   int32 ichannel;              // associated CudaDecoder channel
+   Lattice lat;                 // Raw Lattice output
+   CompactLattice dlat;         // Determinized lattice output.  Only set if
+                                // determinize-lattice=true
+   std::atomic<bool> finished;  // Tells master thread if task has finished
+                                // execution
+
+   bool determinized;
+
+   // (optional) callback is called task is finished and we have a lattice
+   // ready
+   // that way we can compute all CPU tasks in the threadpool (lattice
+   // rescoring, find best path in lattice, etc.)
+   std::function<void(CompactLattice &clat)> callback;
+
+   TaskState() : error(false), finished(false), determinized(false) {}
+
+   // Init when wave data is passed directly in.  This data is deep copied.
+   void Init(const std::string &key_in, const WaveData &wave_data_in) {
+     task_data.reset(new TaskData(wave_data_in));
+     key = key_in;
+   };
+   // Init when raw data is passed in.  This data is shallow copied.
+   void Init(const std::string &key_in,
+             const VectorBase<BaseFloat> &wave_data_in, float sample_rate) {
+     task_data.reset(new TaskData(wave_data_in, sample_rate));
+     key = key_in;
+   }
+  };
+
+  // Creating a new task in the hashmaps
+  TaskState *AddTask(const std::string &key, const std::string &group);
+
+  // Holds the current channel state for a worker
+  struct ChannelState {
+    std::vector<ChannelId> channels;
+    std::vector<ChannelId> free_channels;
+    std::vector<ChannelId> completed_channels;
+    std::mutex free_channels_mutex;
+  };
+
+  // Adds task to the PendingTaskQueue
+  void AddTaskToPendingTaskQueue(TaskState *task);
+
+  // Attempts to fill the batch from the task queue.  May not fully fill the
+  // batch.
+  void AquireAdditionalTasks(CudaDecoder &cuda_decoder,
+                             ChannelState &channel_state,
+                             std::vector<TaskState *> &tasks);
+
+  // Computes Features for a single decode instance.
+  void ComputeOneFeatureCPU(TaskState *task);
+
+  // Computes features across the tasks[first,tasks.size()
+  void ComputeBatchFeatures(int32 first,
+                            std::vector<TaskState *> &tasks,
+                            OnlineCudaFeaturePipeline &feature_pipeline);
+
+  // Computes Nnet across the current decode batch
+  void ComputeBatchNnet(nnet3::NnetBatchComputer &computer, int32 first,
+                        std::vector<TaskState *> &tasks);
+
+  // Allocates decodables for tasks in the range of
+  // dstates[first,dstates.size())
+  void AllocateDecodables(int32 first, std::vector<TaskState *> &tasks,
+                          std::vector<CudaDecodableInterface *> &decodables);
+
+  // Removes all completed channels from the channel list.
+  // Also enqueues up work for post processing
+  void
+  RemoveCompletedChannels(CudaDecoder &cuda_decoder,
+                          ChannelState &channel_state,
+                          std::vector<CudaDecodableInterface *> &decodables,
+                          std::vector<TaskState *> &tasks);
+
+  // For each completed decode perform post processing work and clean up
+  void PostDecodeProcessing(CudaDecoder &cuda_decoder,
+                            ChannelState &channel_state,
+                            std::vector<CudaDecodableInterface *> &decodables,
+                            std::vector<TaskState *> &tasks);
+
+  // Calls ConcurrentGetRawLatticeSingleChannel and Determinize
+  // on a dedicated CPU worker thread at the end of the decode
+  void CompleteTask(CudaDecoder *cuda_decoder, ChannelState *channel_state,
+                    TaskState *state);
+
+  // Determinize one lattice
+  void DeterminizeOneLattice(TaskState *task);
+  // Thread execution function.  This is a single worker thread which processes
+  // input.
+  void ExecuteWorker(int threadId);
+
+  BatchedThreadedNnet3CudaPipelineConfig config_;
+
+  CudaFst cuda_fst_;
+  const TransitionModel *trans_model_;
+  const nnet3::AmNnetSimple *am_nnet_;
+  nnet3::DecodableNnetSimpleLoopedInfo *decodable_info_;
+  OnlineNnet2FeaturePipelineInfo *feature_info_;
+
+  std::mutex tasks_mutex_; // protects tasks_front_ and pending_task_queue_ for
+                           // workers
+  std::mutex tasks_add_mutex_; // protect OpenDecodeHandle if multiple threads
+                               // access
+  std::mutex tasks_lookup_mutex_; // protext tasks_lookup map
+  std::condition_variable tasks_lookup_cv_;
+  std::atomic<int> tasks_front_, tasks_back_;
+  TaskState **pending_task_queue_;
+
+  std::atomic<bool> exit_;      // signals threads to exit
+  std::atomic<int> numStarted_; // signals master how many threads have started
+
+  ThreadPool *work_pool_; // thread pool for CPU work
+  std::map<std::string, int32> group_tasks_not_done_;
+  int32 all_group_tasks_not_done_;
+  std::mutex group_tasks_mutex_;
+  std::condition_variable group_done_cv_;
+  std::unordered_multimap<std::string, TaskState *>
+      tasks_group_lookup_;  // group -> list of tasks
+  std::unordered_map<std::string, TaskState>
+      tasks_lookup_;                              // Contains a map of
+                                                  // utterance to TaskState
+  std::vector<std::thread> thread_contexts_;      // A list of thread contexts
+};
+
+}  // end namespace cuda_decoder
+} // end namespace kaldi.
+
+#endif  // KALDI_CUDA_DECODER_BATCHED_THREADED_CUDA_DECODER_H_
diff --git a/src/cudadecoder/cuda-decodable-itf.h b/src/cudadecoder/cuda-decodable-itf.h
new file mode 100644
index 00000000000..98d0619b6eb
--- /dev/null
+++ b/src/cudadecoder/cuda-decodable-itf.h
@@ -0,0 +1,33 @@
+// cudadecoder/cuda-decodable-itf.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_DECODABLE_ITF_H
+#define KALDI_CUDA_DECODER_DECODABLE_ITF_H
+
+#include "itf/decodable-itf.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+class CudaDecodableInterface : public DecodableInterface {
+public:
+  virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame) = 0;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi.
+#endif  // KALDI_CUDA_DECODER_DECODABLE_ITF_H
diff --git a/src/cudadecoder/cuda-decoder-common.h b/src/cudadecoder/cuda-decoder-common.h
new file mode 100644
index 00000000000..fc11ff894bb
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-common.h
@@ -0,0 +1,564 @@
+// cudadecoder/cuda-decoder-common.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
+#include "cudamatrix/cu-device.h"
+#include "util/stl-utils.h"
+
+// A decoder channel is linked to one utterance. Frames
+// from the same must be sent to the same channel.
+//
+// A decoder lane is where the computation actually happens
+// a decoder lane is given a frame and its associated channel
+// and does the actual computation
+//
+// An analogy would be lane -> a core, channel -> a software thread
+
+// Some config parameters can be computed using other parameters
+// (e.g. we can set main_q_capacity using max-active)
+// Those values are the different factors between parameters that we know
+// and parameters we want to set
+#define KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR 4
+#define KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR 3
+
+// If we're at risk of filling the tokens queue,
+// the beam is reduced to keep only the best candidates in the
+// remaining space
+// We then slowly put the beam back to its default value
+// beam_next_frame = min(default_beam, RECOVER_RATE * beam_previous_frame)
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE 1.2f
+
+// Defines for the cuda decoder kernels
+// It shouldn't be necessary to change the DIMX of the kernels
+
+// Below that value, we launch the persistent kernel for NonEmitting
+#define KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS 4096
+
+// We know we will have at least X elements in the hashmap
+// We allocate space for X*KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR elements
+// to avoid having too much collisions
+#define KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR 1
+
+// Max size of the total kernel arguments
+// 4kb for compute capability >= 2.0
+#define KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE (4096)
+
+// When applying the max-active, we need to compute a topk
+// to perform that (soft) topk, we compute a histogram
+// here we define the number of bins in that histogram
+// it has to be less than the number of 1D threads
+#define KALDI_CUDA_DECODER_HISTO_NBINS 255
+
+// Number of "heavy duty" process non emitting kernels
+// If more non emitting iterations are required, those will be done
+// in the one-CTA persistent kernel
+#define KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS 2
+
+// Adaptive beam parameters
+// We will decrease the beam when we detect that we are generating too many
+// tokens
+// for the first segment of the aux_q, we don't do anything (keep the original
+// beam)
+// the first segment is made of (aux_q
+// capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// then we will decrease the beam step by step, until 0.
+// we will decrease the beam every m elements, with:
+// x = (aux_q capacity)/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT (static
+// segment
+// y = (aux_q capacity) - x
+// m = y / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS
+// For more information, please refer to the definition of GetAdaptiveBeam in
+// cuda-decoder-kernels.cu
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT 4
+#define KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS 8
+// When applying max_active we don't keep exactly max_active_ tokens,
+// but a bit more. And we can call ApplyMaxActiveAndReduceBeam multiple times
+// in the first frame (the first times as a pre-filter, the last time at the
+// very end of the frame)
+// Because keeping a bit more than max_active_ is expected, we add the tolerance
+// so that we can avoid triggering ApplyMaxActiveAndReduceBeam for just a few
+// tokens above the limit
+// at the end of the frame
+
+#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
+
+#define KALDI_CUDA_DECODER_ASSERT(val, recoverable)                     \
+  {                                                                     \
+    if ((val) != true) {                                                \
+      throw CudaDecoderException("KALDI_CUDA_DECODER_ASSERT", __FILE__, \
+                                 __LINE__, recoverable)                 \
+    }                                                                   \
+  }
+// Macro for checking cuda errors following a cuda launch or api call
+#ifdef NDEBUG
+#define KALDI_DECODER_CUDA_CHECK_ERROR()
+#else
+#define KALDI_DECODER_CUDA_CHECK_ERROR()                                  \
+  {                                                                       \
+    cudaError_t e = cudaGetLastError();                                   \
+    if (e != cudaSuccess) {                                               \
+      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
+                                 false);                                  \
+    }                                                                     \
+  }
+#endif
+
+#define KALDI_DECODER_CUDA_API_CHECK_ERROR(e)                             \
+  {                                                                       \
+    if (e != cudaSuccess) {                                               \
+      throw CudaDecoderException(cudaGetErrorName(e), __FILE__, __LINE__, \
+                                 false);                                  \
+    }                                                                     \
+  }
+
+#define KALDI_CUDA_DECODER_1D_KERNEL_LOOP(i, n)                \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, th_idx, n) \
+  for (int offset = blockIdx.x * blockDim.x, th_idx = threadIdx.x;        \
+       offset < (n); offset += blockDim.x * gridDim.x)
+
+#define KALDI_CUDA_DECODER_IS_LAST_1D_THREAD() (threadIdx.x == (blockDim.x - 1))
+
+#define KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.y; i < (n); i += gridDim.y)
+
+#define KALDI_CUDA_DECODER_DIV_ROUND_UP(a, b) ((a + b - 1) / b)
+
+#define KALDI_CUDA_DECODER_1D_BLOCK 256
+#define KALDI_CUDA_DECODER_LARGEST_1D_BLOCK 1024
+#define KALDI_CUDA_DECODER_ONE_THREAD_BLOCK 1
+#define KALDI_CUDA_DECODER_MAX_CTA_COUNT 4096u
+#define KALDI_CUDA_DECODER_MAX_CTA_PER_LANE 512u
+namespace kaldi {
+namespace cuda_decoder {
+
+// Returning the number of CTAs to launch for (N,M) elements to compute
+// M is usually the batch size
+inline dim3 KaldiCudaDecoderNumBlocks(int N, int M) {
+  dim3 grid;
+  grid.x = KALDI_CUDA_DECODER_DIV_ROUND_UP(N, KALDI_CUDA_DECODER_1D_BLOCK);
+  unsigned int max_CTA_per_lane =
+      std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / M, 1u);
+  grid.x = std::min(grid.x, max_CTA_per_lane);
+  grid.y = M;
+  return grid;
+}
+
+// Use a fixed number of blocks for nlanes
+// Using the max number of CTAs possible for each lane,
+// according to KALDI_CUDA_DECODER_MAX_CTA_COUNT
+// and KALDI_CUDA_DECODER_MAX_CTA_PER_LANE
+inline dim3 KaldiCudaDecoderNumBlocks(int nlanes) {
+  dim3 grid;
+  unsigned int n_CTA_per_lane =
+      std::max(KALDI_CUDA_DECODER_MAX_CTA_COUNT / nlanes, 1u);
+  if (n_CTA_per_lane == 0) n_CTA_per_lane = 1;
+  grid.x = std::min(KALDI_CUDA_DECODER_MAX_CTA_PER_LANE, n_CTA_per_lane);
+  grid.y = nlanes;
+  return grid;
+}
+
+typedef int32 StateId;
+typedef float CostType;
+// IntegerCostType is the type used in the lookup table d_state_best_cost
+// and the d_cutoff
+// We use a 1:1 conversion between CostType <--> IntegerCostType
+// IntegerCostType is used because it triggers native atomic operations
+// (CostType does not)
+typedef int32 IntegerCostType;
+typedef int32 LaneId;
+typedef int32 ChannelId;
+
+// On the device we compute everything by batch
+// Data is stored as 2D matrices (BatchSize, 1D_Size)
+// For example, for the token queue, (BatchSize, max_tokens_per_frame_)
+// DeviceMatrix owns the data but is not used to access it.
+// DeviceMatrix is inherited in DeviceLaneMatrix and DeviceChannelMatrix
+// those two classes do the same thing, except that they belong either to a
+// channel or lane
+// that inheritance is done to clarify the code and help debugging
+//
+// To actually access the data, we should request an view through
+// GetView
+// That view contains both host cuda code to access the data. It does not own
+// the data.
+template <typename T>
+// if necessary, make a version that always use ncols_ as the next power of 2
+class DeviceMatrix {
+  T *data_;
+  void Allocate() {
+    KALDI_ASSERT(nrows_ > 0);
+    KALDI_ASSERT(ncols_ > 0);
+    KALDI_ASSERT(!data_);
+    data_ = static_cast<T *>(CuDevice::Instantiate().Malloc(
+        (size_t)nrows_ * ncols_ * sizeof(*data_)));
+    KALDI_ASSERT(data_);
+  }
+  void Free() {
+    KALDI_ASSERT(data_);
+    CuDevice::Instantiate().Free(data_);
+  }
+
+ protected:
+  int32 ncols_;
+  int32 nrows_;
+
+ public:
+  DeviceMatrix() : data_(NULL), ncols_(0), nrows_(0) {}
+
+  virtual ~DeviceMatrix() {
+    if (data_) Free();
+  }
+
+  void Resize(int32 nrows, int32 ncols) {
+    if (data_) Free();
+    KALDI_ASSERT(nrows > 0);
+    KALDI_ASSERT(ncols > 0);
+    nrows_ = nrows;
+    ncols_ = ncols;
+    Allocate();
+  }
+
+  T *MutableData() {
+    KALDI_ASSERT(data_);
+    return data_;
+  }
+  // abstract getInterface...
+};
+
+template <typename T>
+// if necessary, make a version that always use ncols_ as the next power of 2
+class HostMatrix {
+  T *data_;
+  void Allocate() {
+    KALDI_ASSERT(nrows_ > 0);
+    KALDI_ASSERT(ncols_ > 0);
+    KALDI_ASSERT(!data_);
+    cudaMallocHost((void **)&data_, (size_t)nrows_ * ncols_ * sizeof(*data_));
+    KALDI_ASSERT(data_);
+  }
+  void Free() {
+    KALDI_ASSERT(data_);
+    cudaFreeHost(data_);
+  }
+
+ protected:
+  int32 ncols_;
+  int32 nrows_;
+
+ public:
+  HostMatrix() : data_(NULL), ncols_(0), nrows_(0) {}
+
+  virtual ~HostMatrix() {
+    if (data_) Free();
+  }
+
+  void Resize(int32 nrows, int32 ncols) {
+    if (data_) Free();
+    KALDI_ASSERT(nrows > 0);
+    KALDI_ASSERT(ncols > 0);
+    nrows_ = nrows;
+    ncols_ = ncols;
+    Allocate();
+  }
+
+  T *MutableData() {
+    KALDI_ASSERT(data_);
+    return data_;
+  }
+  // abstract getInterface...
+};
+
+// Views of DeviceMatrix
+// Those views are created by either DeviceChannelMatrix or
+// DeviceLaneMatrix
+// We can access the data (the matrix) associated with that
+// Device[Channel|Lane]Matrix without owning that data.
+// Which means that we can pass those views by copy
+// without triggering a cudaFree, for instance.
+// Device[Channel|Lane]Matrix owns the data, [Channel|Lane]MatrixInterface just
+// gives access to it
+// Generating both host and device interfaces
+template <typename T>
+struct LaneMatrixView {
+  T *data_;
+  int32 ncols_;
+  __host__ __device__ __inline__ T *lane(const int32 ilane) {
+    return &data_[ilane * ncols_];
+  }
+};
+
+template <typename T>
+struct ChannelMatrixView {
+  T *data_;
+  int32 ncols_;
+  __host__ __device__ __inline__ T *channel(const int32 ichannel) {
+    return &data_[ichannel * ncols_];
+  }
+};
+
+// Specializing DeviceMatrix into lane and channel variants.
+// Helps with code clarity/debugging
+template <typename T>
+class DeviceLaneMatrix : public DeviceMatrix<T> {
+ public:
+  LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+
+  T *lane(const int32 ilane) {
+    return &this->MutableData()[ilane * this->ncols_];
+  }
+};
+
+template <typename T>
+class HostLaneMatrix : public HostMatrix<T> {
+ public:
+  LaneMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+
+  T *lane(const int32 ilane) {
+    return &this->MutableData()[ilane * this->ncols_];
+  }
+};
+
+template <typename T>
+class DeviceChannelMatrix : public DeviceMatrix<T> {
+ public:
+  ChannelMatrixView<T> GetView() { return {this->MutableData(), this->ncols_}; }
+  T *channel(const int32 ichannel) {
+    return &this->MutableData()[ichannel * this->ncols_];
+  }
+};
+
+// LaneCounters/ChannelCounters
+// The counters are all the singular values associated to a lane/channel
+// For instance  the main queue size. Or the min_cost of all tokens in that
+// queue
+// LaneCounters are used during computation
+struct LaneCounters {
+  // hannel that this lane will compute for the current frame
+  ChannelId channel_to_compute;
+  // Pointer to the loglikelihoods array for this channel and current frame
+  BaseFloat *loglikelihoods;
+  // Contains both main_q_end and narcs
+  // End index of the main queue
+  // only tokens at index i with i < main_q_end
+  // are valid tokens
+  // Each valid token the subqueue main_q[main_q_local_offset, main_q_end[ has
+  // a number of outgoing arcs (out-degree)
+  // main_q_narcs is the sum of those numbers
+  // We sometime need to update both end and narcs at the same time using a
+  // single atomic,
+  // which is why they're packed together
+  int2 main_q_narcs_and_end;
+  // contains the requested queue length which can
+  // be larger then the actual queue length in the case of overflow
+  int32 main_q_requested;
+  int32 aux_q_requested;
+  int32 aux_q_end;
+  int32 post_expand_aux_q_end;  // used for double buffering
+  // Some tokens in the same frame share the same token.next_state
+  // main_q_n_extra_prev_tokens is the count of those tokens
+  int32 main_q_n_extra_prev_tokens;
+  // Number of tokens created during the emitting stage
+  int32 main_q_n_emitting_tokens;
+  // Depending on the value of the parameter "max_tokens_per_frame"
+  // we can end up with an overflow when generating the tokens for a frame
+  // We try to prevent this from happening using an adaptive beam
+  // If an overflow happens, then the kernels no longer insert any data into
+  // the queues and set overflow flag to true.
+  // queue length.
+  // Even if that flag is set, we can continue the execution (quality
+  // of the output can be lowered)
+  // We use that flag to display a warning to the user
+  int32 q_overflow;
+  // ExpandArcs reads the tokens in the index range [main_q_local_offset, end[
+  int32 main_q_local_offset;
+  // We transfer the tokens back to the host at the end of each frame.
+  // Which means that tokens at a frame  n > 0 have an offset compared to to
+  // those
+  // in frame n-1. main_q_global_offset is the overall offset of the current
+  // main_q,
+  // since frame 0
+  // It is used to set the prev_token index.
+  int32 main_q_global_offset;
+  // Same thing, but for main_q_n_extra_prev_tokens (those are also transfered
+  // back to host)
+  int32 main_q_extra_prev_tokens_global_offset;
+  // Minimum token for that frame
+  IntegerCostType min_int_cost;
+  // Current beam. Can be different from default_beam,
+  // because of the AdaptiveBeam process, or because of
+  // ApplyMaxActiveAndReduceBeam
+  IntegerCostType int_beam;
+  // Adaptive beam. The validity says until which index this adaptive beam is
+  // valid.
+  // After that index, we need to lower the adaptive beam
+  int2 adaptive_int_beam_with_validity_index;
+  // min_cost + beam
+  IntegerCostType int_cutoff;
+  // The histogram for max_active will be computed between min_histo_cost
+  // and max_histo_cost. Set for each frame after emitting stage
+  CostType min_histo_cost;
+  CostType max_histo_cost;
+  CostType histo_bin_width;
+  bool compute_max_active;
+  // offsets used by concatenate_lanes_data_kernel
+  int32 main_q_end_lane_offset;
+  int32 main_q_n_emitting_tokens_lane_offset;
+  int32 main_q_n_extra_prev_tokens_lane_offset;
+
+  // --- Only valid after calling GetBestCost
+  // min_cost and its arg. Can be different than min_cost, because we may
+  // include final costs
+  int2 min_int_cost_and_arg;
+  // Number of final tokens with cost < best + lattice_beam
+  int32 n_within_lattice_beam;
+  int32 has_reached_final;  // if there's at least one final token in the queue
+  int32 prev_arg_min_int_cost;
+};
+
+// Channel counters
+// Their job is to save the state of a channel, when this channel is idle
+// The channel counters are loaded into the lane counters during the context
+// switches
+struct ChannelCounters {
+  // All the following values are just saved values from LaneCounters
+  // from the latest context-switch
+  int2 prev_main_q_narcs_and_end;
+  int32 prev_main_q_n_extra_prev_tokens;
+  int32 prev_main_q_global_offset;
+  int32 prev_main_q_extra_prev_tokens_global_offset;
+  CostType prev_beam;
+
+  // Only valid after calling GetBestCost
+  // different than min_int_cost : we include the "final" cost
+  int2 min_int_cost_and_arg_with_final;
+  int2 min_int_cost_and_arg_without_final;
+};
+
+class CudaDecoderException : public std::exception {
+ public:
+  CudaDecoderException(const char *str_, const char *file_, int line_,
+                       const bool recoverable_)
+      : str(str_),
+        file(file_),
+        line(line_),
+        buffer(std::string(file) + ":" + std::to_string(line) + " :" +
+               std::string(str)),
+        recoverable(recoverable_) {}
+  const char *what() const throw() { return buffer.c_str(); }
+
+  const char *str;
+  const char *file;
+  const int line;
+  const std::string buffer;
+  const bool recoverable;
+};
+
+// InfoToken contains data that needs to be saved for the backtrack
+// in GetBestPath/GetRawLattice
+// We don't need the token.cost or token.next_state.
+struct __align__(8) InfoToken {
+  int32 prev_token;
+  int32 arc_idx;
+  bool IsUniqueTokenForStateAndFrame() {
+    // This is a trick used to save space and PCI-E bandwidth (cf
+    // preprocess_in_place kernel)
+    // This token is associated with a next_state s, created during the
+    // processing of frame f.
+    // If we have multiple tokens associated with the state s in the frame f,
+    // arc_idx < 0 and -arc_idx is the
+    // count of such tokens. We will then have to look at another list to read
+    // the actually arc_idx and prev_token values
+    // If the current token is the only one, prev_token and arc_idx are valid
+    // and can be used directly
+    return (arc_idx >= 0);
+  }
+
+  // Called if this token is linked to others tokens in the same frame (cf
+  // comments for IsUniqueTokenForStateAndFrame)
+  // return the {offset,size} pair necessary to list those tokens in the
+  // extra_prev_tokens list
+  // They are stored at offset "offset", and we have "size" of those
+  std::pair<int32, int32> GetSameFSTStateTokensList() {
+    KALDI_ASSERT(!IsUniqueTokenForStateAndFrame());
+
+    return {prev_token, -arc_idx};
+  }
+};
+
+// Device function, used to set a in an InfoToken the [offset,size] related to
+// InfoToken.GetSameFSTStateTokensList
+__device__ __inline__ void SetSameFSTStateTokensList(int32 offset, int32 size,
+                                                     InfoToken *info_token) {
+  // We always have size > 0
+  *info_token = {offset, -size};
+}
+
+// Used to store the index in the GPU hashmap of that FST state
+// The hashmap is only generated with the final main queue (post max_active_) of
+// each frame
+// Also stores the information or whether or not the owner of that object is the
+// representative of this FSTState
+typedef int32 FSTStateHashIndex;
+
+// 1:1 Conversion float <---> sortable int
+// We convert floats to sortable ints in order
+// to use native atomics operation
+// Those are the host version, used when we transfer an int from the device
+// and we want to convert it to a float
+// (it was created on device by floatToOrderedInt, we'll use
+// orderedIntToFloatHost on host to convert it back to a float)
+__inline__ int32 floatToOrderedIntHost(float floatVal) {
+  int32 intVal;
+  // Should be optimized away by compiler
+  memcpy(&intVal, &floatVal, sizeof(float));
+  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+}
+
+__inline__ float orderedIntToFloatHost(int32 intVal) {
+  intVal = (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+  float floatVal;
+  // Should be optimized away by compiler
+  memcpy(&floatVal, &intVal, sizeof(float));
+  return floatVal;
+}
+
+// Hashmap value. Used when computing the hashmap in PostProcessingMainQueue
+struct __align__(16) HashmapValueT {
+  // Map key : fst state
+  int32 key;
+  // Number of tokens associated to that state
+  int32 count;
+  // minimum cost for that state + argmin
+  unsigned long long min_and_argmin_int_cost_u64;
+};
+
+enum OVERFLOW_TYPE {
+  OVERFLOW_NONE = 0,
+  OVERFLOW_MAIN_Q = 1,
+  OVERFLOW_AUX_Q = 2
+};
+
+enum QUEUE_ID { MAIN_Q = 0, AUX_Q = 1 };
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_UTILS_H_
diff --git a/src/cudadecoder/cuda-decoder-kernels-utils.h b/src/cudadecoder/cuda-decoder-kernels-utils.h
new file mode 100644
index 00000000000..c94b84f6360
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels-utils.h
@@ -0,0 +1,264 @@
+// cudadecoder/cuda-decoder-kernels-utils.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
+
+// NO_KEY == -1 is ok, because all keys will be >= 0 (FST states)
+#define KALDI_CUDA_DECODER_HASHMAP_NO_KEY -1
+#define KALDI_CUDA_DECODER_HASHMAP_NO_VAL                 \
+  {                                                       \
+    KALDI_CUDA_DECODER_HASHMAP_NO_KEY, 0, ULONG_MAX \
+  }
+
+#include "util/stl-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// MinPlus and PlusPlus
+// int2 operators used in Scan or Reduce operations
+struct MinPlus {
+  __device__ int2 operator()(const int2 &a, const int2 &b) const {
+    int2 c;
+    c.x = min(a.x, b.x);
+    c.y = a.y + b.y;
+    return c;
+  }
+};
+struct PlusPlus {
+  __device__ int2 operator()(const int2 &a, const int2 &b) const {
+    int2 c;
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    return c;
+  }
+};
+
+struct PlusPlusPlusPlus {
+  __device__ int4 operator()(const int4 &a, const int4 &b) const {
+    int4 c;
+    c.x = a.x + b.x;
+    c.y = a.y + b.y;
+    c.z = a.z + b.z;
+    c.w = a.w + b.w;
+    return c;
+  }
+};
+
+// 1:1 Conversion float <---> sortable int
+// We convert floats to sortable ints in order
+// to use native atomics operation, which are
+// way faster than looping over atomicCAS
+__device__ __forceinline__ int32 floatToOrderedInt(float floatVal) {
+  int32 intVal = __float_as_int(floatVal);
+  return (intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF;
+}
+
+__device__ __forceinline__ float orderedIntToFloat(int32 intVal) {
+  return __int_as_float((intVal >= 0) ? intVal : intVal ^ 0x7FFFFFFF);
+}
+
+// binsearch_maxle (device)
+// With L=[all indexes low<=i<=high such as vec[i]<= val]
+// binsearch_maxle returns max(L)
+// the array vec must be sorted
+// Finds that value using a binary search
+__device__ __forceinline__ int32 binsearch_maxle(const int32 *vec,
+                                                 const int32 val, int32 low,
+                                                 int32 high) {
+  while (true) {
+    if (low == high) return low;  // we know it exists
+    if ((low + 1) == high) return (vec[high] <= val) ? high : low;
+
+    int32 mid = low + (high - low) / 2;
+
+    if (vec[mid] > val)
+      high = mid - 1;
+    else
+      low = mid;
+  }
+}
+
+// Atomic operations on int2 (device)
+// atomicAddI2, atomicMinI2, atomicSubI2
+//
+// union used
+union UInt64UnionInt2 {
+  int2 i2;
+  unsigned long long int ull;
+};
+
+#if __CUDA_ARCH__ < 350
+__device__ __inline__ void atomicMinULL(unsigned long long *ptr,
+                                        unsigned long long val) {
+  unsigned long long old = *ptr, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(ptr, assumed, val);
+  } while (old > val && assumed != old);
+}
+#else
+__device__ __forceinline__ void atomicMinULL(unsigned long long *ptr,
+                                             unsigned long long val) {
+  atomicMin(ptr, val);
+}
+#endif
+
+__device__ __inline__ int2 atomicAddI2(int2 *ptr, int2 val) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 uval, uold;
+  uval.i2 = val;
+  uold.ull = atomicAdd(ptr64, uval.ull);
+  return uold.i2;
+}
+
+// We should switch to native atom64 on atomicMinI2 and atomicSubI2
+__device__ __inline__ void atomicMinI2(int2 *ptr, int2 val) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 old, assumed, value;
+  old.ull = *ptr64;
+  value.i2 = val;
+  if (old.i2.x <= val.x) return;
+  do {
+    assumed = old;
+    old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
+  } while (old.ull != assumed.ull && old.i2.x > value.i2.x);
+}
+
+__device__ void atomicSubI2(int2 *ptr, int2 sub) {
+  unsigned long long int *ptr64 =
+      reinterpret_cast<unsigned long long int *>(ptr);
+  UInt64UnionInt2 old, assumed, value;
+  old.ull = *ptr64;
+  do {
+    assumed = old;
+    value.i2.x = assumed.i2.x - sub.x;
+    value.i2.y = assumed.i2.y - sub.y;
+    old.ull = atomicCAS(ptr64, assumed.ull, value.ull);
+  } while (old.ull != assumed.ull);
+}
+
+// Hash function used in the hashmap.
+// Using identity for now. They keys are the FST states, some randomness already
+// exists
+__device__ __forceinline__ int hash_func(int key) {
+  return key;  // using identity for now
+}
+
+// Packing and unpacking a minimum + its argument into a single uint64
+// (min is first, used for sorting)
+// Not using an union because documentation is not clear regarding reordering in structs
+// (for instance, in int2, y is stored before x)
+
+__device__ __inline__ void PackArgminInUInt64(const uint32_t min, const uint32_t arg, unsigned long long *argmin) {
+	unsigned long long p = min;
+	p <<= 32;
+	p |= arg;
+	*argmin = p;
+}
+
+__device__ __inline__ void GetMinFromPackedArgminUInt64(const unsigned long long argmin, uint32_t *min) {
+	*min = (uint32_t)((argmin & 0xFFFFFFFF00000000LL) >> 32);
+}
+
+__device__ __inline__ void GetArgFromPackedArgminUInt64(const unsigned long long argmin, uint32_t *arg) {
+	*arg = (uint32_t)(argmin & 0xFFFFFFFFLL);
+}
+
+// hashmap_insert_or_aggregate
+// Inserting a new value into the hashmap. If the key already exists in the
+// hashmap,
+// we'll aggregate the existing value with the new one, and set the result as
+// value for that key.
+// The new value inserted at key is (1, (int_cost, arg_int_cost)
+// With values being [count (int32), [min_cost, argmin_cost] (int2)]
+// If a value already exists for a key, we will aggregate the two values:
+// hashmap[key] = old_value +_ new_value
+// with +_ being (integer +, argmin)
+// It returns the hash_idx, i.e. where the key was inserted in the hashmap
+// The owner will then use that to access the data, and clear it for future use
+// It also returns local_idx, which informs how many values of that same key
+// were inserted before that call.
+// e.g. if thread 23 inserts the key 3, then thread 9 inserts the key 3,
+// thread 23 will have local_idx=0, thread 9 will have local_idx=1
+//
+// We use hashmap_insert in the context of a ReduceByKey. The same thread will
+// always
+// access the same key. Which is why we do not need a hashmap_find, and can
+// simply remember the hash_idx
+// from our last insert.
+//
+// Restriction: that function can only be used if we know that we will have
+// enough space in the hashmap
+// ie hashmap_capacity > total number of keys
+//
+// keys must be >= 0 (to avoid collisions with
+// KALDI_CUDA_DECODER_HASHMAP_NO_KEY)
+__device__ __inline__ void hashmap_insert_or_aggregate(
+    HashmapValueT *d_map_values, int key, int int_cost, int arg_int_cost,
+    int capacity, int *local_idx, int *out_hash_idx) {
+  int hash_idx = hash_func(key) % capacity;
+  int c = 0;
+  HashmapValueT *d_val = NULL;
+  do {
+    d_val = &d_map_values[hash_idx];
+    // Looking for a spot in the hashmap
+    int old = atomicCAS(&d_val->key, KALDI_CUDA_DECODER_HASHMAP_NO_KEY, key);
+    if (old == KALDI_CUDA_DECODER_HASHMAP_NO_KEY || old == key)
+      break;  // found a spot
+    hash_idx = (hash_idx + 1) % capacity;
+    ++c;
+  } while (c < capacity);
+  // The condition in which we use the hashmap always ensure that we have space
+  // asserting that we found a spot
+  assert(d_val);
+
+  // Updating values
+  *local_idx = atomicAdd(&d_val->count, 1);
+  *out_hash_idx = hash_idx;
+  unsigned long long argmin_u64;
+  PackArgminInUInt64(int_cost, arg_int_cost, &argmin_u64);
+  atomicMinULL(&d_val->min_and_argmin_int_cost_u64, argmin_u64);
+}
+
+// In FSTStateHashIndex, we store both the hash_idx and a boolean
+// is_representative
+// which tells if the current thread is responsible for the state stored at
+// index hash_idx
+// We use the bit sign for that
+// Setter and getter
+__device__ __inline__ void SetFSTStateHashIndex(int32 raw_hash_idx,
+                                                bool is_representative,
+                                                FSTStateHashIndex *hash_idx) {
+  *hash_idx = is_representative ? (-raw_hash_idx - 1)  // -1 to force it < 0
+                                : raw_hash_idx;
+}
+
+__device__ __inline__ void GetFSTStateHashIndex(FSTStateHashIndex &hash_idx,
+                                                int32 *raw_hash_idx,
+                                                bool *is_representative) {
+  *is_representative = (hash_idx < 0);
+  *raw_hash_idx = *is_representative ? (-(hash_idx + 1)) : hash_idx;
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_UTILS_H_
diff --git a/src/cudadecoder/cuda-decoder-kernels.cu b/src/cudadecoder/cuda-decoder-kernels.cu
new file mode 100644
index 00000000000..f2a0d16d317
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels.cu
@@ -0,0 +1,2091 @@
+// cudadecoder/cuda-decoder-kernels.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cub/cub.cuh>
+#include "cuda-decoder-kernels.h"
+#include "cuda-decoder-kernels-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// Initialize the hashmap with NO_VAL
+// Called in InitDeviceData, when building the CudaDecoder object
+__global__ void init_hashmap_kernel(DeviceParams cst_dev_params) {
+  const int max_nlanes = cst_dev_params.max_nlanes;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, max_nlanes) {
+    const int capacity = cst_dev_params.hashmap_capacity;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, capacity) {
+      cst_dev_params.d_hashmap_values.lane(ilane)[idx] =
+          KALDI_CUDA_DECODER_HASHMAP_NO_VAL;
+    }
+  }
+}
+
+// Initialize initial channel on  device
+// Called by ComputeInitialChannel
+// It is NOT called in InitDecoding
+// In InitDecoding we will clone the initial channel into the channel we called
+// InitDecoding on
+// Here we are actually creating this initial channel
+// we do that once in the CudaDecoder constructor.
+//
+// The initial channel is the state of a channel when
+// it will start decoding a new utterance
+// thread (1, 1, 1)
+// blocks(1, 1, 1);
+__global__ void initialize_initial_lane_kernel(DeviceParams cst_dev_params) {
+  const int init_ichannel = cst_dev_params.init_channel_id;
+  const int init_ilane = 0;
+  ChannelCounters *init_channel_counters =
+      cst_dev_params.d_channels_counters.channel(init_ichannel);
+  LaneCounters *lane_counters =
+      cst_dev_params.d_lanes_counters.lane(init_ilane);
+
+  // Making the data look like an ExpandArcsEmitting just executed,
+  // and put the StartState in the aux_q. We will then pick up a normal
+  // execution from there
+  // (calling PruneAndPreprocess, then ExpandArcsNonEmitting..)
+  lane_counters->aux_q_end = 0;
+  lane_counters->aux_q_requested = 0;
+  lane_counters->post_expand_aux_q_end = 1;
+  lane_counters->main_q_global_offset = 0;
+  lane_counters->main_q_local_offset = 0;
+  lane_counters->main_q_n_extra_prev_tokens = 0;
+  lane_counters->int_cutoff = INT_MAX;
+  lane_counters->main_q_n_emitting_tokens = 0;  // all non emitting
+  lane_counters->int_beam = floatToOrderedInt(cst_dev_params.default_beam);
+  lane_counters->main_q_narcs_and_end = {0, 0};
+  lane_counters->main_q_requested = 0;
+  lane_counters->prev_arg_min_int_cost = 0;
+  const StateId init_state = cst_dev_params.init_state;
+  const CostType init_cost = cst_dev_params.init_cost;
+  IntegerCostType int_init_cost = floatToOrderedInt(init_cost);
+  cst_dev_params.d_aux_q_state_and_cost.lane(init_ilane)[0] = {init_state,
+                                                               int_init_cost};
+  lane_counters->min_int_cost = int_init_cost;
+  CostType cutoff = orderedIntToFloat(int_init_cost);
+  lane_counters->int_cutoff =
+      floatToOrderedInt(cutoff + cst_dev_params.default_beam);
+  cst_dev_params.d_aux_q_info.lane(init_ilane)[0] = {INT_MIN, -1};
+}
+
+// Called by InitDecoding
+// Called when some channels will start decoding a new utterance
+// do everything that's needed to do on the device to start decoding a new
+// utterance with those channels
+// It clones the initial channel (created in initialize_initial_lane_kernel)
+// into the channels we want to InitDecoding on
+__global__ void init_decoding_on_device_kernel(DeviceParams cst_dev_params,
+                                               KernelParams params) {
+  const int init_ichannel = cst_dev_params.init_channel_id;
+
+  const ChannelCounters *init_channel_counters =
+      cst_dev_params.d_channels_counters.channel(init_ichannel);
+  const int32 init_main_q_end =
+      init_channel_counters->prev_main_q_narcs_and_end.y;
+  const int32 nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, init_main_q_end) {
+      const LaneCounters *lane_counters =
+          cst_dev_params.d_lanes_counters.lane(ilane);
+      const int32 ichannel = lane_counters->channel_to_compute;
+      cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_state_and_cost.channel(init_ichannel)[idx];
+      cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+              init_ichannel)[idx];
+      cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[idx] =
+          cst_dev_params.d_main_q_arc_offsets.channel(init_ichannel)[idx];
+      if (idx == 0) {
+        ChannelCounters *channel_counters =
+            cst_dev_params.d_channels_counters.channel(ichannel);
+        channel_counters->prev_main_q_narcs_and_end =
+            init_channel_counters->prev_main_q_narcs_and_end;
+        channel_counters->prev_main_q_n_extra_prev_tokens =
+            init_channel_counters->prev_main_q_n_extra_prev_tokens;
+        channel_counters->prev_main_q_global_offset = 0;
+        channel_counters->prev_main_q_extra_prev_tokens_global_offset = 0;
+        channel_counters->prev_beam = cst_dev_params.default_beam;
+      }
+    }
+  }
+}
+
+// Context switch : load
+// Called by LoadChannelsStateToLanes
+// THREADS : (1, 1, 1)
+// BLOCKS : (1, nlanes_used, 1)
+__global__ void load_channels_state_in_lanes_kernel(DeviceParams cst_dev_params,
+                                                    KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    int2 main_q_narcs_and_end = channel_counters->prev_main_q_narcs_and_end;
+    lane_counters->main_q_narcs_and_end = main_q_narcs_and_end;
+    lane_counters->main_q_n_extra_prev_tokens =
+        channel_counters->prev_main_q_n_extra_prev_tokens;
+    CostType beam = channel_counters->prev_beam;
+    IntegerCostType int_beam = floatToOrderedInt(beam);
+    lane_counters->int_beam = int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.x = int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.y =
+        cst_dev_params.adaptive_beam_static_segment;
+    lane_counters->main_q_global_offset =
+        channel_counters
+            ->prev_main_q_global_offset;  // we'll update it after emitting
+    lane_counters->main_q_extra_prev_tokens_global_offset =
+        channel_counters->prev_main_q_extra_prev_tokens_global_offset;
+  }
+}
+
+// Context switch : store
+// Called by SaveChannelsStateFromLanes
+// THREADS : (1, 1, 1)
+// BLOCKS : (1, nchannel_to_compute, 1)
+__global__ void save_channels_state_from_lanes_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    channel_counters->prev_main_q_global_offset =
+        lane_counters->main_q_global_offset;
+    channel_counters->prev_main_q_extra_prev_tokens_global_offset =
+        lane_counters->main_q_extra_prev_tokens_global_offset;
+    channel_counters->prev_main_q_narcs_and_end =
+        lane_counters->main_q_narcs_and_end;
+    channel_counters->prev_main_q_n_extra_prev_tokens =
+        lane_counters->main_q_n_extra_prev_tokens;
+    channel_counters->prev_beam = orderedIntToFloat(lane_counters->int_beam);
+  }
+}
+
+// compute_lane_offsets_kernel
+// the kernel concatenate_lanes_data concatenates multiple array into a single
+// continuous array
+// compute_lane_offsets_kernel computes the offset of each array into this
+// continous array
+// This kernel is 1D : the lanes are on the X dimension, because we want to
+// compute the offset of those lanes
+__global__ void compute_lane_offsets_kernel(DeviceParams cst_dev_params,
+                                            KernelParams params) {
+  typedef cub::BlockScan<int4, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  const int nlanes = params.nlanes_used;
+  int4 sum_so_far = {0, 0, 0, 0};
+  KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(
+      block_offset, thread_idx,
+      nlanes + 1) {  // +1 because we are doing an exclusive sum, and we want
+                     // all the values
+    int32 ilane = block_offset + thread_idx;
+    int4 zero4 = {0, 0, 0, 0};
+    int4 lane_offsets = zero4;
+    if (ilane < nlanes) {  // nlanes, not nlanes+1, because we cannot read +1
+                           // values (undefined)
+      LaneCounters *d_lane_counters =
+          cst_dev_params.d_lanes_counters.lane(ilane);
+      int32 main_q_end = d_lane_counters->main_q_narcs_and_end.y;
+      int32 n_emitting_tokens = d_lane_counters->main_q_n_emitting_tokens;
+      int32 main_q_n_extra_prev_tokens =
+          d_lane_counters->main_q_n_extra_prev_tokens;
+      lane_offsets = {main_q_end, n_emitting_tokens, main_q_n_extra_prev_tokens,
+                      0};
+    }
+    int4 block_aggregate;
+    BlockScan(temp_storage)
+        .ExclusiveScan(lane_offsets, lane_offsets, zero4, PlusPlusPlusPlus(),
+                       block_aggregate);
+    PlusPlusPlusPlus pppp;
+    lane_offsets = pppp(lane_offsets, sum_so_far);
+    sum_so_far = pppp(sum_so_far, block_aggregate);
+    if (ilane < (nlanes + 1)) {  // nlanes+1, to write the output
+      LaneCounters *d_lane_counters =
+          cst_dev_params.d_lanes_counters.lane(ilane);
+      LaneCounters *h_lane_counters =
+          cst_dev_params.h_lanes_counters.lane(ilane);
+      h_lane_counters->main_q_end_lane_offset =
+          d_lane_counters->main_q_end_lane_offset = lane_offsets.x;
+      h_lane_counters->main_q_n_emitting_tokens_lane_offset =
+          d_lane_counters->main_q_n_emitting_tokens_lane_offset =
+              lane_offsets.y;
+      h_lane_counters->main_q_n_extra_prev_tokens_lane_offset =
+          d_lane_counters->main_q_n_extra_prev_tokens_lane_offset =
+              lane_offsets.z;
+    }
+    __syncthreads();  // reusing temp_storage
+  }
+}
+
+// concatenate_lanes_data
+// Called by PerformConcatenatedCopy
+// Creates a concatenate array into concat,
+// by concatenating all the arrays src.lane(ilane)
+// for ilane=0..params.nlanes_used
+// Used to prepare data for copy to Host. We want to avoid small Device2Host
+// copies.
+template <typename T>
+__global__ void concatenate_lanes_data_kernel(DeviceParams cst_dev_params,
+                                              KernelParams params,
+                                              LaneMatrixView<T> src, T *concat,
+                                              int32 *lane_offsets) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const int32 stride =
+        sizeof(LaneCounters) / sizeof(int32);  // offsets are in LaneCounters
+    int32 beg = *(lane_offsets + ilane * stride);
+    int32 end = *(lane_offsets + (ilane + 1) * stride);
+    int32 vec_size = end - beg;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, vec_size) {
+      T d = src.lane(ilane)[idx];
+      concat[beg + idx] = d;
+    }
+  }
+}
+
+// nonemitting_preprocess_and_contract_kernel
+// Called from PruneAndPreprocess
+// This kernels prune the aux_q, move the survival tokens to the main_q,
+// and add the preprocessing information necessary for the next ExpandArcs
+// (the expand that follows PruneAndPreprocess is always non-emitting)
+// It prunes the tokens using the cutoff, and prepare the data necessary for
+// ExpandArcs:
+// d_main_q_degrees_prefix_sum, d_main_q_arc_offsets_
+// The prefix sum is done in one-pass here, using a trick (we compute the prefix
+// sum
+// as we fill the main_q)
+__global__ void nonemitting_preprocess_and_contract_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  // We need to move the survival tokens to the main_q
+  //
+  // sh_main_q_global_block_offset has two purposes :
+  // (1) to know where to store the survival tokens in the main_q
+  // (2) to perform the prefix sum degrees (of the survival tokens)
+  __shared__ int2 sh_main_q_global_block_offset;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 aux_q_end = lane_counters->post_expand_aux_q_end;
+    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    // Keeping whole CTA alive. We'll use __syncthreads()
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   aux_q_end) {
+      const int32 aux_q_idx = block_offset + thread_idx;
+      const int32 ichannel = lane_counters->channel_to_compute;
+      int32 degree = 0;
+      int32 arc_start = -1;
+      StateId token_state;
+      IntegerCostType token_int_cost;
+      // We've kept the whole CTA alive. Now we keep only those will a valid
+      // token
+      if (aux_q_idx < aux_q_end) {
+        const int2 both =
+            cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
+        token_state = both.x;
+        token_int_cost = both.y;
+
+        if (token_int_cost < int_cutoff) {
+          // We'll keep that token. Loading its arc degree/csr offset now.
+          arc_start = cst_dev_params.d_arc_ne_offsets[token_state];
+          const int32 arc_end =
+              cst_dev_params.d_arc_ne_offsets[token_state + 1];
+          degree = arc_end - arc_start;
+        }
+      }
+
+      // If we've set a different arc_start,
+      // this thread has a valid unpruned token
+      int32 is_pruned = (arc_start == -1);
+
+      // We now know which tokens will be moved to the main_q, the remaining
+      // will be pruned
+      // we now compute a prefix sum inside the CUDA block to determine the
+      // local indexes of the unpruned tokens
+      // the first unpruned token will have a index of 0, the second 1, ...
+      // We also need to compute the prefix sum of the arc degrees
+      // we start by doing a local prefix sum inside the CUDA block
+      int2 block_prefix_sum_narcs_and_end = {degree, (is_pruned ? 0 : 1)};
+      const int2 zero2 = {0, 0};
+
+      // Computing the prefix sum (exclusive)
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(block_prefix_sum_narcs_and_end,
+                         block_prefix_sum_narcs_and_end, zero2, PlusPlus());
+
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // This conditional branch is entered by the last thread
+        // Because it is the last, the prefix_sum of that thread contains the
+        // sum of all elements
+
+        // We also add the value from this thread - the prefix sum is exclusive
+        // For the sum, we want it inclusive
+        int2 block_sum = block_prefix_sum_narcs_and_end;
+        block_sum.x += degree;
+        block_sum.y += is_pruned ? 0 : 1;
+
+        // Doing two things at the same time :
+        // requesting a spot in the main_q to store the survival tokens from
+        // this CTA
+        // We also increment the narcs value. atomic64.x will contain the number
+        // of
+        // arcs in the main_q up until the atomic64.y index
+        // That's all we need to finish our prefix sum. We add this global
+        // offset.
+
+        // First atomic to check if we are not overflowing main_q.
+        int block_offset =
+            atomicAdd(&lane_counters->main_q_requested, block_sum.y);
+
+        // Verify that we do not overflow
+        if (block_offset + block_sum.y < cst_dev_params.main_q_capacity) {
+          // we don't overflow we can safely grab a spot in the main_q
+          sh_main_q_global_block_offset =
+              atomicAddI2(&lane_counters->main_q_narcs_and_end, block_sum);
+        } else {
+          // our update would overflow
+          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;  // for the host
+          sh_main_q_global_block_offset.y =
+              cst_dev_params.main_q_capacity;  // used as flag to broadcast the
+                                               // information in the CTA
+        }
+      }
+
+      // Syncing because :
+      // - Broadcasting sh_main_q_global_block_offset
+      // - We may reuse sh_temp_storage (cf CUB doc)
+      __syncthreads();
+
+      // Checking if we are overflowing the main_q
+      // All threads are executing the next line
+      if (sh_main_q_global_block_offset.y == cst_dev_params.main_q_capacity)
+        goto end_lane;  // done for this lane
+
+      // If we are executing the following lines it means that we are not
+      // overflowing the queue
+      // We then continue what we were doing
+      if (!is_pruned) {
+        bool moving_emitting_tokens = (lane_counters->main_q_local_offset == 0);
+        // we will move our unpruned token to the main_q, at index main_q_idx
+        InfoToken tok_info = cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
+        const int32 main_q_idx =
+            sh_main_q_global_block_offset.y + block_prefix_sum_narcs_and_end.y;
+        CostType acoustic_cost = 0.0f;
+        if (moving_emitting_tokens && tok_info.arc_idx != -1) {
+          const int32 arc_ilabel =
+              cst_dev_params.d_arc_pdf_ilabels[tok_info.arc_idx];
+          acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
+        }
+        cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] = tok_info;
+
+        // Moving the token to the main q
+        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx] = {
+            token_state, token_int_cost};
+        cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
+            acoustic_cost;
+        // Saving the global prefix sum
+        const int32 prefix_sum_narcs =
+            sh_main_q_global_block_offset.x + block_prefix_sum_narcs_and_end.x;
+        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+            ichannel)[main_q_idx] = prefix_sum_narcs;
+        // Saving the CSR arc offset for that token's state
+        // it will be used by the expand kernel, and avoid doing a new random
+        // memory access in the expand kernel
+        cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+            arc_start;
+      }
+    }
+
+  end_lane:;  // empty statement
+  }
+}
+
+// GetAdaptiveBeam is used in ExpandArcs
+// When we generate new tokens by traversing arcs, 
+// we can end up creating a lot of tokens, if the current frame 
+// generated loglikelihoods too uniform for instance (we don't have
+// any good tokens that will reduce the cutoff, so we end up generating
+// a lot of tokens)
+// To avoid overflowing the aux_q, we apply a decreasing beam.
+// With aux_q_end being the current aux_q size, we have a decrease function f, with
+// adaptive_beam = f(aux_q_end)
+// f is a decreasing piecewise constant function
+// Please note that when processing tokens, we usually have dozens of thousands of threads
+// generating tokens. Those are already in flight, and will not reload the beam immediatly.
+// It means that we need to start reducing the beam as soon as we detect that we are generating more tokens than
+// expected. 
+// We can configure the function f using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS.
+// We will use default_beam for the first max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// tokens in the aux_q.
+// Once we reach that number, we will decrease the adaptive beam linearly from default_beam to 0,
+// using KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS steps
+//
+// x-axis : aux_q_end. How much tokens are already in the aux_q
+// y-axis : adaptive_beam = f(aux_q_end)
+// default_beam _| ________________
+//               |               /\ _________
+//               |                |          _________
+//            0 _|   static_segment                   _________
+//               |________________________________________________
+//               |                                             |     
+//   aux_q_end=  0                                    max_tokens_per_frame
+// We have :     
+// static_segment = max_tokens_per_frame/KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT
+// and KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS = 3
+__device__ void UpdateAdaptiveBeam(const DeviceParams &cst_dev_params,
+                                   const int aux_q_index_block_offset,
+                                   IntegerCostType min_int_cost,
+                                   int2 *adaptive_int_beam_with_validity_index,
+                                   LaneCounters *lane_counters) {
+  int32 beam_valid_until_idx = adaptive_int_beam_with_validity_index->y;
+  if (aux_q_index_block_offset < beam_valid_until_idx) return;  // nothing to do
+
+  CostType beam = orderedIntToFloat(adaptive_int_beam_with_validity_index->x);
+  while (aux_q_index_block_offset >= beam_valid_until_idx) {
+    beam /= 2;
+    beam_valid_until_idx += cst_dev_params.adaptive_beam_bin_width;
+  }
+
+  IntegerCostType new_int_cutoff = (min_int_cost < INT_MAX)
+      ? floatToOrderedInt(orderedIntToFloat(min_int_cost) + beam)
+      : INT_MAX;
+  IntegerCostType int_beam = floatToOrderedInt(beam);
+  adaptive_int_beam_with_validity_index->x = int_beam;
+  adaptive_int_beam_with_validity_index->y = beam_valid_until_idx;
+  // We can have races between the two atomics
+  // However the worst than can happen is a CTA might delay updating the beam
+  // This is not a critical bug. However, once we have a floatToOrderedInt
+  // that is generating unsigned ints, we could merge the two atomics into a
+  // single atomic64
+  atomicMin(&lane_counters->adaptive_int_beam_with_validity_index.x, int_beam);
+  atomicMax(&lane_counters->adaptive_int_beam_with_validity_index.y,
+            beam_valid_until_idx);
+  atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
+}
+
+// One CTA / lane
+__global__ void reset_for_frame_and_estimate_cutoff_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockReduce<CostType, KALDI_CUDA_DECODER_1D_BLOCK> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    if (threadIdx.x == 0) {
+      const CostType current_beam = orderedIntToFloat(lane_counters->int_beam);
+      // Do some initialization
+      lane_counters->q_overflow = OVERFLOW_NONE;
+      lane_counters->main_q_n_emitting_tokens = INT_MAX;
+      lane_counters->int_cutoff = INT_MAX;
+      lane_counters->min_int_cost = INT_MAX;
+      lane_counters->q_overflow = OVERFLOW_NONE;
+      lane_counters->aux_q_requested = 0;
+      lane_counters->main_q_requested = 0;
+      lane_counters->main_q_local_offset = 0;
+      lane_counters->compute_max_active =
+          false;  // will be set to true if necessary
+      channel_counters->min_int_cost_and_arg_with_final.x =
+          INT_MAX;  // it will be set with atomicMins
+      const CostType new_beam =
+          fmin(cst_dev_params.default_beam,
+               current_beam * KALDI_CUDA_DECODER_ADAPTIVE_BEAM_RECOVER_RATE);
+      lane_counters->int_beam = floatToOrderedInt(new_beam);
+    }
+    const int32 prev_arg_min = lane_counters->prev_arg_min_int_cost;
+    int2 both =
+        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[prev_arg_min];
+    int32 int_cost = both.y;
+    CostType previous_cost = orderedIntToFloat(int_cost);
+    const int32 prev_arg_min_state = both.x;
+    int32 arc_start = cst_dev_params.d_arc_e_offsets[prev_arg_min_state];
+    int32 arc_end = cst_dev_params.d_arc_e_offsets[prev_arg_min_state + 1];
+    int32 narcs = arc_end - arc_start;
+    // no loop - we only process the first KALDI_CUDA_DECODER_1D_BLOCK arcs
+    // we just want an estimate
+    CostType total_cost = FLT_MAX;
+    if (threadIdx.x < narcs) {
+      int32 iarc = arc_start + threadIdx.x;
+      CostType arc_fixed_cost = cst_dev_params.d_arc_weights[iarc];
+      const int32 arc_ilabel = cst_dev_params.d_arc_pdf_ilabels[iarc];
+      CostType acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
+      total_cost = previous_cost + arc_fixed_cost +
+                   acoustic_cost;  // +0.0f, best prev cost is normalized to 0
+    }
+
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(bin_id, KALDI_CUDA_DECODER_HISTO_NBINS) { 
+      cst_dev_params.d_histograms.lane(ilane)[bin_id] = 0; // reset for this frame
+    }
+
+    CostType min = BlockReduce(temp_storage).Reduce(total_cost, cub::Min());
+    if (narcs > 0 && threadIdx.x == 0) {
+      // narcs > 0 to have at least one valid element in the reduce
+      CostType new_cutoff = min + orderedIntToFloat(lane_counters->int_beam);
+      IntegerCostType new_int_cutoff = floatToOrderedInt(new_cutoff);
+      lane_counters->int_cutoff = new_int_cutoff;
+      lane_counters->min_int_cost = floatToOrderedInt(min);
+    }
+  }
+}
+// ExpandArc kernel
+// This kernel does the actual work of traversing arcs
+//
+// Pseudo code :
+// for all token tok in main_q[main_q_offset...end]:
+//      u = tok.next_state
+//      for all arc a(u->v) in the FST:
+//          v_cost = tok.cost + a.cost + accoustic_cost
+//
+//          if v_cost < cutoff and v_cost < best_state_cost[v]
+//              generate token associated to v, add to aux_q
+//              if necessary update cutoff
+//              if aux_q is getting full, reduce beam
+//
+// For more information please refer to http://kaldi-asr.org/doc/decoders.html
+//
+// ExpandArc rely on some preprocessed data to be able to function
+// for instance, it needs the prefix sum of the arc degree of all token.state in
+// the main_q
+// We need to call a Preprocess kernel before ExpandArc
+//
+// ExpandArc is used for both emitting and nonemitting phases
+// Differences between emitting and nonemitting :
+//      1) params.d_q_arc_offset contains offsets to either emitting or
+//      nonemitting arcs.
+//         It is transparent for this kernel. The differentiation was done in
+//         the Preprocess kernel,
+//         which is responsible for filling the params.d_q_arc_offset array
+//      2) Computation of the acoustic cost. If nonemitting, it is equal to 0.
+//      If emitting, we need
+//         to use values from the acoustic model (through the d_loglikelihoods
+//         array)
+//
+// Note : ExpandArc is not the only kernel able to traverse arcs.
+// FinalizeProcessNonemitting contains a simplified version of expand for only
+// one CUDA block
+template <bool IS_EMITTING>
+__global__ void expand_arcs_kernel(DeviceParams cst_dev_params,
+                                   KernelParams params) {
+  // BlockScan that we will use to compute token indexes in the output queue,
+  // and to find the min cost in the block
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage_scan;
+
+  // This kernel writes the new token to the output queue aux_q
+  // We will request a spot to store all the new tokens created by threads in
+  // this CUDA block
+  // sh_aux_q_index_block_offset indicates where to store them in the aux_q
+  // tokens created in this CUDA block will be store in :
+  // aux_q[sh_aux_q_index_block_offset], aux_q[sh_aux_q_index_block_offset + 1],
+  __shared__ int32 sh_aux_q_index_block_offset;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 main_q_offset = lane_counters->main_q_local_offset;
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int32 total_narcs = lane_counters->main_q_narcs_and_end.x;
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   total_narcs) {
+      int2 adaptive_int_beam_with_validity_index =
+          lane_counters->adaptive_int_beam_with_validity_index;
+      const int32 ichannel = lane_counters->channel_to_compute;
+      // Important : this thread is not responsible for a token in the input
+      // queue main_q
+      // but for an arc, going out of a token in the main_q
+      // The main_q contains in total total_narcs
+      // and this thread will compute the main_q_arc_index-th arc of the main_q
+      // For instance, first thread in the grid with threadIdx.x == 0 and
+      // blockIdx.x == 0
+      // will process the first arc of the token in main_q[main_q_offset + 0]
+      // (if that token has at least one arc)
+      //
+      // This insure a perfect one thread = one arc load balancing
+      // but we have work to do to know exactly which arc is the
+      // main_q_arc_index-th arc
+      // (what's its source ? its destination ? its arc_idx the FST CSR ?)
+      int32 main_q_arc_index = block_offset + thread_idx;
+      // We'll need those variables later in the kernel
+      // we declare them outside of the "valid_input" scope
+      // to be able to access them later
+      int32 main_q_idx;
+      int32 arc_idx;
+      StateId arc_next_state;
+      IntegerCostType int_total_cost = INT_MAX;
+      if (main_q_arc_index < total_narcs) {
+        // Current thread must take care of main_q_arc_index-th arc
+        // we need to now what's the source of that arc
+        // ie which token.state in main_q does it start from ?
+        // We use a binary search in the prefix sum of the token's degree to get
+        // that information
+        //
+        // Example : main_q contains 3 tokens
+        // - First token is associated to a state which has 3 outgoing arc
+        // - Second token is associated to a state which has 0 outgoing arc
+        // - Third token is associated to a state which has 2 outgoing arc
+        //
+        // We store the degrees in an array :
+        // [3, 0, 2]
+        //
+        // We then compute the exclusive prefix sum of that array :
+        // [0, 3, 3, 5]
+        //
+        // In total, we have 5 arcs in the main_q. ExpandArc will use 5 threads.
+        //
+        // Let's say we are the fifth thread in ExpandArc.
+        // we have threadIdx.x == 4, and blockIdx.x == 0
+        // it gives us main_q_arc_index == 4
+        // From there we have no idea what we're supposed to do next, we need to
+        // have information about the
+        // arc that we're supposed to traverse
+        //
+        // To do that, we look for the maximum index maxle_i in the prefix sum
+        // array such prefix_sum[i] <= 4
+        //
+        // [0, 3, 3, 5]
+        //          |
+        //         here
+        // maxle_i = 2
+        // it means that our source token is at index 2 in the main_q
+        // and we are computing the arc at index (main_q_arc_index -
+        // prefix_sum[maxle_i]) of that token
+        // ie the arc at index (4-3) = 1, the second arc of the second token in
+        // main_q
+
+        // Searching for the source of the arc that we will process
+        // (main_q_arc_index)
+        // we could preprocess the search in the preprocess kernels - for now
+        // this kernel is fast enough
+        const int32 *degrees_prefix_sum =
+            cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel);
+        main_q_idx = binsearch_maxle(degrees_prefix_sum, main_q_arc_index,
+                                     main_q_offset, main_q_end - 1);
+
+        // state_first_arc_idx_in_main_q
+        // d_main_q_degrees_prefix_sum contains the prefix sum of the
+        // degrees of all tokens in the main_q
+        // d_main_q_degrees_prefix_sum[main_q_idx] contains the number of arc
+        // in the main_q until that token
+        const int32 state_first_arc_idx_in_main_q =
+            degrees_prefix_sum[main_q_idx];
+
+        // arc_offset_start is the offset in the CSR, to find the arcs
+        // related to the state main_q_state_[main_q_idx]
+        // it was set by the preprocess kernel
+        const int32 arc_offset_start =
+            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];
+
+        // local_arc_index is the arc index for that state
+        // if local_arc_index == 2, we will process the second arc
+        // of state main_q_state_[main_q_idx]
+        const int32 local_arc_index =
+            main_q_arc_index - state_first_arc_idx_in_main_q;
+
+        // corresponding arc_idx in the FST
+        arc_idx = arc_offset_start + local_arc_index;
+
+        // Destination of that arc
+        arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];
+
+        // Building the total cost incrementally
+        // we'll add the acoustic cost and the old token's cost
+        const CostType arc_fixed_cost = cst_dev_params.d_arc_weights[arc_idx];
+        const CostType prev_token_cost = orderedIntToFloat(
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .y);
+        CostType total_cost = prev_token_cost + arc_fixed_cost;
+        const int32 prev_state =
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .x;
+        if (IS_EMITTING) {
+          const int32 arc_ilabel = cst_dev_params.d_arc_pdf_ilabels[arc_idx];
+          CostType acoustic_cost = -lane_counters->loglikelihoods[arc_ilabel];
+          total_cost += acoustic_cost;
+        }
+        int_total_cost = floatToOrderedInt(total_cost);
+
+        // If the total_cost is too large compared to our cutoff (beam search)
+        // then let's drop it
+        const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+        if (int_total_cost >= int_cutoff) int_total_cost = INT_MAX;
+      }
+
+      // If int_total_cost < INT_MAX, it means that :
+      // - this thread had a valid input (main_q_arc_index < total_narcs)
+      // - the total_cost of the generated token is < cutoff
+      // We will then add that new token in the output queue, aux_q
+      // We need to know where to put that token in the aux_q
+      // we'll first compute its index inside the CUDA block
+      // the first valid output token in the CUDA block will have index 0,
+      // the second index 1... We compute that using a prefix sum
+      //
+      // We also need to find the overall min cost in the CUDA block
+      // a prefix sum is a scan operation, and a min a reduce operation
+      // we can perform a reduce operation using a scan (using the last value)
+      // we compute the prefix sum and the min in one scan, using the data
+      // struct CostTypeAndInt
+      const int32 has_successor = (int_total_cost < INT_MAX) ? 1 : 0;
+
+      int2 int_cost_and_index = {int_total_cost, has_successor};
+      BlockScan(sh_temp_storage_scan)
+          .InclusiveScan(int_cost_and_index, int_cost_and_index, MinPlus());
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // We are in a divergent branch
+        // This is the last thread. The last value of the inclusive scan is the
+        // total
+        const int32 total_successors_in_block = int_cost_and_index.y;
+        // Requesting a spot of size total_successors_in_block in the aux_q
+
+        // note:  using 2 atomics here to avoid adding another kernel
+        // first request more space
+        const int aux_q_index_block_offset = atomicAdd(
+            &lane_counters->aux_q_requested, total_successors_in_block);
+
+        // check for overflow in aux_q
+        // We try to prevent an overflow from happening using an adaptive beam
+        // (cf GetAdaptiveBeam)
+        if (aux_q_index_block_offset + total_successors_in_block <
+            cst_dev_params.aux_q_capacity) {
+          // no overflow
+
+          // grab the aux_q offset
+          sh_aux_q_index_block_offset =
+              atomicAdd(&lane_counters->aux_q_end, total_successors_in_block);
+
+          // We are not overflowing the queue, updating the global values
+            IntegerCostType global_min_int_cost = lane_counters->min_int_cost;
+            IntegerCostType local_min_int_cost = int_cost_and_index.x;
+            // if we found a lower min_cost, update the global value
+            if (local_min_int_cost < global_min_int_cost) {
+              global_min_int_cost = local_min_int_cost;
+              atomicMin(&lane_counters->min_int_cost, global_min_int_cost);
+              CostType beam =
+                  orderedIntToFloat(adaptive_int_beam_with_validity_index.x);
+              IntegerCostType new_int_cutoff = floatToOrderedInt(
+                  orderedIntToFloat(local_min_int_cost) + beam);
+              atomicMin(&lane_counters->int_cutoff, new_int_cutoff);
+            }
+            int32 beam_valid_until_idx =
+                adaptive_int_beam_with_validity_index.y;
+            if (aux_q_index_block_offset >= beam_valid_until_idx) {
+              // This beam is no longer valid. Updating it
+              UpdateAdaptiveBeam(
+                  cst_dev_params, aux_q_index_block_offset, global_min_int_cost,
+                  &adaptive_int_beam_with_validity_index, lane_counters);
+            }
+        } else {
+          // sh_aux_q_index_block_offset is in shared memory
+          // its value is currently invalid (overflow)
+          // we set it to a special value and use it as a flag to broadcast
+          // the fact that we have an overflow and that all threads should exit
+          sh_aux_q_index_block_offset = cst_dev_params.aux_q_capacity;
+
+          // Setting the flag for the host. It will be used to print a warning
+          // to stderr
+          lane_counters->q_overflow |= OVERFLOW_AUX_Q;
+
+          // We do not jump to end_lane now, because only
+          // the first thread (threadIdx.x == 0) is executing this
+          // We wait until the end of the divergent branch
+        }
+      }
+
+      // Sync'ing for two reasons :
+      // - Broadcasting sh_aux_q_index_block_offset
+      // - reusing sh_temp_storage (cf CUB's doc)
+      __syncthreads();
+      // The only case where we can have that condition met,
+      // is if we detected an overflow if the previous lines
+      if (sh_aux_q_index_block_offset == cst_dev_params.aux_q_capacity)
+        goto end_lane;  // done for this lane
+      //
+      // If we're executing the following lines it means everything
+      // is valid and we are not overflowing the aux_q
+      //
+      int_cost_and_index.y -= has_successor;  // we want the exclusive sum now
+      const int32 aux_q_block_index = int_cost_and_index.y;
+      const int32 aux_q_index = sh_aux_q_index_block_offset + aux_q_block_index;
+      if (has_successor) {
+        // We save the new token to the aux_q
+        cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_index] = {
+            arc_next_state, int_total_cost};
+        // Index of the parent token
+        // the parent is the token used as input (source of arc)
+        // that parent is at index main_q_idx in the GPU memory
+        // However, the main_q is emptied before processing a new frame
+        // we need to add the offset related to the previous frames index
+        // we add cst_dev_params.main_q_global_offset
+        const int32 prev_token =
+            lane_counters->main_q_global_offset + main_q_idx;
+        assert(main_q_idx >= 0 && main_q_idx < cst_dev_params.main_q_capacity);
+        cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_index] = {prev_token,
+                                                                arc_idx};
+      }
+    }
+  end_lane:;  // ";" is an empty statement
+  }
+}
+
+// post_expand_kernel
+// Called after expand_arcs_kernel
+// Takes care of what needs to be done after an expand_arcs_kernel
+// execution. Mostly resetting the beam (if adaptive beam was triggered,
+// the max_active_ kernels will take care of selecting a good beam),
+// resetting the number of arcs in the main_q (we've processed them all),
+// etc.
+// Threads (1,1,1)
+// Blocks (1, nlanes_used, 1)
+template <bool IS_EMITTING>
+__global__ void post_expand_kernel(DeviceParams cst_dev_params,
+                                   KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    LaneCounters *h_lane_counters = cst_dev_params.h_lanes_counters.lane(ilane);
+    const int prev_main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int prev_n_extra_prev_tokens =
+        lane_counters->main_q_n_extra_prev_tokens;
+    const int aux_q_end = lane_counters->aux_q_end;
+    CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+    // The next step is the contracting step from aux_q to main_q
+    // It will need the aux_q_end value. But it will also empty the aux_q
+    // We're resetting aux_q_end to 0 now, but we're saving its old value
+    // in another place
+    lane_counters->post_expand_aux_q_end = aux_q_end;
+    h_lane_counters->post_expand_aux_q_end = aux_q_end;       // pinned memory
+    h_lane_counters->q_overflow = lane_counters->q_overflow;  // pinned memory
+    lane_counters->aux_q_end = 0;
+    lane_counters->aux_q_requested = 0;
+    // We are done processing those arcs
+    lane_counters->main_q_narcs_and_end.x = 0;
+    // Resetting the adaptive beam
+    lane_counters->adaptive_int_beam_with_validity_index.x =
+        lane_counters->int_beam;
+    lane_counters->adaptive_int_beam_with_validity_index.y =
+        cst_dev_params.adaptive_beam_static_segment;
+    CostType beam = orderedIntToFloat(lane_counters->int_beam);
+    lane_counters->int_cutoff = floatToOrderedInt(min_cost + beam);
+    // If the adaptive beam kicked in, we want to reset the beam
+    // the max-active process will take care of selecting the right beam
+    if (IS_EMITTING) {
+      // the main_q contains the tokens from the previous frame
+      // after emitting, we won't use them anymore to create new tokens
+      // we reset the main_q
+      lane_counters->main_q_narcs_and_end = {0, 0};
+      lane_counters->main_q_requested = 0;
+      // The main_q was flushed - we need to update the global_offset
+      lane_counters->main_q_global_offset += prev_main_q_end;
+      if (threadIdx.x == 0 && blockIdx.x == 0)
+        lane_counters->main_q_extra_prev_tokens_global_offset +=
+            prev_n_extra_prev_tokens;
+      // Moving local offset. Tokens created by last expand
+      // will be pruned, and survivals will be moved at the end
+      // of the main q. Those tokens will be placed after local_offset
+      lane_counters->main_q_requested = 0;
+      CostType min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+      lane_counters->min_histo_cost = min_cost;
+      lane_counters->max_histo_cost = min_cost + beam;
+      lane_counters->histo_bin_width = beam / (KALDI_CUDA_DECODER_HISTO_NBINS-1);
+    } else {
+      lane_counters->main_q_local_offset = prev_main_q_end;
+      // reset requested to end of queue
+      lane_counters->main_q_requested = prev_main_q_end;
+    }
+  }
+}
+
+__global__ void post_contract_and_preprocess_kernel(DeviceParams cst_dev_params,
+                                                    KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    LaneCounters *h_lane_counters = cst_dev_params.h_lanes_counters.lane(ilane);
+    int2 main_q_narcs_and_end = lane_counters->main_q_narcs_and_end;
+    h_lane_counters->main_q_narcs_and_end =
+        main_q_narcs_and_end;                                 // pinned memory
+    h_lane_counters->q_overflow = lane_counters->q_overflow;  // pinned memory
+    atomicMin(&lane_counters->main_q_n_emitting_tokens, main_q_narcs_and_end.y);
+  }
+}
+
+// Meta-kernel (merging preprocess and expand) but only works with 1 CUDA block
+// Used to avoid calling multiple main kernels (such as expand_arcs_kernel)
+// for the tail of non emitting (lots of iterations with small number of arcs)
+//
+// Code is greatly simplified because we use only one CTA / lane
+//
+// Repeat until new queue empty:
+// 1) Preprocess
+// 2) Expand arcs
+//
+// The preprocess stage is not done on the first iteration, because it was
+// already done by the ProcessAndContract kernel. We always call
+// PruneAndPreprocess before calling FinalizeProcessNonemitting
+//
+// At the end, this kernel finalize the computation for current frame,
+// so that it's ready for next ProcessEmitting
+//
+// This kernel works, but can be greatly simplified now.
+__launch_bounds__(KALDI_CUDA_DECODER_LARGEST_1D_BLOCK, 1) __global__
+    void finalize_process_non_emitting_kernel(DeviceParams cst_dev_params,
+                                              KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK>
+      Int2BlockScan;
+  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_LARGEST_1D_BLOCK> IntBlockScan;
+  __shared__ typename IntBlockScan::TempStorage sh_temp_storage_int_scan;
+  __shared__ typename Int2BlockScan::TempStorage sh_temp_storage_int2_scan;
+
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+
+    int2 both = lane_counters->main_q_narcs_and_end;
+    int32 main_q_narcs = both.x;
+    int32 main_q_end = both.y;
+    int32 main_q_local_offset = lane_counters->main_q_local_offset;
+    const int32 main_q_global_offset = lane_counters->main_q_global_offset;
+    // aux_q is empty when this kernel is called
+    int32 aux_q_end = 0;
+    IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    while (main_q_narcs > 0) {
+      // Step 1 : ExpandArcs
+      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
+                                                     main_q_narcs) {
+        const int32 main_q_arc_idx = offset + thread_idx;
+        // For details on how this code works, please refer to comments in
+        // expand_arcs
+        IntegerCostType total_int_cost = INT_MAX;
+        int32 arc_idx;
+        StateId arc_next_state;
+        int32 main_q_idx;
+        if (main_q_arc_idx < main_q_narcs) {
+          main_q_idx = binsearch_maxle(
+              cst_dev_params.d_main_q_degrees_prefix_sum.channel(ichannel),
+              main_q_arc_idx, main_q_local_offset, main_q_end - 1);
+
+          const int32 state_first_arc_idx_in_main_q =
+              cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+                  ichannel)[main_q_idx];
+          const int32 arc_offset_start =
+              cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx];
+          arc_idx = arc_offset_start +
+                    (main_q_arc_idx - state_first_arc_idx_in_main_q);
+
+          arc_next_state = cst_dev_params.d_arc_nextstates[arc_idx];
+          CostType arc_weight = cst_dev_params.d_arc_weights[arc_idx];
+          CostType prev_token_cost =
+              orderedIntToFloat(cst_dev_params.d_main_q_state_and_cost
+                                    .channel(ichannel)[main_q_idx]
+                                    .y);
+          total_int_cost = floatToOrderedInt(arc_weight + prev_token_cost);
+	  if(total_int_cost < lane_counters->min_int_cost)
+            atomicMin(&lane_counters->min_int_cost, total_int_cost);
+          if (total_int_cost >= int_cutoff) {
+            total_int_cost = INT_MAX;  // above cutoff
+          }
+        }
+        const int32 has_successor = (total_int_cost < INT_MAX) ? 1 : 0;
+
+        int32 local_aux_q_idx;
+        int32 nsuccessors;
+        IntBlockScan(sh_temp_storage_int_scan)
+            .ExclusiveSum(has_successor, local_aux_q_idx,
+                          nsuccessors);  // aggregate
+
+        // Checking if we are overflowing the aux_q
+        if ((aux_q_end + nsuccessors) >= cst_dev_params.aux_q_capacity) {
+          lane_counters->q_overflow |= OVERFLOW_AUX_Q;
+          // nothing to revert in global memory
+          goto finalize_lane;
+        }
+
+        if (has_successor) {
+          const int32 aux_q_idx = aux_q_end + local_aux_q_idx;
+          const int32 prev_token_idx = main_q_global_offset + main_q_idx;
+          cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx] = {
+              arc_next_state, total_int_cost};
+          cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx] = {prev_token_idx,
+                                                                arc_idx};
+        }
+        aux_q_end += nsuccessors;
+        // sync: reusing sh_temp_storage_scan_int
+        __syncthreads();
+      }
+
+      // Step 2 : PreprocessAndContract
+      // Reset for new iteration
+      main_q_narcs = 0;
+      main_q_local_offset = main_q_end;
+      KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx,
+                                                     aux_q_end) {
+        const int32 aux_q_idx = offset + thread_idx;
+        int32 degree = 0;
+        int32 start = -1;
+        StateId token_state;
+        IntegerCostType token_int_cost;
+        if (aux_q_idx < aux_q_end) {
+          int2 both =
+              cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[aux_q_idx];
+          token_state = both.x;
+          token_int_cost = both.y;
+          // beam may have changed since generation
+          // We are non-emitting in this kernel, using ne offsets
+          start = cst_dev_params.d_arc_ne_offsets[token_state];
+          int32 end = cst_dev_params.d_arc_ne_offsets[token_state + 1];
+          degree = end - start;
+        }
+        int has_valid_nonpruned_token = (start != -1) ? 1 : 0;
+        int2 narcs_and_ntokens_prefix_sum = {degree, has_valid_nonpruned_token};
+        int2 aggregate, zero2 = {0, 0};
+        Int2BlockScan(sh_temp_storage_int2_scan)
+            .ExclusiveScan(narcs_and_ntokens_prefix_sum,
+                           narcs_and_ntokens_prefix_sum, zero2, PlusPlus(),
+                           aggregate);
+        // Checking if we are not overflowing the main_q
+        const int32 total_ntokens = aggregate.y;
+        if ((main_q_end + total_ntokens) >= cst_dev_params.main_q_capacity) {
+          lane_counters->q_overflow |= OVERFLOW_MAIN_Q;
+          goto finalize_lane;
+        }
+        const int32 degree_prefix_sum =
+            main_q_narcs + narcs_and_ntokens_prefix_sum.x;
+        const int32 degree_sum = aggregate.x;
+        main_q_narcs += degree_sum;
+        if (has_valid_nonpruned_token) {
+          const int32 local_main_q_idx = narcs_and_ntokens_prefix_sum.y;
+          const int32 main_q_idx = main_q_end + local_main_q_idx;
+
+          cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+              start;
+          cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+              ichannel)[main_q_idx] = degree_prefix_sum;
+          cst_dev_params.d_main_q_state_and_cost.channel(
+              ichannel)[main_q_idx] = {token_state, token_int_cost};
+          cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx] =
+              cst_dev_params.d_aux_q_info.lane(ilane)[aux_q_idx];
+          cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx] =
+              0.0f;  // we are always nonemitting in this kernel
+        }
+        main_q_end += total_ntokens;
+        __syncthreads();
+      }
+      aux_q_end = 0;  // aux_q is now empty
+    }
+
+  finalize_lane:
+    if (threadIdx.x == 0) {
+      // This main_q is now final for that frame
+      lane_counters->main_q_narcs_and_end = {0, main_q_end};
+      cst_dev_params.h_lanes_counters.lane(ilane)->main_q_narcs_and_end = {
+          0, main_q_end};  // pinned memory
+    }
+  }
+}
+
+// GetBestCost :
+// Finds all tokens with a cost in [min_cost;min_cost+lattice_beam[
+// Add the final_costs if use_final_probs
+// Does the computation in two steps
+//
+// Step 1: Find the value of min_cost, i.e. the minimum cost in the last token
+// queue
+// (the queue generated by the last frame computed)
+// We set both channel_counters->min_int_cost_and_arg_without_final
+// and channel_counters->min_int_cost_and_arg_with_final
+// One add the final_cost[token.state] before looking for the min
+__global__ void get_best_cost_step1_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params,
+                                           bool use_final_probs,
+                                           CostType fst_zero) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
+      if (idx == 0)
+        lane_counters->n_within_lattice_beam =
+            0;  // will be used in the next kernel
+      const int2 both =
+          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
+      const int token_state = both.x;
+      const int token_int_cost = both.y;
+      CostType cost = orderedIntToFloat(token_int_cost);
+      IntegerCostType int_cost = floatToOrderedInt(cost);
+      int32 global_idx = global_offset + idx;
+      // We know what is the min cost (without final costs)
+      // we just need to have the index of one token with that min cost
+
+      if (use_final_probs) {
+        const CostType final_cost =
+            cst_dev_params.d_fst_final_costs[token_state];
+        IntegerCostType int_cost_with_final =
+            floatToOrderedInt(cost + final_cost);
+        if (final_cost != fst_zero) {
+          int2 min_and_arg = {int_cost_with_final,
+                              global_idx};  // sort by cost, put it first
+          atomicMinI2(&channel_counters->min_int_cost_and_arg_with_final,
+                      min_and_arg);
+        }
+      }
+    }
+  }
+}
+
+// Step2: Now that step1 found the min_cost (with and without final cost)
+// If at least one final token (token associated with a final fst state)
+// exists in the token queue, AND if use_final_probs is true,
+// We can detect all tokens with a cost within [min_cost;min_cost+lattice_beam]
+// and list them into d_list_final_tokens_in_main_q
+__global__ void get_best_cost_step2_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params,
+                                           bool use_final_probs,
+                                           CostType fst_zero) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+    const int32 main_q_end = channel_counters->prev_main_q_narcs_and_end.y;
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    const int2 min_int_cost_and_arg_with_final =
+        channel_counters->min_int_cost_and_arg_with_final;
+    const int2 min_int_cost_and_arg_without_final =
+        channel_counters->min_int_cost_and_arg_without_final;
+    bool has_reached_final = (min_int_cost_and_arg_with_final.x != INT_MAX);
+    // Use final if we want to use final (use_final_probs is true) and if we
+    // found a final state in the token list
+    bool compute_final = use_final_probs && has_reached_final;
+    IntegerCostType min_cost_to_use =
+        compute_final ? min_int_cost_and_arg_with_final.x
+                      : min_int_cost_and_arg_without_final.x;
+
+    // if token.cost < lattice_cutoff, that token will belong in the output
+    // lattice
+    CostType lattice_cutoff =
+        orderedIntToFloat(min_cost_to_use) + cst_dev_params.lattice_beam;
+    IntegerCostType lattice_int_cutoff = floatToOrderedInt(lattice_cutoff);
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(idx, main_q_end) {
+      // First thread of each lane will move the results into lane counters.
+      // That's because we never move channel counters back to host,
+      // so we move those values to the lane counters, and those lane counters
+      // will be moved to host after this kernel
+      if (idx == 0) {
+        // The lane counters will be copied to host
+        lane_counters->min_int_cost_and_arg =
+            compute_final ? min_int_cost_and_arg_with_final
+                          : min_int_cost_and_arg_without_final;
+        lane_counters->has_reached_final = has_reached_final;
+      }
+      // Looking for a token with its int_cost < lattice_int_cutoff
+      const int2 both =
+          cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[idx];
+      const int32 token_state = both.x;
+      int32 token_int_cost = both.y;
+      if (compute_final) {
+        const CostType final_cost =
+            cst_dev_params.d_fst_final_costs[token_state];
+        const CostType token_cost = orderedIntToFloat(token_int_cost);
+        // final_cost == fst_zero -> this state is not final
+        token_int_cost = (final_cost != fst_zero)
+                             ? floatToOrderedInt(token_cost + final_cost)
+                             : INT_MAX;
+      }
+      if (token_int_cost < lattice_int_cutoff) {
+        // That token will be included in the lattice (last frame)
+        // save it
+        int list_idx = atomicAdd(&lane_counters->n_within_lattice_beam, 1);
+        cst_dev_params.h_list_final_tokens_in_main_q.lane(ilane)[list_idx] = {
+            global_offset + idx, token_int_cost};
+      }
+    }
+  }
+}
+__global__ void get_best_cost_step3_kernel(DeviceParams cst_dev_params,
+                                           KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *d_lanes_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    LaneCounters *h_lanes_counters =
+        cst_dev_params.h_lanes_counters.lane(ilane);
+    h_lanes_counters->min_int_cost_and_arg =
+        d_lanes_counters->min_int_cost_and_arg;
+    h_lanes_counters->has_reached_final = d_lanes_counters->has_reached_final;
+    h_lanes_counters->n_within_lattice_beam =
+        d_lanes_counters->n_within_lattice_beam;
+  }
+}
+// compute_costs_histogram_kernel
+// Used in ApplyMaxActiveAndReduceBeam
+// Compute the histogram of the token.cost in the main_q
+__global__ void compute_costs_histogram_kernel(DeviceParams cst_dev_params,
+                                               KernelParams params,
+                                               bool use_aux_q) {
+  const int nlanes = params.nlanes_used;
+  typedef cub::BlockHistogram<BinId, KALDI_CUDA_DECODER_1D_BLOCK, 1,
+                              KALDI_CUDA_DECODER_HISTO_NBINS + 1>
+      BlockHistogram;
+  __shared__ typename BlockHistogram::TempStorage temp_storage;
+  __shared__ unsigned int smem_histogram[KALDI_CUDA_DECODER_HISTO_NBINS + 1];
+
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const int32 q_end = use_aux_q ? lane_counters->post_expand_aux_q_end
+                                  : lane_counters->main_q_narcs_and_end.y;
+    bool compute_max_active = lane_counters->compute_max_active;
+    if (!compute_max_active) {
+      if (q_end <= cst_dev_params.max_active) continue;  // nothing to do
+      // Otherwise let's turn max active on for this frame and lane
+      lane_counters->compute_max_active = true;
+    }
+
+    // Reset local histogram for this lane
+    BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+    CostType min_histo_cost = lane_counters->min_histo_cost;
+    CostType max_histo_cost = lane_counters->max_histo_cost;
+    CostType bin_width = lane_counters->histo_bin_width;
+
+    // We have a sync inside the loop, keeping all threads alive
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   q_end) {
+      const int32 q_idx = block_offset + thread_idx;
+      // The last bin is for everything we don't want to count:
+      // cost already above the beam, or non-valid tokens
+      // It is the default bin
+      BinId bin_id[1];
+      bin_id[0] = KALDI_CUDA_DECODER_HISTO_NBINS;
+      if (q_idx < q_end) {
+        IntegerCostType int_cost =
+            use_aux_q
+                ? cst_dev_params.d_aux_q_state_and_cost.lane(ilane)[q_idx].y
+                : cst_dev_params.d_main_q_state_and_cost
+                      .channel(ichannel)[q_idx]
+                      .y;
+        CostType cost = orderedIntToFloat(int_cost);
+        CostType extra = cost - min_histo_cost;
+	if(extra <= 0.0f) 
+		bin_id[0] = 0;
+  	else if (extra < max_histo_cost) {
+          bin_id[0] = (BinId)__fdiv_rd(extra, bin_width)+1; // +1 because first bin is cost < min_histo_cost
+        }
+      }
+      BlockHistogram(temp_storage).Composite(bin_id, smem_histogram);  // sync
+      __syncthreads();  // reusing temp_storage
+    }
+
+    // Not using the macros 1D_LOOP because that loop is only within a CTA
+    for (int32 bin_id_w = threadIdx.x;
+         bin_id_w < KALDI_CUDA_DECODER_HISTO_NBINS;
+         bin_id_w += KALDI_CUDA_DECODER_1D_BLOCK) {
+      // Writing the local histo to global
+      // We don't care about the last bin (cf above)
+      int32 s_count = (int32)smem_histogram[bin_id_w];
+      atomicAdd(&cst_dev_params.d_histograms.lane(ilane)[bin_id_w], s_count);
+    }
+    // Making sure we're done reading from smem
+    __syncthreads();
+  }
+}
+
+// update_beam_using_histogram_kernel
+// used in ApplyMaxActiveAndReduceBeam
+// uses the histogram computed in compute_costs_histogram_kernel
+// to find where to cut (where to set the beam)
+// to keep only ~max_active_ tokens.
+// Important: use only one CTA per lane
+__global__ void update_beam_using_histogram_kernel(DeviceParams cst_dev_params,
+                                                   KernelParams params,
+                                                   bool use_aux_q) {
+  typedef cub::BlockScan<int, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  const int nlanes = params.nlanes_used;
+  const int max_active = cst_dev_params.max_active;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    bool compute_max_active = lane_counters->compute_max_active;
+    if (!compute_max_active) continue;  // nothing to do
+    CostType beam = orderedIntToFloat(lane_counters->int_beam);
+    CostType min_histo_cost = lane_counters->min_histo_cost;
+    CostType bin_width = lane_counters->histo_bin_width;
+    // We now have our histogram of the token costs (computed in the previous
+    // kernel)
+    // Each thread i is responsible for a bin i, with that bin containing ni
+    // tokens.
+    // We compute the prefix sum of those ni, ending up for each thread with
+    // si=sum[i=1..i](ni)
+    // If the thread i detects that si < max_active_ and s[i+1] >= max_active_,
+    // then we will cut the beam at
+    // the cost of the bin [i+1]
+    //
+    // Assert : one thread in a CTA is responsible for at most one bin
+    // we will not iterate over bins
+    assert(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
+    int bin_id = threadIdx.x;
+    int val = 0;
+    if (bin_id < KALDI_CUDA_DECODER_HISTO_NBINS) 
+      val = cst_dev_params.d_histograms.lane(ilane)[bin_id];
+    
+    int prefix_sum;
+    BlockScan(temp_storage).ExclusiveSum(val, prefix_sum);
+
+    if (prefix_sum < max_active && (prefix_sum + val) >= max_active) {
+      // We found our new beam regarding min_histo_cost
+      // Howevever, the current min_cost could be lower than min_histo_cost
+      // we need to add that diff to the new beam
+      CostType new_beam_for_histo_min_cost = bin_width * bin_id;
+      CostType current_min_cost = orderedIntToFloat(lane_counters->min_int_cost);
+      CostType new_beam = (min_histo_cost - current_min_cost) + new_beam_for_histo_min_cost;
+      IntegerCostType new_int_beam = floatToOrderedInt(new_beam);
+      // Saving our new beam for this lane
+      lane_counters->int_beam = new_int_beam;
+      lane_counters->adaptive_int_beam_with_validity_index.x = new_int_beam;
+      lane_counters->int_cutoff = floatToOrderedInt(current_min_cost + new_beam);
+    }
+  }
+}
+
+//
+// PostProcessingMainQueue kernels.
+// all the following kernels are called when postprocessing a frame
+//
+
+// Filling hashmap values with the tokens that we have in the main queue
+// We do that because multiple tokens associated with the same FST state
+// (but with different arc_idx) can exist in the main_q. We need to detect
+// that situation, count them, detect what the min_cost for that FST state is.
+// It is done using a hashmap
+__global__ void fill_hashmap_with_main_q_kernel(DeviceParams cst_dev_params,
+                                                KernelParams params) {
+  // Operator for the prefix sum inside the CUDA block
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    ChannelCounters *channel_counters =
+        cst_dev_params.d_channels_counters.channel(ichannel);
+
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    int32 min_int_cost = lane_counters->min_int_cost;
+    CostType min_cost = orderedIntToFloat(min_int_cost);
+    const int32 global_offset = channel_counters->prev_main_q_global_offset;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      // Position of considered token in the main_q
+      if (main_q_idx < main_q_end) {
+        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
+            ichannel)[main_q_idx];
+        StateId token_state = both.x;
+        IntegerCostType token_int_cost = both.y;
+        if (min_int_cost == token_int_cost) {
+          // remove offset = min_cost, set it to 0 explicitely
+          token_int_cost = floatToOrderedInt(0.0f);
+          channel_counters->min_int_cost_and_arg_without_final = {
+              token_int_cost, global_offset + main_q_idx};
+          lane_counters->prev_arg_min_int_cost = main_q_idx;
+        } else {
+          // remove offset = min_cost
+          CostType token_cost = orderedIntToFloat(token_int_cost) - min_cost;
+          token_int_cost = floatToOrderedInt(token_cost);
+        }
+        int local_idx, hash_idx;
+        hashmap_insert_or_aggregate(cst_dev_params.d_hashmap_values.lane(ilane),
+                                    token_state, token_int_cost, main_q_idx,
+                                    cst_dev_params.hashmap_capacity, &local_idx,
+                                    &hash_idx);
+        cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
+            ilane)[main_q_idx] = local_idx;
+        cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx].y =
+            token_int_cost;
+        // If we have the min, saving its index for get best cost and the min
+        // cost estimate of the next frame
+
+        // Saving where that token.state ended up in the hashmap
+        // false = this token is not the representative of this state
+        // We will update representing_state once we know more (in the next
+        // kernel)
+        // We first need to add all tokens to the hashmap. Which will be the
+        // case when
+        // this kernel returns.
+        SetFSTStateHashIndex(
+            hash_idx, false,
+            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);
+      }
+
+      if (main_q_idx == 0) {
+        lane_counters->int_cutoff = floatToOrderedInt(
+            orderedIntToFloat(lane_counters->int_cutoff) - min_cost);
+      }
+    }
+  }
+}
+
+// preprocess_and_list_extra_prev_tokens_kernel_step[i] kernels
+// Called in PostProcessingMainQueue
+// They do two things:
+// - do the "emitting preprocessing". I.e. doing the preprocessing necessary for
+// the future ExpandArcsEmitting that may be done next (if the current frame is
+// not the last one)
+// It consists of filling the d_main_q_degrees_prefix_sum of the emitting arc
+// degrees of the tokens + setting d_main_q_arc_offsets
+// - when we have multiple tokens associated with the same FST state S, we will
+// list them in d_main_q_extra_prev_tokens. We need to know where to put them in
+// that array,
+// so we'll compute a prefix_sum also to compute those indexes. We'll then save
+// the location of each extra tokens list (its offset and size in
+// d_main_q_extra_prev_tokens),
+// and save it into d_main_q_info for later lattice processing
+//
+// First step : Reading the hashmap, detecting which token is representative for
+// each FST state, which is decided by fill_hashmap_with_main_q_kernel()
+// (we pick one of the best ones, with the best ones being the ones with the
+// lowest cost)
+// this representative will be responsible for K tokens, with K being the number
+// of tokens associated with that FST state. We only considers the cases where K
+// > 1,
+// because if K == 1, then we will not store that token in the special list
+// d_main_q_extra_prev_tokens
+// Each representative is also the only token that will propagate emitting arcs
+// for that FST state. Because a representative has the min_cost for that FST
+// state, it is enough to only propagate
+// that one
+// Each representative counts the number of emitting arcs it is responsible for,
+// and we will compute the prefix sum of the arc degrees
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step1_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  // Operator for the prefix sum inside the CUDA block
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 main_q_end = lane_counters->main_q_narcs_and_end.y;
+    // Final cutoff from last ExpandArc execution
+    // The cutoff can have decreased since moving tokens to the main_q
+    // min_cost cannot be lower than before (we only did non-emitting phases
+    // since then)
+    // but the adaptive beam may have lowered the beam
+    const IntegerCostType int_cutoff = lane_counters->int_cutoff;
+    // Keeping all threads in CTA alive
+    // We'll __syncthreads()
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(block_offset, thread_idx,
+                                                   main_q_end) {
+      // We'll take care of the token at index main_q_idx
+      const int32 main_q_idx = block_offset + thread_idx;
+      const int32 ichannel = lane_counters->channel_to_compute;
+      // If that token is the representative of its FST state (token.next_state)
+      // The representative of a FST state is the token with the lowest
+      // token.cost for that FST state
+      // If multiple tokens have token1.cost == token2.cost ==
+      // min_cost_for_that_state, then one is picked (first come first serve,
+      // was done in fill_hashmap_with_main_q_kernel)
+      bool representing_state = false;
+      // Number of emitting arcs for that token
+      // Only the token representative of that FST state can have degree > 0
+      int32 degree = 0;
+      // If that token is representative of a FST state S,
+      // and if multiple tokens are associated with that state S,
+      // then n_extra_prev_token will contain their count
+      int32 n_extra_prev_token = 0;
+      if (main_q_idx < main_q_end) {
+        int2 both = cst_dev_params.d_main_q_state_and_cost.channel(
+            ichannel)[main_q_idx];
+        StateId token_state = both.x;
+        IntegerCostType token_int_cost = both.y;
+        // Loading info about token.next_state. Is there multiple tokens for
+        // that state ?
+        // How many ? What's the min token.cost for that state ?
+        int32 hash_idx;    // we saved the hash_idx after inserting
+        bool bool_buffer;  // will always be false. We just need it to call the
+                           // function
+        GetFSTStateHashIndex(
+            cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+            &hash_idx, &bool_buffer);
+        HashmapValueT h_val =
+            cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
+        // Token index of one of the token which the lowest token.cost for that
+        // state
+        uint32_t state_best_int_cost_argmin;
+	GetArgFromPackedArgminUInt64(h_val.min_and_argmin_int_cost_u64, &state_best_int_cost_argmin);
+
+        // Checking if we're the representative of that state
+        representing_state = (main_q_idx == state_best_int_cost_argmin);
+        // Saving the hash_idx of that fst state + if we're responsible for that
+        // state
+        SetFSTStateHashIndex(
+            hash_idx, representing_state,
+            &cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx]);
+
+        // One of the best token for that state will represent that state in the
+        // next frame
+        if (representing_state) {
+          if (token_int_cost < int_cutoff) {
+            // Next step is emitting (next frame), using emitting offsets
+            const int32 start = cst_dev_params.d_arc_e_offsets[token_state];
+            const int32 end = cst_dev_params.d_arc_e_offsets[token_state + 1];
+            degree = end - start;
+            // Saving the start offset for the expand kernel
+            // avoid a new random memory access
+            cst_dev_params.d_main_q_arc_offsets.channel(ichannel)[main_q_idx] =
+                start;
+          }
+          // If that FST state has only one token associated to it, we store
+          // that token directly in
+          // d_main_q_info (its original place)
+          // We only move it into the d_main_q_extra_prev_tokens list if
+          // multiple tokens are associated to that state
+          n_extra_prev_token = (h_val.count > 1) ? (h_val.count) : 0;
+        }
+      }
+
+      // Computing a local prefix sum inside that CUDA block
+      // Others kernels will take care of adding the necessary offset to those
+      // local prefix sums
+      int2 zeroi2 = {0, 0};
+      int2 vali2 = {degree, n_extra_prev_token};
+      int2 aggi2;
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(vali2, aggi2, zeroi2, PlusPlus());
+      int32 degree_local_prefix_sum = aggi2.x;
+      int32 n_extra_prev_token_prefix_sum = aggi2.y;
+
+      if (main_q_idx < main_q_end) {
+        // This is not the final global prefix sum
+        // Other kernels will add the necessary offset
+        cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+            ichannel)[main_q_idx] = degree_local_prefix_sum;
+        cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
+            ilane)[main_q_idx] = n_extra_prev_token_prefix_sum;
+      }
+
+      if (KALDI_CUDA_DECODER_IS_LAST_1D_THREAD()) {
+        // Saving the local sum of degrees of that CUDA block
+        // That's necessary to compute the global offset of that CUDA block,
+        // and that offset is what we need to transform the local prefix sum
+        // into a global prefix sum
+        const int local_sum_index = block_offset / KALDI_CUDA_DECODER_1D_BLOCK;
+        // the prefix sum was exclusive, adding missing value
+        const int degree_inclusive_sum = degree_local_prefix_sum + degree;
+        const int n_extra_prev_tokens_inclusive_sum =
+            n_extra_prev_token_prefix_sum + n_extra_prev_token;
+        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
+            ilane)[local_sum_index] = {degree_inclusive_sum,
+                                       n_extra_prev_tokens_inclusive_sum};
+      }
+
+      // Synchronization because:
+      // - we may need to reuse sh_temp_storage if the for loop iterates (cf
+      // CUB's doc)
+      __syncthreads();
+    }
+  }
+}
+
+// In step1, we've computed the local (CTA-wide) prefix sums. We also have the
+// local sums of each individual CTAs
+// In this kernel, we will compute the offset of each CTA in the global prefix
+// sum. We will then add those offsets in step3
+// Only one CTA / lane
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step2_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  typedef cub::BlockScan<int2, KALDI_CUDA_DECODER_1D_BLOCK> BlockScan;
+  __shared__ typename BlockScan::TempStorage sh_temp_storage;
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    const int ntiles = KALDI_CUDA_DECODER_DIV_ROUND_UP(
+        main_q_end, KALDI_CUDA_DECODER_1D_BLOCK);
+    // Using block_offset loop to keep entire CTA alive (we're going to use
+    // __syncthreads in CUB)
+    int2 sum_so_far = {0, 0};
+    KALDI_CUDA_DECODER_1D_BLOCK_OFFSET_KERNEL_LOOP(offset, thread_idx, ntiles) {
+      const int32 itile = offset + thread_idx;
+      const int2 zeroi2 = {0, 0};
+      const int2 val =
+          (itile < ntiles)
+              ? cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile]
+              : zeroi2;
+
+      int2 prefix_sum, sum;
+      BlockScan(sh_temp_storage)
+          .ExclusiveScan(val, prefix_sum, zeroi2, PlusPlus(), sum);
+      PlusPlus pp;
+      prefix_sum = pp(prefix_sum, sum_so_far);
+      sum_so_far = pp(sum_so_far, sum);
+      if (itile < ntiles) {
+        cst_dev_params.d_main_q_block_sums_prefix_sum.lane(ilane)[itile] =
+            prefix_sum;
+      }
+      if (itile == (ntiles - 1)) {
+        const int32 total_narcs = prefix_sum.x + val.x;
+        const int32 total_n_extra_prev_tokens = prefix_sum.y + val.y;
+        lane_counters->main_q_narcs_and_end.x = total_narcs;
+        lane_counters->main_q_n_extra_prev_tokens = total_n_extra_prev_tokens;
+        assert(total_n_extra_prev_tokens >= 0 &&
+               total_n_extra_prev_tokens <= main_q_end);
+      }
+    }
+  }
+}
+
+// Step3: Uses the CTA offsets computed in step2 to transform the CTA-wide
+// prefix sums to global prefix sums
+// The representative of each FST states saves into the hashmap the location of
+// the extra_prev_tokens of that state
+// in d_main_q_extra_prev_tokens. That way each extra tokens will know where to
+// write itself in the next kernel.
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step3_kernel(
+		DeviceParams cst_dev_params, KernelParams params) {
+	const int nlanes = params.nlanes_used;
+	KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+		const LaneCounters *lane_counters =
+			cst_dev_params.d_lanes_counters.lane(ilane);
+		const int32 ichannel = lane_counters->channel_to_compute;
+		const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+		KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+			const int32 local_sum_idx = main_q_idx / KALDI_CUDA_DECODER_1D_BLOCK;
+			const int2 local_sum_offset =
+				cst_dev_params.d_main_q_block_sums_prefix_sum.lane(
+						ilane)[local_sum_idx];
+			cst_dev_params.d_main_q_degrees_prefix_sum.channel(
+					ichannel)[main_q_idx] += local_sum_offset.x;
+			int extra_prev_tokens_offset =
+				cst_dev_params.d_main_q_extra_prev_tokens_prefix_sum.lane(
+						ilane)[main_q_idx] +
+				local_sum_offset.y;
+			// Loading the hash index associate with token.state
+			// If representative, store the location of the extra prev tokens list for
+			// that state in the hashmap
+			bool is_representative;
+			int32 hash_idx;
+			GetFSTStateHashIndex(
+					cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+					&hash_idx, &is_representative);
+                        if (is_representative) {
+                          HashmapValueT &val =
+                              cst_dev_params.d_hashmap_values.lane(
+                                  ilane)[hash_idx];
+                          uint32_t min;
+                          GetMinFromPackedArgminUInt64(
+                              val.min_and_argmin_int_cost_u64, &min);
+                          unsigned long long new_pack;
+                          PackArgminInUInt64(min, extra_prev_tokens_offset,
+                                             &new_pack);
+                          val.min_and_argmin_int_cost_u64 = new_pack;
+                        }
+		}
+	}
+}
+
+// Step4: We now know where to store our extra prev tokens in
+// d_main_q_extra_prev_tokens.
+// We will now move the tokens that need to be moved (when multiple tokens are
+// associated to the same FST state)
+// into d_main_q_extra_prev_tokens. In d_main_q_info, we will store the location
+// of that list [offset,size]
+// so that when backtracking, when we read d_main_q_info[token_idx], we know
+// where to look to have the list
+// of the same-state tokens
+// It is the last step of the
+// emitting_preprocess_and_list_extra_prev_tokens_step[i]_kernel pipeline
+__global__ void emitting_preprocess_and_list_extra_prev_tokens_step4_kernel(
+    DeviceParams cst_dev_params, KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    const LaneCounters *lane_counters =
+        cst_dev_params.d_lanes_counters.lane(ilane);
+    const int32 ichannel = lane_counters->channel_to_compute;
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    // Previous frames have filled d_main_q_extra_prev_tokens.
+    // d_main_q_extra_prev_tokens was then flushed to host. We want to set the
+    // global
+    // (global in the sense "for all frames") offset on where to read it the
+    // h_all_tokens_extra_prev_tokens_ on host.
+    // adding the main_q_extra_prev_tokens_global_offset for that
+    const int prev_global_idx =
+        lane_counters->main_q_extra_prev_tokens_global_offset;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      // We'll take care of token at main_q_idx
+      // Loading hashmap information about token.state
+      bool is_representative;
+      int32 hash_idx;
+      GetFSTStateHashIndex(
+          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+          &hash_idx, &is_representative);
+
+      HashmapValueT val = cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx];
+      // How many tokens are associated with that fst state token.state
+      int same_count = val.count;
+      bool must_move_to_extra_prev_tokens = (same_count > 1);
+      if (must_move_to_extra_prev_tokens) {
+        // Moving to the extra_prev_tokens list.
+        // Some of those tokens have an extra cost (compared to the best cost
+        // for that FST state)
+        // Generating and saving that extra cost. We will use it when generating
+        // the lattice.
+        CostType token_cost = orderedIntToFloat(
+            cst_dev_params.d_main_q_state_and_cost.channel(ichannel)[main_q_idx]
+                .y);
+	uint32_t best_int_cost;
+        // Where to write this state list in d_main_q_extra_prev_tokens
+	uint32_t extra_prev_tokens_offset;
+	unsigned long long pack = val.min_and_argmin_int_cost_u64;
+	GetMinFromPackedArgminUInt64(pack, &best_int_cost);
+	GetArgFromPackedArgminUInt64(pack, &extra_prev_tokens_offset);
+        CostType best_cost = orderedIntToFloat((int)best_int_cost);
+        CostType extra_cost = token_cost - best_cost;
+	assert(!is_representative || extra_cost == 0.0f);
+        // Loading the token to be moved
+        InfoToken inf_tok =
+            cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx];
+        CostType acoustic_cost =
+            cst_dev_params.d_main_q_acoustic_cost.lane(ilane)[main_q_idx];
+        // Place of that specific token in the extra_prev_tokens sublist of that
+        // specific FST state
+        int32 local_idx =
+            cst_dev_params.d_main_q_n_extra_prev_tokens_local_idx.lane(
+                ilane)[main_q_idx];
+        // Saving the location of the extra prev tokens for that state into that
+        // InfoToken
+        SetSameFSTStateTokensList(
+            prev_global_idx + extra_prev_tokens_offset, same_count,
+            &cst_dev_params.d_main_q_info.lane(ilane)[main_q_idx]);
+        // Where to write this token in d_main_q_extra_prev_tokens
+        int32 list_idx = extra_prev_tokens_offset + local_idx;
+        // Moving token. Also saving extra_cost
+        cst_dev_params.d_main_q_extra_prev_tokens.lane(ilane)[list_idx] =
+            inf_tok;
+        cst_dev_params.d_main_q_extra_and_acoustic_cost.lane(
+            ilane)[list_idx] = {extra_cost, acoustic_cost};
+        assert(inf_tok.prev_token >= (lane_counters->main_q_global_offset -
+                                      cst_dev_params.main_q_capacity) &&
+               inf_tok.prev_token <=
+                   (lane_counters->main_q_global_offset + main_q_end));
+      }
+    }
+  }
+}
+
+// Clear the hashmaps after use
+// Each element in the map has a representative in the main_q
+// Everyone of those representatives has the responsability to reset their
+// corresponding value in the hashmap
+// Once this kernel returns, the hashmaps are cleared
+__global__ void clear_hashmap_kernel(DeviceParams cst_dev_params,
+                                     KernelParams params) {
+  const int nlanes = params.nlanes_used;
+  KALDI_CUDA_DECODER_BATCH_KERNEL_LOOP(ilane, nlanes) {
+    LaneCounters *lane_counters = cst_dev_params.d_lanes_counters.lane(ilane);
+    const int main_q_end = lane_counters->main_q_narcs_and_end.y;
+    KALDI_CUDA_DECODER_1D_KERNEL_LOOP(main_q_idx, main_q_end) {
+      bool is_representative;
+      int32 hash_idx;
+      GetFSTStateHashIndex(
+          cst_dev_params.d_main_q_state_hash_idx.lane(ilane)[main_q_idx],
+          &hash_idx, &is_representative);
+      // Representative owns a state. Each representative resets its associated
+      // token.state
+      // in the hashmap
+      if (is_representative) {
+        cst_dev_params.d_hashmap_values.lane(ilane)[hash_idx] =
+            KALDI_CUDA_DECODER_HASHMAP_NO_VAL;  // clear
+      }
+    }
+  }
+}
+
+// Kernels wrappers
+
+void SaveChannelsStateFromLanesKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params) {
+  save_channels_state_from_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                                kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void LoadChannelsStateInLanesKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params) {
+  load_channels_state_in_lanes_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                              kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitDecodingOnDeviceKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params) {
+  init_decoding_on_device_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                         kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitializeInitialLaneKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params) {
+  initialize_initial_lane_kernel<<<grid, block, 0, st>>>(cst_dev_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ResetForFrameAndEstimateCutoffKernel(const dim3 &grid, const dim3 &block,
+                                          const cudaStream_t &st,
+                                          const DeviceParams &cst_dev_params,
+                                          const KernelParams &kernel_params) {
+  reset_for_frame_and_estimate_cutoff_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params);
+}
+
+template <bool IS_EMITTING>
+void ExpandArcsKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params) {
+  expand_arcs_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template <bool IS_EMITTING>
+void PostExpandKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params) {
+  post_expand_kernel<IS_EMITTING><<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void PostContractAndPreprocessKernel(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &kernel_params) {
+  post_contract_and_preprocess_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                              kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void NonEmittingPreprocessAndContractKernel(const dim3 &grid, const dim3 &block,
+                                            const cudaStream_t &st,
+                                            const DeviceParams &cst_dev_params,
+                                            const KernelParams &kernel_params) {
+  nonemitting_preprocess_and_contract_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void FillHashmapWithMainQKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params) {
+  fill_hashmap_with_main_q_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                          kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step1_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step2_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step3_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params) {
+  emitting_preprocess_and_list_extra_prev_tokens_step4_kernel<<<grid, block, 0,
+                                                                st>>>(
+      cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ComputeLaneOffsetsKernel(const dim3 &grid, const dim3 &block,
+                              const cudaStream_t &st,
+                              const DeviceParams &cst_dev_params,
+                              const KernelParams &kernel_params) {
+  compute_lane_offsets_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                      kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template <typename T>
+void ConcatenateLanesDataKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params,
+                                const LaneMatrixView<T> &src, T *concat,
+                                int32 *lane_offsets) {
+  concatenate_lanes_data_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, src, concat, lane_offsets);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void InitHashmapKernel(const dim3 &grid, const dim3 &block,
+                       const cudaStream_t &st,
+                       const DeviceParams &cst_dev_params) {
+  init_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ClearHashmapKernel(const dim3 &grid, const dim3 &block,
+                        const cudaStream_t &st,
+                        const DeviceParams &cst_dev_params,
+                        const KernelParams &kernel_params) {
+  clear_hashmap_kernel<<<grid, block, 0, st>>>(cst_dev_params, kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void ComputeCostsHistogramKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params,
+                                 const KernelParams &kernel_params,
+                                 bool use_aux_q) {
+  compute_costs_histogram_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, use_aux_q);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void UpdateBeamUsingHistogramKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params,
+                                    bool use_aux_q) {
+  update_beam_using_histogram_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, use_aux_q);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void FinalizeProcessNonEmittingKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params) {
+  finalize_process_non_emitting_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                               kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep1Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero) {
+  get_best_cost_step1_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, isfinal, fst_zero);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep2Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero) {
+  get_best_cost_step2_kernel<<<grid, block, 0, st>>>(
+      cst_dev_params, kernel_params, isfinal, fst_zero);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void GetBestCostStep3Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params) {
+  get_best_cost_step3_kernel<<<grid, block, 0, st>>>(cst_dev_params,
+                                                     kernel_params);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+template void ExpandArcsKernel<true>(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &params);
+template void ExpandArcsKernel<false>(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &params);
+template void PostExpandKernel<true>(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &params);
+template void PostExpandKernel<false>(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &params);
+
+template void ConcatenateLanesDataKernel<InfoToken>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<InfoToken> &src, InfoToken *concat,
+    int32 *lane_offsets);
+
+template void ConcatenateLanesDataKernel<CostType>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<CostType> &src, CostType *concat, int32 *lane_offsets);
+
+template void ConcatenateLanesDataKernel<float2>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<float2> &src, float2 *concat, int32 *lane_offsets);
+
+template void ConcatenateLanesDataKernel<int32>(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &params,
+    const LaneMatrixView<int32> &src, int32 *concat, int32 *lane_offsets);
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
diff --git a/src/cudadecoder/cuda-decoder-kernels.h b/src/cudadecoder/cuda-decoder-kernels.h
new file mode 100644
index 00000000000..c137a98da74
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder-kernels.h
@@ -0,0 +1,206 @@
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
+
+#include "cudadecoder/cuda-decoder-common.h"
+#include "util/stl-utils.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// DeviceParams contains all top-level const data used by the kernels
+// i.e. the data that won't change between kernel calls (such as memory pointers
+// to the main_q)
+struct DeviceParams {
+  ChannelMatrixView<ChannelCounters> d_channels_counters;
+  LaneMatrixView<LaneCounters> d_lanes_counters;
+  LaneMatrixView<LaneCounters> h_lanes_counters;
+
+  ChannelMatrixView<int2> d_main_q_state_and_cost;
+  ChannelMatrixView<int32> d_main_q_degrees_prefix_sum;
+  ChannelMatrixView<int32> d_main_q_arc_offsets;
+  LaneMatrixView<CostType> d_main_q_acoustic_cost;
+  LaneMatrixView<InfoToken> d_main_q_info;
+  LaneMatrixView<int2> d_aux_q_state_and_cost;
+  LaneMatrixView<InfoToken> d_aux_q_info;
+  LaneMatrixView<HashmapValueT> d_hashmap_values;
+  LaneMatrixView<int2> h_list_final_tokens_in_main_q;
+  LaneMatrixView<float2> d_main_q_extra_and_acoustic_cost;
+  LaneMatrixView<int32> d_histograms;
+  LaneMatrixView<int2> d_main_q_block_sums_prefix_sum;
+  LaneMatrixView<int32> d_main_q_state_hash_idx;
+  LaneMatrixView<int32> d_main_q_extra_prev_tokens_prefix_sum;
+  LaneMatrixView<int32> d_main_q_n_extra_prev_tokens_local_idx;
+  LaneMatrixView<InfoToken> d_main_q_extra_prev_tokens;
+
+  int32 max_nlanes;
+  int32 main_q_capacity, aux_q_capacity;
+  CostType *d_arc_weights;
+  int32 *d_arc_nextstates;
+  int32 *d_arc_pdf_ilabels;
+  uint32 *d_arc_e_offsets;
+  uint32 *d_arc_ne_offsets;
+  CostType *d_fst_final_costs;
+  int32 nstates;
+  CostType default_beam;
+  CostType lattice_beam;
+  int32 init_channel_id;
+  StateId init_state;
+  CostType init_cost;
+  int32 hashmap_capacity;
+  int32 max_active;
+  int32 adaptive_beam_static_segment;
+  int32 adaptive_beam_bin_width;
+};
+
+// KernelParams contains all the kernels arguments that change between kernel
+// calls
+struct KernelParams {
+  int32 nlanes_used;
+};
+
+// Kernel wrappers
+void SaveChannelsStateFromLanesKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params);
+
+void LoadChannelsStateInLanesKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params);
+
+void InitDecodingOnDeviceKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params);
+
+void InitializeInitialLaneKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params);
+
+void ResetForFrameAndEstimateCutoffKernel(const dim3 &grid, const dim3 &block,
+                                          const cudaStream_t &st,
+                                          const DeviceParams &cst_dev_params,
+                                          const KernelParams &kernel_params);
+
+template <bool IS_EMITTING>
+void ExpandArcsKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params);
+
+template <bool IS_EMITTING>
+void PostExpandKernel(const dim3 &grid, const dim3 &block,
+                      const cudaStream_t &st,
+                      const DeviceParams &cst_dev_params,
+                      const KernelParams &kernel_params);
+
+void PostContractAndPreprocessKernel(const dim3 &grid, const dim3 &block,
+                                     const cudaStream_t &st,
+                                     const DeviceParams &cst_dev_params,
+                                     const KernelParams &kernel_params);
+
+void NonEmittingPreprocessAndContractKernel(const dim3 &grid, const dim3 &block,
+                                            const cudaStream_t &st,
+                                            const DeviceParams &cst_dev_params,
+                                            const KernelParams &kernel_params);
+
+void FillHashmapWithMainQKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+    const dim3 &grid, const dim3 &block, const cudaStream_t &st,
+    const DeviceParams &cst_dev_params, const KernelParams &kernel_params);
+
+void ComputeLaneOffsetsKernel(const dim3 &grid, const dim3 &block,
+                              const cudaStream_t &st,
+                              const DeviceParams &cst_dev_params,
+                              const KernelParams &kernel_params);
+
+template <typename T>
+void ConcatenateLanesDataKernel(const dim3 &grid, const dim3 &block,
+                                const cudaStream_t &st,
+                                const DeviceParams &cst_dev_params,
+                                const KernelParams &kernel_params,
+                                const LaneMatrixView<T> &src, T *concat,
+                                int32 *lane_offsets);
+
+void InitHashmapKernel(const dim3 &grid, const dim3 &block,
+                       const cudaStream_t &st,
+                       const DeviceParams &cst_dev_params);
+
+void ClearHashmapKernel(const dim3 &grid, const dim3 &block,
+                        const cudaStream_t &st,
+                        const DeviceParams &cst_dev_params,
+                        const KernelParams &kernel_params);
+
+void ComputeCostsHistogramKernel(const dim3 &grid, const dim3 &block,
+                                 const cudaStream_t &st,
+                                 const DeviceParams &cst_dev_params,
+                                 const KernelParams &kernel_params,
+                                 bool use_aux_q);
+
+void UpdateBeamUsingHistogramKernel(const dim3 &grid, const dim3 &block,
+                                    const cudaStream_t &st,
+                                    const DeviceParams &cst_dev_params,
+                                    const KernelParams &kernel_params,
+                                    bool use_aux_q);
+
+void FinalizeProcessNonEmittingKernel(const dim3 &grid, const dim3 &block,
+                                      const cudaStream_t &st,
+                                      const DeviceParams &cst_dev_params,
+                                      const KernelParams &kernel_params);
+
+void GetBestCostStep1Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero);
+
+void GetBestCostStep2Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params, bool isfinal,
+                            CostType fst_zero);
+
+void GetBestCostStep3Kernel(const dim3 &grid, const dim3 &block,
+                            const cudaStream_t &st,
+                            const DeviceParams &cst_dev_params,
+                            const KernelParams &kernel_params);
+
+typedef unsigned char BinId;
+
+}  // namespace kaldi
+}  // namespace cuda_decoder
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_KERNELS_H_
diff --git a/src/cudadecoder/cuda-decoder.cc b/src/cudadecoder/cuda-decoder.cc
new file mode 100644
index 00000000000..89b1ef5f099
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder.cc
@@ -0,0 +1,1816 @@
+// cudadecoder/cuda-decoder.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include "cuda-decoder.h"
+#include "cuda-decoder-kernels.h"
+
+#include <cuda_runtime_api.h>
+#include <nvToolsExt.h>
+#include <algorithm>
+#include <cfloat>
+#include <map>
+#include <tuple>
+
+namespace kaldi {
+namespace cuda_decoder {
+CudaDecoder::CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
+                         int32 nlanes, int32 nchannels)
+    : fst_(fst),
+      nlanes_(nlanes),
+      nchannels_(nchannels),
+      channel_lock_(nchannels + 1),
+      extra_cost_min_delta_(0.0f),
+      thread_pool_(NULL),
+      n_threads_used_(0),
+      n_h2h_task_not_done_(0),
+      n_init_decoding_h2h_task_not_done_(0),
+      h2h_threads_running_(true) {
+  ReadConfig(config);
+  // Static asserts on constants
+  CheckStaticAsserts();
+  // Runtime asserts
+  KALDI_ASSERT(nlanes > 0);
+  KALDI_ASSERT(nchannels > 0);
+  KALDI_ASSERT(nlanes_ <= nchannels_);
+  // All GPU work in decoder will be sent to compute_st_
+  cudaStreamCreate(&compute_st_);
+  // Copies D2H of tokens for storage on host are done on
+  // copy_st_, in parallel with compute_st_
+  cudaStreamCreate(&copy_st_);
+  // For all the allocating/initializing process
+  // We create a special channel
+  // containing the exact state a channel should have when starting a new decode
+  // It contains fst.Start(), the non-emitting tokens created by fst.Start(),
+  // and all the data used by the decoder.
+  // When calling InitDecoding() on a new channel, we simply clone this special
+  // channel into that new channel
+  ++nchannels_;                       // adding the special initial channel
+  init_channel_id_ = nchannels_ - 1;  // Using last one as init_channel_params
+  AllocateHostData();
+  AllocateDeviceData();
+  AllocateDeviceKernelParams();
+
+  InitDeviceParams();
+  InitHostData();
+  InitDeviceData();
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&nnet3_done_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&d2h_copy_acoustic_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&d2h_copy_infotoken_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventCreate(&d2h_copy_extra_prev_tokens_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventCreate(&concatenated_data_ready_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventCreate(&lane_offsets_ready_evt_));
+
+  ComputeInitialChannel();
+  --nchannels_;  // removing the special initial channel from the count
+
+  // Making sure that everything is ready to use
+  cudaStreamSynchronize(compute_st_);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void CudaDecoder::ReadConfig(const CudaDecoderConfig &cst_config) {
+  CudaDecoderConfig config = cst_config;  // deep copy
+  // Sets the missing values using other values
+  config.ComputeConfig();
+  default_beam_ = config.default_beam;
+  lattice_beam_ = config.lattice_beam;
+  ntokens_pre_allocated_ = config.ntokens_pre_allocated;
+  max_active_ = config.max_active;
+  aux_q_capacity_ = config.aux_q_capacity;
+  main_q_capacity_ = config.main_q_capacity;
+
+  KALDI_ASSERT(default_beam_ >= 0.0f);
+  KALDI_ASSERT(lattice_beam_ >= 0.0f);
+  KALDI_ASSERT(ntokens_pre_allocated_ >= 0);
+  KALDI_ASSERT(max_active_ > 0);
+  KALDI_ASSERT(main_q_capacity_ > 0);
+  KALDI_ASSERT(aux_q_capacity_ >= main_q_capacity_);
+}
+
+void CudaDecoder::AllocateDeviceData() {
+  hashmap_capacity_ =
+      KALDI_CUDA_DECODER_HASHMAP_CAPACITY_FACTOR * main_q_capacity_;
+  d_channels_counters_.Resize(nchannels_, 1);
+  d_lanes_counters_.Resize(
+      nlanes_ + 1,
+      1);  // +1 because we sometimes need last+1 value (for offsets)
+  d_main_q_state_and_cost_.Resize(nchannels_, main_q_capacity_);
+  d_main_q_info_.Resize(nlanes_, main_q_capacity_);
+  d_aux_q_state_and_cost_.Resize(nlanes_, aux_q_capacity_);
+  d_aux_q_info_.Resize(nlanes_, aux_q_capacity_);
+  d_main_q_degrees_prefix_sum_.Resize(nchannels_, main_q_capacity_);
+  d_histograms_.Resize(nlanes_, KALDI_CUDA_DECODER_HISTO_NBINS);
+  d_main_q_extra_prev_tokens_prefix_sum_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_n_extra_prev_tokens_local_idx_.Resize(nlanes_, main_q_capacity_);
+
+  d_main_q_state_hash_idx_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_extra_prev_tokens_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_extra_and_acoustic_cost_.Resize(nlanes_, main_q_capacity_);
+  d_main_q_block_sums_prefix_sum_.Resize(
+      nlanes_, KALDI_CUDA_DECODER_DIV_ROUND_UP(main_q_capacity_,
+                                               KALDI_CUDA_DECODER_1D_BLOCK) +
+                   1);
+  d_main_q_arc_offsets_.Resize(nchannels_, main_q_capacity_);
+  d_hashmap_values_.Resize(nlanes_, hashmap_capacity_);
+  d_main_q_acoustic_cost_.Resize(nlanes_, main_q_capacity_);
+  d_extra_and_acoustic_cost_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  d_acoustic_cost_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  d_infotoken_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  d_extra_prev_tokens_concat_matrix_.Resize(nlanes_, main_q_capacity_);
+  // Reusing data from aux_q. Those two are never used at the same time
+  // d_list_final_tokens_in_main_q_ is used in GetBestPath.
+  // the aux_q is used in AdvanceDecoding
+  h_list_final_tokens_in_main_q_.Resize(nlanes_, main_q_capacity_);
+  d_extra_prev_tokens_concat_ = d_extra_prev_tokens_concat_matrix_.lane(0);
+  d_extra_and_acoustic_cost_concat_ =
+      d_extra_and_acoustic_cost_concat_matrix_.lane(0);
+  d_acoustic_cost_concat_ = d_acoustic_cost_concat_matrix_.lane(0);
+  d_infotoken_concat_ = d_infotoken_concat_matrix_.lane(0);
+}
+
+void CudaDecoder::AllocateHostData() {
+  channel_to_compute_.resize(nlanes_);
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_extra_and_acoustic_cost_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_extra_and_acoustic_cost_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_acoustic_cost_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_extra_prev_tokens_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_infotoken_concat_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMallocHost(&h_extra_and_acoustic_cost_concat_tmp_,
+                     nlanes_ * main_q_capacity_ *
+                         sizeof(*h_extra_and_acoustic_cost_concat_tmp_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_acoustic_cost_concat_tmp_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_acoustic_cost_concat_tmp_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_extra_prev_tokens_concat_tmp_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_extra_prev_tokens_concat_tmp_)));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_infotoken_concat_tmp_,
+      nlanes_ * main_q_capacity_ * sizeof(*h_infotoken_concat_tmp_)));
+  h_lanes_counters_.Resize(
+      nlanes_ + 1,
+      1);  // +1 because we sometimes need last+1 value (for offsets)
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMallocHost(
+      &h_channels_counters_, nchannels_ * sizeof(*h_channels_counters_)));
+
+  h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_.resize(nchannels_);
+  h_all_tokens_acoustic_cost_.resize(nchannels_);
+  h_all_tokens_extra_prev_tokens_.resize(nchannels_);
+  h_all_tokens_info_.resize(nchannels_);
+  for (int32 ichannel = 0; ichannel < nchannels_; ++ichannel) {
+    h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel].reserve(
+        ntokens_pre_allocated_);
+    h_all_tokens_acoustic_cost_[ichannel].reserve(ntokens_pre_allocated_);
+    h_all_tokens_info_[ichannel].reserve(ntokens_pre_allocated_);
+  }
+  h_main_q_end_lane_offsets_.resize(nlanes_ + 1);
+  h_emitting_main_q_end_lane_offsets_.resize(nlanes_ + 1);
+  h_n_extra_prev_tokens_lane_offsets_.resize(nlanes_ + 1);
+  frame_offsets_.resize(nchannels_);
+  num_frames_decoded_.resize(nchannels_, -1);
+  lanes2channels_todo_.reserve(nlanes_);
+
+  h_all_argmin_cost_.resize(nchannels_, {-1, 0.0f});
+  h_all_final_tokens_list_.resize(nchannels_);
+  h_all_has_reached_final_.resize(nchannels_);
+}
+
+void CudaDecoder::InitDeviceData() {
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemsetAsync(
+      d_channels_counters_.MutableData(), 0,
+      nchannels_ * sizeof(*d_channels_counters_.MutableData()), compute_st_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemsetAsync(
+      d_lanes_counters_.MutableData(), 0,
+      nlanes_ * sizeof(*d_lanes_counters_.MutableData()), compute_st_));
+  InitHashmapKernel(KaldiCudaDecoderNumBlocks(hashmap_capacity_, nlanes_),
+                    KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                    *h_device_params_);
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+}
+
+void CudaDecoder::InitHostData() {}
+
+void CudaDecoder::AllocateDeviceKernelParams() {
+  h_device_params_ = new DeviceParams();
+  h_kernel_params_ = new KernelParams();
+}
+
+void CudaDecoder::InitDeviceParams() {
+  // Setting Kernel Params
+  // Sent to cuda kernels by copy
+  // Making sure we'll be able to send it to the kernels
+  KALDI_ASSERT((sizeof(KernelParams) + sizeof(DeviceParams)) <
+               KALDI_CUDA_DECODER_MAX_KERNEL_ARGUMENTS_BYTE_SIZE);
+
+  h_device_params_->d_channels_counters = d_channels_counters_.GetView();
+  h_device_params_->d_lanes_counters = d_lanes_counters_.GetView();
+  h_device_params_->h_lanes_counters = h_lanes_counters_.GetView();
+  h_device_params_->d_main_q_state_and_cost =
+      d_main_q_state_and_cost_.GetView();
+  h_device_params_->d_main_q_info = d_main_q_info_.GetView();
+  h_device_params_->d_aux_q_state_and_cost = d_aux_q_state_and_cost_.GetView();
+  h_device_params_->d_main_q_extra_and_acoustic_cost =
+      d_main_q_extra_and_acoustic_cost_.GetView();
+  h_device_params_->d_main_q_acoustic_cost = d_main_q_acoustic_cost_.GetView();
+  h_device_params_->d_aux_q_info = d_aux_q_info_.GetView();
+  h_device_params_->d_main_q_degrees_prefix_sum =
+      d_main_q_degrees_prefix_sum_.GetView();
+  h_device_params_->d_main_q_block_sums_prefix_sum =
+      d_main_q_block_sums_prefix_sum_.GetView();
+  h_device_params_->d_main_q_state_hash_idx =
+      d_main_q_state_hash_idx_.GetView();
+  h_device_params_->d_main_q_extra_prev_tokens_prefix_sum =
+      d_main_q_extra_prev_tokens_prefix_sum_.GetView();
+  h_device_params_->d_main_q_n_extra_prev_tokens_local_idx =
+      d_main_q_n_extra_prev_tokens_local_idx_.GetView();
+  h_device_params_->d_main_q_extra_prev_tokens =
+      d_main_q_extra_prev_tokens_.GetView();
+  h_device_params_->d_main_q_arc_offsets = d_main_q_arc_offsets_.GetView();
+  h_device_params_->d_hashmap_values = d_hashmap_values_.GetView();
+  h_device_params_->d_histograms = d_histograms_.GetView();
+  h_device_params_->d_arc_e_offsets = fst_.d_e_offsets_;
+  h_device_params_->d_arc_ne_offsets = fst_.d_ne_offsets_;
+  h_device_params_->d_arc_pdf_ilabels = fst_.d_arc_pdf_ilabels_;
+  h_device_params_->d_arc_weights = fst_.d_arc_weights_;
+  h_device_params_->d_arc_nextstates = fst_.d_arc_nextstates_;
+  h_device_params_->d_fst_final_costs = fst_.d_final_;
+  h_device_params_->default_beam = default_beam_;
+  h_device_params_->lattice_beam = lattice_beam_;
+  h_device_params_->main_q_capacity = main_q_capacity_;
+  h_device_params_->aux_q_capacity = aux_q_capacity_;
+  h_device_params_->init_channel_id = init_channel_id_;
+  h_device_params_->max_nlanes = nlanes_;
+  h_device_params_->nstates = fst_.num_states_;
+  h_device_params_->init_state = fst_.Start();
+  KALDI_ASSERT(h_device_params_->init_state != fst::kNoStateId);
+  h_device_params_->init_cost = StdWeight::One().Value();
+  h_device_params_->hashmap_capacity = hashmap_capacity_;
+  h_device_params_->max_active = max_active_;
+  // For the first static_beam_q_length elements of the queue, we will keep the
+  // beam static
+  adaptive_beam_static_segment_ =
+      aux_q_capacity_ / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_STATIC_SEGMENT;
+  // For the last adaptive_beam_q_length elements of the queue, we will decrease
+  // the beam, segment by segment
+  // For more information, please refer to the definition of GetAdaptiveBeam in
+  // cuda-decoder-kernels.cu
+  int32 adaptive_beam_q_length =
+      (aux_q_capacity_ - adaptive_beam_static_segment_);
+  int32 adaptive_beam_bin_width =
+      adaptive_beam_q_length / KALDI_CUDA_DECODER_ADAPTIVE_BEAM_NSTEPS;
+  h_device_params_->adaptive_beam_static_segment =
+      adaptive_beam_static_segment_;
+  h_device_params_->adaptive_beam_bin_width = adaptive_beam_bin_width;
+
+  // Reusing aux_q memory to list final states in GetLattice
+  // Those cannot be used at the same time
+  h_device_params_->h_list_final_tokens_in_main_q =
+      h_list_final_tokens_in_main_q_.GetView();
+}
+
+CudaDecoder::~CudaDecoder() {
+  // Stopping h2h tasks
+  h2h_threads_running_ = false;
+  n_h2h_main_task_todo_cv_.notify_all();
+  for (std::thread &thread : cpu_dedicated_threads_) thread.join();
+  cudaStreamDestroy(compute_st_);
+  cudaStreamDestroy(copy_st_);
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_channels_counters_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_extra_and_acoustic_cost_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_acoustic_cost_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_extra_prev_tokens_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_infotoken_concat_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_extra_and_acoustic_cost_concat_tmp_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_acoustic_cost_concat_tmp_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaFreeHost(h_extra_prev_tokens_concat_tmp_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaFreeHost(h_infotoken_concat_tmp_));
+  // Will call the cudaFrees inside destructors
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(nnet3_done_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(d2h_copy_acoustic_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(d2h_copy_infotoken_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventDestroy(d2h_copy_extra_prev_tokens_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaEventDestroy(concatenated_data_ready_evt_));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaEventDestroy(lane_offsets_ready_evt_));
+
+  delete h_kernel_params_;
+  delete h_device_params_;
+}
+
+void CudaDecoder::ComputeInitialChannel() {
+  KALDI_ASSERT(nlanes_ > 0);
+  const int32 ilane = 0;
+  KALDI_ASSERT(ilane == 0);
+  // Following kernels working channel_id
+  std::vector<ChannelId> channels = {init_channel_id_};
+  SetChannelsInKernelParams(channels);  // not calling LoadChannelsStateToLanes,
+                                        // init_channel_id_ is a special case
+  h_lanes_counters_.lane(ilane)->channel_to_compute = init_channel_id_;
+
+  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                  1 * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyHostToDevice, compute_st_);
+  h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y = 0;
+
+  // Adding the start state to the initial token queue
+  InitializeInitialLaneKernel(KaldiCudaDecoderNumBlocks(1, 1),
+                              KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                              *h_device_params_);
+
+  h_lanes_counters_.lane(ilane)->post_expand_aux_q_end = 1;
+
+  PruneAndPreprocess();
+  FinalizeProcessNonEmittingKernel(
+      KaldiCudaDecoderNumBlocks(1, 1), KALDI_CUDA_DECODER_LARGEST_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  CopyLaneCountersToHostSync();
+  PostProcessingMainQueue();
+  ConcatenateData();
+  CopyLaneCountersToHostSync();
+
+  const int32 main_q_end =
+      h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y;
+  KALDI_ASSERT(main_q_end > 0);
+
+  // Moving all data linked to init_channel_id_ to host
+  // that data will be cloned to other channels when calling InitDecoding
+  CopyMainQueueDataToHost();
+  SaveChannelsStateFromLanes();
+
+  KALDI_ASSERT(
+      h_channels_counters_[init_channel_id_].prev_main_q_narcs_and_end.x > 0);
+  KALDI_ASSERT(
+      h_channels_counters_[init_channel_id_].prev_main_q_narcs_and_end.y > 0);
+}
+
+void CudaDecoder::InitDecoding(const std::vector<ChannelId> &channels) {
+  // Cloning the init_channel_id_ channel into all channels in the channels vec
+  const int nlanes_used = channels.size();
+  // Getting *h_kernel_params ready to use
+  LoadChannelsStateToLanes(channels);
+  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyHostToDevice, compute_st_);
+
+  // Size of the initial main_q
+  ChannelCounters &init_channel_counters =
+      h_channels_counters_[init_channel_id_];
+  const int32 init_main_q_size =
+      init_channel_counters.prev_main_q_narcs_and_end.y;
+
+  KALDI_ASSERT(init_main_q_size > 0);
+  // Getting the channels ready to compute new utterances
+  InitDecodingOnDeviceKernel(
+      KaldiCudaDecoderNumBlocks(init_main_q_size, nlanes_used),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  {
+    std::lock_guard<std::mutex> n_h2h_not_done_lk(
+        n_init_decoding_h2h_task_not_done_mutex_);
+    n_init_decoding_h2h_task_not_done_ += channels.size();
+  }
+  for (ChannelId ichannel : channels) {
+    ChannelCounters &channel_counters = h_channels_counters_[ichannel];
+    channel_counters.prev_main_q_narcs_and_end =
+        init_channel_counters.prev_main_q_narcs_and_end;
+    channel_counters.prev_main_q_n_extra_prev_tokens =
+        init_channel_counters.prev_main_q_n_extra_prev_tokens;
+    channel_counters.prev_main_q_global_offset = 0;
+    channel_counters.prev_main_q_extra_prev_tokens_global_offset = 0;
+    channel_counters.prev_beam = default_beam_;
+
+    int32 n_initial_tokens = h_all_tokens_info_[init_channel_id_].size();
+    num_frames_decoded_[ichannel] = 0;
+    h_channels_counters_[ichannel] = h_channels_counters_[init_channel_id_];
+    h_all_argmin_cost_[ichannel] = {-1, 0.0f};
+    frame_offsets_[ichannel].clear();
+    frame_offsets_[ichannel].push_back(n_initial_tokens);
+    if (thread_pool_)
+      thread_pool_->enqueue(THREAD_POOL_HIGH_PRIORITY,
+                            &CudaDecoder::InitDecodingH2HCopies, this,
+                            ichannel);
+    else
+      InitDecodingH2HCopies(ichannel);
+  }
+}
+
+void CudaDecoder::InitDecodingH2HCopies(ChannelId ichannel) {
+  // Tokens from initial main_q needed on host
+  std::unique_lock<std::mutex> channel_lk(channel_lock_[ichannel]);
+  // Deep copy
+  h_all_tokens_info_[ichannel] = h_all_tokens_info_[init_channel_id_];
+  h_all_tokens_acoustic_cost_[ichannel] =
+      h_all_tokens_acoustic_cost_[init_channel_id_];
+  h_all_tokens_extra_prev_tokens_[ichannel] =
+      h_all_tokens_extra_prev_tokens_[init_channel_id_];
+  h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel] =
+      h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[init_channel_id_];
+
+  bool all_done;
+  {
+    std::lock_guard<std::mutex> lk_not_done(
+        n_init_decoding_h2h_task_not_done_mutex_);
+    all_done = (--n_init_decoding_h2h_task_not_done_ == 0);
+  }
+  if (all_done) {
+    init_decoding_h2h_done_.notify_all();
+  }
+}
+
+void CudaDecoder::LoadChannelsStateToLanes(
+    const std::vector<ChannelId> &channels) {
+  // Setting that channels configuration in kernel_params
+  SetChannelsInKernelParams(channels);
+  KALDI_ASSERT(nlanes_used_ > 0);
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = channel_to_compute_[ilane];
+    ChannelCounters &channel_counters = h_channels_counters_[ichannel];
+    LaneCounters &lane_counters = *h_lanes_counters_.lane(ilane);
+    lane_counters.channel_to_compute = ichannel;
+    lane_counters.main_q_narcs_and_end =
+        channel_counters.prev_main_q_narcs_and_end;
+    lane_counters.main_q_n_extra_prev_tokens =
+        channel_counters.prev_main_q_n_extra_prev_tokens;
+    int32 int_beam = floatToOrderedIntHost(channel_counters.prev_beam);
+    lane_counters.int_beam = int_beam;
+    lane_counters.adaptive_int_beam_with_validity_index.x = int_beam;
+    lane_counters.adaptive_int_beam_with_validity_index.y =
+        adaptive_beam_static_segment_;
+    lane_counters.main_q_global_offset =
+        channel_counters.prev_main_q_global_offset;
+    lane_counters.main_q_extra_prev_tokens_global_offset =
+        channel_counters.prev_main_q_extra_prev_tokens_global_offset;
+
+    lane_counters.min_int_cost =
+        channel_counters.min_int_cost_and_arg_without_final.x;
+    lane_counters.prev_arg_min_int_cost =
+        channel_counters.min_int_cost_and_arg_without_final.y;
+  }
+}
+
+void CudaDecoder::SaveChannelsStateFromLanes() {
+  KALDI_ASSERT(nlanes_used_ > 0);
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = channel_to_compute_[ilane];
+    ChannelCounters &channel_counters = h_channels_counters_[ichannel];
+    LaneCounters &lane_counters = *h_lanes_counters_.lane(ilane);
+    channel_counters.prev_main_q_narcs_and_end =
+        lane_counters.main_q_narcs_and_end;
+    channel_counters.prev_main_q_extra_prev_tokens_global_offset =
+        lane_counters.main_q_extra_prev_tokens_global_offset;
+    channel_counters.prev_main_q_global_offset =
+        lane_counters.main_q_global_offset;
+    channel_counters.prev_main_q_n_extra_prev_tokens =
+        lane_counters.main_q_n_extra_prev_tokens;
+    channel_counters.prev_beam = orderedIntToFloatHost(lane_counters.int_beam);
+    channel_counters.min_int_cost_and_arg_without_final = {
+        lane_counters.min_int_cost, lane_counters.prev_arg_min_int_cost};
+  }
+  SaveChannelsStateFromLanesKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                   KALDI_CUDA_DECODER_ONE_THREAD_BLOCK,
+                                   compute_st_, *h_device_params_,
+                                   *h_kernel_params_);
+
+  ResetChannelsInKernelParams();
+}
+
+int32 CudaDecoder::GetMaxForAllLanes(
+    std::function<int32(const LaneCounters &)> func) {
+  int32 max_val = 0;
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const int32 val = func(*h_lanes_counters_.lane(ilane));
+    max_val = std::max(max_val, val);
+  }
+  return max_val;
+}
+
+void CudaDecoder::CopyLaneCountersToHostAsync() {
+  cudaMemcpyAsync(h_lanes_counters_.lane(0), d_lanes_counters_.MutableData(),
+                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyDeviceToHost, compute_st_);
+}
+
+void CudaDecoder::CopyLaneCountersToHostSync() {
+  CopyLaneCountersToHostAsync();
+  cudaStreamSynchronize(compute_st_);
+}
+
+// One sync has to happen between PerformConcatenatedCopy and
+// MoveConcatenatedCopyToVector
+template <typename T>
+void CudaDecoder::MoveConcatenatedCopyToVector(
+    const int32 ilane, const int32 ichannel,
+    const std::vector<int32> &lanes_offsets, T *h_concat,
+    std::vector<std::vector<T>> *vecvec) {
+  // Unpacking the concatenated vector into individual channel storage
+  int32 beg = lanes_offsets[ilane];
+  int32 end = lanes_offsets[ilane + 1];
+  auto &vec = (*vecvec)[ichannel];
+  vec.insert(vec.end(), h_concat + beg, h_concat + end);
+}
+
+void CudaDecoder::ApplyMaxActiveAndReduceBeam(enum QUEUE_ID queue_id) {
+  // Checking if we should activate max active for the current frame
+  // once it is active, it is active for the whole frame (for all non emitting
+  // iterations)
+  // If at least one lane queue is bigger than max_active,
+  // we'll apply a topk on that queue (k=max_active_)
+  bool use_aux_q = (queue_id == AUX_Q);
+  ComputeCostsHistogramKernel(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                              KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                              *h_device_params_, *h_kernel_params_, use_aux_q);
+
+  UpdateBeamUsingHistogramKernel(
+      KaldiCudaDecoderNumBlocks(1, nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_, use_aux_q);
+}
+
+int32 CudaDecoder::NumFramesToDecode(
+    const std::vector<ChannelId> &channels,
+    std::vector<CudaDecodableInterface *> &decodables, int32 max_num_frames) {
+  int32 nframes_to_decode = INT_MAX;
+  // std::vector<int> debug_ntokens;
+  // std::vector<int> debug_narcs;
+  for (int32 ilane = 0; ilane < nlanes_used_; ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    const int32 num_frames_decoded = num_frames_decoded_[ichannel];
+    KALDI_ASSERT(num_frames_decoded >= 0 &&
+                 "You must call InitDecoding() before AdvanceDecoding()");
+    int32 num_frames_ready = decodables[ilane]->NumFramesReady();
+    // num_frames_ready must be >= num_frames_decoded, or else
+    // the number of frames ready must have decreased (which doesn't
+    // make sense) or the decodable object changed between calls
+    // (which isn't allowed).
+    KALDI_ASSERT(num_frames_ready >= num_frames_decoded);
+    int32 channel_nframes_to_decode = num_frames_ready - num_frames_decoded;
+    nframes_to_decode = std::min(nframes_to_decode, channel_nframes_to_decode);
+  }
+  if (max_num_frames >= 0)
+    nframes_to_decode = std::min(nframes_to_decode, max_num_frames);
+
+  return nframes_to_decode;
+}
+
+void CudaDecoder::ExpandArcsEmitting() {
+  ExpandArcsKernel<true>(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                         KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                         *h_device_params_, *h_kernel_params_);
+
+  // Updating a few counters, like resetting aux_q_end to 0...
+  // true is for IS_EMITTING
+  PostExpandKernel<true>(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                         KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                         *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::ExpandArcsNonEmitting() {
+  // false is for non emitting
+  ExpandArcsKernel<false>(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                          KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                          *h_device_params_, *h_kernel_params_);
+
+  // false is for non emitting
+  PostExpandKernel<false>(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                          KALDI_CUDA_DECODER_ONE_THREAD_BLOCK, compute_st_,
+                          *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::PruneAndPreprocess() {
+  NonEmittingPreprocessAndContractKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+  PostContractAndPreprocessKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                  KALDI_CUDA_DECODER_ONE_THREAD_BLOCK,
+                                  compute_st_, *h_device_params_,
+                                  *h_kernel_params_);
+}
+
+void CudaDecoder::PostProcessingMainQueue() {
+  ApplyMaxActiveAndReduceBeam(MAIN_Q);
+
+  FillHashmapWithMainQKernel(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                             KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                             *h_device_params_, *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep1Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep2Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  // Step2 wrote main_q_n_extra_prev_tokens
+  // it was the last value missing to compute the lanes offsets
+  // doing it now
+  ComputeLaneOffsetsKernel(KaldiCudaDecoderNumBlocks(1, 1),  // One CTA
+                           KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                           *h_device_params_, *h_kernel_params_);
+  cudaEventRecord(lane_offsets_ready_evt_, compute_st_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep3Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  EmittingPreprocessAndListExtraPrevTokensStep4Kernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_);
+
+  ClearHashmapKernel(KaldiCudaDecoderNumBlocks(nlanes_used_),
+                     KALDI_CUDA_DECODER_1D_BLOCK, compute_st_,
+                     *h_device_params_, *h_kernel_params_);
+}
+
+void CudaDecoder::CopyMainQueueDataToHost() {
+  cudaEventRecord(concatenated_data_ready_evt_, compute_st_);
+  cudaStreamWaitEvent(copy_st_, concatenated_data_ready_evt_,
+                      0);  // the copies on copy_st will wait on compute_st_
+  cudaEventSynchronize(
+      lane_offsets_ready_evt_);  // we need the total size of each segments
+  LaunchD2HCopies();
+
+  // Making sure the previous H2H copies are done
+  WaitForInitDecodingH2HCopies();
+  WaitForH2HCopies();
+
+  std::swap(h_extra_and_acoustic_cost_concat_tmp_,
+            h_extra_and_acoustic_cost_concat_);
+  std::swap(h_infotoken_concat_tmp_, h_infotoken_concat_);
+  std::swap(h_acoustic_cost_concat_tmp_, h_acoustic_cost_concat_);
+  std::swap(h_extra_prev_tokens_concat_tmp_, h_extra_prev_tokens_concat_);
+  // Saving the offsets computed previously
+  lanes2channels_todo_.clear();
+  for (int32 ilane = 0; ilane < (nlanes_used_ + 1); ++ilane) {
+    h_emitting_main_q_end_lane_offsets_[ilane] =
+        h_lanes_counters_.lane(ilane)->main_q_n_emitting_tokens_lane_offset;
+    h_main_q_end_lane_offsets_[ilane] =
+        h_lanes_counters_.lane(ilane)->main_q_end_lane_offset;
+    h_n_extra_prev_tokens_lane_offsets_[ilane] =
+        h_lanes_counters_.lane(ilane)->main_q_n_extra_prev_tokens_lane_offset;
+    lanes2channels_todo_.push_back(channel_to_compute_[ilane]);
+  }
+
+  LaunchH2HCopies();
+}
+
+void CudaDecoder::LaunchD2HCopies() {
+  // Last offset = total
+  int32 nelements_acoustic_costs = h_lanes_counters_.lane(nlanes_used_)
+                                       ->main_q_n_emitting_tokens_lane_offset;
+  // Moving the d_concat to h_concat (host), async
+  if (nelements_acoustic_costs > 0) {
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpyAsync(
+        h_acoustic_cost_concat_tmp_, d_acoustic_cost_concat_,
+        nelements_acoustic_costs * sizeof(*d_acoustic_cost_concat_),
+        cudaMemcpyDeviceToHost, copy_st_));
+  }
+  cudaEventRecord(d2h_copy_acoustic_evt_, copy_st_);
+
+  int32 nelements_infotoken =
+      h_lanes_counters_.lane(nlanes_used_)->main_q_end_lane_offset;
+  if (nelements_infotoken > 0) {
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(
+        cudaMemcpyAsync(h_infotoken_concat_tmp_, d_infotoken_concat_,
+                        nelements_infotoken * sizeof(*d_infotoken_concat_),
+                        cudaMemcpyDeviceToHost, copy_st_));
+  }
+  cudaEventRecord(d2h_copy_infotoken_evt_, copy_st_);
+  int32 nelements_extra_prev_tokens =
+      h_lanes_counters_.lane(nlanes_used_)
+          ->main_q_n_extra_prev_tokens_lane_offset;
+  if (nelements_extra_prev_tokens > 0) {
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpyAsync(
+        h_extra_prev_tokens_concat_tmp_, d_extra_prev_tokens_concat_,
+        nelements_extra_prev_tokens * sizeof(*d_extra_prev_tokens_concat_),
+        cudaMemcpyDeviceToHost, copy_st_));
+    KALDI_DECODER_CUDA_API_CHECK_ERROR(
+        cudaMemcpyAsync(h_extra_and_acoustic_cost_concat_tmp_,
+                        d_extra_and_acoustic_cost_concat_,
+                        nelements_extra_prev_tokens *
+                            sizeof(*d_extra_and_acoustic_cost_concat_),
+                        cudaMemcpyDeviceToHost, copy_st_));
+  }
+  cudaEventRecord(d2h_copy_extra_prev_tokens_evt_, copy_st_);
+}
+
+void CudaDecoder::ConcatenateData() {
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_acoustic_cost, d_acoustic_cost_concat_,
+      &d_lanes_counters_.lane(0)->main_q_n_emitting_tokens_lane_offset);
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_info, d_infotoken_concat_,
+      &d_lanes_counters_.lane(0)->main_q_end_lane_offset);
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_extra_prev_tokens, d_extra_prev_tokens_concat_,
+      &d_lanes_counters_.lane(0)->main_q_n_extra_prev_tokens_lane_offset);
+  ConcatenateLanesDataKernel(
+      KaldiCudaDecoderNumBlocks(nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+      compute_st_, *h_device_params_, *h_kernel_params_,
+      h_device_params_->d_main_q_extra_and_acoustic_cost,
+      d_extra_and_acoustic_cost_concat_,
+      &d_lanes_counters_.lane(0)->main_q_n_extra_prev_tokens_lane_offset);
+}
+
+void CudaDecoder::AdvanceDecoding(
+    const std::vector<ChannelId> &channels,
+    std::vector<CudaDecodableInterface *> &decodables, int32 max_num_frames) {
+  if (channels.size() == 0) return;  // nothing to do
+  // Context switch : Loading the channels state in lanes
+  LoadChannelsStateToLanes(channels);
+  KALDI_ASSERT(nlanes_used_ > 0);
+
+  // We'll decode nframes_to_decode, such as all channels have at least that
+  // number
+  // of frames available
+  int32 nframes_to_decode =
+      NumFramesToDecode(channels, decodables, max_num_frames);
+
+  // Looping over the frames that we will compute
+  for (int32 iframe = 0; iframe < nframes_to_decode; ++iframe) {
+    // Loglikelihoods from the acoustic model
+    // Setting the loglikelihoods pointers for that frame
+    for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+      ChannelId ichannel = channel_to_compute_[ilane];
+      int32 frame = num_frames_decoded_[ichannel];
+      h_lanes_counters_.lane(ilane)->loglikelihoods =
+          decodables[ilane]->GetLogLikelihoodsCudaPointer(frame);
+    }
+    cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                    nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                    cudaMemcpyHostToDevice, compute_st_);
+    // compute_st_ will wait for nnet3 to complete
+    cudaEventRecord(nnet3_done_evt_, cudaStreamPerThread);
+    cudaStreamWaitEvent(compute_st_, nnet3_done_evt_, 0);
+
+    // Estimating cutoff using argmin from last frame
+    ResetForFrameAndEstimateCutoffKernel(
+        KaldiCudaDecoderNumBlocks(1, nlanes_used_), KALDI_CUDA_DECODER_1D_BLOCK,
+        compute_st_, *h_device_params_, *h_kernel_params_);
+    // Reset max active status. If necessary, ApplyMaxActiveAndReduceBeam will
+    // switch it back on
+    compute_max_active_ = false;
+
+    // Processing emitting arcs. We've done the preprocess stage at the end of
+    // the previous frame
+    ExpandArcsEmitting();
+    // We'll loop until we have a small enough number of non-emitting arcs
+    // in the token queue. We'll then break the loop
+    for (int i = 0; i < KALDI_CUDA_DECODER_N_NON_EMITTING_MAIN_ITERATIONS;
+         ++i) {
+      // If one of the aux_q contains more than max_active_ tokens,
+      // we'll reduce the beam to only keep max_active_ tokens
+      ApplyMaxActiveAndReduceBeam(AUX_Q);
+      // Prune the aux_q. Apply the latest beam (using the one from
+      // ApplyMaxActiveAndReduceBeam if triggered)
+      // move the survival tokens to the main queue
+      // and do the preprocessing necessary for the next ExpandArcs
+      PruneAndPreprocess();
+
+      // "heavy duty" kernel for non-emitting. The long tail of small
+      // non-emitting iterations will be done in
+      // FinalizeProcessNonEmittingKernel
+      ExpandArcsNonEmitting();
+    }
+    ApplyMaxActiveAndReduceBeam(AUX_Q);
+    PruneAndPreprocess();
+    // Finalizing process non emitting. Takes care of the long tail,
+    // the final iterations with a small numbers of arcs. Do the work inside a
+    // single CTA (per lane),
+    FinalizeProcessNonEmittingKernel(KaldiCudaDecoderNumBlocks(1, nlanes_used_),
+                                     KALDI_CUDA_DECODER_LARGEST_1D_BLOCK,
+                                     compute_st_, *h_device_params_,
+                                     *h_kernel_params_);
+
+    // We now have our final token main queues for that frame
+
+    // Post processing the tokens for that frame
+    // - do the preprocess necessary for the next emitting expand (will happen
+    // with next frame)
+    // - if a state S has more than one token associated to it, generate the
+    // list of those tokens
+    // It allows to backtrack efficiently in GetRawLattice
+    // - compute the extra costs
+    PostProcessingMainQueue();
+
+    // Waiting on previous d2h before writing on same device memory
+    cudaStreamWaitEvent(compute_st_, d2h_copy_extra_prev_tokens_evt_, 0);
+    // Concatenating the data that will be moved to host into large arrays
+    ConcatenateData();
+    // Copying the final lane counters for that frame
+    CopyLaneCountersToHostSync();
+    CheckOverflow();
+
+    // Moving the data necessary for GetRawLattice/GetBestPath back to host for
+    // storage
+    CopyMainQueueDataToHost();
+
+    for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+      const ChannelId ichannel = channel_to_compute_[ilane];
+      // We're done processing that frame
+      ++num_frames_decoded_[ichannel];
+      const int32 main_q_end =
+          h_lanes_counters_.lane(ilane)->main_q_narcs_and_end.y;
+      // Saving frame offsets for GetRawLattice
+      frame_offsets_[ichannel].push_back(frame_offsets_[ichannel].back() +
+                                         main_q_end);
+    }
+  }
+
+  SaveChannelsStateFromLanes();
+}
+
+void CudaDecoder::CheckOverflow() {
+  for (LaneId ilane = 0; ilane < nlanes_used_; ++ilane) {
+    LaneCounters *lane_counters = h_lanes_counters_.lane(ilane);
+    bool q_overflow = lane_counters->q_overflow;
+    if (q_overflow != OVERFLOW_NONE) {
+      // An overflow was prevented in a kernel
+      // The algorithm can still go on but quality of the result can be reduced
+      // (less tokens were generated)
+
+      if ((q_overflow & OVERFLOW_MAIN_Q) == OVERFLOW_MAIN_Q) {
+        // overflowed main_q
+        KALDI_WARN
+            << "Preventing overflow of main_q. Continuing "
+            << "execution but the quality of the output may be decreased. "
+            << "To prevent this from happening, please increase the parameter "
+               "--main-q-capacity"
+            << " and/or decrease --max-active";
+      }
+      if ((q_overflow & OVERFLOW_AUX_Q) == OVERFLOW_AUX_Q) {
+        // overflowed aux_q
+        KALDI_WARN
+            << "Preventing overflow of aux_q. Continuing "
+            << "execution but the quality of the output may be decreased. "
+            << "To prevent this from happening, please increase the parameter "
+               "--aux-q-capacity"
+            << " and/or decrease --beam";
+      }
+
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.y < main_q_capacity_);
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.x >= 0);
+      KALDI_ASSERT(lane_counters->main_q_narcs_and_end.y >= 0);
+      KALDI_ASSERT(lane_counters->post_expand_aux_q_end < aux_q_capacity_);
+      KALDI_ASSERT(lane_counters->post_expand_aux_q_end >= 0);
+      KALDI_ASSERT(lane_counters->aux_q_end < aux_q_capacity_);
+      KALDI_ASSERT(lane_counters->aux_q_end >= 0);
+    }
+  }
+}
+
+// GetBestCost
+// returns the minimum cost among all tokens cost in the current frame
+// also returns the index of one token with that min cost
+//
+// Only called at the end of the computation of one audio file
+// not optimized
+void CudaDecoder::GetBestCost(const std::vector<ChannelId> &channels,
+                              bool use_final_costs,
+                              std::vector<std::pair<int32, CostType>> *argmins,
+                              std::vector<std::vector<std::pair<int, float>>>
+                                  *list_finals_token_idx_and_cost,
+                              std::vector<bool> *has_reached_final) {
+  if (channels.size() == 0) return;
+  // Getting the lanes ready to be used with those channels
+  LoadChannelsStateToLanes(channels);
+  cudaMemcpyAsync(d_lanes_counters_.MutableData(), h_lanes_counters_.lane(0),
+                  nlanes_used_ * sizeof(*h_lanes_counters_.lane(0)),
+                  cudaMemcpyHostToDevice, compute_st_);
+
+  auto func_main_q_end = [](const LaneCounters &c) {
+    return c.main_q_narcs_and_end.y;
+  };
+  int32 max_main_q_end = GetMaxForAllLanes(func_main_q_end);
+
+  // Step1 : Finding the best cost in the last token queue, with and without
+  // final costs.
+  // Also saving the indexes of those min.
+  GetBestCostStep1Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_, use_final_costs, StdWeight::Zero().Value());
+
+  // Step2: Now that we now what the minimum cost is, we list all tokens within
+  // [min_cost; min_cost+lattice_beam]
+  // min_cost takes into account the final costs if use_final_costs is true,
+  // AND if a final state is is present in the last token queue
+  GetBestCostStep2Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_, use_final_costs, StdWeight::Zero().Value());
+
+  // Step3 : Moves some data to host. We are moving the data that couldn't be
+  // moved
+  // directly in step 2, e.g. results of atomics (we don't know which one is
+  // last)
+  GetBestCostStep3Kernel(
+      KaldiCudaDecoderNumBlocks(max_main_q_end, nlanes_used_),
+      KALDI_CUDA_DECODER_1D_BLOCK, compute_st_, *h_device_params_,
+      *h_kernel_params_);
+
+  // Resetting the datastructures
+  argmins->clear();
+  has_reached_final->clear();
+  list_finals_token_idx_and_cost->clear();
+  // list_finals_token_idx_and_cost is a vector<vector<>>
+  // Each channel will have its own list of tokens within [best;
+  // best+lattice_beam]
+  list_finals_token_idx_and_cost->resize(nlanes_used_);
+  // Waiting for the copy
+  cudaStreamSynchronize(compute_st_);
+  for (int32 ilane = 0; ilane < nlanes_used_; ++ilane) {
+    int2 minarg = h_lanes_counters_.lane(ilane)->min_int_cost_and_arg;
+    // Min cost in that channel last token queue
+    CostType min_cost = orderedIntToFloatHost(minarg.x);
+    // index of that min cost
+    int32 arg = minarg.y;
+    // Saving both in output
+    argmins->push_back({arg, min_cost});
+    // Whether or not the last token queue contains at least one token
+    // associated with a final FST state
+    has_reached_final->push_back(
+        h_lanes_counters_.lane(ilane)->has_reached_final);
+    // Number of tokens within [min_cost; min_cost+lattice_beam]
+    int n_within_lattice_beam =
+        h_lanes_counters_.lane(ilane)->n_within_lattice_beam;
+    // Loading those tokens
+    (*list_finals_token_idx_and_cost)[ilane].resize(n_within_lattice_beam);
+    // Moving to output + int2float conversion
+    for (int i = 0; i < n_within_lattice_beam; ++i) {
+      int global_idx = h_list_final_tokens_in_main_q_.lane(ilane)[i].x;
+      float cost_with_final = orderedIntToFloatHost(
+          h_list_final_tokens_in_main_q_.lane(ilane)[i].y);
+      (*list_finals_token_idx_and_cost)[ilane][i].first = global_idx;
+      (*list_finals_token_idx_and_cost)[ilane][i].second = cost_with_final;
+    }
+  }
+}
+
+void CudaDecoder::GetBestPath(const std::vector<ChannelId> &channels,
+                              std::vector<Lattice *> &fst_out_vec,
+                              bool use_final_probs) {
+  KALDI_ASSERT(channels.size() == fst_out_vec.size());
+  nvtxRangePushA("GetBestPath");
+  GetBestCost(channels, use_final_probs, &argmins_,
+              &list_finals_token_idx_and_cost_, &has_reached_final_);
+
+  std::vector<int32> reversed_path;
+  for (int32 ilane = 0; ilane < channels.size(); ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    const int32 token_with_best_cost = argmins_[ilane].first;
+    std::unique_lock<std::mutex> channel_lk(channel_lock_[ichannel]);
+    // If that token in that frame f is available, then all tokens in that frame
+    // f are available
+    WaitForH2HCopies();
+    const bool isfinal = has_reached_final_[ilane];
+    TokenId token_idx = token_with_best_cost;
+
+    // Backtracking
+    // Going all the way from the token with best cost
+    // to the beginning (StartState)
+    reversed_path.clear();
+
+    // The first token was inserted at the beginning of the queue
+    // it always has index 0
+    // We backtrack until that first token
+    while (token_idx != 0) {
+      InfoToken token = h_all_tokens_info_[ichannel][token_idx];
+      // We want an arc with extra_cost == 0
+      int32 arc_idx;
+      TokenId prev_token_idx;
+      if (token.IsUniqueTokenForStateAndFrame()) {
+        // If we have only one, it is an arc with extra_cost == 0
+        arc_idx = token.arc_idx;
+        prev_token_idx = token.prev_token;
+      } else {
+        // Using the first arc with extra_cost == 0
+        int32 offset, size;
+        std::tie(offset, size) = token.GetSameFSTStateTokensList();
+        bool found_best = false;
+        for (auto i = 0; i < size; ++i) {
+          CostType arc_extra_cost =
+              h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                                                                     [offset +
+                                                                      i].x;
+          // Picking one arc on the best path (extra_cost == 0)
+          if (arc_extra_cost == 0.0f) {
+            InfoToken list_token =
+                h_all_tokens_extra_prev_tokens_[ichannel][offset + i];
+            arc_idx = list_token.arc_idx;
+            prev_token_idx = list_token.prev_token;
+            found_best = true;
+            break;
+          }
+        }
+        KALDI_ASSERT(found_best);
+      }
+      reversed_path.push_back(arc_idx);
+      token_idx = prev_token_idx;
+    }
+
+    Lattice *fst_out = fst_out_vec[ilane];
+    fst_out->DeleteStates();
+    // Building the output Lattice
+    OutputLatticeState curr_state = fst_out->AddState();
+    fst_out->SetStart(curr_state);
+
+    for (int32 i = reversed_path.size() - 1; i >= 1; i--) {
+      int32 arc_idx = reversed_path[i];
+
+      LatticeArc arc(fst_.h_arc_id_ilabels_[arc_idx],
+                     fst_.h_arc_olabels_[arc_idx],
+                     LatticeWeight(fst_.h_arc_weights_[arc_idx], 0),
+                     fst_.h_arc_nextstate_[arc_idx]);
+
+      arc.nextstate = fst_out->AddState();
+      fst_out->AddArc(curr_state, arc);
+      curr_state = arc.nextstate;
+    }
+
+    // Adding final cost to final state
+    if (isfinal && use_final_probs)
+      fst_out->SetFinal(
+          curr_state,
+          LatticeWeight(fst_.h_final_[fst_.h_arc_nextstate_[reversed_path[0]]],
+                        0.0));
+    else
+      fst_out->SetFinal(curr_state, LatticeWeight::One());
+
+    fst::RemoveEpsLocal(fst_out);
+  }
+  nvtxRangePop();
+}
+
+void CudaDecoder::DebugValidateLattice() {
+#if 0
+	//validate lattice consistency
+	for(int frame=0;frame<nframes;frame++) {
+		int token_start=frame_offsets_[ichannel][frame];
+		int token_end=(frame+1<nframes) ? frame_offsets_[ichannel][frame+1] : total_ntokens;
+		int prev_frame_offset=(frame>0) ? frame_offsets_[ichannel][frame-1] : 0;
+		int cur_frame_offset=token_start;
+		int next_frame_offset=token_end;
+
+		bool found_zero = false;
+		//for each token in frame
+		for(int i=token_start;i<token_end;i++) {
+			if(i==0) continue;  //initial token skip this...
+			InfoToken token=h_all_tokens_info_[ichannel][i];
+			KALDI_ASSERT(token.prev_token>=0);
+
+			if(token.IsUniqueTokenForStateAndFrame()) {
+				//previous token must be lower than the next frame start
+				KALDI_ASSERT(token.prev_token<next_frame_offset);
+				//previous token must be larger then previous frame start
+				KALDI_ASSERT(token.prev_token>=prev_frame_offset);
+			} else {
+				int32 offset, size;
+				std::tie(offset,size) = token.GetNextStateTokensList();
+				KALDI_ASSERT(size>0);
+				KALDI_ASSERT(offset>=0 && offset<h_all_tokens_extra_prev_tokens_[ichannel].size());
+				for(auto j=0; j<size; ++j) {
+					KALDI_ASSERT(offset+j<h_all_tokens_extra_prev_tokens_[ichannel].size());
+					InfoToken extra_token=h_all_tokens_extra_prev_tokens_[ichannel][offset+j];
+					//previous token must be lower than the next frame start
+					KALDI_ASSERT(extra_token.prev_token<next_frame_offset);
+					//previous token must be larger then previous frame start
+					KALDI_ASSERT(extra_token.prev_token>=prev_frame_offset);
+				}
+			}
+		}
+	}
+#endif
+}
+
+CudaDecoder::LatticeStateInternalId CudaDecoder::GetLatticeStateInternalId(
+    int32 total_ntokens, TokenId token_idx, InfoToken token) {
+  // If we have a unique token for this (frame,fst_state)
+  // Then its ID is a unique ID for (frame,fst_state)
+  if (token.IsUniqueTokenForStateAndFrame()) return token_idx;
+
+  // If we have multiple tokens for this (frame,fst_state),
+  // let's use the "extra_prev_tokens" offset, which is unique for
+  // (frame,fst_state) in that case
+
+  // Adding the total_ntokens offset to avoid collisions with the previous
+  // case
+  return (total_ntokens + token.prev_token);
+}
+
+void CudaDecoder::AddFinalTokensToLattice(
+    ChannelId ichannel,
+    std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    Lattice *fst_out) {
+  // Total number of tokens for that utterance. Used in
+  // GetLatticeStateInternalId
+  const int32 total_ntokens = h_all_tokens_info_[ichannel].size();
+  // Reading the overall best_cost for that utterance's last frame. Was set by
+  // GetBestCost
+  const CostType best_cost = h_all_argmin_cost_[ichannel].second;
+  // Iterating through tokens associated with a final state in the last frame
+  for (auto &p : h_all_final_tokens_list_[ichannel]) {
+    // This final token has a final cost of final_token_cost
+    CostType final_token_cost = p.second;
+    // This token has possibly an extra cost compared to the best
+    CostType extra_cost = final_token_cost - best_cost;
+    // We only want to keep paths that have a cost within [best;
+    // best+lattice_beam]
+    if (extra_cost > lattice_beam_) {
+      continue;
+    }
+
+    const TokenId final_token_idx = p.first;
+    InfoToken final_token = h_all_tokens_info_[ichannel][final_token_idx];
+
+    // Internal ID for our lattice_state=(iframe, fst_state)
+    LatticeStateInternalId state_internal_id =
+        GetLatticeStateInternalId(total_ntokens, final_token_idx, final_token);
+    decltype(curr_f_raw_lattice_state->end()) map_it;
+    bool inserted;
+
+    // We need to create the fst_lattice_state linked to our internal id in the
+    // lattice if it doesn't already exists
+    // Inserts only if the key doesn't exist in the map
+    std::tie(map_it, inserted) = curr_f_raw_lattice_state->insert(
+        {state_internal_id, {FLT_MAX, -1, false}});
+
+    // If we've inserted the element, it means that that state didn't exist in
+    // the map
+    // Because this is a final state, we need to do a bit of extra work to add
+    // the final_cost to it
+    if (inserted) {
+      // We want to figure out which FST state this token is associated to
+      // We don't have that info anymore, it wasn't transfered from the GPU
+      // We still need it for final tokens, because we need to know which
+      // final cost to add in the lattice.
+      // To find that original FST state, we need the id of an arc going to
+      // that state,
+      // then we'll look in the graph and figure out next_state[arc_idx]
+      // we just need a valid arc_idx
+      int32 arc_idx;
+      if (final_token.IsUniqueTokenForStateAndFrame()) {
+        // If unique, we can directly use this arc_idx
+        arc_idx = final_token.arc_idx;
+      } else {
+        // If we have multiple tokens associated to that fst state, just pick
+        // the first one
+        // from the list
+        int32 offset, size;
+        std::tie(offset, size) = final_token.GetSameFSTStateTokensList();
+        InfoToken prev_token =
+            h_all_tokens_extra_prev_tokens_[ichannel][offset];
+        arc_idx = prev_token.arc_idx;
+      }
+      // Creating the state associated with our internal id in the lattice
+      OutputLatticeState fst_lattice_final_state = fst_out->AddState();
+      map_it->second.fst_lattice_state = fst_lattice_final_state;
+      q_curr_frame_todo->push_back({final_token_idx, final_token});
+
+      if (h_all_has_reached_final_[ichannel]) {
+        // If we have reached final states, adding the final cost
+        // We now have a valid arc_idx. We can read the FST state
+        StateId fst_next_state = fst_.h_arc_nextstate_[arc_idx];
+
+        fst_out->SetFinal(fst_lattice_final_state,
+                          LatticeWeight(fst_.h_final_[fst_next_state], 0.0));
+      } else {
+        fst_out->SetFinal(fst_lattice_final_state, LatticeWeight::One());
+      }
+    }
+
+    map_it->second.token_extra_cost =
+        std::min(map_it->second.token_extra_cost, extra_cost);
+  }
+}
+
+void CudaDecoder::AddArcToLattice(
+    int32 list_arc_idx, TokenId list_prev_token_idx, InfoToken list_prev_token,
+    int32 curr_frame_offset, CostType acoustic_cost,
+    CostType this_arc_prev_token_extra_cost,
+    LatticeStateInternalId src_state_internal_id,
+    OutputLatticeState fst_lattice_start,
+    OutputLatticeState to_fst_lattice_state,
+    std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+    std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *prev_f_raw_lattice_state,
+    std::unordered_set<int32> *f_arc_idx_added, Lattice *fst_out,
+    bool *must_replay_frame) {
+  // We will now add this arc to the output lattice
+  // We know the destination state of the arc (to_fst_lattice_state)
+  // We need to figure out its source
+  // And propagate the extra cost from the destination to the source of that arc
+  // (we go backward)
+  OutputLatticeState from_fst_lattice_state;
+  // Having the predecessor in the previous frame
+  // <=> that token is associated to an emiting arc
+  bool emitting = (list_prev_token_idx < curr_frame_offset);
+  // Checking if the source of that arc is the start state (original state at
+  // the beginning of the decode)
+  if (list_prev_token_idx != 0) {
+    // Selecting the right map
+    // - emitting arc -> previous frame map
+    // - non emitting arc -> same frame map
+    auto *extra_cost_map =
+        emitting ? prev_f_raw_lattice_state : curr_f_raw_lattice_state;
+    decltype(extra_cost_map->end()) from_map_it;
+    bool inserted;
+    // Attempting to insert the state in the map
+    std::tie(from_map_it, inserted) =
+        extra_cost_map->insert({src_state_internal_id, {FLT_MAX, -1, false}});
+    // If it was inserted, its the first time we insert that key in
+    // the map
+    // we need to put that state in the todo list to be considered
+    // next
+    if (inserted) {
+      auto *todo_list = emitting ? q_prev_frame_todo : q_curr_frame_todo;
+      todo_list->push_back({list_prev_token_idx, list_prev_token});
+      from_map_it->second.fst_lattice_state = fst_out->AddState();
+    }
+
+    // Updating the source extra cost using that arc
+    // for an arc a->b
+    // extra_cost(a) = min(extra_cost(a),
+    //		extra_cost(b) + arc_extra_cost(a->b))
+    CostType prev_token_extra_cost = from_map_it->second.token_extra_cost;
+    if (this_arc_prev_token_extra_cost < prev_token_extra_cost) {
+      // We found a new min
+      CostType diff = (prev_token_extra_cost - this_arc_prev_token_extra_cost);
+      // If the change is large enough,
+      // and if the state that we're writing to was already closed,
+      // then we need to replay that frame.
+      // if the source state is already closed it means we've
+      // read its extra_cost value. Now we're writing again to it.
+      // We have to do the first read again, to get the updated
+      // value
+      // that's why we're replaying that frame
+      // (between frames everything is in topological order)
+      if (diff > extra_cost_min_delta_ && from_map_it->second.is_state_closed) {
+        *must_replay_frame = true;
+      }
+      prev_token_extra_cost = this_arc_prev_token_extra_cost;
+      from_map_it->second.token_extra_cost = prev_token_extra_cost;
+    }
+
+    // Reading the OutputLatticeState of the source state in the output lattice
+    from_fst_lattice_state = from_map_it->second.fst_lattice_state;
+  } else {
+    from_fst_lattice_state =
+        fst_lattice_start;  // we simply link it to the source
+  }
+
+  // Checking if it's the first time we insert an arc with that
+  // arc_idx for that frame.
+  // If we're replaying that frame, we don't want duplicates
+  bool is_this_arc_new = f_arc_idx_added->insert(list_arc_idx).second;
+  if (is_this_arc_new) {
+    // The following reads will most likely end up in cache misses
+    // we could load everything sooner
+    LatticeArc arc(
+        fst_.h_arc_id_ilabels_[list_arc_idx], fst_.h_arc_olabels_[list_arc_idx],
+        LatticeWeight(fst_.h_arc_weights_[list_arc_idx], acoustic_cost),
+        to_fst_lattice_state);
+    fst_out->AddArc(from_fst_lattice_state, arc);
+  }
+}
+
+void CudaDecoder::GetTokenRawLatticeData(
+    TokenId token_idx, InfoToken token, int32 total_ntokens,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    CostType *token_extra_cost, OutputLatticeState *to_fst_lattice_state) {
+  LatticeStateInternalId next_state_internal_id =
+      GetLatticeStateInternalId(total_ntokens, token_idx, token);
+  auto to_map_it = curr_f_raw_lattice_state->find(next_state_internal_id);
+  // We know this token exists in the output lattice (because it's in
+  // q_curr_frame_todo_)
+  KALDI_ASSERT(to_map_it != curr_f_raw_lattice_state->end());
+
+  *token_extra_cost = to_map_it->second.token_extra_cost;
+  *to_fst_lattice_state = to_map_it->second.fst_lattice_state;
+
+  // We read the extra cost from lattice_next_state
+  // We are now closing the state. If we write to it again, we will have
+  // to replay that frame
+  // (so that the latest extra_cost value is read)
+  to_map_it->second.is_state_closed = true;
+}
+
+void CudaDecoder::GetSameFSTStateTokenList(
+    ChannelId ichannel, InfoToken &token, InfoToken **tok_beg,
+    float2 **extra_extra_and_acoustic_cost_beg, int32 *nsame) {
+  // We now need to consider all tokens related to that (iframe,
+  // fst_state)
+  // with fst_state being the state this current token is linked to
+  // There's two possibilies:
+  // a) only one token is associated with that fst_state in that frame.
+  // The necessary information
+  // is then stored directly in the token (arc_idx, prev_token)
+  // b) multiple tokens are associated with that fst_state in that
+  // frame. The token that we have right now
+  // only contains information on where to find the list of those
+  // tokens. It contains (offset, size)
+  //
+  // In any cases we consider the list of tokens to process as an array
+  // of InfoToken, which will
+  // be of size 1 in case a), of size > 1 in case b)
+  if (token.IsUniqueTokenForStateAndFrame()) {
+    *tok_beg = &token;
+    // if we've got only one, extra_cost == 0.0
+    *extra_extra_and_acoustic_cost_beg = NULL;
+    *nsame = 1;
+  } else {
+    int32 offset, size;
+    std::tie(offset, size) = token.GetSameFSTStateTokensList();
+    *tok_beg = &h_all_tokens_extra_prev_tokens_[ichannel][offset];
+    *extra_extra_and_acoustic_cost_beg =
+        &h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+                                                                [offset];
+    *nsame = size;
+  }
+}
+
+void CudaDecoder::ConsiderTokenForLattice(
+    ChannelId ichannel, int32 iprev, int32 total_ntokens, TokenId token_idx,
+    OutputLatticeState fst_lattice_start, InfoToken *tok_beg,
+    float2 *extra_extra_and_acoustic_cost_beg, CostType token_extra_cost,
+    TokenId list_prev_token_idx, int32 list_arc_idx, InfoToken *list_prev_token,
+    CostType *this_arc_prev_token_extra_cost, CostType *acoustic_cost,
+    OutputLatticeState *lattice_src_state, bool *keep_arc,
+    bool *dbg_found_zero) {
+  CostType arc_extra_cost;
+  if (extra_extra_and_acoustic_cost_beg) {
+    float2 both = extra_extra_and_acoustic_cost_beg[iprev];
+    arc_extra_cost = both.x;
+    *acoustic_cost = both.y;
+  } else {
+    // If we have only one token for that (iframe,fst_state),
+    // Its arc has an extra_cost of zero (it's the only way to
+    // get to that state, so it's the best)
+    arc_extra_cost = 0.0f;
+    *acoustic_cost = h_all_tokens_acoustic_cost_[ichannel][token_idx];
+  }
+  // If we use that arc to go to prev_token, prev_token will have the
+  // following extra cost
+  *this_arc_prev_token_extra_cost = token_extra_cost + arc_extra_cost;
+  // We need at least one arc_extra_cost of zero for each (iframe,
+  // fst_state)
+  // The only use for that boolean is in a KALDI_ASSERT,
+  // because if something went wrong in the kernels it's not likely
+  // that this property will be verified out of luck
+  *dbg_found_zero |= (arc_extra_cost == 0.0f);
+  *list_prev_token = h_all_tokens_info_[ichannel][list_prev_token_idx];
+  // Source of the arc currently considered
+  *lattice_src_state =
+      (list_prev_token_idx != 0)
+          ? GetLatticeStateInternalId(total_ntokens, list_prev_token_idx,
+                                      *list_prev_token)
+          : fst_lattice_start;
+
+  // We only keep the arc if, when using that arc, we can end up
+  // at the last frame with a cost not worse than (best+lattice_beam)
+  // this_arc_prev_token_extra_cost contains the accumulated sums
+  // of extra costs (through the cheapest possible way) to the last
+  // frame
+  *keep_arc = (*this_arc_prev_token_extra_cost < lattice_beam_);
+}
+
+void CudaDecoder::SwapPrevAndCurrLatticeMap(
+    int32 iframe, bool dbg_found_best_path,
+    std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+    std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *curr_f_raw_lattice_state,
+    std::unordered_map<LatticeStateInternalId, RawLatticeState>
+        *prev_f_raw_lattice_state,
+    std::unordered_set<int32> *f_arc_idx_added) {
+  q_prev_frame_todo->swap(*q_curr_frame_todo);
+  q_prev_frame_todo->clear();
+  prev_f_raw_lattice_state->swap(*curr_f_raw_lattice_state);
+  prev_f_raw_lattice_state->clear();
+  f_arc_idx_added->clear();
+
+  KALDI_ASSERT(q_prev_frame_todo->empty());
+  if (iframe > 0) {
+    KALDI_ASSERT(!q_curr_frame_todo->empty());
+    if (!dbg_found_best_path) {
+      KALDI_WARN << "Warning didn't find exact best path in GetRawLattice";
+    }
+  }
+}
+
+void CudaDecoder::WaitForH2HCopies() {
+  std::unique_lock<std::mutex> lk(n_h2h_task_not_done_mutex_);
+  h2h_done_.wait(lk, [this] { return (n_h2h_task_not_done_ == 0); });
+}
+
+void CudaDecoder::WaitForInitDecodingH2HCopies() {
+  std::unique_lock<std::mutex> lk(n_init_decoding_h2h_task_not_done_mutex_);
+  init_decoding_h2h_done_.wait(
+      lk, [this] { return (n_init_decoding_h2h_task_not_done_ == 0); });
+}
+
+void CudaDecoder::PrepareForGetRawLattice(
+    const std::vector<ChannelId> &channels, bool use_final_probs) {
+  GetBestCost(channels, use_final_probs, &argmins_,
+              &list_finals_token_idx_and_cost_, &has_reached_final_);
+  for (LaneId ilane = 0; ilane < channels.size(); ++ilane) {
+    ChannelId ichannel = channels[ilane];
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    h_all_argmin_cost_[ichannel] = argmins_[ilane];
+    h_all_final_tokens_list_[ichannel].swap(
+        list_finals_token_idx_and_cost_[ilane]);
+    h_all_has_reached_final_[ichannel] = has_reached_final_[ilane];
+  }
+}
+
+void CudaDecoder::ConcurrentGetRawLatticeSingleChannel(const ChannelId ichannel,
+                                                       Lattice *fst_out) {
+  nvtxRangePushA("GetRawLatticeOneChannel");
+  // Allocating the datastructures that we need
+
+  // [prev|curr]_f_raw_lattice_state
+  // Used to get information about a lattice state (i.e. a (iframe, fst_state)
+  // pair)
+  // using its LatticeStateInternalId (its ID inside of the decoder)
+  // It gives us the OutputLatticeState (its ID in the output lattice)
+  // alongside with the extra_cost of that state in the lattice
+  // Those maps are used to build the external lattice using what we know
+  // internally
+  // Using one map per frame. We always know to which frame a token belongs.
+  // Using one big map slows everything down
+  std::unordered_map<LatticeStateInternalId, RawLatticeState>
+      prev_f_raw_lattice_state, curr_f_raw_lattice_state;
+  // We want the unicity of each arc_idx for one frame. Important because we
+  // can replay a frame (and possibly add multiple time the same arc)
+  std::unordered_set<int32> f_arc_idx_added;
+  // When backtracking, we read tokens in the current frame (in
+  // q_curr_frame_todo_),
+  // we backtrack the associated arc, and we add the predecessor either to
+  // q_curr_frame_todo_ (non-emitting arc, same frame)
+  // or q_prev_frame_todo_ (emitting arc, source in previous frame)
+  std::vector<std::pair<TokenId, InfoToken>> q_curr_frame_todo;
+  std::vector<std::pair<TokenId, InfoToken>> q_prev_frame_todo;
+
+  // We need to lock the channel to read argmin
+  TokenId best_cost_idx;
+  {
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    h_all_tokens_info_[ichannel].shrink_to_fit();
+    h_all_tokens_acoustic_cost_[ichannel].shrink_to_fit();
+    h_all_tokens_extra_prev_tokens_[ichannel].shrink_to_fit();
+    h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_[ichannel]
+      .shrink_to_fit();
+    best_cost_idx = h_all_argmin_cost_[ichannel].first;
+  }
+  KALDI_ASSERT(
+      "You need to call PrepareForGetRawLattice before "
+      "ConcurrentGetRawLatticeSingleChannel" &&
+      best_cost_idx >= 0);
+  const int32 nframes = NumFramesDecoded(ichannel);
+  // Making sure that this token is available for this channel.
+  // We're going to read storage data from this channel. Locking it
+  // If that token in that frame f is available, then all tokens in that frame
+  // f are available
+  WaitForH2HCopies();
+  std::unique_lock<std::mutex> channel_lk(channel_lock_[ichannel]);
+  // Total number of tokens generated by the utterance on channel ichannel
+  const int32 total_ntokens = h_all_tokens_info_[ichannel].size();
+
+  // Preparing output lattice
+  // The start state has to be 0 (cf some asserts somewhere else in Kaldi)
+  // Adding it now
+  fst_out->DeleteStates();
+  OutputLatticeState fst_lattice_start = fst_out->AddState();
+  fst_out->SetStart(fst_lattice_start);
+
+  // Adding the best tokens returned by GetBestCost to the lattice
+  // We also add them to q_curr_frame_todo, and we'll backtrack from there
+  AddFinalTokensToLattice(ichannel, &q_curr_frame_todo,
+                          &curr_f_raw_lattice_state, fst_out);
+
+  // We're now going to backtrack frame by frame
+  // For each frame we're going to process tokens that need to be inserted
+  // into the output lattice
+  // and add their predecessors to the todo list
+  // iframe == -1 contains the start state and the first non emitting tokens.
+  // It is not linked to a real frame
+  for (int32 iframe = nframes - 1; iframe >= -1; --iframe) {
+    // Tokens for the current frame were inserted after this offset in the
+    // token list
+    const int32 curr_frame_offset =
+        (iframe >= 0) ? frame_offsets_[ichannel][iframe] : 0;
+
+    // bool must_replay_frame
+    // In some cases we can update an extra_cost that has already been used
+    // For instance we process arcs in that order :
+    // 1) a -> b, which updates extra_cost[b] using extra_cost[a]
+    // 2) c -> a, which updates extra-cost[a] (using extra_cost[c])
+    // because the arcs were not considered in topological order, we need to
+    // run
+    // again the step 1,
+    // to get the correct extra_cost[b] (using the latest extra_cost[a])
+    // However, we only re-run the step 1 if the value extra_cost[a] has
+    // changed more than extra_cost_min_delta_
+    bool must_replay_frame;
+
+    // dbg_found_best_path is used in an useful assert, making sure the best
+    // path is still there for each frame
+    // if something went wrong in the kernels, it's not likely we respect that
+    // property out of luck
+    bool dbg_found_best_path = false;
+    do {
+      must_replay_frame = false;
+      // Reading something to do. We are pushing stuff back in
+      // q_curr_frame_todo while reading it,
+      // so it's important to always read q_curr_frame_todo_.size() directly
+      // not using a queue, because we may need to recompute the frame (if
+      // must_replay_frame is true)
+      for (int32 u = 0; u < q_curr_frame_todo.size(); ++u) {
+        TokenId token_idx;
+        InfoToken token;
+        std::tie(token_idx, token) = q_curr_frame_todo[u];
+        KALDI_ASSERT(token_idx >= curr_frame_offset);
+        CostType token_extra_cost;
+        StateId to_fst_lattice_state;
+        // Loading the current extra_cost of that token
+        // + its associated state in the lattice
+        GetTokenRawLatticeData(token_idx, token, total_ntokens,
+                               &curr_f_raw_lattice_state, &token_extra_cost,
+                               &to_fst_lattice_state);
+        dbg_found_best_path |= (token_extra_cost == 0.0f);
+
+        InfoToken *tok_beg;
+        float2 *extra_extra_and_acoustic_cost_beg;
+        int32 nsamestate;
+        // Getting the list of the tokens linked to the same FST state, in the
+        // same frame
+        // In the GPU decoder a token is linked to a single arc, but we can
+        // generate
+        // multiple token for a same fst_nextstate in the same frame.
+        // In the CPU decoder we would use the forward_links list to store
+        // everything in the same metatoken
+        // GetSameFSTStateTokenList returns the list of tokens linked to the
+        // same FST state than token
+        // (in the current frame)
+        GetSameFSTStateTokenList(ichannel, token, &tok_beg,
+                                 &extra_extra_and_acoustic_cost_beg,
+                                 &nsamestate);
+
+        // dbg_found_zero used for debugging. For each FST state, we have a
+        // token with the
+        // best cost for that FST state
+        // that token has an extra_cost of 0.0f. This is a sanity check
+        bool dbg_found_zero = false;
+        for (int32 iprev = 0; iprev < nsamestate; ++iprev) {
+          InfoToken list_prev_token;
+          CostType acoustic_cost, this_arc_prev_token_extra_cost;
+          bool keep_arc;
+          LatticeStateInternalId src_state_internal_id;
+          InfoToken list_token = tok_beg[iprev];
+          int32 list_prev_token_idx = list_token.prev_token;
+          int32 list_arc_idx = list_token.arc_idx;
+
+          ConsiderTokenForLattice(
+              ichannel, iprev, total_ntokens, token_idx, fst_lattice_start,
+              tok_beg, extra_extra_and_acoustic_cost_beg, token_extra_cost,
+              list_prev_token_idx, list_arc_idx, &list_prev_token,
+              &this_arc_prev_token_extra_cost, &acoustic_cost,
+              &src_state_internal_id, &keep_arc, &dbg_found_zero);
+
+          if (keep_arc)
+            AddArcToLattice(list_arc_idx, list_prev_token_idx, list_prev_token,
+                            curr_frame_offset, acoustic_cost,
+                            this_arc_prev_token_extra_cost,
+                            src_state_internal_id, fst_lattice_start,
+                            to_fst_lattice_state, &q_curr_frame_todo,
+                            &q_prev_frame_todo, &curr_f_raw_lattice_state,
+                            &prev_f_raw_lattice_state, &f_arc_idx_added,
+                            fst_out, &must_replay_frame);
+        }
+        KALDI_ASSERT(dbg_found_zero);
+      }
+
+      if (must_replay_frame) {
+        // We need to replay the frame. Because all states will be read again,
+        // we can reopen them (and they will be closed again when being read
+        // from again)
+        for (auto it = curr_f_raw_lattice_state.begin();
+             it != curr_f_raw_lattice_state.end(); ++it) {
+          it->second.is_state_closed = false;
+        }
+      }
+    } while (must_replay_frame);
+
+    // Done processing this frame. Swap the datastructures, move on to
+    // previous frame (we go --iframe)
+    SwapPrevAndCurrLatticeMap(iframe, dbg_found_best_path, &q_curr_frame_todo,
+                              &q_prev_frame_todo, &curr_f_raw_lattice_state,
+                              &prev_f_raw_lattice_state, &f_arc_idx_added);
+  }
+  nvtxRangePop();
+}
+
+void CudaDecoder::GetRawLattice(const std::vector<ChannelId> &channels,
+                                std::vector<Lattice *> &fst_out_vec,
+                                bool use_final_probs) {
+  KALDI_ASSERT(channels.size() == fst_out_vec.size());
+  // Getting the list of the best costs in the lastest token queue.
+  // all costs within [best;best+lattice_beam]
+  PrepareForGetRawLattice(channels, use_final_probs);
+  for (int32 ilane = 0; ilane < channels.size(); ++ilane) {
+    const ChannelId ichannel = channels[ilane];
+    Lattice *fst_out = fst_out_vec[ilane];
+    ConcurrentGetRawLatticeSingleChannel(ichannel, fst_out);
+  }
+}
+
+void CudaDecoder::SetChannelsInKernelParams(
+    const std::vector<ChannelId> &channels) {
+  KALDI_ASSERT(channels.size() <= nchannels_);
+  KALDI_ASSERT(channels.size() <= nlanes_);
+  for (LaneId lane_id = 0; lane_id < channels.size(); ++lane_id)
+    channel_to_compute_[lane_id] = channels[lane_id];
+  h_kernel_params_->nlanes_used = channels.size();
+  nlanes_used_ = channels.size();
+}
+
+void CudaDecoder::ResetChannelsInKernelParams() {
+  h_kernel_params_->nlanes_used = 0;
+  nlanes_used_ = 0;
+}
+
+int32 CudaDecoder::NumFramesDecoded(ChannelId ichannel) const {
+  KALDI_ASSERT(ichannel < nchannels_);
+  return num_frames_decoded_[ichannel];
+}
+
+void CudaDecoder::CheckStaticAsserts() {
+  // Checking if all constants look ok
+
+  // We need that because we need to be able to do the scan in one pass in the
+  // kernel
+  // update_beam_using_histogram_kernel
+  KALDI_ASSERT(KALDI_CUDA_DECODER_HISTO_NBINS < KALDI_CUDA_DECODER_1D_BLOCK);
+  KALDI_ASSERT(KALDI_CUDA_DECODER_NONEM_LT_MAX_NARCS > 0);
+}
+
+void CudaDecoder::LaunchH2HCopies() {
+  // Each H2H copy counter
+  n_acoustic_h2h_copies_todo_.store(nlanes_used_ - 1);
+  n_infotoken_h2h_copies_todo_.store(nlanes_used_ - 1);
+  n_extra_prev_tokens_h2h_copies_todo_.store(nlanes_used_ - 1);
+
+  {
+    std::lock_guard<std::mutex> n_h2h_not_done_lk(n_h2h_task_not_done_mutex_);
+    n_h2h_task_not_done_ += thread_pool_ ? n_threads_used_ : 1;
+  }
+  {
+    std::lock_guard<std::mutex> n_h2h_todo_lk(n_h2h_main_task_todo_mutex_);
+    n_h2h_main_task_todo_ = thread_pool_ ? n_threads_used_ : 1;
+  }
+
+  // Either do the copy locally or send it to the threadpool
+  if (thread_pool_) {
+    n_h2h_main_task_todo_cv_.notify_all();
+  } else {
+    ComputeH2HCopies();
+  }
+}
+
+void CudaDecoder::ComputeH2HCopiesCPUWorker() {
+  // Run by a dedicated CPU thread
+  while (h2h_threads_running_) {
+    ComputeH2HCopies();
+  }
+}
+
+void CudaDecoder::ComputeH2HCopies() {
+  // Waiting for either something to do or the instruction to stop the threads
+  {
+    std::unique_lock<std::mutex> n_h2h_lk(n_h2h_main_task_todo_mutex_);
+    n_h2h_main_task_todo_cv_.wait(n_h2h_lk, [this] {
+      return !h2h_threads_running_ || (n_h2h_main_task_todo_ > 0);
+    });
+    --n_h2h_main_task_todo_;
+  }
+  // If we are done, stop the wait and return now. ComputeH2HCopiesCPUWorker
+  // will also return,
+  // stopping the thread
+  if (!h2h_threads_running_) return;
+  // Waiting for the D2H copies. This is threadsafe
+  // Step 1: acoustic costs
+  cudaEventSynchronize(d2h_copy_acoustic_evt_);
+  int32 ilane;
+  while ((ilane = n_acoustic_h2h_copies_todo_.fetch_sub(1)) >= 0) {
+    int32 ichannel = lanes2channels_todo_[ilane];
+    // Lock Channel
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    MoveConcatenatedCopyToVector(
+        ilane, ichannel, h_emitting_main_q_end_lane_offsets_,
+        h_acoustic_cost_concat_, &h_all_tokens_acoustic_cost_);
+    // Adding 0.0f acoustic_costs for non-emittings
+    int32 main_q_end = h_main_q_end_lane_offsets_[ilane + 1] -
+                       h_main_q_end_lane_offsets_[ilane];
+    int32 ntokens_emitting = h_emitting_main_q_end_lane_offsets_[ilane + 1] -
+                             h_emitting_main_q_end_lane_offsets_[ilane];
+    int32 ntokens_nonemitting = main_q_end - ntokens_emitting;
+    auto &vec = h_all_tokens_acoustic_cost_[ichannel];
+    vec.insert(vec.end(), ntokens_nonemitting, 0.0f);
+  }
+
+  // Step 2: infotoken
+  cudaEventSynchronize(d2h_copy_infotoken_evt_);
+  while ((ilane = n_infotoken_h2h_copies_todo_.fetch_sub(1)) >= 0) {
+    int32 ichannel = lanes2channels_todo_[ilane];
+    // Lock Channel
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    MoveConcatenatedCopyToVector(ilane, ichannel, h_main_q_end_lane_offsets_,
+                                 h_infotoken_concat_, &h_all_tokens_info_);
+  }
+
+  // Step 3: extra prev tokens
+  cudaEventSynchronize(d2h_copy_extra_prev_tokens_evt_);
+  while ((ilane = n_extra_prev_tokens_h2h_copies_todo_.fetch_sub(1)) >= 0) {
+    int32 ichannel = lanes2channels_todo_[ilane];
+    // Lock Channel
+    std::lock_guard<std::mutex> channel_lk(channel_lock_[ichannel]);
+    MoveConcatenatedCopyToVector(
+        ilane, ichannel, h_n_extra_prev_tokens_lane_offsets_,
+        h_extra_prev_tokens_concat_, &h_all_tokens_extra_prev_tokens_);
+    MoveConcatenatedCopyToVector(
+        ilane, ichannel, h_n_extra_prev_tokens_lane_offsets_,
+        h_extra_and_acoustic_cost_concat_,
+        &h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_);
+  }
+
+  // If we're the last cpu thread to complete the current tasks, notify the main
+  // thread
+  bool all_done;
+  {
+    std::lock_guard<std::mutex> lk_not_done(n_h2h_task_not_done_mutex_);
+    all_done = (--n_h2h_task_not_done_ == 0);
+  }
+  if (all_done) {
+    h2h_done_.notify_all();
+  }
+}
+
+void CudaDecoder::SetThreadPoolAndStartCPUWorkers(ThreadPool *thread_pool,
+                                                  int32 nworkers) {
+  KALDI_ASSERT(nworkers > 0);
+  n_threads_used_ = nworkers;
+  thread_pool_ = thread_pool;
+  for (int32 i = 0; i < nworkers; ++i)
+    cpu_dedicated_threads_.emplace_back(&CudaDecoder::ComputeH2HCopiesCPUWorker,
+                                        this);
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/cuda-decoder.h b/src/cudadecoder/cuda-decoder.h
new file mode 100644
index 00000000000..83ef1f49d8d
--- /dev/null
+++ b/src/cudadecoder/cuda-decoder.h
@@ -0,0 +1,851 @@
+// cudadecoder/cuda-decoder.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_DECODER_H_
+#define KALDI_CUDA_DECODER_CUDA_DECODER_H_
+
+#include "cudadecoder/cuda-decodable-itf.h"
+#include "cudadecoder/cuda-decoder-common.h"
+#include "cudadecoder/cuda-fst.h"
+#include "nnet3/decodable-online-looped.h"
+#include "thread-pool.h"
+
+#include <cuda_runtime_api.h>
+#include <mutex>
+#include <tuple>
+#include <vector>
+namespace kaldi {
+namespace cuda_decoder {
+
+struct CudaDecoderConfig {
+  BaseFloat default_beam;
+  BaseFloat lattice_beam;
+  int32 ntokens_pre_allocated;
+  int32 main_q_capacity, aux_q_capacity;
+  int32 max_active;
+
+  CudaDecoderConfig()
+      : default_beam(15.0),
+        lattice_beam(10.0),
+        ntokens_pre_allocated(2000000),
+        main_q_capacity(-1),
+        aux_q_capacity(-1),
+        max_active(10000) {}
+
+  void Register(OptionsItf *opts) {
+    opts->Register("beam", &default_beam,
+                   "Decoding beam. Larger->slower, more accurate. If "
+                   "aux-q-capacity is too small, we may decrease the beam "
+                   "dynamically to avoid overflow (adaptive beam, see "
+                   "aux-q-capacity parameter)");
+    opts->Register("lattice-beam", &lattice_beam,
+                   "The width of the lattice beam");
+    opts->Register("max-active", &max_active,
+                   "At the end of each frame computation, we keep only its "
+                   "best max-active tokens. One token is the instantiation of "
+                   "a single arc. Typical values are within the 5k-10k range.");
+    opts->Register("ntokens-pre-allocated", &ntokens_pre_allocated,
+                   "Advanced - Number of tokens pre-allocated in host buffers. "
+                   "If this size is exceeded the buffer will reallocate, "
+                   "reducing performance.");
+    std::ostringstream main_q_capacity_desc;
+    main_q_capacity_desc
+        << "Advanced - Capacity of the main queue : Maximum number of "
+           "tokens that can be stored *after* pruning for each frame. "
+           "Lower -> less memory usage, Higher -> More accurate. "
+           "Tokens stored in the main queue were already selected "
+           "through a max-active pre-selection. It means that for each "
+           "emitting/non-emitting iteration, we can add at most "
+           "~max-active tokens to the main queue. Typically only the "
+           "emitting iteration creates a large number of tokens. Using "
+           "main-q-capacity=k*max-active with k=4..10 should be safe. "
+           "If main-q-capacity is too small, we will print a warning "
+           "but prevent the overflow. The computation can safely "
+           "continue, but the quality of the output may decrease "
+           "(-1 = set to "
+        << KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR
+        << "*max-active).";
+    opts->Register("main-q-capacity", &main_q_capacity,
+                   main_q_capacity_desc.str());
+    std::ostringstream aux_q_capacity_desc;
+    aux_q_capacity_desc
+        << "Advanced - Capacity of the auxiliary queue : Maximum "
+           "number of raw tokens that can be stored *before* pruning "
+           "for each frame. Lower -> less memory usage, Higher -> More "
+           "accurate. During the tokens generation, if we detect that "
+           "we are getting close to saturating that capacity, we will "
+           "reduce the beam dynamically (adaptive beam) to keep only "
+           "the best tokens in the remaining space. If the aux queue "
+           "is still too small, we will print an overflow warning, but "
+           "prevent the overflow. The computation can safely continue, "
+           "but the quality of the output may decrease. We strongly "
+           "recommend keeping aux-q-capacity large (>400k), to avoid "
+           "triggering the adaptive beam and/or the overflow "
+           "(-1 = set to "
+        << KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR
+        << "*main-q-capacity).";
+    opts->Register("aux-q-capacity", &aux_q_capacity,
+                   aux_q_capacity_desc.str());
+  }
+
+  void Check() const {
+    KALDI_ASSERT(default_beam > 0.0 && ntokens_pre_allocated >= 0 &&
+                 lattice_beam >= 0.0f && max_active > 0);
+  }
+
+  void ComputeConfig() {
+    if (main_q_capacity == -1)
+      main_q_capacity =
+          max_active * KALDI_CUDA_DECODER_MAX_ACTIVE_MAIN_Q_CAPACITY_FACTOR;
+    if (aux_q_capacity == -1)
+      aux_q_capacity =
+          main_q_capacity * KALDI_CUDA_DECODER_AUX_Q_MAIN_Q_CAPACITIES_FACTOR;
+  }
+};
+
+// Forward declaration.
+// Those contains CUDA code. We don't want to include their definition
+// in this header
+class DeviceParams;
+class KernelParams;
+
+class CudaDecoder {
+ public:
+  // Creating a new CudaDecoder, associated to the FST fst
+  // nlanes and nchannels are defined as follow
+
+  // A decoder channel is linked to one utterance.
+  // When we need to perform decoding on an utterance,
+  // we pick an available channel, call InitDecoding on that channel
+  // (with that ChannelId in the channels vector in the arguments)
+  // then call AdvanceDecoding whenever frames are ready for the decoder
+  // for that utterance (also passing the same ChannelId to AdvanceDecoding)
+  //
+  // A decoder lane is where the computation actually happens
+  // a decoder lane is channel, and perform the actual decoding
+  // of that channel.
+  // If we have 200 lanes, we can compute 200 utterances (channels)
+  // at the same time. We need many lanes in parallel to saturate the big GPUs
+  //
+  // An analogy would be lane -> a CPU core, channel -> a software thread
+  // A channel saves the current state of the decoding for a given utterance.
+  // It can be kept idle until more frames are ready to be processed
+  //
+  // We will use as many lanes as necessary to saturate the GPU, but not more.
+  // A lane has an higher memory usage than a channel. If you just want to be
+  // able to
+  // keep more audio channels open at the same time (when I/O is the bottleneck
+  // for instance,
+  // typically in the context of online decoding), you should instead use more
+  // channels.
+  //
+  // A channel is typically way smaller in term of memory usage, and can be used
+  // to oversubsribe lanes in the context of online decoding
+  // For instance, we could choose nlanes=200 because it gives us good
+  // performance
+  // on a given GPU. It gives us an end-to-end performance of 3000 XRTF. We are
+  // doing online,
+  // so we only get audio at realtime speed for a given utterance/channel.
+  // We then decide to receive audio from 2500 audio channels at the same time
+  // (each at realtime speed),
+  // and as soon as we have frames ready for nlanes=200 channels, we call
+  // AdvanceDecoding on those channels
+  // In that configuration, we have nlanes=200 (for performance), and
+  // nchannels=2500 (to have enough audio
+  // available at a given time).
+  // Using nlanes=2500 in that configuration would first not be possible (out of
+  // memory), but also not necessary.
+  // Increasing the number of lanes is only useful if it increases performance.
+  // If the GPU is saturated at nlanes=200,
+  // you should not increase that number
+  CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config, int32 nlanes,
+              int32 nchannels);
+
+  // Reads the config from config
+  void ReadConfig(const CudaDecoderConfig &config);
+  // Special constructor for nlanes = nchannels. Here for the non-advanced user
+  // Here we can consider nchannels = batch size. If we want to decode 10
+  // utterances at a time,
+  // we can use nchannels = 10
+  CudaDecoder(const CudaFst &fst, const CudaDecoderConfig &config,
+              int32 nchannels)
+      : CudaDecoder(fst, config, nchannels, nchannels) {}
+  ~CudaDecoder();
+
+  // InitDecoding initializes the decoding, and should only be used if you
+  // intend to call AdvanceDecoding() on the channels listed in channels
+  void InitDecoding(const std::vector<ChannelId> &channels);
+  // Computes the heavy H2H copies of InitDecoding. Usually launched on the
+  // threadpool
+  void InitDecodingH2HCopies(ChannelId ichannel);
+  // AdvanceDecoding on a given batch
+  // a batch is defined by the channels vector
+  // We can compute N channels at the same time (in the same batch)
+  // where N = number of lanes, as defined in the constructor
+  // AdvanceDecoding will compute as many frames as possible while running the
+  // full batch
+  // when at least one channel has no more frames ready to be computed,
+  // AdvanceDecoding returns
+  // The user then decides what to do, i.e.:
+  //
+  // 1) either remove the empty channel from the channels list
+  // and call again AdvanceDecoding
+  // 2) or swap the empty channel with another one that has frames ready
+  // and call again AdvanceDecoding
+  //
+  // Solution 2) should be preferred because we need to run full, big batches to
+  // saturate the GPU
+  //
+  // If max_num_frames is >= 0 it will decode no more than
+  // that many frames.
+  void AdvanceDecoding(const std::vector<ChannelId> &channels,
+                       std::vector<CudaDecodableInterface *> &decodables,
+                       int32 max_num_frames = -1);
+
+  // Returns the number of frames already decoded in a given channel
+  int32 NumFramesDecoded(ChannelId ichannel) const;
+  // GetBestPath gets the one-best decoding traceback. If "use_final_probs" is
+  // true
+  // AND we reached a final state, it limits itself to final states;
+  // otherwise it gets the most likely token not taking into account
+  // final-probs.
+  void GetBestPath(const std::vector<ChannelId> &channels,
+                   std::vector<Lattice *> &fst_out_vec,
+                   bool use_final_probs = true);
+  // It is possible to use a threadsafe version of GetRawLattice, which is
+  // ConcurrentGetRawLatticeSingleChannel()
+  // Which will do the heavy CPU work associated with GetRawLattice
+  // It is necessary to first call PrepareForGetRawLattice *on the main thread*
+  // on the channels.
+  // The main thread is the one we use to call all other functions, like
+  // InitDecoding or AdvanceDecoding
+  // We usually call it "cuda control thread", but it is a CPU thread
+  // For example:
+  // on main cpu thread : Call PrepareForGetRawLattice on channel 8,6,3
+  // then:
+  // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 3
+  // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 8
+  // on some cpu thread : Call ConcurrentGetRawLatticeSingleChannel on channel 6
+  void PrepareForGetRawLattice(const std::vector<ChannelId> &channels,
+                               bool use_final_probs);
+  void ConcurrentGetRawLatticeSingleChannel(ChannelId ichannel,
+                                            Lattice *fst_out);
+
+  // GetRawLattice gets the lattice decoding traceback (using the lattice-beam
+  // in the CudaConfig parameters).
+  // If "use_final_probs" is true
+  // AND we reached a final state, it limits itself to final states;
+  // otherwise it gets the most likely token not taking into account
+  // final-probs.
+  void GetRawLattice(const std::vector<ChannelId> &channels,
+                     std::vector<Lattice *> &fst_out_vec, bool use_final_probs);
+  // GetBestCost finds the best cost in the last tokens queue
+  // for each channel in channels. If isfinal is true,
+  // we also add the final cost to the token costs before
+  // finding the minimum cost
+  // We list all tokens that have a cost within [best; best+lattice_beam]
+  // in list_lattice_tokens.
+  // We alsos set has_reached_final[ichannel] to true if token associated to a
+  // final state
+  // exists in the last token queue of that channel
+  void GetBestCost(
+      const std::vector<ChannelId> &channels, bool isfinal,
+      std::vector<std::pair<int32, CostType>> *argmins,
+      std::vector<std::vector<std::pair<int, float>>> *list_lattice_tokens,
+      std::vector<bool> *has_reached_final);
+  // (optional) Giving the decoder access to the cpu thread pool
+  // We will use it to compute specific CPU work, such as InitDecodingH2HCopies
+  // For recurrent CPU work, such as ComputeH2HCopies, we will use dedicated CPU
+  // threads
+  // We will launch nworkers of those threads
+  void SetThreadPoolAndStartCPUWorkers(ThreadPool *thread_pool, int32 nworkers);
+
+ private:
+  // Data allocation. Called in constructor
+  void AllocateDeviceData();
+  void AllocateHostData();
+  void AllocateDeviceKernelParams();
+  // Data initialization. Called in constructor
+  void InitDeviceData();
+  void InitHostData();
+  void InitDeviceParams();
+  // Computes the initial channel
+  // The initial channel is used to initialize a channel
+  // when a new utterance starts (we clone it into the given channel)
+  void ComputeInitialChannel();
+  // Updates *h_kernel_params using channels
+  void SetChannelsInKernelParams(const std::vector<ChannelId> &channels);
+  void ResetChannelsInKernelParams();
+  // Context-switch functions
+  // Used to perform the context-switch of load/saving the state of a channels
+  // into a lane. When a channel will be executed on a lane, we load that
+  // channel into that lane (same idea than when we load a software threads into
+  // the registers of a CPU)
+  void LoadChannelsStateToLanes(const std::vector<ChannelId> &channels);
+  void SaveChannelsStateFromLanes();
+  // We compute the decodes by batch. Each decodable in the batch has a
+  // different number of frames ready
+  // We compute the min number of frames ready (so that the full batch is
+  // executing). If max_num_frames
+  // is > 0, we apply that ceiling to the NumFramesToDecode.
+  int32 NumFramesToDecode(const std::vector<ChannelId> &channels,
+                          std::vector<CudaDecodableInterface *> &decodables,
+                          int32 max_num_frames);
+  // Expand the arcs, emitting stage. Must be called after
+  // a preprocess_in_place, which happens in PostProcessingMainQueue.
+  // ExpandArcsEmitting is called first when decoding a frame,
+  // using the preprocessing that happened at the end of the previous frame,
+  // in PostProcessingMainQueue
+  void ExpandArcsEmitting();
+  // ExpandArcs, non-emitting stage. Must be called after PruneAndPreprocess.
+  void ExpandArcsNonEmitting();
+  // If we have more than max_active_ tokens in the queue (either after an
+  // expand, or at the end of the frame)
+  // we will compute a new beam that will only keep a number of tokens as close
+  // as possible to max_active_ tokens
+  // (that number is >= max_active_) (soft topk)
+  // All ApplyMaxActiveAndReduceBeam is find the right beam for that topk and
+  // set it.
+  // We need to then call PruneAndPreprocess (explicitly pruning tokens with
+  // cost > beam)
+  // Or PostProcessingMainQueue (ignoring tokens with cost > beam in the next
+  // frame)
+  void ApplyMaxActiveAndReduceBeam(enum QUEUE_ID queue_id);
+  // Called after an ExpandArcs. Prune the aux_q (output of the ExpandArcs),
+  // move the survival tokens to the main_q, do the preprocessing at the same
+  // time
+  // We don't need it after the last ExpandArcsNonEmitting.
+  void PruneAndPreprocess();
+  // Once the non-emitting is done, the main_q is final for that frame.
+  // We now generate all the data associated with that main_q, such as listing
+  // the different tokens sharing the same token.next_state
+  // we also preprocess for the ExpandArcsEmitting of the next frame
+  // Once PostProcessingMainQueue, all working data is back to its original
+  // state, to make sure we're ready for the next context switch
+  void PostProcessingMainQueue();
+  // Moving the relevant data to host, ie the data that will be needed in
+  // GetBestPath/GetRawLattice.
+  // Happens when PostProcessingMainQueue is done generating that data
+  void CopyMainQueueDataToHost();
+  // CheckOverflow
+  // If a kernel sets the flag h_q_overflow, we send a warning to stderr
+  // Overflows are detected and prevented on the device. It only means
+  // that we've discarded the tokens that were created after the queue was full
+  // That's why we only send a warning. It is not a fatal error
+  void CheckOverflow();
+  // Evaluates the function func for each lane, returning the max of all return
+  // values
+  // (func returns int32)
+  // Used for instance to ge the max number of arcs for all lanes
+  // func is called with h_lanes_counters_[ilane] for each lane.
+  // h_lanes_counters_
+  // must be ready to be used when calling GetMaxForAllLanes (you might want to
+  // call
+  // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
+  int32 GetMaxForAllLanes(std::function<int32(const LaneCounters &)> func);
+  // Copy the lane counters back to host, async or sync
+  // The lanes counters contain all the information such as main_q_end (number
+  // of tokens in the main_q)
+  // main_q_narcs (number of arcs) during the computation. That's why we
+  // frequently copy it back to host
+  // to know what to do next
+  void CopyLaneCountersToHostAsync();
+  void CopyLaneCountersToHostSync();
+  // The selected tokens for each frame will be copied back to host. We will
+  // store them on host memory, and we wil use them to create the final lattice
+  // once we've reached the last frame
+  // We will also copy information on those tokens that we've generated on the
+  // device, such as which tokens are associated to the same FST state in the
+  // same frame, or their extra cost.
+  // We cannot call individuals Device2Host copies for each channel, because it
+  // would lead to a lot of small copies, reducing performance. Instead we
+  // concatenate all channels data into a single
+  // continuous array, copy that array to host, then unpack it to the individual
+  // channel vectors
+  // The first step (pack then copy to host, async) is done in
+  // ConcatenateData
+  // The second step is done in LaunchD2H and sLaunchH2HCopies
+  // A sync on cudaStream st has to happen between the two functions to make
+  // sure that the copy is done
+  //
+  // Each lane contains X elements to be copied, where X = func(ilane)
+  // That data is contained in the array (pointer, X), with pointer = src[ilane]
+  // It will be concatenated in d_concat on device, then copied async into
+  // h_concat
+  // That copy is launched on stream st
+  // The offset of the data of each lane in the concatenate array is saved in
+  // *lanes_offsets_ptr
+  // it will be used for unpacking in MoveConcatenatedCopyToVector
+  //
+  // func is called with h_lanes_counters_[ilane] for each lane.
+  // h_lanes_counters_
+  // must be ready to be used when calling GetMaxForAllLanes (you might want to
+  // call
+  // CopyLaneCountersToHost[A|]sync to make sure everything is ready first)
+  // Concatenate data on device before calling the D2H copies
+  void ConcatenateData();
+  // Start the D2H copies used to send data back to host at the end of each
+  // frames
+  void LaunchD2HCopies();
+  // ComputeH2HCopies
+  // At the end of each frame, we copy data back to host
+  // That data was concatenated into a single continous array
+  // We then have to unpack it and move it inside host memory
+  // This is done by ComputeH2HCopies
+  void ComputeH2HCopies();
+  // Takes care of preparing the data for ComputeH2HCopies
+  // and check whether we can use the threadpool or we have to do the work on
+  // the current thread
+  void LaunchH2HCopies();
+  // Function called by the CPU worker threads
+  // Calls ComputeH2HCopies when triggered
+  void ComputeH2HCopiesCPUWorker();
+
+  template <typename T>
+  void MoveConcatenatedCopyToVector(const LaneId ilane,
+                                    const ChannelId ichannel,
+                                    const std::vector<int32> &lanes_offsets,
+                                    T *h_concat,
+                                    std::vector<std::vector<T>> *vecvec);
+  void WaitForH2HCopies();
+  void WaitForInitDecodingH2HCopies();
+  // Computes a set of static asserts on the static values
+  // In theory we should do them at compile time
+  void CheckStaticAsserts();
+  // Can be called in GetRawLattice to do a bunch of deep asserts on the data
+  // Slow, so disabled by default
+  void DebugValidateLattice();
+
+  //
+  // Data members
+  //
+
+  // The CudaFst data structure contains the FST graph
+  // in the CSR format, on both the GPU and CPU memory
+  const CudaFst fst_;
+  // Counters used by a decoder lane
+  // Contains all the single values generated during computation,
+  // such as the current size of the main_q, the number of arcs currently in
+  // that queue
+  // We load data from the channel state during context-switch (for instance the
+  // size of the last token queue for that channel)
+  HostLaneMatrix<LaneCounters> h_lanes_counters_;
+  // Counters of channels
+  // Contains all the single values saved to remember the state of a channel
+  // not used during computation. Those values are loaded/saved into/from a lane
+  // during context switching
+  ChannelCounters *h_channels_counters_;
+  // Contain the various counters used by lanes/channels, such as main_q_end,
+  // main_q_narcs. On device memory (equivalent of h_channels_counters on
+  // device)
+  DeviceChannelMatrix<ChannelCounters> d_channels_counters_;
+  DeviceLaneMatrix<LaneCounters> d_lanes_counters_;
+  // Number of lanes and channels, as defined in the constructor arguments
+  int32 nlanes_, nchannels_;
+
+  // We will now define the data used on the GPU
+  // The data is mainly linked to two token queues
+  // - the main queue
+  // - the auxiliary queue
+  //
+  // The auxiliary queue is used to store the raw output of ExpandArcs.
+  // We then prune that aux queue (and apply max-active) and move the survival
+  // tokens in the main queue.
+  // Tokens stored in the main q can then be used to generate new tokens (using
+  // ExpandArcs)
+  // We also generate more information about what's in the main_q at the end of
+  // a frame (in PostProcessingMainQueue)
+  //
+  // As a reminder, here's the data structure of a token :
+  //
+  // struct Token { state, cost, prev_token, arc_idx }
+  //
+  // Please keep in mind that this structure is also used in the context
+  // of lattice decoding. We are not storing a list of forward links like in the
+  // CPU decoder. A token stays an instanciation of an single arc.
+  //
+  // For performance reasons, we split the tokens in three parts :
+  // { state } , { cost }, { prev_token, arc_idx }
+  // Each part has its associated queue
+  // For instance, d_main_q_state[i], d_main_q_cost[i], d_main_q_info[i]
+  // all refer to the same token (at index i)
+  // The data structure InfoToken contains { prev_token, arc_idx }
+  // We also store the acoustic costs independently in d_main_q_acoustic_cost_
+  //
+  // The data is eiher linked to a channel, or to a lane.
+  //
+  // Channel data (DeviceChannelMatrix):
+  //
+  // The data linked with a channel contains the data of frame i we need to
+  // remember
+  // to compute frame i+1. It is the list of tokens from frame i, with some
+  // additional info
+  // (ie the prefix sum of the emitting arcs degrees from those tokens).
+  // We are only storing d_main_q_state_and_cost_ as channel data because that's
+  // all we need in a token to compute
+  // frame i+1. We don't need token.arc_idx or token.prev_token.
+  // The reason why we also store that prefix sum is because we do the emitting
+  // preprocessing
+  // at the end of frame i. The reason for that is that we need infos from the
+  // hashmap to do that preprocessing.
+  // The hashmap is always cleared at the end of a frame. So we need to do the
+  // preprocessing at the end of frame i,
+  // and then save d_main_q_degrees_prefix_sum_. d_main_q_arc_offsets is
+  // generated also during preprocessing.
+  //
+  // Lane data (DeviceLaneMatrix):
+  //
+  // The lane data is everything we use during computation, but which we reset
+  // at the end of each frame.
+  // For instance we use a hashmap at some point during the computation, but at
+  // the end of each frame we reset it. That
+  // way that hashmap is able to compute whichever channel the next time
+  // AdvanceDecoding is called. The reasons why we do that is :
+  //
+  // - We use context switching. Before and after every frames, we can do a
+  // context switching. Which means that a lane cannot save a channel's state
+  // in any way once AdvanceDecoding returns. e.g., during a call of
+  // AdvanceDecoding, ilane=2 may compute 5 frames from channel=57 (as defined
+  // in the std::vector<ChannelId> channels).
+  // In the next call, the same ilane=2 may compute 10 frames from channel=231.
+  // A lane data has to be reset to its original state at the end of each
+  // AdvanceDecoding call.
+  // If somehow some data has to be saved, it needs to be declared as channel
+  // data.
+  //
+  // - The reason why we make the distinction between lane and channel data (in
+  // theory everything could be consider channel data), is because
+  // a lane uses more memory than a channel. In the context of online decoding,
+  // we need to create a lot channels, and we need them to be as small as
+  // possible in memory.
+  // Everything that can be reused between channels is stored as lane data.
+
+  //
+  // Channel data members:
+  //
+
+  DeviceChannelMatrix<int2> d_main_q_state_and_cost_;
+  // Prefix sum of the arc's degrees in the main_q. Used by ExpandArcs,
+  // set in the preprocess stages (either PruneAndPreprocess or
+  // preprocess_in_place in PostProcessingMainQueue)
+  DeviceChannelMatrix<int32> d_main_q_degrees_prefix_sum_;
+  // d_main_q_arc_offsets[i] = fst_.arc_offsets[d_main_q_state[i]]
+  // we pay the price for the random memory accesses of fst_.arc_offsets in the
+  // preprocess kernel
+  // we cache the results in d_main_q_arc_offsets which will be read in a
+  // coalesced fashion in expand
+  DeviceChannelMatrix<int32> d_main_q_arc_offsets_;
+
+  //
+  // Lane data members:
+  //
+
+  // InfoToken
+  // Usually contains {prev_token, arc_idx}
+  // If more than one token is associated to a fst_state,
+  // it will contain where to find the list of those tokens in
+  // d_main_q_extra_prev_tokens
+  // ie {offset,size} in that list. We differentiate the two situations by
+  // calling InfoToken.IsUniqueTokenForStateAndFrame()
+  DeviceLaneMatrix<InfoToken> d_main_q_info_;
+  // Acoustic cost of a given token
+  DeviceLaneMatrix<CostType> d_main_q_acoustic_cost_;
+  // At the end of a frame, we use a hashmap to detect the tokens that are
+  // associated with the same FST state S
+  // We do it that the very end, to only use the hashmap on post-prune, post-max
+  // active tokens
+  DeviceLaneMatrix<HashmapValueT> d_hashmap_values_;
+  // Reminder: in the GPU lattice decoder, a token is always associated
+  // to a single arc. Which means that multiple tokens in the same frame
+  // can be associated with the same FST state.
+  //
+  // We are NOT listing those duplicates as ForwardLinks in an unique meta-token
+  // like in the CPU lattice decoder
+  //
+  // When more than one token is associated to a single FST state,
+  // we will list those tokens into another list : d_main_q_extra_prev_tokens
+  // we will also save data useful in such a case, such as the extra_cost of a
+  // token compared to the best for that state
+  DeviceLaneMatrix<InfoToken> d_main_q_extra_prev_tokens_;
+  DeviceLaneMatrix<float2> d_main_q_extra_and_acoustic_cost_;
+  // Histogram. Used to perform the histogram of the token costs
+  // in the main_q. Used to perform a soft topk of the main_q (max-active)
+  DeviceLaneMatrix<int32> d_histograms_;
+  // When filling the hashmap in PostProcessingMainQueue, we create a hashmap
+  // value for each FST state
+  // presents in the main_q (if at least one token is associated with that
+  // state)
+  // d_main_q_state_hash_idx_[token_idx] is the index of the state token.state
+  // in the hashmap
+  // Stored into a FSTStateHashIndex, which is actually a int32.
+  // FSTStateHashIndex should only
+  // be accessed through [Get|Set]FSTStateHashIndex, because it uses the bit
+  // sign to also remember if that token is the representative of that state.
+  // If only one token is associated with S, its representative will be itself
+  DeviceLaneMatrix<FSTStateHashIndex> d_main_q_state_hash_idx_;
+  // local_idx of the extra cost list for a state
+  // For a given state S, first token associated with S will have local_idx=0
+  // the second one local_idx=1, etc. The order of the local_idxs is random
+  DeviceLaneMatrix<int32> d_main_q_n_extra_prev_tokens_local_idx_;
+  // Where to write the extra_prev_tokens in the d_main_q_extra_prev_tokens_
+  // queue
+  DeviceLaneMatrix<int32> d_main_q_extra_prev_tokens_prefix_sum_;
+  // Used when computing the prefix_sums in preprocess_in_place. Stores
+  // the local_sums per CTA
+  DeviceLaneMatrix<int2> d_main_q_block_sums_prefix_sum_;
+  // Defining the aux_q. Filled by ExpandArcs.
+  // The tokens are moved to the main_q by PruneAndPreprocess
+  DeviceLaneMatrix<int2> d_aux_q_state_and_cost_;
+  DeviceLaneMatrix<InfoToken> d_aux_q_info_;
+  // Dedicated space for the concat of extra_cost. We should reuse memory
+  DeviceLaneMatrix<float2> d_extra_and_acoustic_cost_concat_matrix_;
+  DeviceLaneMatrix<InfoToken> d_extra_prev_tokens_concat_matrix_;
+  DeviceLaneMatrix<CostType> d_acoustic_cost_concat_matrix_;
+  DeviceLaneMatrix<InfoToken> d_infotoken_concat_matrix_;
+  // We will list in d_list_final_tokens_in_main_q all tokens within [min_cost;
+  // min_cost+lattice_beam]
+  // It is used when calling GetBestCost
+  // We only use an interface here because we will actually reuse data from
+  // d_aux_q_state_and_cost
+  // We are done using the aux_q when GetBestCost is called, so we can reuse
+  // that memory
+  HostLaneMatrix<int2> h_list_final_tokens_in_main_q_;
+  // Parameters used by the kernels
+  // DeviceParams contains all the parameters that won't change
+  // i.e. memory address of the main_q for instance
+  // KernelParams contains information that can change.
+  // For instance which channel is executing on which lane
+  DeviceParams *h_device_params_;
+  KernelParams *h_kernel_params_;
+  std::vector<ChannelId> channel_to_compute_;
+  int32 nlanes_used_;  // number of lanes used in h_kernel_params_
+  // Initial lane
+  // When starting a new utterance,
+  // init_channel_id is used to initialize a channel
+  int32 init_channel_id_;
+  // CUDA streams used by the decoder
+  cudaStream_t compute_st_, copy_st_;
+  // Parameters extracted from CudaDecoderConfig
+  // Those are defined in CudaDecoderConfig
+  CostType default_beam_;
+  CostType lattice_beam_;
+  int32 ntokens_pre_allocated_;
+  int32 max_active_;  // Target value from the parameters
+  int32 aux_q_capacity_;
+  int32 main_q_capacity_;
+  // Hashmap capacity. Multiple of max_tokens_per_frame
+  int32 hashmap_capacity_;
+  // Static segment of the adaptive beam. Cf InitDeviceParams
+  int32 adaptive_beam_static_segment_;
+  // The first index of all the following vectors (or vector<vector>)
+  // is the ChannelId. e.g., to get the number of frames decoded in channel 2,
+  // look into num_frames_decoded_[2].
+
+  // Keep track of the number of frames decoded in the current file.
+  std::vector<int32> num_frames_decoded_;
+  // Offsets of each frame in h_all_tokens_info_
+  std::vector<std::vector<int32>> frame_offsets_;
+  // Data storage. We store on host what we will need in
+  // GetRawLattice/GetBestPath
+  std::vector<std::vector<InfoToken>> h_all_tokens_info_;
+  std::vector<std::vector<CostType>> h_all_tokens_acoustic_cost_;
+  std::vector<std::vector<InfoToken>> h_all_tokens_extra_prev_tokens_;
+  std::vector<std::vector<float2>>
+      h_all_tokens_extra_prev_tokens_extra_and_acoustic_cost_;
+  std::vector<std::mutex> channel_lock_;  // at some point we should switch to a
+                                          // shared_lock (to be able to compute
+                                          // partial lattices while still
+                                          // streaming new data for this
+                                          // channel)
+  bool worker_threads_running_;
+  // For each channel, set by PrepareForGetRawLattice
+  // argmin cost, list of the tokens within [best_cost;best_cost+lattice_beam]
+  // and if we've reached a final token. Set by PrepareForGetRawLattice.
+  std::vector<std::pair<int32, CostType>> h_all_argmin_cost_;
+  std::vector<std::vector<std::pair<int, float>>> h_all_final_tokens_list_;
+  std::vector<bool> h_all_has_reached_final_;
+
+  // Pinned memory arrays. Used for the DeviceToHost copies
+  float2 *h_extra_and_acoustic_cost_concat_, *d_extra_and_acoustic_cost_concat_;
+  InfoToken *h_infotoken_concat_, *d_infotoken_concat_;
+  CostType *h_acoustic_cost_concat_, *d_acoustic_cost_concat_;
+  InfoToken *h_extra_prev_tokens_concat_, *d_extra_prev_tokens_concat_;
+  // second memory space used for double buffering
+  float2 *h_extra_and_acoustic_cost_concat_tmp_;
+  InfoToken *h_infotoken_concat_tmp_;
+  CostType *h_acoustic_cost_concat_tmp_;
+  InfoToken *h_extra_prev_tokens_concat_tmp_;
+  // Offsets used in MoveConcatenatedCopyToVector
+  std::vector<int32> h_main_q_end_lane_offsets_;
+  std::vector<int32> h_emitting_main_q_end_lane_offsets_;
+  std::vector<int32> h_n_extra_prev_tokens_lane_offsets_;
+  // Used when calling GetBestCost
+  std::vector<std::pair<int32, CostType>> argmins_;
+  std::vector<bool> has_reached_final_;
+  std::vector<std::vector<std::pair<int32, CostType>>>
+      list_finals_token_idx_and_cost_;
+  bool compute_max_active_;
+  cudaEvent_t nnet3_done_evt_;
+  cudaEvent_t d2h_copy_acoustic_evt_;
+  cudaEvent_t d2h_copy_infotoken_evt_;
+  cudaEvent_t d2h_copy_extra_prev_tokens_evt_;
+  cudaEvent_t concatenated_data_ready_evt_;
+  cudaEvent_t lane_offsets_ready_evt_;
+  // GetRawLattice helper
+  // Data used when building the lattice in GetRawLattice
+
+  // few typedef to make GetRawLattice easier to understand
+  // Returns a unique id for each (iframe, fst_state) pair
+  // We need to be able to quickly identity a (iframe, fst_state) ID
+  //
+  // A lattice state is defined by the pair (iframe, fst_state)
+  // A token is associated to a lattice state (iframe, token.next_state)
+  // Multiple token in the same frame can be associated to the same lattice
+  // state
+  // (they all go to the same token.next_state)
+  // We need to quickly identify what is the lattice state of a token.
+  // We are able to do that through GetLatticeStateInternalId(token),
+  // which returns the internal unique ID for each lattice state for a token
+  //
+  // When we build the output lattice, we a get new lattice state
+  // output_lattice_state = fst_out->AddState()
+  // We call this one OutputLatticeState
+  // The conversion between the two is done through maps
+  // [curr|prev]_f_raw_lattice_state_
+  typedef int32 LatticeStateInternalId;
+  typedef StateId OutputLatticeState;
+  typedef int32 TokenId;
+  LatticeStateInternalId GetLatticeStateInternalId(int32 total_ntokens,
+                                                   TokenId token_idx,
+                                                   InfoToken token);
+  // Keeping track of a variety of info about states in the lattice
+  // - token_extra_cost. A path going from the current lattice_state to the
+  // end has an extra cost
+  // compared to the best path (which has an extra cost of 0).
+  // token_extra_cost is the minimum of the extra_cost of all paths going from
+  // the current lattice_state
+  // to the final frame.
+  // - fst_lattice_state is the StateId of the lattice_state in fst_out (in
+  // the output lattice). lattice_state is an internal state used in
+  // GetRawLattice.
+  // - is_state_closed is true if the token_extra_cost has been read by
+  // another token. It means that the
+  // token_extra_cost value has been used, and if we modify token_extra_cost
+  // again, we may need to recompute the current frame (so that everyone uses
+  // the latest
+  // token_extra_cost value)
+  struct RawLatticeState {
+    CostType token_extra_cost;
+    OutputLatticeState fst_lattice_state;
+    bool is_state_closed;
+  };
+  // extra_cost_min_delta_ used in the must_replay_frame situation. Please read
+  // comments
+  // associated with must_replay_frame in GetRawLattice to understand what it
+  // does
+  CostType extra_cost_min_delta_;
+  ThreadPool *thread_pool_;
+  std::vector<std::thread> cpu_dedicated_threads_;
+  int32 n_threads_used_;
+  std::vector<ChannelId> lanes2channels_todo_;
+  std::atomic<int> n_acoustic_h2h_copies_todo_;
+  std::atomic<int> n_extra_prev_tokens_h2h_copies_todo_;
+  std::atomic<int> n_d2h_copies_ready_;
+  std::atomic<int> n_infotoken_h2h_copies_todo_;
+  int32 n_h2h_task_not_done_;
+  int32 n_init_decoding_h2h_task_not_done_;
+  std::atomic<int> n_h2h_main_task_todo_;
+  std::mutex n_h2h_task_not_done_mutex_;
+  std::mutex n_init_decoding_h2h_task_not_done_mutex_;
+  std::mutex n_h2h_main_task_todo_mutex_;
+  std::condition_variable n_h2h_main_task_todo_cv_;
+  std::condition_variable h2h_done_;
+  std::condition_variable init_decoding_h2h_done_;
+  std::atomic<bool> active_wait_;
+  bool h2h_threads_running_;
+  // Using the output from GetBestPath, we add the best tokens (as selected in
+  // GetBestCost)
+  // from the final frame to the output lattice. We also fill the data
+  // structures
+  // (such as q_curr_frame_todo_, or curr_f_raw_lattice_state_) accordingly
+  void AddFinalTokensToLattice(
+      ChannelId ichannel,
+      std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      Lattice *fst_out);
+  // Check if a token should be added to the lattice. If it should, then
+  // keep_arc will be true
+  void ConsiderTokenForLattice(
+      ChannelId ichannel, int32 iprev, int32 total_ntokens, TokenId token_idx,
+      OutputLatticeState fst_lattice_start, InfoToken *tok_beg,
+      float2 *arc_extra_cost_beg, CostType token_extra_cost,
+      TokenId list_prev_token_idx, int32 list_arc_idx,
+      InfoToken *list_prev_token, CostType *this_arc_prev_token_extra_cost,
+      CostType *acoustic_cost, OutputLatticeState *lattice_src_state,
+      bool *keep_arc, bool *dbg_found_zero);
+  // Add the arc to the lattice. Also updates what needs to be updated in the
+  // GetRawLattice datastructures.
+  void AddArcToLattice(
+      int32 list_arc_idx, TokenId list_prev_token_idx,
+      InfoToken list_prev_token, int32 curr_frame_offset,
+      CostType acoustic_cost, CostType this_arc_prev_token_extra_cost,
+      LatticeStateInternalId src_state_internal_id,
+      OutputLatticeState fst_lattice_start,
+      OutputLatticeState to_fst_lattice_state,
+      std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+      std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *prev_f_raw_lattice_state,
+      std::unordered_set<int32> *f_arc_idx_added, Lattice *fst_out,
+      bool *must_replay_frame);
+  // Read a token information
+  void GetTokenRawLatticeData(
+      TokenId token_idx, InfoToken token, int32 total_ntokens,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      CostType *token_extra_cost, OutputLatticeState *to_fst_lattice_state);
+
+  // A token is an instance of an arc. It goes to a FST state (token.next_state)
+  // Multiple token in the same frame can go to the same FST state.
+  // GetSameFSTStateTokenList
+  // returns that list
+  void GetSameFSTStateTokenList(ChannelId ichannel, InfoToken &token,
+                                InfoToken **tok_beg,
+                                float2 **arc_extra_cost_beg, int32 *nprevs);
+
+  // Swap datastructures at the end of a frame. prev becomes curr (we go
+  // backward)
+  //
+  void SwapPrevAndCurrLatticeMap(
+      int32 iframe, bool dbg_found_best_path,
+      std::vector<std::pair<TokenId, InfoToken>> *q_curr_frame_todo,
+      std::vector<std::pair<TokenId, InfoToken>> *q_prev_frame_todo,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *curr_f_raw_lattice_state,
+      std::unordered_map<LatticeStateInternalId, RawLatticeState>
+          *prev_f_raw_lattice_state,
+      std::unordered_set<int32> *f_arc_idx_added);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(CudaDecoder);
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // KALDI_CUDA_DECODER_CUDA_DECODER_H_
diff --git a/src/cudadecoder/cuda-fst.cc b/src/cudadecoder/cuda-fst.cc
new file mode 100644
index 00000000000..6f899d87321
--- /dev/null
+++ b/src/cudadecoder/cuda-fst.cc
@@ -0,0 +1,209 @@
+// cudadecoder/cuda-fst.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include "cudadecoder/cuda-fst.h"
+
+#include <cuda_runtime_api.h>
+#include <nvToolsExt.h>
+
+namespace kaldi {
+namespace cuda_decoder {
+
+void CudaFst::ComputeOffsets(const fst::Fst<StdArc> &fst) {
+  // count states since Fst doesn't provide this functionality
+  num_states_ = 0;
+  for (fst::StateIterator<fst::Fst<StdArc> > iter(fst); !iter.Done();
+       iter.Next())
+    ++num_states_;
+
+  // allocate and initialize offset arrays
+  h_final_.resize(num_states_);
+  h_e_offsets_.resize(num_states_ + 1);
+  h_ne_offsets_.resize(num_states_ + 1);
+
+  // iterate through states and arcs and count number of arcs per state
+  e_count_ = 0;
+  ne_count_ = 0;
+
+  // Init first offsets
+  h_ne_offsets_[0] = 0;
+  h_e_offsets_[0] = 0;
+  for (int i = 0; i < num_states_; i++) {
+    h_final_[i] = fst.Final(i).Value();
+    // count emiting and non_emitting arcs
+    for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst, i); !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      int32 ilabel = arc.ilabel;
+      if (ilabel != 0) {  // emitting
+        e_count_++;
+      } else {  // non-emitting
+        ne_count_++;
+      }
+    }
+    h_ne_offsets_[i + 1] = ne_count_;
+    h_e_offsets_[i + 1] = e_count_;
+  }
+
+  // We put the emitting arcs before the nonemitting arcs in the arc list
+  // adding offset to the non emitting arcs
+  // we go to num_states_+1 to take into account the last offset
+  for (int i = 0; i < num_states_ + 1; i++)
+    h_ne_offsets_[i] += e_count_;  // e_arcs before
+
+  arc_count_ = e_count_ + ne_count_;
+}
+
+void CudaFst::AllocateData(const fst::Fst<StdArc> &fst) {
+  d_e_offsets_ = static_cast<unsigned int *>(CuDevice::Instantiate().Malloc(
+      (num_states_ + 1) * sizeof(*d_e_offsets_)));
+  d_ne_offsets_ = static_cast<unsigned int *>(CuDevice::Instantiate().Malloc(
+      (num_states_ + 1) * sizeof(*d_ne_offsets_)));
+  d_final_ = static_cast<float *>(
+      CuDevice::Instantiate().Malloc((num_states_) * sizeof(*d_final_)));
+
+  h_arc_weights_.resize(arc_count_);
+  h_arc_nextstate_.resize(arc_count_);
+  // ilabels (id indexing)
+  h_arc_id_ilabels_.resize(arc_count_);
+  h_arc_olabels_.resize(arc_count_);
+
+  d_arc_weights_ = static_cast<float *>(
+      CuDevice::Instantiate().Malloc(arc_count_ * sizeof(*d_arc_weights_)));
+  d_arc_nextstates_ = static_cast<StateId *>(
+      CuDevice::Instantiate().Malloc(arc_count_ * sizeof(*d_arc_nextstates_)));
+
+  // Only the ilabels for the e_arc are needed on the device
+  d_arc_pdf_ilabels_ = static_cast<int32 *>(
+      CuDevice::Instantiate().Malloc(e_count_ * sizeof(*d_arc_pdf_ilabels_)));
+}
+
+void CudaFst::PopulateArcs(const fst::Fst<StdArc> &fst) {
+  // now populate arc data
+  int e_idx = 0;
+  int ne_idx = e_count_;  // starts where e_offsets_ ends
+  for (int i = 0; i < num_states_; i++) {
+    for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst, i); !aiter.Done();
+         aiter.Next()) {
+      StdArc arc = aiter.Value();
+      int idx;
+      if (arc.ilabel != 0) {  // emitting
+        idx = e_idx++;
+      } else {
+        idx = ne_idx++;
+      }
+      h_arc_weights_[idx] = arc.weight.Value();
+      h_arc_nextstate_[idx] = arc.nextstate;
+      h_arc_id_ilabels_[idx] = arc.ilabel;
+      // For now we consider id indexing == pdf indexing
+      // If the two are differents, we'll call ApplyTransModelOnIlabels with a
+      // TransitionModel
+      h_arc_pdf_ilabels_[idx] = arc.ilabel;
+      h_arc_olabels_[idx] = arc.olabel;
+    }
+  }
+}
+
+void CudaFst::ApplyTransitionModelOnIlabels(
+    const TransitionModel &trans_model) {
+  // Converting ilabel here, to avoid reindexing when reading nnet3 output
+  // We only need to convert the emitting arcs
+  // The emitting arcs are the first e_count_ arcs
+  for (int iarc = 0; iarc < e_count_; ++iarc)
+    h_arc_pdf_ilabels_[iarc] =
+        trans_model.TransitionIdToPdf(h_arc_id_ilabels_[iarc]);
+}
+
+void CudaFst::CopyDataToDevice() {
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_e_offsets_, &h_e_offsets_[0], (num_states_ + 1) * sizeof(*d_e_offsets_),
+      cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_ne_offsets_, &h_ne_offsets_[0],
+      (num_states_ + 1) * sizeof(*d_ne_offsets_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(d_final_, &h_final_[0],
+                                                num_states_ * sizeof(*d_final_),
+                                                cudaMemcpyHostToDevice));
+
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(
+      cudaMemcpy(d_arc_weights_, &h_arc_weights_[0],
+                 arc_count_ * sizeof(*d_arc_weights_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_arc_nextstates_, &h_arc_nextstate_[0],
+      arc_count_ * sizeof(*d_arc_nextstates_), cudaMemcpyHostToDevice));
+  KALDI_DECODER_CUDA_API_CHECK_ERROR(cudaMemcpy(
+      d_arc_pdf_ilabels_, &h_arc_pdf_ilabels_[0],
+      e_count_ * sizeof(*d_arc_pdf_ilabels_), cudaMemcpyHostToDevice));
+}
+
+void CudaFst::Initialize(const fst::Fst<StdArc> &fst,
+                         const TransitionModel *trans_model) {
+  nvtxRangePushA("CudaFst constructor");
+  start_ = fst.Start();
+
+  ComputeOffsets(fst);
+  AllocateData(fst);
+  // Temporarily allocating data for this vector
+  // We just need it during CSR generation. We will clear it
+  // at the end of Initialize
+  h_arc_pdf_ilabels_.resize(arc_count_);
+  PopulateArcs(fst);
+  if (trans_model) ApplyTransitionModelOnIlabels(*trans_model);
+
+  KALDI_ASSERT(d_e_offsets_);
+  KALDI_ASSERT(d_ne_offsets_);
+  KALDI_ASSERT(d_final_);
+  KALDI_ASSERT(d_arc_weights_);
+  KALDI_ASSERT(d_arc_nextstates_);
+  KALDI_ASSERT(d_arc_pdf_ilabels_);
+
+  CopyDataToDevice();
+
+  // Making sure the graph is ready
+  cudaDeviceSynchronize();
+  KALDI_DECODER_CUDA_CHECK_ERROR();
+  h_arc_pdf_ilabels_.clear();  // we don't need those on host
+  nvtxRangePop();
+}
+
+void CudaFst::Finalize() {
+  nvtxRangePushA("CudaFst destructor");
+
+  // Making sure that Initialize was called before Finalize
+  KALDI_ASSERT(d_e_offsets_ &&
+               "Please call CudaFst::Initialize() before calling Finalize()");
+  KALDI_ASSERT(d_ne_offsets_);
+  KALDI_ASSERT(d_final_);
+  KALDI_ASSERT(d_arc_weights_);
+  KALDI_ASSERT(d_arc_nextstates_);
+  KALDI_ASSERT(d_arc_pdf_ilabels_);
+
+  CuDevice::Instantiate().Free(d_e_offsets_);
+  CuDevice::Instantiate().Free(d_ne_offsets_);
+  CuDevice::Instantiate().Free(d_final_);
+  CuDevice::Instantiate().Free(d_arc_weights_);
+  CuDevice::Instantiate().Free(d_arc_nextstates_);
+  CuDevice::Instantiate().Free(d_arc_pdf_ilabels_);
+  nvtxRangePop();
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/cuda-fst.h b/src/cudadecoder/cuda-fst.h
new file mode 100644
index 00000000000..1dac627755b
--- /dev/null
+++ b/src/cudadecoder/cuda-fst.h
@@ -0,0 +1,122 @@
+// cudadecoder/cuda-fst.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDA_DECODER_CUDA_FST_H_
+#define KALDI_CUDA_DECODER_CUDA_FST_H_
+#include "cudadecoder/cuda-decoder-common.h"
+#include "cudamatrix/cu-device.h"
+#include "lat/kaldi-lattice.h"
+#include "nnet3/decodable-online-looped.h"  // TransitionModel
+
+namespace kaldi {
+namespace cuda_decoder {
+
+typedef fst::StdArc StdArc;
+typedef StdArc::Weight StdWeight;
+typedef StdArc::Label Label;
+
+// FST in both device and host memory
+// Converting the OpenFst format to the CSR Compressed Sparse Row (CSR) Matrix
+// format.
+// https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)
+// Where states = rows and arcs = columns.
+// This format allows us to store the FST in a compact form, and leads to clean
+// memory accesses
+// For instance, when loading the arcs from a given source, we can load all arc
+// informations (destination, weight, etc.) with coalesced reads
+// Emitting arcs and non-emitting arcs are stored as separate matrices for
+// efficiency
+// We then copy the FST to the device (while keeping its original copy on host)
+class CudaFst {
+ public:
+  CudaFst()
+      : d_e_offsets_(nullptr),
+        d_ne_offsets_(nullptr),
+        d_arc_weights_(nullptr),
+        d_arc_nextstates_(nullptr),
+        d_arc_pdf_ilabels_(nullptr),
+        d_final_(nullptr){};
+  // Creates a CSR representation of the FST,
+  // then copies it to the GPU
+  // If a TransitionModel is passed, we'll use it to convert the ilabels id
+  // indexes into pdf indexes
+  // If no TransitionModel is passed, we'll assume TransitionModel == identity
+  // Important: The CudaDecodable won't apply the TransitionModel. If you use a
+  // TransitionModel, you need to apply it now
+  void Initialize(const fst::Fst<StdArc> &fst,
+                  const TransitionModel *trans_model = NULL);
+  void Finalize();
+
+  inline uint32_t NumStates() const { return num_states_; }
+  inline StateId Start() const { return start_; }
+
+ private:
+  friend class CudaDecoder;
+  // Counts arcs and computes offsets of the fst passed in
+  void ComputeOffsets(const fst::Fst<StdArc> &fst);
+  // Allocates memory to store FST
+  void AllocateData(const fst::Fst<StdArc> &fst);
+  // Populate the arcs data (arc.destination, arc.weights, etc.)
+  void PopulateArcs(const fst::Fst<StdArc> &fst);
+  // Converting the id ilabels into pdf ilabels using the transition model
+  // It allows the CudaDecoder to read the acoustic model loglikelihoods at the
+  // right indexes
+  void ApplyTransitionModelOnIlabels(const TransitionModel &trans_model);
+  // Copies fst to device into the pre-allocated datastructures
+  void CopyDataToDevice();
+  // Total number of states
+  unsigned int num_states_;
+  // Starting state of the FST
+  // Computation should start from state start_
+  StateId start_;
+  // Number of emitting, non-emitting, and total number of arcs
+  unsigned int e_count_, ne_count_, arc_count_;
+  // This data structure is similar to a CSR matrix format
+  // with 2 offsets matrices (one emitting one non-emitting).
+  // Offset arrays are num_states_+1 in size (last state needs
+  // its +1 arc_offset)
+  // Arc values for state i are stored in the range of [offset[i],offset[i+1][
+  unsigned int *d_e_offsets_;  // Emitting offset arrays
+  std::vector<unsigned int> h_e_offsets_;
+  unsigned int *d_ne_offsets_;  // Non-emitting offset arrays
+  std::vector<unsigned int> h_ne_offsets_;
+  // These are the values for each arc.
+  // Arcs belonging to state i are found in the range of [offsets[i],
+  // offsets[i+1][
+  // Use e_offsets or ne_offsets depending on what you need
+  // (emitting/nonemitting)
+  // The ilabels arrays are of size e_count_, not arc_count_
+  std::vector<CostType> h_arc_weights_;
+  CostType *d_arc_weights_;
+  std::vector<StateId> h_arc_nextstate_;
+  StateId *d_arc_nextstates_;
+  std::vector<int32> h_arc_id_ilabels_;
+  int32 *d_arc_pdf_ilabels_;
+  std::vector<int32> h_arc_olabels_;
+  // Final costs
+  // final cost of state i is h_final_[i]
+  std::vector<CostType> h_final_;
+  CostType *d_final_;
+
+  // ilabels (pdf indexing)
+  // only populate during CSR generation, cleared after (not needed on host)
+  std::vector<int32> h_arc_pdf_ilabels_;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+#endif  // KALDI_CUDA_DECODER_CUDA_FST_H_
diff --git a/src/cudadecoder/decodable-cumatrix.cc b/src/cudadecoder/decodable-cumatrix.cc
new file mode 100644
index 00000000000..d7c1d0359a5
--- /dev/null
+++ b/src/cudadecoder/decodable-cumatrix.cc
@@ -0,0 +1,62 @@
+// cudadecoder/decodable-cumatrix.cc
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Authors:  Hugo Braun, Justin Luitjens, Ryan Leary
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if HAVE_CUDA == 1
+
+#include "decodable-cumatrix.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+DecodableCuMatrixMapped::DecodableCuMatrixMapped(
+    const TransitionModel &tm, const CuMatrixBase<BaseFloat> &likes,
+    int32 frame_offset)
+    : trans_model_(tm), likes_(&likes), frame_offset_(frame_offset) {
+  if (likes.NumCols() != tm.NumPdfs())
+    KALDI_ERR << "Mismatch, matrix has " << likes.NumCols()
+              << " rows but transition-model has " << tm.NumPdfs()
+              << " pdf-ids.";
+}
+
+int32 DecodableCuMatrixMapped::NumFramesReady() const {
+  return frame_offset_ + likes_->NumRows();
+}
+
+bool DecodableCuMatrixMapped::IsLastFrame(int32 frame) const {
+  KALDI_ASSERT(frame < NumFramesReady());
+  return (frame == NumFramesReady() - 1);
+}
+
+// Indices are one-based!  This is for compatibility with OpenFst.
+int32 DecodableCuMatrixMapped::NumIndices() const {
+  return trans_model_.NumTransitionIds();
+}
+
+// returns cuda pointer to nnet3 output
+BaseFloat *
+DecodableCuMatrixMapped::GetLogLikelihoodsCudaPointer(int32 subsampled_frame) {
+  BaseFloat *frame_nnet3_out =
+      (BaseFloat *)likes_->Data() +
+      (subsampled_frame - frame_offset_) * likes_->Stride();
+  return frame_nnet3_out;
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+#endif  // HAVE_CUDA == 1
diff --git a/src/cudadecoder/decodable-cumatrix.h b/src/cudadecoder/decodable-cumatrix.h
new file mode 100644
index 00000000000..d34079cc9c7
--- /dev/null
+++ b/src/cudadecoder/decodable-cumatrix.h
@@ -0,0 +1,71 @@
+// cudadecoder/decodable-cumatrix.h
+/*
+ * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
+ * Authors:  Hugo Braun, Justin Luitjens, Ryan Leary
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
+#define KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
+
+#include "cudadecoder/cuda-decodable-itf.h"
+#include "cudamatrix/cu-matrix.h"
+#include "decoder/decodable-matrix.h"
+
+namespace kaldi {
+namespace cuda_decoder {
+
+/**
+  Cuda Decodable matrix.  Takes transition model and posteriors and provides
+  an interface similar to the Decodable Interface
+  */
+class DecodableCuMatrixMapped : public CudaDecodableInterface {
+public:
+  // This constructor creates an object that will not delete "likes" when done.
+  // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
+  // greater than one if this is not the first chunk of likelihoods.
+  DecodableCuMatrixMapped(const TransitionModel &tm,
+                          const CuMatrixBase<BaseFloat> &likes,
+                          int32 frame_offset = 0);
+
+  virtual int32 NumFramesReady() const;
+
+  virtual bool IsLastFrame(int32 frame) const;
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
+    KALDI_ASSERT(false);
+    return 0.0f;  // never executed, compiler requests a return
+  };
+
+  // Note: these indices are 1-based.
+  virtual int32 NumIndices() const;
+
+  virtual ~DecodableCuMatrixMapped(){};
+
+  // returns cuda pointer to nnet3 output
+  virtual BaseFloat *GetLogLikelihoodsCudaPointer(int32 subsampled_frame);
+
+private:
+  const TransitionModel &trans_model_; // for tid to pdf mapping
+  const CuMatrixBase<BaseFloat> *likes_;
+
+  int32 frame_offset_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableCuMatrixMapped);
+};
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi.
+
+#endif  // KALDI_CUDA_DECODER_DECODABLE_CUMATRIX_H_
diff --git a/src/cudadecoder/thread-pool.h b/src/cudadecoder/thread-pool.h
new file mode 100644
index 00000000000..920ea6d3300
--- /dev/null
+++ b/src/cudadecoder/thread-pool.h
@@ -0,0 +1,162 @@
+// cudadecoder/thread-pool.h
+// Source:  https://github.com/progschj/ThreadPool
+// Modified to add a priority queue 
+// Ubtained under this license:
+/*
+Copyright (c) 2012 Jakob Progsch, Václav Zeman
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+   3. This notice may not be removed or altered from any source
+   distribution.
+*/
+
+#ifndef KALDI_CUDA_DECODER_THREAD_POOL_H_
+#define KALDI_CUDA_DECODER_THREAD_POOL_H_
+
+#include <climits>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+namespace kaldi {
+namespace cuda_decoder {
+
+// C++ indexes enum 0,1,2...
+enum ThreadPoolPriority  { THREAD_POOL_LOW_PRIORITY, THREAD_POOL_NORMAL_PRIORITY, THREAD_POOL_HIGH_PRIORITY };
+
+class ThreadPool {
+public:
+  ThreadPool(size_t);
+  template <class F, class... Args>
+  auto enqueue(ThreadPoolPriority priority, F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  template <class F, class... Args>
+  auto enqueue(F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  ~ThreadPool();
+
+ private:
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  struct Task {
+	  std::function<void()> func;
+          // Ordered first by priority, then FIFO order
+          // tasks created first will have a higher priority_with_fifo.second
+          std::pair<ThreadPoolPriority, long long> priority_with_fifo;
+  };
+  friend bool operator<(const ThreadPool::Task &lhs,
+                        const ThreadPool::Task &rhs);
+
+  std::priority_queue<Task> tasks;
+  long long task_counter;
+
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+
+  bool stop;
+};
+
+inline bool operator<(const ThreadPool::Task &lhs,
+                      const ThreadPool::Task &rhs) {
+  return lhs.priority_with_fifo < rhs.priority_with_fifo;
+}
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+    : task_counter(LONG_MAX), stop(false) {
+  for (size_t i = 0; i < threads; ++i)
+    workers.emplace_back([this] {
+      for (;;) {
+        Task task;
+
+	{
+          std::unique_lock<std::mutex> lock(this->queue_mutex);
+          this->condition.wait(
+              lock, [this] { return this->stop || !this->tasks.empty(); });
+          if (this->stop && this->tasks.empty()) return;
+          if (!tasks.empty()) {
+            task = std::move(this->tasks.top());
+            this->tasks.pop();
+        }
+	}
+        task.func();
+      }
+    });
+}
+
+// add new work item to the pool : normal priority
+template <class F, class... Args>
+auto ThreadPool::enqueue(F &&f, Args &&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  return enqueue(THREAD_POOL_NORMAL_PRIORITY, std::forward<F>(f), std::forward<Args>(args)...);
+}
+
+// add new work item to the pool
+template <class F, class... Args>
+auto ThreadPool::enqueue(ThreadPoolPriority priority, F &&f, Args &&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  using return_type = typename std::result_of<F(Args...)>::type;
+
+  auto func = std::make_shared<std::packaged_task<return_type()>>(
+      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+  std::future<return_type> res = func->get_future();
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+
+    // don't allow enqueueing after stopping the pool
+    if (stop)
+      throw std::runtime_error("enqueue on stopped ThreadPool");
+    Task task;
+    task.func = [func]() { (*func)(); };
+    long long task_fifo_id = task_counter--;
+    // The following if will temporarly break the FIFO order
+    // (leading to a perf drop for a few seconds)
+    // But it should trigger in ~50 million years
+    if (task_counter == 0) task_counter = LONG_MAX;
+    task.priority_with_fifo = {priority, task_fifo_id};
+    tasks.push(std::move(task));
+  }
+  condition.notify_one();
+  return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread &worker : workers)
+    worker.join();
+}
+
+}  // end namespace cuda_decoder
+}  // end namespace kaldi
+
+
+#endif  // KALDI_CUDA_DECODER_THREAD_POOL_H_
diff --git a/src/cudadecoderbin/Makefile b/src/cudadecoderbin/Makefile
new file mode 100644
index 00000000000..6a31a52ceca
--- /dev/null
+++ b/src/cudadecoderbin/Makefile
@@ -0,0 +1,27 @@
+all:
+
+include ../kaldi.mk
+
+ifeq ($(CUDA), true)
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES = batched-wav-nnet3-cuda
+
+OBJFILES =
+
+TESTFILES =
+
+ADDLIBS = ../cudadecoder/kaldi-cudadecoder.a  ../cudafeat/kaldi-cudafeat.a \
+../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
+../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a ../nnet2/kaldi-nnet2.a \
+../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
+../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
+../feat/kaldi-feat.a ../transform/kaldi-transform.a \
+../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
+../matrix/kaldi-matrix.a ../base/kaldi-base.a
+
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudadecoderbin/batched-wav-nnet3-cuda.cc b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
new file mode 100644
index 00000000000..bfe8d8a2ce6
--- /dev/null
+++ b/src/cudadecoderbin/batched-wav-nnet3-cuda.cc
@@ -0,0 +1,338 @@
+// cudadecoderbin/batched-wav-nnet3-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Hugo Braun, Justin Luitjens, Ryan Leary
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <nvToolsExt.h>
+#include <sstream>
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline.h"
+#include "cudamatrix/cu-allocator.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "util/kaldi-thread.h"
+using namespace kaldi;
+using namespace cuda_decoder;
+
+// When the pipeline is full, wait for
+// KALDI_CUDA_DECODER_BIN_PIPELINE_FULL_SLEEP
+// Not using a semaphore because it is usually not necessary to wait
+#define KALDI_CUDA_DECODER_BIN_PIPELINE_FULL_SLEEP ((double)1 / 1e5)
+
+void GetDiagnosticsAndPrintOutput(const std::string &utt,
+                                  const fst::SymbolTable *word_syms,
+                                  const CompactLattice &clat,
+                                  std::mutex *stdout_mutex,
+                                  int64 *tot_num_frames, double *tot_like) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return;
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  num_frames = alignment.size();
+  likelihood = -(weight.Value1() + weight.Value2());
+  {
+    *tot_num_frames += num_frames;
+    *tot_like += likelihood;
+    std::lock_guard<std::mutex> lk(*stdout_mutex);
+    KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
+                  << (likelihood / num_frames) << " over " << num_frames
+                  << " frames.";
+
+    if (word_syms != NULL) {
+      std::ostringstream oss_warn;
+      oss_warn << utt << " ";
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          oss_warn << "Word-id " << words[i] << " not in symbol table.";
+        oss_warn << s << " ";
+      }
+      KALDI_WARN << oss_warn.str();
+    }
+  }
+}
+
+// Called when a task is complete. Will be called by different threads
+// concurrently,
+// so it must be threadsafe
+void FinishOneDecode(const std::string &utt, const std::string &key,
+                     const fst::SymbolTable *word_syms,
+                     BatchedThreadedNnet3CudaPipeline *cuda_pipeline,
+                     int64 *num_frames, double *tot_like,
+                     CompactLatticeWriter *clat_writer,
+                     std::mutex *clat_writer_mutex, std::mutex *stdout_mutex,
+                     const bool write_lattice, CompactLattice &clat) {
+  nvtxRangePushA("FinishOneDecode");
+  GetDiagnosticsAndPrintOutput(utt, word_syms, clat, stdout_mutex, num_frames,
+                               tot_like);
+  if (write_lattice) {
+    std::lock_guard<std::mutex> lk(*clat_writer_mutex);
+    clat_writer->Write(key, clat);
+  }
+
+  nvtxRangePop();
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in wav file(s) and simulates online decoding with neural nets\n"
+        "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
+        "optional endpointing.  Note: some configuration values and inputs "
+        "are\n"
+        "set via config files whose filenames are passed as options\n"
+        "\n"
+        "Usage: batched-wav-nnet3-cuda [options] <nnet3-in> <fst-in> "
+        "<wav-rspecifier> <lattice-wspecifier>\n";
+
+    std::string word_syms_rxfilename;
+
+    bool write_lattice = true;
+    int num_todo = -1;
+    int iterations = 1;
+    ParseOptions po(usage);
+    std::mutex stdout_mutex;
+    int pipeline_length = 4000;  // length of pipeline of outstanding requests,
+    // this is independent of queue lengths in
+    // decoder
+
+    po.Register("write-lattice", &write_lattice,
+                "Output lattice to a file. Setting to false is useful when "
+                "benchmarking");
+    po.Register("word-symbol-table", &word_syms_rxfilename,
+                "Symbol table for words [for debug output]");
+    po.Register("file-limit", &num_todo,
+                "Limits the number of files that are processed by this driver. "
+                "After N files are processed the remaining files are ignored. "
+                "Useful for profiling");
+    po.Register("iterations", &iterations,
+                "Number of times to decode the corpus.");
+
+    // Multi-threaded CPU and batched GPU decoder
+    BatchedThreadedNnet3CudaPipelineConfig batched_decoder_config;
+
+    CuDevice::RegisterDeviceOptions(&po);
+    RegisterCuAllocatorOptions(&po);
+    batched_decoder_config.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+    BatchedThreadedNnet3CudaPipeline cuda_pipeline(batched_decoder_config);
+
+    std::string nnet3_rxfilename = po.GetArg(1), fst_rxfilename = po.GetArg(2),
+                wav_rspecifier = po.GetArg(3), clat_wspecifier = po.GetArg(4);
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+
+    // read transition model and nnet
+    bool binary;
+    Input ki(nnet3_rxfilename, &binary);
+    trans_model.Read(ki.Stream(), binary);
+    am_nnet.Read(ki.Stream(), binary);
+    SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+    SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+    nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+
+    CompactLatticeWriter clat_writer;
+    std::mutex clat_write_mutex;
+
+    fst::Fst<fst::StdArc> *decode_fst =
+        fst::ReadFstKaldiGeneric(fst_rxfilename);
+
+    cuda_pipeline.Initialize(*decode_fst, am_nnet, trans_model);
+
+    delete decode_fst;
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_rxfilename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename;
+
+    int32 num_task_submitted = 0, num_err = 0;
+    double tot_like = 0.0;
+    int64 num_frames = 0;
+    double total_audio = 0;
+
+    nvtxRangePush("Global Timer");
+
+    int num_groups_done = 0;
+
+    clat_writer.Open(clat_wspecifier);
+    // starting timer here so we
+    // can measure throughput
+    // without allocation
+    // overheads
+    // using kaldi timer, which starts counting in the constructor
+    Timer timer;
+    std::vector<double> iteration_timer;
+    for (int iter = 0; iter < iterations; iter++) {
+      std::string task_group = std::to_string(iter);
+      num_task_submitted = 0;
+      SequentialTableReader<WaveHolder> wav_reader(wav_rspecifier);
+
+      for (; !wav_reader.Done(); wav_reader.Next()) {
+        nvtxRangePushA("Utterance Iteration");
+
+        while (cuda_pipeline.GetNumberOfTasksPending() >= pipeline_length) {
+          kaldi::Sleep(KALDI_CUDA_DECODER_BIN_PIPELINE_FULL_SLEEP);
+        }
+
+        std::string utt = wav_reader.Key();
+        std::string key = utt;
+
+        if (iterations > 0) {
+          // make key unique for each iteration
+          key = std::to_string(iter) + "-" + key;
+        }
+
+        const WaveData &wave_data = wav_reader.Value();
+
+        if (iter == 0) {
+          // calculating number of utterances per iteration
+          // calculating total audio time per iteration
+          total_audio += wave_data.Duration();
+        }
+
+        // Creating a function alias for the callback function of that utterance
+        auto finish_one_decode_lamba =
+            [
+                // Capturing the arguments that will change by copy
+                utt, key, 
+                // Capturing the const/global args by reference
+                &word_syms, &cuda_pipeline, &stdout_mutex, &num_frames,
+                &clat_write_mutex, &clat_writer, &write_lattice,
+                &tot_like]
+            // The callback function receive the compact lattice as argument
+            // if determinize_lattice is true, it is a determinized lattice
+            // otherwise, it is a raw lattice converted to compact format
+            // through ConvertLattice
+            (CompactLattice & clat_in) {
+              // Content of our callback function. Calling the general
+              // FinishOneDecode function with the proper arguments
+              FinishOneDecode(
+                  // Captured arguments used to specialize FinishOneDecode for
+                  // this task
+                  utt, key, word_syms, &cuda_pipeline, &num_frames, &tot_like,
+                  &clat_writer, &clat_write_mutex, &stdout_mutex, 
+                  write_lattice,
+                  // Generated lattice that will be passed once the task is
+                  // complete
+                  clat_in);
+            };
+        // Adding a new task. Once the output lattice is ready, it will call
+        // finish_one_decode_lamba
+        // Important : finish_one_decode_lamba is called in the threadpool. We
+        // need it to be threadsafe
+        // (use locks around relevant parts, like writing to I/O)
+        cuda_pipeline.OpenDecodeHandle(key, wave_data, task_group,
+                                       finish_one_decode_lamba);
+        num_task_submitted++;
+
+        nvtxRangePop();
+        if (num_todo != -1 && num_task_submitted >= num_todo) break;
+      }  // end utterance loop
+      
+      std::string group_done;
+      // Non-blocking way to check if a group is done
+      // returns false if zero groups are ready
+      while (cuda_pipeline.IsAnyGroupCompleted(&group_done)) {
+        cuda_pipeline.CloseAllDecodeHandlesForGroup(group_done);
+        double total_time = timer.Elapsed();
+        int32 iter = std::atoi(group_done.c_str());
+        KALDI_LOG << "~Group " << group_done << " completed"
+                  << " Aggregate Total Time: " << total_time
+                  << " Audio: " << total_audio * (iter + 1)
+                  << " RealTimeX: " << total_audio * (iter + 1) / total_time;
+        num_groups_done++;
+      }
+    }  // end iterations loop
+
+    // We've submitted all tasks. Now waiting for them to complete
+    // We could also have called WaitForAllTasks and CloseAllDecodeHandles
+    while (num_groups_done < iterations) {
+      // WaitForAnyGroup is blocking. It will hold until one group is ready
+      std::string group_done = cuda_pipeline.WaitForAnyGroup();
+      cuda_pipeline.CloseAllDecodeHandlesForGroup(group_done);
+      double total_time = timer.Elapsed();
+      int32 iter = std::atoi(group_done.c_str());
+      KALDI_LOG << "~Group " << group_done << " completed"
+                << " Aggregate Total Time: " << total_time
+                << " Audio: " << total_audio * (iter + 1)
+                << " RealTimeX: " << total_audio * (iter + 1) / total_time;
+      num_groups_done++;
+    }
+
+    // number of seconds elapsed since the creation of timer
+    double total_time = timer.Elapsed();
+    nvtxRangePop();
+
+    KALDI_LOG << "Decoded " << num_task_submitted << " utterances, " << num_err
+              << " with errors.";
+    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
+              << " per frame over " << num_frames << " frames.";
+
+    KALDI_LOG << "Overall: "
+              << " Aggregate Total Time: " << total_time
+              << " Total Audio: " << total_audio * iterations
+              << " RealTimeX: " << total_audio * iterations / total_time;
+
+    cuda_pipeline.Finalize();
+    cudaDeviceSynchronize();
+      
+    delete word_syms;  // will delete if non-NULL.
+
+    return 0;
+
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}  // main()
+
+#endif  // if HAVE_CUDA == 1
diff --git a/src/cudafeat/Makefile b/src/cudafeat/Makefile
new file mode 100644
index 00000000000..33aca4eedaa
--- /dev/null
+++ b/src/cudafeat/Makefile
@@ -0,0 +1,30 @@
+
+
+all:
+
+include ../kaldi.mk
+ifeq ($(CUDA), true)
+
+TESTFILES = 
+
+ifeq ($(CUDA), true)
+  OBJFILES +=  feature-window-cuda.o feature-spectral-cuda.o feature-online-cmvn-cuda.o \
+							 online-ivector-feature-cuda-kernels.o online-ivector-feature-cuda.o \
+							 online-cuda-feature-pipeline.o
+endif
+
+LIBNAME = kaldi-cudafeat
+
+ADDLIBS = ../feat/kaldi-feat.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../gmm/kaldi-gmm.a ../ivector/kaldi-ivector.a ../online2/kaldi-online2.a
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+
+%.o : %.cu
+	$(CUDATKDIR)/bin/nvcc -c -g $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../ -I$(OPENFSTINC)
+endif
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudafeat/feature-online-cmvn-cuda.cu b/src/cudafeat/feature-online-cmvn-cuda.cu
new file mode 100644
index 00000000000..445c06c3c04
--- /dev/null
+++ b/src/cudafeat/feature-online-cmvn-cuda.cu
@@ -0,0 +1,203 @@
+// cudafeat/feature-online-cmvn-cuda.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cub/cub.cuh>
+#include "cudafeat/feature-online-cmvn-cuda.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+
+__host__ __device__ inline float2 operator-(const float2 &a, const float2 &b) {
+  float2 retval;
+  retval.x = a.x - b.x;
+  retval.y = a.y - b.y;
+  return retval;
+}
+__host__ __device__ inline float2 operator+(const float2 &a, const float2 &b) {
+  float2 retval;
+  retval.x = a.x + b.x;
+  retval.y = a.y + b.y;
+  return retval;
+}
+
+#if __CUDA_ARCH__ == 750
+__launch_bounds__ (1024, 1)
+#else 
+__launch_bounds__ (1024, 2)
+#endif
+__global__ void compute_cmvn_stats_kernel(const float *data, int32_t ldd,
+                                          int32_t num_frames, int32_t feat_dim,
+                                          float *stats, int32_t lds) {
+  typedef cub::BlockScan<float2, 1024> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  int32_t feat = blockIdx.x;
+
+  float2 running_sum = {0.0f, 0.0f};
+  // for each frame, keep threads alive for cub
+  for (int32_t r = 0; r < num_frames; r += blockDim.x) {
+    int32_t rid = r + threadIdx.x;
+
+    float val = 0.0f;
+
+    if (rid < num_frames) {
+      // uncoalesced, could transpose data or do some shared memory swizzling...
+      val = data[rid * ldd + feat];
+    }
+
+    float2 sum = {val, val * val};  // this elements value and value squared
+
+    float2 psum;   // row prefix sum
+    float2 total;  // total count
+    BlockScan(temp_storage).InclusiveSum(sum, psum, total);
+
+    // offset by running sum
+    psum = psum + running_sum;
+    // increase running sum by new total
+    running_sum = running_sum + total;
+
+    // un-coalesced
+    if (rid < num_frames) {
+      reinterpret_cast<float2 *>(&stats[rid * lds])[feat] = psum;
+    }
+  }
+}
+
+__global__ void apply_cmvn_kernel(
+    int32_t cmvn_window, bool var_norm, bool mean_norm, const float *feat_in,
+    int32_t ldi, int32_t num_rows, int32_t num_cols,
+    const float *__restrict__ stats, int32_t lds,
+    const float *__restrict__ global_stats, int32_t ldg, int32_t global_frames,
+    const float *__restrict__ speaker_stats, int32_t ldss,
+    int32_t speaker_frames, float *feat_out, int32_t ldo) {
+  int32_t r = blockIdx.x;
+
+  for (int c = threadIdx.x; c < num_cols; c += blockDim.x) {
+    float2 frame_stats =
+        reinterpret_cast<const float2* __restrict__>(&stats[r * lds])[c];
+
+    float val = feat_in[r * ldi + c];
+
+    float window_length = min(r + 1, cmvn_window);
+
+    // we have to subtract row r-cmvn_window stats
+    if (r >= cmvn_window) {
+      // window starting row
+      int32_t o = r - cmvn_window;
+
+      // stats at the start row of the window that must be removed
+      float2 ostats =
+          reinterpret_cast<const float2* __restrict__>(&stats[o * lds])[c];
+
+      // remove start of the window stats
+      frame_stats = frame_stats - ostats;
+    }
+
+    // Smooth stats by speaker frames if necessary
+    float smooth_frames = cmvn_window - window_length;
+    if (smooth_frames > 0 && speaker_frames > 0) {
+      float count_from_speaker = min(smooth_frames, (float)speaker_frames);
+      float speaker_count = speaker_stats[num_cols];
+
+      if (count_from_speaker > 0.0) {
+        float alpha = count_from_speaker / speaker_count;
+
+        frame_stats.x += alpha * speaker_stats[c];         // update mean
+        frame_stats.y += alpha * speaker_stats[ldss + c];  // update variance
+        window_length += alpha * speaker_count;  // update window length
+
+        // recompute smooth frames now that we have speaker stats
+        smooth_frames = cmvn_window - window_length;
+      }
+    }
+
+    // Smooth stats by global frames if necessary
+    if (smooth_frames > 0 && global_frames > 0) {
+      float count_from_global = min(smooth_frames, (float)global_frames);
+      float global_count = global_stats[num_cols];
+
+      if (count_from_global > 0.0) {
+        float alpha = count_from_global / global_count;
+
+        frame_stats.x += alpha * global_stats[c];        // update mean
+        frame_stats.y += alpha * global_stats[ldg + c];  // update variance
+        window_length += alpha * global_count;           // update window length
+      }
+    }
+
+    float mean = frame_stats.x / window_length;
+    float var = frame_stats.y / window_length - mean * mean;
+
+    float floor = 1e-20;
+    if (var < floor)  // avoid dividing by zero
+      var = floor;
+
+    if (!var_norm) {
+      // skip variance normalization
+      var = 1.0f;
+    }
+    if (!mean_norm) {
+      assert(false);
+      // skip mean normalization
+      mean = 0.0f;
+    }
+
+    // shift by mean and scale by variance
+    feat_out[r * ldo + c] = (val - mean) / sqrtf(var);
+  }
+}
+
+namespace kaldi {
+
+void CudaOnlineCmvn::ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
+                               CuMatrix<BaseFloat> *feats_out) {
+  int32_t num_frames = feats_in.NumRows();
+  int32_t feat_dim = feats_in.NumCols();
+  feats_out->Resize(num_frames, feat_dim, kUndefined);
+
+  CuMatrix<float> stats(num_frames, feat_dim * 2, kUndefined);
+
+  int threads = 1024;
+  int blocks = feat_dim;
+
+  // compute windowed sum/sum2 prefix sum along column of feats
+  compute_cmvn_stats_kernel<<<blocks, threads>>>(
+      feats_in.Data(), feats_in.Stride(), num_frames, feat_dim, stats.Data(),
+      stats.Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+
+  threads = (feat_dim + 31) / 32 * 32;  // round up to 32 threads
+  if (threads > 1024) threads = 1024;
+
+  const CuMatrix<float> &gstats = cmvn_state_.global_cmvn_stats;
+  const CuMatrix<float> &sstats = cmvn_state_.speaker_cmvn_stats;
+
+  int global_frames = opts_.global_frames;
+  int speaker_frames = opts_.speaker_frames;
+
+  if (gstats.NumRows() == 0) global_frames = 0;
+  if (sstats.NumRows() == 0) speaker_frames = 0;
+
+  // apply cmvn
+  apply_cmvn_kernel<<<num_frames, threads>>>(
+      opts_.cmn_window, opts_.normalize_variance, opts_.normalize_mean,
+      feats_in.Data(), feats_in.Stride(), num_frames, feat_dim, stats.Data(),
+      stats.Stride(), gstats.Data(), gstats.Stride(), global_frames,
+      sstats.Data(), sstats.Stride(), speaker_frames, feats_out->Data(),
+      feats_out->Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+}
+}
diff --git a/src/cudafeat/feature-online-cmvn-cuda.h b/src/cudafeat/feature-online-cmvn-cuda.h
new file mode 100644
index 00000000000..729467a7a88
--- /dev/null
+++ b/src/cudafeat/feature-online-cmvn-cuda.h
@@ -0,0 +1,59 @@
+// cudafeat/feature-online-cmvn-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_FEATURE_ONLINE_CMVN_CUDA_H_
+#define KALDI_CUDAFEAT_FEATURE_ONLINE_CMVN_CUDA_H_
+
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "feat/online-feature.h"
+
+namespace kaldi {
+
+struct CudaOnlineCmvnState {
+  // The following is the global CMVN stats, in the usual
+  // format, of dimension 2 x (dim+1), as [  sum-stats          count
+  //                                       sum-sqared-stats   0    ]
+  CuMatrix<float> global_cmvn_stats;
+  CuMatrix<float> speaker_cmvn_stats;
+
+  CudaOnlineCmvnState(){};
+  CudaOnlineCmvnState(const OnlineCmvnState &cmvn_state)
+      : global_cmvn_stats(cmvn_state.global_cmvn_stats),
+        speaker_cmvn_stats(cmvn_state.speaker_cmvn_stats) {}
+
+  CudaOnlineCmvnState(const CudaOnlineCmvnState &cmvn_state)
+      : global_cmvn_stats(cmvn_state.global_cmvn_stats),
+        speaker_cmvn_stats(cmvn_state.speaker_cmvn_stats) {}
+};
+
+class CudaOnlineCmvn {
+ public:
+  CudaOnlineCmvn(const OnlineCmvnOptions &opts, const CudaOnlineCmvnState &cmvn_state)
+      : opts_(opts), cmvn_state_(cmvn_state){};
+  ~CudaOnlineCmvn(){};
+
+  void ComputeFeatures(const CuMatrixBase<BaseFloat> &feats_in,
+                       CuMatrix<BaseFloat> *feats_out);
+
+ private:
+  const OnlineCmvnOptions &opts_;
+  const CudaOnlineCmvnState &cmvn_state_;
+};
+}
+
+#endif
diff --git a/src/cudafeat/feature-spectral-cuda.cu b/src/cudafeat/feature-spectral-cuda.cu
new file mode 100644
index 00000000000..8d01a9ac7d4
--- /dev/null
+++ b/src/cudafeat/feature-spectral-cuda.cu
@@ -0,0 +1,568 @@
+// cudafeature/feature-spectral-cuda.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#include <cub/cub.cuh>
+#endif
+
+#include "cudafeat/feature-spectral-cuda.h"
+#include "cudamatrix/cu-rand.h"
+
+// Each thread block processes a unique frame
+// threads in the same threadblock collaborate to
+// compute the frame together.
+__global__ void apply_lifter_and_floor_energy(
+    int num_frames, int num_cols, float cepstral_lifter, bool use_energy,
+    float energy_floor, float *log_energy, float *lifter_coeffs,
+    float *features, int32_t ldf) {
+  int thread_id = threadIdx.x;
+  int frame = blockIdx.x;
+
+  float *feats = features + frame * ldf;
+
+  // apply lifter coefficients
+  if (cepstral_lifter != 0.0f) {
+    for (int c = thread_id; c < num_cols; c += CU1DBLOCK) {
+      float lift = lifter_coeffs[c];
+      float f = feats[c];
+      feats[c] = f * lift;
+    }
+  }
+
+  // Thread 0 for each frame will apply energy
+  if (use_energy && thread_id == 0) {
+    float energy = log_energy[frame];
+    float log_energy_floor = log(energy_floor);
+
+    if (energy_floor > 0.0f && energy < log_energy_floor) {
+      energy = log_energy_floor;
+    }
+    feats[0] = energy;
+  }
+}
+
+// Each threadblock computes a different row of the matrix.
+// Threads in the same block compute the row collaboratively.
+// This kernel must be called out of place (A_in!=A_out).
+__global__ void power_spectrum_kernel(int row_length, float *A_in, int32_t ldi,
+                                      float *A_out, int32_t ldo,
+                                      bool use_power) {
+  int thread_id = threadIdx.x;
+  int block_id = blockIdx.x;
+  float *Ar = A_in + block_id * ldi;
+  float *Aw = A_out + block_id * ldo;
+
+  int half_length = row_length / 2;
+  for (int idx = thread_id; idx < half_length; idx += CU1DBLOCK) {
+    // ignore special case
+    if (idx == 0) continue;
+
+    float2 val = reinterpret_cast<float2 *>(Ar)[idx];
+    float ret = val.x * val.x + val.y * val.y;
+    if (use_power) {
+      Aw[idx] = ret;
+    } else {
+      Aw[idx] = sqrtf(ret);
+    }
+  }
+
+  // handle special case
+  if (threadIdx.x == 0) {
+    float real = Ar[0];
+    // cufft puts this at the end, this is different than kaldi does with its
+    // own
+    // internal implementation
+    float im = Ar[row_length];
+
+    if (use_power) {
+      Aw[0] = real * real;
+      Aw[half_length] = im * im;
+    } else {
+      Aw[0] = fabs(real);
+      Aw[half_length] = fabs(im);
+    }
+  }
+}
+
+// Expects to be called with 32x8 sized thread block.
+// LDB: Adding use_log flag
+__global__ void mel_banks_compute_kernel(int32_t num_frames, float energy_floor,
+                                         int32 *offsets, int32 *sizes,
+                                         float **vecs, const float *feats,
+                                         int32_t ldf, float *mels, int32_t ldm,
+                                         bool use_log) {
+  // Specialize WarpReduce for type float
+  typedef cub::WarpReduce<float> WarpReduce;
+  // Allocate WarpReduce shared memory for 8 warps
+  __shared__ typename WarpReduce::TempStorage temp_storage[8];
+
+  // warp will work together to compute sum
+  int tid = threadIdx.x;
+  int wid = threadIdx.y;
+  // blocks in the x dimension take different bins
+  int bin = blockIdx.x;
+  // frame is a combination of blocks in the y dimension and threads in the y
+  // dimension
+  int frame = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (frame >= num_frames) return;
+
+  int offset = offsets[bin];
+  int size = sizes[bin];
+  const float *v = vecs[bin];
+  const float *w = feats + frame * ldf + offset;
+
+  // perfom local sum
+  float sum = 0;
+  for (int idx = tid; idx < size; idx += 32) {
+    sum += v[idx] * w[idx];
+  }
+
+  // Sum in cub
+  sum = WarpReduce(temp_storage[wid]).Sum(sum);
+  if (tid == 0) {
+    if (use_log) {
+      // avoid log of zero
+      if (sum < energy_floor) sum = energy_floor;
+      float val = logf(sum);
+      mels[frame * ldm + bin] = val;
+    } else {
+      mels[frame * ldm + bin] = sum;
+    }
+  }
+}
+
+__global__ void process_window_kernel(
+    int frame_length, float dither, float energy_floor, bool remove_dc_offset,
+    float preemph_coeff, bool need_raw_log_energy, float *log_energy_pre_window,
+    const float *windowing, float *tmp_windows, int32_t ldt, float *windows,
+    int32_t ldw) {
+  // Specialize WarpReduce for type float
+  typedef cub::BlockReduce<float, CU1DBLOCK> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int thread_id = threadIdx.x;
+  int row = blockIdx.x;
+  float *tmp_window = tmp_windows + row * ldt;
+  float *window = windows + row * ldw;
+
+  __shared__ float ssum;
+
+  float sum = 0;
+  float wdot = 0;
+
+  for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
+    // tmp_window contains optional dither.  Apply that on read.
+    float wval = window[idx];
+    if (dither != 0.0f) {
+      wval += tmp_window[idx] * dither;
+    }
+    // compute local sum for removing dc offset
+    sum += wval;
+    // compute dot product for log energy
+    wdot += wval * wval;
+
+    float windowing_mul = 1;
+    if (remove_dc_offset == false && preemph_coeff == 0.0f) {
+      // we are done here so set windowing multiplication on write.
+      windowing_mul = windowing[idx];
+    }
+
+    // write dithered output
+    window[idx] = wval * windowing_mul;
+  }
+  __syncthreads();
+  if (remove_dc_offset) {
+    // we will recompute this below
+    wdot = 0.0f;
+    // use cub to reduce
+    sum = BlockReduce(temp_storage).Sum(sum);
+
+    // broadcast sum to entire block
+    if (thread_id == 0) ssum = sum;
+    __syncthreads();
+
+    sum = -ssum / frame_length;
+    for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
+      float windowing_mul = 1;
+      float *out = window;
+      if (preemph_coeff == 0.0f) {
+        // we are done here so apply windowing
+        windowing_mul = windowing[idx];
+      } else {
+        // write to temp window as we will copy back into window
+        // when doing pre-emphasis
+        out = tmp_window;
+      }
+      // updated window value
+      float wval = window[idx] + sum;
+
+      // compute new dot product with dc offset removed
+      wdot += wval * wval;
+
+      // write output
+      out[idx] = wval * windowing_mul;
+    }
+  }
+  __syncthreads();
+
+  // if pointer is not NULL we will set energy to either
+  // the computed energy or 0 depending on need_raw_log_energy
+  if (log_energy_pre_window != NULL) {
+    float energy = 0.0f;
+
+    if (need_raw_log_energy) {
+      // must sync to use retemp_storage
+      if (remove_dc_offset) __syncthreads();
+      // use cub to reduce
+      wdot = BlockReduce(temp_storage).Sum(wdot);
+
+      energy = max(wdot, energy_floor);
+    }
+
+    if (thread_id == 0) {
+      log_energy_pre_window[row] = log(energy);
+    }
+  }
+
+  // TODO this could be more efficient using shared memory instead of
+  // tmp_window.
+  if (preemph_coeff != 0.0f) {
+    // wait for tmp_window to be computed
+    __threadfence();
+    __syncthreads();
+    // starting thread idx at 0 to keep writes aligned.
+    // unaligned reads are less painful then unaligned writes
+    for (int idx = thread_id; idx < frame_length; idx += CU1DBLOCK) {
+      float wval = tmp_window[idx];
+      float prev_window = wval;
+      if (idx > 0) {
+        prev_window = tmp_window[idx - 1];
+      }
+      // use __fmul_rn to match CPU
+      // window[idx] = (wval - preemph_coeff*prev_window) * windowing[idx];
+      window[idx] =
+          (wval - __fmul_rn(preemph_coeff, prev_window)) * windowing[idx];
+    }
+  }
+}
+
+__device__ inline int32 FirstSampleOfFrame(int32 frame, int32 frame_shift,
+                                           int32 window_size, bool snip_edges) {
+  if (snip_edges) {
+    return frame * frame_shift;
+  } else {
+    int32 midpoint_of_frame = frame_shift * frame + frame_shift / 2,
+          beginning_of_frame = midpoint_of_frame - window_size / 2;
+    return beginning_of_frame;
+  }
+}
+
+__global__ void extract_window_kernel(
+    int32 frame_shift, int32 frame_length, int32 frame_length_padded,
+    int32 window_size, bool snip_edges, int32_t sample_offset,
+    const BaseFloat * __restrict__ wave, int32 wave_dim,
+    BaseFloat *__restrict__ windows, int32_t wlda) {
+  int frame = blockIdx.x;
+  int tidx = threadIdx.x;
+
+  int32 start_sample =
+      FirstSampleOfFrame(frame, frame_shift, window_size, snip_edges);
+
+  // wave_start and wave_end are start and end indexes into 'wave', for the
+  // piece of wave that we're trying to extract.
+  int32 wave_start = int32(start_sample - sample_offset),
+        wave_end = wave_start + frame_length;
+
+  BaseFloat *window = windows + frame * wlda;
+  if (wave_start >= 0 && wave_end <= wave_dim) {
+    // the normal case-- no edge effects to consider.
+    for (int i = tidx; i < frame_length; i += blockDim.x) {
+      window[i] = wave[wave_start + i];
+    }
+  } else {
+    // Deal with any end effects by reflection, if needed.  This code will only
+    // be reached for about two frames per utterance, so we don't concern
+    // ourselves excessively with efficiency.
+    for (int s = tidx; s < frame_length; s += blockDim.x) {
+      int32 s_in_wave = s + wave_start;
+      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
+        // reflect around the beginning or end of the wave.
+        // e.g. -1 -> 0, -2 -> 1.
+        // dim -> dim - 1, dim + 1 -> dim - 2.
+        // the code supports repeated reflections, although this
+        // would only be needed in pathological cases.
+        if (s_in_wave < 0)
+          s_in_wave = -s_in_wave - 1;
+        else
+          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
+      }
+      window[s] = wave[s_in_wave];
+    }
+  }
+
+  if (frame_length_padded > frame_length) {
+    for (int i = frame_length + tidx; i < frame_length_padded;
+         i += blockDim.x) {
+      window[i] = 0.0f;
+    }
+  }
+}
+
+// For each frame
+//   compute logf(dot(signal_frame, signal_frame))
+__global__ void dot_log_kernel(int32_t num_frames, int32_t frame_length,
+                               float *signal_frame, int32_t lds,
+                               float *signal_log_energy) {
+  // Specialize WarpReduce for type float
+  typedef cub::BlockReduce<float, CU1DBLOCK> BlockReduce;
+  // Allocate WarpReduce shared memory for 8 warps
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int32_t frame = blockIdx.x;
+  int32_t tid = threadIdx.x;
+
+  float *in = signal_frame + frame * lds;
+  float sum = 0;
+
+  // preform local dot product
+  for (int32_t i = tid; i < frame_length; i += blockDim.x) {
+    float val = in[i];
+    sum += val * val;
+  }
+
+  // reduce using cub
+  sum = BlockReduce(temp_storage).Sum(sum);
+
+  if (threadIdx.x == 0) {
+    signal_log_energy[frame] = logf(sum);
+  }
+}
+
+namespace kaldi {
+
+CudaSpectralFeatures::CudaSpectralFeatures(const CudaSpectralFeatureOptions &opts)
+    : MfccComputer(opts.mfcc_opts),
+      cu_lifter_coeffs_(lifter_coeffs_),
+      cu_dct_matrix_(dct_matrix_),
+      window_function_(opts.mfcc_opts.frame_opts) {
+  const MelBanks *mel_banks = GetMelBanks(1.0);
+  const std::vector<std::pair<int32, Vector<BaseFloat>>> &bins =
+      mel_banks->GetBins();
+  int size = bins.size();
+  bin_size_ = size;
+  std::vector<int32> offsets(size), sizes(size);
+  std::vector<float *> vecs(size);
+  cu_vecs_ = new CuVector<float>[size];
+  for (int i = 0; i < bins.size(); i++) {
+    cu_vecs_[i].Resize(bins[i].second.Dim(), kUndefined);
+    cu_vecs_[i].CopyFromVec(bins[i].second);
+    vecs[i] = cu_vecs_[i].Data();
+    sizes[i] = cu_vecs_[i].Dim();
+    offsets[i] = bins[i].first;
+  }
+  offsets_ = static_cast<int32 *>(
+      CuDevice::Instantiate().Malloc(size * sizeof(int32)));
+  sizes_ = static_cast<int32 *>(
+      CuDevice::Instantiate().Malloc(size * sizeof(int32)));
+  vecs_ = static_cast<float **>(
+      CuDevice::Instantiate().Malloc(size * sizeof(float *)));
+
+  CU_SAFE_CALL(cudaMemcpyAsync(vecs_, &vecs[0], size * sizeof(float *),
+                               cudaMemcpyHostToDevice, cudaStreamPerThread));
+  CU_SAFE_CALL(cudaMemcpyAsync(offsets_, &offsets[0], size * sizeof(int32),
+                               cudaMemcpyHostToDevice, cudaStreamPerThread));
+  CU_SAFE_CALL(cudaMemcpyAsync(sizes_, &sizes[0], size * sizeof(int32),
+                               cudaMemcpyHostToDevice, cudaStreamPerThread));
+  CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
+
+  frame_length_ = opts.mfcc_opts.frame_opts.WindowSize();
+  padded_length_ = opts.mfcc_opts.frame_opts.PaddedWindowSize();
+  fft_length_ = padded_length_ / 2;  // + 1;
+  fft_size_ = 800;
+
+  // place holders to get strides for cufft.  these will be resized correctly
+  // later.  The +2 for cufft/fftw requirements of an extra element at the end.
+  // turning off stride because cufft seems buggy with a stride
+  cu_windows_.Resize(fft_size_, padded_length_, kUndefined,
+                     kStrideEqualNumCols);
+  tmp_window_.Resize(fft_size_, padded_length_ + 2, kUndefined,
+                     kStrideEqualNumCols);
+
+  stride_ = cu_windows_.Stride();
+  tmp_stride_ = tmp_window_.Stride();
+
+  cufftPlanMany(&plan_, 1, &padded_length_, NULL, 1, stride_, NULL, 1,
+                tmp_stride_ / 2, CUFFT_R2C, fft_size_);
+  cufftSetStream(plan_, cudaStreamPerThread);
+  cumfcc_opts_ = opts;
+}
+
+// ExtractWindow extracts a windowed frame of waveform with a power-of-two,
+// padded size.  It does mean subtraction, pre-emphasis and dithering as
+// requested.
+void CudaSpectralFeatures::ExtractWindows(int32_t num_frames, int64 sample_offset,
+                              const CuVectorBase<BaseFloat> &wave,
+                              const FrameExtractionOptions &opts) {
+  KALDI_ASSERT(sample_offset >= 0 && wave.Dim() != 0);
+  int32 frame_length = opts.WindowSize(),
+        frame_length_padded = opts.PaddedWindowSize();
+  int64 num_samples = sample_offset + wave.Dim();
+
+  extract_window_kernel<<<num_frames, CU1DBLOCK>>>(
+      opts.WindowShift(), frame_length, frame_length_padded, opts.WindowSize(),
+      opts.snip_edges, sample_offset, wave.Data(), wave.Dim(),
+      cu_windows_.Data(), cu_windows_.Stride());
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void CudaSpectralFeatures::ProcessWindows(int num_frames,
+                              const FrameExtractionOptions &opts,
+                              CuVectorBase<BaseFloat> *log_energy_pre_window) {
+  if (num_frames == 0) return;
+
+  int fft_num_frames = cu_windows_.NumRows();
+  KALDI_ASSERT(fft_num_frames % fft_size_ == 0);
+
+  process_window_kernel<<<num_frames, CU1DBLOCK>>>(
+      frame_length_, opts.dither, std::numeric_limits<float>::epsilon(),
+      opts.remove_dc_offset, opts.preemph_coeff, NeedRawLogEnergy(),
+      log_energy_pre_window->Data(), window_function_.cu_window.Data(),
+      tmp_window_.Data(), tmp_window_.Stride(), cu_windows_.Data(),
+      cu_windows_.Stride());
+
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void CudaSpectralFeatures::ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
+                                    CuVector<BaseFloat> *cu_signal_log_energy,
+                                    CuMatrix<BaseFloat> *cu_features) {
+  MfccOptions mfcc_opts = cumfcc_opts_.mfcc_opts;
+  Vector<float> tmp;
+  assert(mfcc_opts.htk_compat == false);
+
+  if (num_frames == 0) return;
+
+  if (mfcc_opts.use_energy && !mfcc_opts.raw_energy) {
+    dot_log_kernel<<<num_frames, CU1DBLOCK>>>(
+        num_frames, cu_windows_.NumCols(), cu_windows_.Data(),
+        cu_windows_.Stride(), cu_signal_log_energy->Data());
+    CU_SAFE_CALL(cudaGetLastError());
+  }
+
+  // make sure a reallocation hasn't changed these
+  KALDI_ASSERT(cu_windows_.Stride() == stride_);
+  KALDI_ASSERT(tmp_window_.Stride() == tmp_stride_);
+
+  // Perform FFTs in batches of fft_size.  This reduces memory requirements
+  for (int idx = 0; idx < num_frames; idx += fft_size_) {
+    CUFFT_SAFE_CALL(cufftExecR2C(
+        plan_, cu_windows_.Data() + cu_windows_.Stride() * idx,
+        (cufftComplex *)(tmp_window_.Data() + tmp_window_.Stride() * idx)));
+  }
+
+  // Compute Power spectrum
+  CuMatrix<BaseFloat> power_spectrum(tmp_window_.NumRows(),
+                                     padded_length_ / 2 + 1, kUndefined);
+
+  power_spectrum_kernel<<<num_frames, CU1DBLOCK>>>(
+      padded_length_, tmp_window_.Data(), tmp_window_.Stride(),
+      power_spectrum.Data(), power_spectrum.Stride(), cumfcc_opts_.use_power);
+  CU_SAFE_CALL(cudaGetLastError());
+
+  // mel banks
+  int num_bins = bin_size_;
+  cu_mel_energies_.Resize(num_frames, num_bins, kUndefined);
+  dim3 mel_threads(32, 8);
+  dim3 mel_blocks(num_bins, (num_frames + mel_threads.y - 1) / mel_threads.y);
+  mel_banks_compute_kernel<<<mel_blocks, mel_threads>>>(
+      num_frames, std::numeric_limits<float>::epsilon(), offsets_, sizes_,
+      vecs_, power_spectrum.Data(), power_spectrum.Stride(),
+      cu_mel_energies_.Data(), cu_mel_energies_.Stride(),
+      cumfcc_opts_.use_log_fbank);
+  CU_SAFE_CALL(cudaGetLastError());
+
+  // dct transform
+  if (cumfcc_opts_.use_dct) {
+     cu_features->AddMatMat(1.0, cu_mel_energies_, kNoTrans, cu_dct_matrix_,
+                            kTrans, 0.0);
+
+     apply_lifter_and_floor_energy<<<num_frames, CU1DBLOCK>>>(
+         cu_features->NumRows(), cu_features->NumCols(),
+	 mfcc_opts.cepstral_lifter, mfcc_opts.use_energy,
+	 mfcc_opts.energy_floor, cu_signal_log_energy->Data(),
+         cu_lifter_coeffs_.Data(), cu_features->Data(), cu_features->Stride());
+  } else {
+    cudaMemcpyAsync(cu_features->Data(), cu_mel_energies_.Data(),
+               sizeof(BaseFloat) * num_frames * cu_features->Stride(),
+               cudaMemcpyDeviceToDevice, cudaStreamPerThread);
+  }
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void CudaSpectralFeatures::ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
+                               BaseFloat sample_freq, BaseFloat vtln_warp,
+                               CuMatrix<BaseFloat> *cu_features) {
+  nvtxRangePushA("CudaSpectralFeatures::ComputeFeatures");
+  const FrameExtractionOptions &frame_opts = GetFrameOptions();
+  int num_frames = NumFrames(cu_wave.Dim(), frame_opts, true);
+  // compute fft frames by rounding up to a multiple of fft_size_
+  int fft_num_frames = num_frames + (fft_size_ - num_frames % fft_size_);
+  int feature_dim = Dim();
+  bool use_raw_log_energy = NeedRawLogEnergy();
+
+  CuVector<BaseFloat> raw_log_energies;
+  raw_log_energies.Resize(num_frames, kUndefined);
+
+  cu_windows_.Resize(fft_num_frames, padded_length_, kUndefined,
+                     kStrideEqualNumCols);
+  cu_features->Resize(num_frames, feature_dim, kUndefined);
+  //+1 matches cufft/fftw requirements
+  tmp_window_.Resize(fft_num_frames, padded_length_ + 2, kUndefined,
+                     kStrideEqualNumCols);
+
+  if (frame_opts.dither != 0.0f) {
+    // Calling cu-rand directly
+    // CuRand class works on CuMatrixBase which must
+    // assume that the matrix is part of a larger matrix
+    // Doing this directly avoids unecessary memory copies
+    CURAND_SAFE_CALL(
+        curandGenerateNormal(GetCurandHandle(), tmp_window_.Data(),
+                             tmp_window_.NumRows() * tmp_window_.Stride(),
+                             0.0 /*mean*/, 1.0 /*stddev*/));
+  }
+
+  // Extract Windows
+  ExtractWindows(num_frames, 0, cu_wave, frame_opts);
+
+  // Process Windows
+  ProcessWindows(num_frames, frame_opts, &raw_log_energies);
+
+  // Compute Features
+  ComputeFinalFeatures(num_frames, 1.0, &raw_log_energies, cu_features);
+
+  nvtxRangePop();
+}
+CudaSpectralFeatures::~CudaSpectralFeatures() {
+  delete[] cu_vecs_;
+  CuDevice::Instantiate().Free(vecs_);
+  CuDevice::Instantiate().Free(offsets_);
+  CuDevice::Instantiate().Free(sizes_);
+  cufftDestroy(plan_);
+}
+}  // namespace kaldi
diff --git a/src/cudafeat/feature-spectral-cuda.h b/src/cudafeat/feature-spectral-cuda.h
new file mode 100644
index 00000000000..8683372098c
--- /dev/null
+++ b/src/cudafeat/feature-spectral-cuda.h
@@ -0,0 +1,119 @@
+// cudafeat/feature-spectral-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
+#define KALDI_CUDAFEAT_FEATURE_MFCC_CUDA_H_
+
+#if HAVE_CUDA == 1
+#include <cufft.h>
+#endif
+
+#include "cudafeat/feature-window-cuda.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "feat/feature-fbank.h"
+#include "feat/feature-mfcc.h"
+
+namespace kaldi {
+enum SpectralFeatureType {MFCC, FBANK};
+struct CudaSpectralFeatureOptions {
+  MfccOptions mfcc_opts;
+  bool use_log_fbank; // LDB: Adding these two to enable fbank and mfcc
+  bool use_power;     //   to use the same code path for GPU (and CPU?)
+  bool use_dct;       // LDB: Adding this so that fbank can run w/o applying dct
+  SpectralFeatureType feature_type;
+  CudaSpectralFeatureOptions(MfccOptions opts_in)
+      : mfcc_opts(opts_in),
+        use_log_fbank(true), 
+	use_power(true), 
+	use_dct(true),
+        feature_type(MFCC) {}
+  CudaSpectralFeatureOptions(FbankOptions opts){
+     mfcc_opts.frame_opts      = opts.frame_opts;
+     mfcc_opts.mel_opts        = opts.mel_opts;
+     mfcc_opts.use_energy      = opts.use_energy;
+     mfcc_opts.energy_floor    = opts.energy_floor;
+     mfcc_opts.raw_energy      = opts.raw_energy;
+     mfcc_opts.htk_compat      = opts.htk_compat;
+     mfcc_opts.cepstral_lifter = 0.0f;
+     use_log_fbank = opts.use_log_fbank;
+     use_power = opts.use_power;
+     use_dct = false;
+     feature_type = FBANK;
+  }
+  // Default is MFCC
+  CudaSpectralFeatureOptions() : use_log_fbank(true),
+                  use_power(true),
+                  use_dct(true),
+                  feature_type(MFCC)	{}
+
+};
+// This class implements MFCC and Fbank computation in CUDA.
+// It takes input from device memory and outputs to
+// device memory.  It also does no synchronization.
+class CudaSpectralFeatures : public MfccComputer {
+ public:
+  void ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
+                       BaseFloat sample_freq, BaseFloat vtln_warp,
+                       CuMatrix<BaseFloat> *cu_features);
+
+  CudaSpectralFeatures(const CudaSpectralFeatureOptions &opts);
+  ~CudaSpectralFeatures();
+  CudaSpectralFeatureOptions cumfcc_opts_;
+  int32 Dim()
+  // The dimension of the output is different for MFCC and Fbank. 
+  // This returns the appropriate value depending on the feature
+  // extraction algorithm
+  {
+    if (cumfcc_opts_.feature_type == MFCC) return MfccComputer::Dim();
+    //If we're running fbank, we need to set the dimension right
+    else return cumfcc_opts_.mfcc_opts.mel_opts.num_bins + 
+	        (cumfcc_opts_.mfcc_opts.use_energy ? 1 : 0);
+  }
+
+ private:
+  void ExtractWindows(int32 num_frames, int64 sample_offset,
+                      const CuVectorBase<BaseFloat> &wave,
+                      const FrameExtractionOptions &opts);
+
+  void ProcessWindows(int num_frames, const FrameExtractionOptions &opts,
+                      CuVectorBase<BaseFloat> *log_energy_pre_window);
+
+  void ComputeFinalFeatures(int num_frames, BaseFloat vtln_wrap,
+                            CuVector<BaseFloat> *cu_signal_log_energy,
+                            CuMatrix<BaseFloat> *cu_features);
+
+  CuMatrix<BaseFloat> cu_windows_;
+  CuMatrix<float> tmp_window_, cu_mel_energies_;
+  CuMatrix<float> cu_dct_matrix_;
+  CuVector<float> cu_lifter_coeffs_;
+
+  int frame_length_, padded_length_, fft_length_, fft_size_;
+  cufftHandle plan_;
+  CudaFeatureWindowFunction window_function_;
+
+  int bin_size_;
+  int32 *offsets_, *sizes_;
+  CuVector<float> *cu_vecs_;
+  float **vecs_;
+
+  // for sanity checking cufft
+  int32_t stride_, tmp_stride_;
+};
+}
+
+#endif
diff --git a/src/cudafeat/feature-window-cuda.cu b/src/cudafeat/feature-window-cuda.cu
new file mode 100644
index 00000000000..7ce7d798ca2
--- /dev/null
+++ b/src/cudafeat/feature-window-cuda.cu
@@ -0,0 +1,39 @@
+// cudafeat/feature-window-cuda.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#endif
+#include "cudafeat/feature-window-cuda.h"
+#include "matrix/matrix-functions.h"
+
+namespace kaldi {
+
+CudaFeatureWindowFunction::CudaFeatureWindowFunction(
+    const FrameExtractionOptions &opts) {
+  nvtxRangePushA("CudaFeatureWindowFunction::CudaFeatureWindowFunction");
+  int32 frame_length = opts.WindowSize();
+
+  // Create CPU feature window
+  FeatureWindowFunction feature_window(opts);
+
+  // Copy into GPU memory
+  cu_window.Resize(frame_length, kUndefined);
+  cu_window.CopyFromVec(feature_window.window);
+  nvtxRangePop();
+}
+}  // namespace kaldi
diff --git a/src/cudafeat/feature-window-cuda.h b/src/cudafeat/feature-window-cuda.h
new file mode 100644
index 00000000000..ff749a855b9
--- /dev/null
+++ b/src/cudafeat/feature-window-cuda.h
@@ -0,0 +1,38 @@
+// cudafeat/feature-window-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
+#define KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
+
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "feat/feature-window.h"
+
+namespace kaldi {
+
+// This struct stores a feature window on the device.
+// Behind the scense it just computes a feature window on
+// the host and then copies it into device memory.
+struct CudaFeatureWindowFunction {
+  CudaFeatureWindowFunction() {}
+  explicit CudaFeatureWindowFunction(const FrameExtractionOptions &opts);
+  CuVector<float> cu_window;
+};
+
+}  // namespace kaldi
+
+#endif  // KALDI_CUDAFEAT_FEATURE_WINDOW_CUDA_H_
diff --git a/src/cudafeat/online-cuda-feature-pipeline.cc b/src/cudafeat/online-cuda-feature-pipeline.cc
new file mode 100644
index 00000000000..7941a0671a3
--- /dev/null
+++ b/src/cudafeat/online-cuda-feature-pipeline.cc
@@ -0,0 +1,88 @@
+// cudafeat/online-cuda-feature-pipleine.cc
+
+// Copyright    2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudafeat/online-cuda-feature-pipeline.h"
+
+namespace kaldi {
+
+OnlineCudaFeaturePipeline::OnlineCudaFeaturePipeline(
+    const OnlineNnet2FeaturePipelineConfig &config)
+    : info_(config), spectral_feat(NULL), ivector(NULL) {
+  spectral_feat = NULL;
+  cmvn = NULL;
+  ivector = NULL;
+  if (info_.feature_type == "mfcc") {
+    spectral_feat = new CudaSpectralFeatures(info_.mfcc_opts);
+  }
+  if (info_.feature_type == "fbank") {
+    spectral_feat = new CudaSpectralFeatures(info_.fbank_opts);
+  }
+
+  if (info_.use_cmvn) {
+    KALDI_ASSERT(info_.global_cmvn_stats_rxfilename != "");
+    ReadKaldiObject(info_.global_cmvn_stats_rxfilename, &global_cmvn_stats);
+    OnlineCmvnState cmvn_state(global_cmvn_stats);
+    CudaOnlineCmvnState cu_cmvn_state(cmvn_state);
+    cmvn = new CudaOnlineCmvn(info_.cmvn_opts, cu_cmvn_state);
+  } 
+
+  if (info_.use_ivectors) {
+    OnlineIvectorExtractionConfig ivector_extraction_opts;
+    ReadConfigFromFile(config.ivector_extraction_config,
+                       &ivector_extraction_opts);
+    info_.ivector_extractor_info.Init(ivector_extraction_opts);
+
+    // Only these ivector options are currently supported
+    ivector_extraction_opts.use_most_recent_ivector = true;
+    ivector_extraction_opts.greedy_ivector_extractor = true;
+
+    ivector = new IvectorExtractorFastCuda(ivector_extraction_opts);
+  } 
+}
+
+OnlineCudaFeaturePipeline::~OnlineCudaFeaturePipeline() {
+  if (spectral_feat != NULL) delete spectral_feat;
+  if (cmvn != NULL) delete cmvn;
+  if (ivector != NULL) delete ivector;
+}
+
+void OnlineCudaFeaturePipeline::ComputeFeatures(
+    const CuVectorBase<BaseFloat> &cu_wave, BaseFloat sample_freq,
+    CuMatrix<BaseFloat> *input_features,
+    CuVector<BaseFloat> *ivector_features) {
+  if (info_.feature_type == "mfcc" || info_.feature_type == "fbank") {
+    // Fbank called via the MFCC codepath
+    // MFCC
+    float vtln_warp = 1.0;
+    spectral_feat->ComputeFeatures(cu_wave, sample_freq, vtln_warp, input_features);
+  } else {
+    KALDI_ASSERT(false);
+  }
+
+  if (info_.use_cmvn) {
+    cmvn->ComputeFeatures(*input_features, input_features);
+  }
+
+  // Ivector
+  if (info_.use_ivectors && ivector_features != NULL) {
+    ivector->GetIvector(*input_features, ivector_features);
+  }
+}
+
+}  // namespace kaldi
diff --git a/src/cudafeat/online-cuda-feature-pipeline.h b/src/cudafeat/online-cuda-feature-pipeline.h
new file mode 100644
index 00000000000..f3d2795e3fb
--- /dev/null
+++ b/src/cudafeat/online-cuda-feature-pipeline.h
@@ -0,0 +1,57 @@
+// cudafeat/online-cuda-feature-pipeline.h
+
+// Copyright 2013-2014   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_CUDAFEAT_ONLINE_CUDA_FEATURE_PIPELINE_H_
+#define KALDI_CUDAFEAT_ONLINE_CUDA_FEATURE_PIPELINE_H_
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "base/kaldi-error.h"
+#include "cudafeat/feature-spectral-cuda.h"
+#include "cudafeat/online-ivector-feature-cuda.h"
+#include "matrix/matrix-lib.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+class OnlineCudaFeaturePipeline {
+ public:
+  explicit OnlineCudaFeaturePipeline(
+      const OnlineNnet2FeaturePipelineConfig &config);
+
+  void ComputeFeatures(const CuVectorBase<BaseFloat> &cu_wave,
+                       BaseFloat sample_freq,
+                       CuMatrix<BaseFloat> *input_features,
+                       CuVector<BaseFloat> *ivector_features);
+
+  ~OnlineCudaFeaturePipeline();
+
+ private:
+  OnlineNnet2FeaturePipelineInfo info_;
+  CudaSpectralFeatures *spectral_feat;
+  CudaOnlineCmvn *cmvn;
+  IvectorExtractorFastCuda *ivector;
+  Matrix<double> global_cmvn_stats;
+};
+}  // namespace kaldi
+
+#endif  // KALDI_CUDAFEAT_ONLINE_CUDA_FEATURE_EXTRACTOR_H_
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.cu b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
new file mode 100644
index 00000000000..12d9b071f59
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.cu
@@ -0,0 +1,254 @@
+// cudafeat/online-ivector-feature-cuda-kernels.cu
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cub/cub.cuh>
+#include "cudafeat/online-ivector-feature-cuda-kernels.h"
+#include "cudamatrix/cu-common.h"
+namespace kaldi {
+
+// Meant to be called with blockDim= 32x32
+__global__ void batched_gemv_reduce_kernel(int rows, int cols,
+                                           const float* __restrict__ A, int lda,
+                                           const float* __restrict__ X, int ldx,
+                                           float* C) {
+  // Specialize WarpReduce for type float
+  typedef cub::WarpReduce<float> WarpReduce;
+  // Allocate WarpReduce shared memory for 32 warps
+  __shared__ typename WarpReduce::TempStorage temp_storage[32];
+
+  __shared__ float s_A[32][32 + 1];  //+1 to avoid bank conflicts on transpose
+
+  int bid = blockIdx.x;   // batch id
+  int tid = threadIdx.x;  // thread id
+  int wid = threadIdx.y;  // warp id
+
+  // Offset to input matrix to starting row for batch
+  const float* __restrict__ A_in = A + bid * rows * lda;
+  // Offset to input vector to starting column for batch
+  const float* __restrict__ X_in = X + bid * ldx;
+
+  for (int i = 0; i < cols; i += 32) {  // threadIdx.x, keep all threads present
+    int c = i + tid;
+
+    float sum = 0.0f;
+    // Perform dot product
+    for (int j = 0; j < rows;
+         j += 32) {  // threadIdx.y, keep all threads present
+      int r = j + wid;
+
+      float val = 0.0f;
+      if (c < cols && r < rows) {
+        // coalesced reads
+        val = A_in[r * lda + c] * X_in[r];
+      }
+
+      // write to shared memory
+      __syncthreads();  // wait for shared memory to be written
+      s_A[wid][tid] = val;
+      __syncthreads();  // wait for shared memory to be consumed
+
+      // transpose read from shared memory and collect sum
+      sum += s_A[tid][wid];
+    }
+    // reduce sum in cub
+    sum = WarpReduce(temp_storage[wid]).Sum(sum);
+
+    // update c now that we are trasnposed
+    c = i + wid;
+    if (tid == 0 && c < cols) {
+      // Add contribution to final sum.
+      // Atomic necessary due to different batches updating this
+      atomicAdd(&C[c], sum);
+    }
+  }
+}
+
+// computes feats^2.  This works in place and out of place.
+__global__ void square_matrix_kernel(int32_t num_rows, int32_t num_cols,
+                                     const float* feats, int32_t ldf,
+                                     float* feats_sq, int32_t lds) {
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < num_rows;
+       i += blockDim.y * gridDim.y) {
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < num_cols;
+         j += blockDim.x * gridDim.x) {
+      float f = feats[i * ldf + j];
+      feats_sq[i * lds + j] = f * f;
+    }
+  }
+}
+
+// takes features in feat and writes them into sfeats while applying
+// the splicing algorithm for the left and right context.
+// input features that are out of range are clamped.
+__global__ void splice_features_kernel(int32_t num_frames, int32_t feat_dim,
+                                       int32_t left, int32_t size,
+                                       const float* __restrict__ feats,
+                                       int32_t ldf, float* __restrict__ sfeats,
+                                       int32_t lds) {
+  int32_t frame = blockIdx.x;
+  int32_t tid = threadIdx.x;
+
+  // offset feature output to process frame
+  float* feat_out = sfeats + lds * frame;
+
+  // for each splice of input
+  for (int i = 0; i < size; i++) {
+    int r = frame + i + left;
+    // clamp input row
+    if (r < 0) r = 0;
+    if (r > num_frames - 1) r = num_frames - 1;
+
+    // for each column of input in parallel
+    for (int c = tid; c < feat_dim; c += blockDim.x) {
+      // read feature from input row offset by column
+      float val = feats[r * ldf + c];
+
+      // write feature to output offset by splice index and column
+      feat_out[i * feat_dim + c] = val;
+    }
+  }
+}
+
+// Computes the sum of all terms in a matrix.
+// The kernel double buffers the output such that the
+// output is written to retval[b] where b is 0 or 1.
+// The output element of retval is written as zero.
+// Double buffering eliminates a call to cudaMemset
+__global__ void get_matrix_sum_double_buffer_kernel(int32_t b, int32_t num_rows,
+                                                    int32_t num_cols, float* A,
+                                                    int32_t lda, float scale,
+                                                    float* retval) {
+  // Specialize WarpReduce for type float
+  typedef cub::BlockReduce<float, 32, cub::BLOCK_REDUCE_WARP_REDUCTIONS, 32>
+      BlockReduce;
+  // Allocate WarpReduce shared memory for 32 warps
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  float sum = 0.0f;
+
+  // compute local sums in parallel
+  for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < num_rows;
+       i += blockDim.y * gridDim.y) {
+    for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < num_cols;
+         j += blockDim.x * gridDim.x) {
+      sum += A[i * lda + j];
+    }
+  }
+
+  sum = BlockReduce(temp_storage).Sum(sum);
+
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    atomicAdd(&retval[b], sum * scale);
+    int next_b = (b + 1) % 2;
+    retval[next_b] = 0.0f;
+  }
+}
+
+// This kernel updates the diagonal of the quadratic terms and
+// element zero of the linear term. Code is meant to match
+// ivector_extractor.cc:
+//   double old_num_frames = num_frames_,
+//          new_num_frames = num_frames_ + tot_weight;
+//   double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_,
+//          new_prior_scale = std::max(new_num_frames, max_count_) / max_count_;
+//   double prior_scale_change = new_prior_scale - old_prior_scale;
+//   if (prior_scale_change != 0.0) {
+//     linear_term_(0) += prior_offset_ * prior_scale_change;
+//     quadratic_term_.AddToDiag(prior_scale_change);
+//   }
+//Extra 1.0f on prior_scale_change is to match ivector_extractor.cc:
+//  linear_term_(0) += prior_offset;
+//  quadratic_term_.AddToDiag(1.0);
+__global__ void update_linear_and_quadratic_terms_kernel(
+    int32_t n, float old_num_frames, float prior_offset, float* cur_tot_weight, 
+    int32_t max_count, float* quadratic, float* linear) {
+  float cur_weight = *cur_tot_weight;
+
+  float new_num_frames = old_num_frames + cur_weight;
+  float prior_scale_change = 1.0f;
+
+  if(max_count!=0.0f) {
+    float old_prior_scale = max(old_num_frames, (float)max_count) / max_count;
+    float new_prior_scale = max(new_num_frames, (float)max_count) / max_count;
+    prior_scale_change += new_prior_scale - old_prior_scale;
+  }
+
+  for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+     i += blockDim.x * gridDim.x) {
+    int32_t diag_idx = ((i + 1) * (i + 2) / 2) - 1;
+    quadratic[diag_idx] += prior_scale_change;
+  }
+
+  if (threadIdx.x == 0 && blockIdx.x==0) {
+    linear[0] += prior_offset * prior_scale_change;  
+  }
+}
+
+void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
+                         const float* AT, int B_stride, const float* B,
+                         float* C) {
+  batched_gemv_reduce_kernel<<<batch_size, dim3(32, 32)>>>(
+      rows, cols, AT, A_stride, B, B_stride, C);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
+                     int32_t size, const float* feats, int32_t ldf,
+                     float* sfeats, int32_t lds) {
+  int threads = (feat_dim + 31) / 32 * 32;  // round up to the nearest warp size
+  if (threads > 1024) threads = 1024;       // Max block size is 1024 threads
+
+  splice_features_kernel<<<num_frames, threads>>>(
+      num_frames, feat_dim, left, size, feats, ldf, sfeats, lds);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void update_linear_and_quadratic_terms(int32_t n, float old_num_frames,
+                                       float prior_offset,
+                                       float* cur_tot_weight, int32_t max_count,
+                                       float* quadratic, float* linear) {
+  // Only using 1 CTA here  for now as the updates are tiny and this lets us
+  // use syncthreads as a global barrier.
+  update_linear_and_quadratic_terms_kernel<<<1, 1024>>>(
+      n, old_num_frames, prior_offset, cur_tot_weight, max_count, quadratic, 
+      linear);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
+                                  float* A, int32_t lda, float scale,
+                                  float* sum) {
+  dim3 threads(32, 32);
+  dim3 blocks((num_cols + threads.x - 1) / threads.x,
+              (num_rows + threads.y - 1) / threads.y);
+
+  get_matrix_sum_double_buffer_kernel<<<blocks, threads>>>(
+      b, num_rows, num_cols, A, lda, scale, sum);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+
+void square_matrix(int32_t num_rows, int32_t num_cols, const float* feats,
+                   int32_t ldf, float* feats_sq, int32_t lds) {
+  dim3 threads(32, 32);
+  dim3 blocks((num_cols + threads.x - 1) / threads.x,
+              (num_rows + threads.y - 1) / threads.y);
+
+  square_matrix_kernel<<<blocks, threads>>>(num_rows, num_cols, feats, ldf,
+                                            feats_sq, lds);
+  CU_SAFE_CALL(cudaGetLastError());
+}
+}
diff --git a/src/cudafeat/online-ivector-feature-cuda-kernels.h b/src/cudafeat/online-ivector-feature-cuda-kernels.h
new file mode 100644
index 00000000000..5dfe4030e70
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda-kernels.h
@@ -0,0 +1,41 @@
+// cudafeat/online-ivector-feature-cuda-kernels.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_KERNELS
+#define CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_KERNELS
+
+namespace kaldi {
+void batched_gemv_reduce(int batch_size, int rows, int cols, int A_stride,
+                         const float *AT, int B_stride, const float *B,
+                         float *C);
+
+void splice_features(int32_t num_frames, int32_t feat_dim, int32_t left,
+                     int32_t size, const float *feats, int32_t ldf,
+                     float *sfeats, int32_t lds);
+
+void update_linear_and_quadratic_terms(int32_t n, float old_num_frames,
+                                       float prior_offset_,
+                                       float *cur_tot_weight, int32_t max_count,
+                                       float *quadratic, float *linear);
+
+void get_matrix_sum_double_buffer(int32_t b, int32_t num_rows, int32_t num_cols,
+                                  float *A, int32_t lda, float scale,
+                                  float *sum);
+
+void square_matrix(int32_t num_rows, int32_t num_cols, const float *feats,
+                   int32_t ldf, float *feats_sq, int32_t lds);
+}
+#endif
diff --git a/src/cudafeat/online-ivector-feature-cuda.cc b/src/cudafeat/online-ivector-feature-cuda.cc
new file mode 100644
index 00000000000..434b2cdb482
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda.cc
@@ -0,0 +1,353 @@
+// cudafeat/online-ivector-feature-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#endif
+#include <iostream>
+
+#include "base/io-funcs.h"
+#include "base/kaldi-common.h"
+#include "base/timer.h"
+#include "cudafeat/feature-online-cmvn-cuda.h"
+#include "cudafeat/online-ivector-feature-cuda-kernels.h"
+#include "cudafeat/online-ivector-feature-cuda.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-sp-matrix.h"
+#include "gmm/diag-gmm.h"
+#include "util/kaldi-io.h"
+#include "util/table-types.h"
+namespace kaldi {
+
+void IvectorExtractorFastCuda::GetIvector(const CuMatrixBase<BaseFloat> &feats,
+                                          CuVector<BaseFloat> *ivector) {
+  nvtxRangePushA("GetIvector");
+  CuMatrix<BaseFloat> posteriors, X;
+  CuVector<BaseFloat> gamma;
+  int rows = feats.NumRows();
+  int cols = feats.NumCols();
+
+  int lda_rows = cu_lda_.NumRows();
+  int lda_cols = cu_lda_.NumCols();
+
+  // normalized pipeline
+  CuMatrix<BaseFloat> lda_feats_normalized(rows, cols, kUndefined);
+  {
+    CudaOnlineCmvn cmvn(info_.cmvn_opts, naive_cmvn_state_);
+    CuMatrix<BaseFloat> cmvn_feats(rows, cols, kUndefined);
+    CuMatrix<BaseFloat> spliced_feats_normalized;
+
+    // Normalize
+    cmvn.ComputeFeatures(feats, &cmvn_feats);
+
+    // Splice
+    SpliceFeats(cmvn_feats, &spliced_feats_normalized);
+
+    // Transform by LDA matrix
+    if (spliced_feats_normalized.NumCols() == lda_cols) {
+      // Linear transformation
+      lda_feats_normalized.AddMatMat(1.0, spliced_feats_normalized, kNoTrans,
+                                     cu_lda_, kTrans, 0.0);
+    } else if (spliced_feats_normalized.NumCols() + 1 == lda_cols) {
+      // Affine transformation
+
+      // create submatrix which removes last column
+      CuSubMatrix<BaseFloat> cu_lda(cu_lda_, 0, lda_rows, 0, lda_cols - 1);
+  
+      // Add offset
+      lda_feats_normalized.CopyRowsFromVec(offset_);
+      lda_feats_normalized.AddMatMat(1.0, spliced_feats_normalized, kNoTrans,
+                                   cu_lda, kTrans, 1.0);
+
+    } else {
+      KALDI_ERR << "Dimension mismatch: source features have dimension "
+                << spliced_feats_normalized.NumCols() << " and LDA #cols is " 
+                << lda_cols;
+    }
+  }
+
+  // non-normalized pipeline
+  CuMatrix<BaseFloat> lda_feats(feats.NumRows(), feats.NumCols(), kUndefined);
+  {
+    CuMatrix<BaseFloat> spliced_feats;
+
+    // Splice feats
+    SpliceFeats(feats, &spliced_feats);
+
+    // Transform by LDA matrix
+    if (spliced_feats.NumCols() == lda_cols) {
+      // Linear transformation
+      lda_feats.AddMatMat(1.0, spliced_feats, kNoTrans, cu_lda_, kTrans, 0.0);
+    } else if (spliced_feats.NumCols() + 1 == lda_cols) {
+      // Affine transformation
+
+      // create submatrix which removes last column
+      CuSubMatrix<BaseFloat> cu_lda(cu_lda_, 0, lda_rows, 0, lda_cols - 1);
+      
+      // Add offset
+      lda_feats.CopyRowsFromVec(offset_);
+      lda_feats.AddMatMat(1.0, spliced_feats, kNoTrans, cu_lda, kTrans, 1.0);
+
+    } else {
+      KALDI_ERR << "Dimension mismatch: source features have dimension "
+                << spliced_feats.NumCols() << " and LDA #cols is " 
+                << lda_cols;
+    }
+  }
+
+  // based on normalized feats
+  ComputePosteriors(lda_feats_normalized, &posteriors);
+
+  // based on non-normalized feats
+  ComputeIvectorStats(lda_feats, posteriors, &gamma, &X);
+
+  ComputeIvectorFromStats(gamma, X, ivector);
+
+  nvtxRangePop();
+}
+
+void IvectorExtractorFastCuda::Read(
+    const kaldi::OnlineIvectorExtractionConfig &config) {
+  // read ubm
+  DiagGmm gmm;
+  ReadKaldiObject(config.diag_ubm_rxfilename, &gmm);
+  ubm_gconsts_.Resize(gmm.NumGauss());
+  ubm_gconsts_.CopyFromVec(gmm.gconsts());
+  ubm_means_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
+  ubm_means_inv_vars_.CopyFromMat(gmm.means_invvars());
+  ubm_inv_vars_.Resize(gmm.NumGauss(), gmm.Dim());
+  ubm_inv_vars_.CopyFromMat(gmm.inv_vars());
+  num_gauss_ = gmm.NumGauss();
+
+  // read extractor (copied from ivector/ivector-extractor.cc)
+  bool binary;
+  Input input(config.ivector_extractor_rxfilename, &binary);
+  Matrix<float> w;
+  Vector<float> w_vec;
+  std::vector<Matrix<float> > ie_M;
+  std::vector<SpMatrix<float> > ie_Sigma_inv;
+
+  ExpectToken(input.Stream(), binary, "<IvectorExtractor>");
+  ExpectToken(input.Stream(), binary, "<w>");
+  w.Read(input.Stream(), binary);
+  ExpectToken(input.Stream(), binary, "<w_vec>");
+  w_vec.Read(input.Stream(), binary);
+  ExpectToken(input.Stream(), binary, "<M>");
+  int32 size;
+  ReadBasicType(input.Stream(), binary, &size);
+  KALDI_ASSERT(size > 0);
+  ie_M.resize(size);
+  for (int32 i = 0; i < size; i++) {
+    ie_M[i].Read(input.Stream(), binary);
+  }
+  ExpectToken(input.Stream(), binary, "<SigmaInv>");
+  ie_Sigma_inv.resize(size);
+  for (int32 i = 0; i < size; i++) {
+    ie_Sigma_inv[i].Read(input.Stream(), binary);
+  }
+  ExpectToken(input.Stream(), binary, "<IvectorOffset>");
+  ReadBasicType(input.Stream(), binary, &prior_offset_);
+  ExpectToken(input.Stream(), binary, "</IvectorExtractor>");
+
+  // compute derived variables
+  ivector_dim_ = ie_M[0].NumCols();
+  feat_dim_ = ie_M[0].NumRows();
+
+  ie_Sigma_inv_M_f_.Resize(num_gauss_ * feat_dim_, ivector_dim_);
+
+  ie_U_.Resize(num_gauss_, ivector_dim_ * (ivector_dim_ + 1) / 2);
+
+  SpMatrix<float> tmp_sub_U(ivector_dim_);
+  Matrix<float> tmp_Sigma_inv_M(feat_dim_, ivector_dim_);
+  for (int32 i = 0; i < num_gauss_; i++) {
+    // compute matrix ie_Sigma_inv_M[i[
+    tmp_sub_U.AddMat2Sp(1, ie_M[i], kTrans, ie_Sigma_inv[i], 0);
+    SubVector<float> tmp_U_vec(tmp_sub_U.Data(),
+                               ivector_dim_ * (ivector_dim_ + 1) / 2);
+    ie_U_.Row(i).CopyFromVec(tmp_U_vec);
+
+    tmp_Sigma_inv_M.AddSpMat(1, ie_Sigma_inv[i], ie_M[i], kNoTrans, 0);
+
+    // copy into global matrix
+    CuSubMatrix<float> window(ie_Sigma_inv_M_f_, i * feat_dim_, feat_dim_, 0,
+                              ivector_dim_);
+    window.CopyFromMat(tmp_Sigma_inv_M);
+  }
+}
+
+void IvectorExtractorFastCuda::SpliceFeats(const CuMatrixBase<BaseFloat> &feats,
+                                           CuMatrix<BaseFloat> *spliced_feats) {
+  int left = -info_.splice_opts.left_context;
+  int right = info_.splice_opts.right_context;
+  int size = right - left + 1;
+  spliced_feats->Resize(feats.NumRows(), feats.NumCols() * size, kUndefined);
+
+  splice_features(feats.NumRows(), feats.NumCols(), left, size, feats.Data(),
+                  feats.Stride(), spliced_feats->Data(),
+                  spliced_feats->Stride());
+}
+
+void IvectorExtractorFastCuda::ComputePosteriors(
+    const CuMatrixBase<float> &feats, CuMatrix<float> *posteriors) {
+  int num_frames = feats.NumRows();
+
+  posteriors->Resize(num_frames, num_gauss_, kUndefined);
+
+  posteriors->CopyRowsFromVec(ubm_gconsts_);
+
+  CuMatrix<float> feats_sq(feats.NumRows(), feats.NumCols(), kUndefined);
+
+  // using our own kernel here to avoid an extra memcpy.
+  // ApplyPow unfortunately only works in place.
+  square_matrix(feats.NumRows(), feats.NumCols(), feats.Data(), feats.Stride(),
+                feats_sq.Data(), feats_sq.Stride());
+
+  posteriors->AddMatMat(1.0, feats, kNoTrans, ubm_means_inv_vars_, kTrans, 1.0);
+  posteriors->AddMatMat(-0.5, feats_sq, kNoTrans, ubm_inv_vars_, kTrans, 1.0);
+
+  // apply scaling factor
+  posteriors->ApplySoftMaxPerRow();
+
+  if (info_.max_count > 0) {
+    // when max count > 0 we need to know the total posterior sum to adjust
+    // the prior offset.  So calculate that here.
+    get_matrix_sum_double_buffer(
+        b_, posteriors->NumRows(), posteriors->NumCols(), posteriors->Data(),
+        posteriors->Stride(), info_.posterior_scale, tot_post_.Data());
+  }
+}
+
+void IvectorExtractorFastCuda::ComputeIvectorStats(
+    const CuMatrixBase<float> &feats, const CuMatrixBase<float> &posteriors,
+    CuVector<float> *gamma, CuMatrix<float> *X) {
+  gamma->Resize(num_gauss_, kUndefined);
+  X->Resize(num_gauss_, feat_dim_, kUndefined);
+
+  gamma->AddRowSumMat(info_.posterior_scale, posteriors, 0.0f);
+  X->AddMatMat(info_.posterior_scale, posteriors, kTrans, feats, kNoTrans,
+               0.0f);
+}
+
+void IvectorExtractorFastCuda::ComputeIvectorFromStats(
+    const CuVector<float> &gamma, const CuMatrix<float> &X,
+    CuVector<float> *ivector) {
+  CuVector<float> &linear = *ivector;
+  linear.Resize(ivector_dim_, kUndefined);
+  // Initialize to zero as batched kernel is +=
+  linear.SetZero();
+
+  CuSpMatrix<float> quadratic(ivector_dim_, kUndefined);
+
+  batched_gemv_reduce(num_gauss_, feat_dim_, ivector_dim_,
+                      ie_Sigma_inv_M_f_.Stride(), ie_Sigma_inv_M_f_.Data(),
+                      X.Stride(), X.Data(), linear.Data());
+
+  CuSubVector<float> q_vec(quadratic.Data(),
+                           ivector_dim_ * (ivector_dim_ + 1) / 2);
+  q_vec.AddMatVec(1.0f, ie_U_, kTrans, gamma, 0.0f);
+
+  // TODO for online this needs to be stored and passed forward
+  // For offline this is always zero.
+  float old_num_frames = 0.0f;
+
+  // compute and apply prior offset to linear and quadraditic terms
+  // offset tot_post_ by correct buffer
+  update_linear_and_quadratic_terms(
+      quadratic.NumRows(), old_num_frames, prior_offset_, tot_post_.Data() + b_,
+      info_.max_count, quadratic.Data(), linear.Data());
+  // advance double buffer
+  b_ = (b_ + 1) % 2;
+
+  // We are computing a solution to this linear system:
+  // x = quadratic^-1 * linear
+  // ivector+=x
+
+  // Inverting the matrix is unneccessary.  We are only solving a single
+  // linear system.  So just use choleskey's to solve for a single ivector
+  // Equation being solved: quadratic * ivector = linear
+
+#if CUDA_VERSION >= 9010
+  // Comment this out to use LU decomposistion instead.
+  // CHOLESKY's should be faster and more accurate so this is preffered.
+#define CHOLESKY
+  int nrhs = 1;
+  // Forming new non-SP matrix for cusolver.
+  CuMatrix<float> A(quadratic);
+
+#ifdef CHOLESKY
+  // query temp buffer size
+  int L_work;
+  CUSOLVER_SAFE_CALL(
+      cusolverDnSpotrf_bufferSize(GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER,
+                                  A.NumRows(), A.Data(), A.Stride(), &L_work));
+
+  // allocate temp buffer
+  float *workspace = static_cast<float *>(
+      CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
+
+  // perform factorization
+  CUSOLVER_SAFE_CALL(cusolverDnSpotrf(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), A.Data(),
+      A.Stride(), workspace, L_work, d_info_));
+
+  // solve for rhs
+  CUSOLVER_SAFE_CALL(cusolverDnSpotrs(
+      GetCusolverDnHandle(), CUBLAS_FILL_MODE_LOWER, A.NumRows(), nrhs,
+      A.Data(), A.Stride(), ivector->Data(), ivector_dim_, d_info_));
+
+  CuDevice::Instantiate().Free(workspace);
+#else
+  // query temp buffer size
+  int L_work;
+  CUSOLVER_SAFE_CALL(
+      cusolverDnSgetrf_bufferSize(GetCusolverDnHandle(), A.NumRows(),
+                                  A.NumCols(), A.Data(), A.Stride(), &L_work));
+
+  // allocate temp buffer
+  float *workspace = static_cast<float *>(
+      CuDevice::Instantiate().Malloc(L_work * sizeof(float)));
+  int *devIpiv =
+      static_cast<int *>(CuDevice::Instantiate().Malloc(L_work * sizeof(int)));
+
+  // perform factorization
+  CUSOLVER_SAFE_CALL(cusolverDnSgetrf(GetCusolverDnHandle(), A.NumRows(),
+                                      A.NumCols(), A.Data(), A.Stride(),
+                                      workspace, devIpiv, d_info_));
+
+  // solve for rhs
+  CUSOLVER_SAFE_CALL(cusolverDnSgetrs(
+      GetCusolverDnHandle(), CUBLAS_OP_N, A.NumRows(), nrhs, A.Data(),
+      A.Stride(), devIpiv, ivector->Data(), ivector_dim_, d_info_));
+
+  CuDevice::Instantiate().Free(workspace);
+  CuDevice::Instantiate().Free(devIpiv);
+#endif
+#else
+  // Cuda version is too old for cu-solver.
+  // Use Kaldi built-in inversion routine.
+  quadratic.Invert();
+  CuVector<float> linear_tmp(linear);
+  ivector->Resize(ivector_dim_, kUndefined);
+  ivector->AddSpVec(1.0, quadratic, linear_tmp, 0.0);
+#endif
+
+  // remove prior from ivector
+  CuSubVector<float> ivector0(*ivector, 0, 1);
+  ivector0.Add(-prior_offset_);
+}
+
+};  // namespace kaldi
diff --git a/src/cudafeat/online-ivector-feature-cuda.h b/src/cudafeat/online-ivector-feature-cuda.h
new file mode 100644
index 00000000000..f6fe1e65cb9
--- /dev/null
+++ b/src/cudafeat/online-ivector-feature-cuda.h
@@ -0,0 +1,137 @@
+// cudafeat/online-ivector-feature-cuda.h
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_H_
+#define CUDAFEAT_ONLINE_IVECTOR_FEATURE_CUDA_H_
+
+#include <string>
+#include <vector>
+
+#include "base/kaldi-error.h"
+#include "cudafeat/feature-online-cmvn-cuda.h"
+#include "cudamatrix/cu-matrix.h"
+#include "online2/online-ivector-feature.h"
+
+namespace kaldi {
+
+class IvectorExtractorFastCuda {
+ public:
+  IvectorExtractorFastCuda(const OnlineIvectorExtractionConfig &config)
+      : b_(0), tot_post_(2) {
+    if (config.use_most_recent_ivector == false) {
+      KALDI_WARN
+          << "IvectorExractorFastCuda: Ignoring use_most_recent_ivector=false.";
+    }
+    if (config.greedy_ivector_extractor == false) {
+      KALDI_WARN << "IvectorExractorFastCuda: Ignoring "
+                    "greedy_ivector_extractor=false.";
+    }
+
+    info_.Init(config);
+    naive_cmvn_state_ = OnlineCmvnState(info_.global_cmvn_stats);
+    Read(config);
+    cu_lda_.Resize(info_.lda_mat.NumRows(), info_.lda_mat.NumCols());
+    cu_lda_.CopyFromMat(info_.lda_mat);
+
+    // The last col in the LDA matrix may be an affine offset
+    // copy that column to offset_ now.  This may or may not be used
+    // when getting the features later
+    offset_.Resize(cu_lda_.NumRows());
+    offset_.CopyColFromMat(cu_lda_, cu_lda_.NumCols() - 1);
+    d_info_ = static_cast<int *>(CuDevice::Instantiate().Malloc(sizeof(int)));
+  }
+  ~IvectorExtractorFastCuda() {
+    KALDI_ASSERT(d_info_);
+    CuDevice::Instantiate().Free(d_info_);
+  }
+
+  // This function goes directly from features to an i-vector
+  // which makes the computation easier to port to GPU
+  // and make it run more efficiently
+  //
+  // It is roughly the replacement for the following in kaldi:
+  //
+  // DiagGmm.LogLikelihoods(), VectorToPosteriorEntry()
+  // IvectorExtractorUtteranceStats.AccStats()
+  // IvectorExtractor.GetIvectorDistribution()
+  //
+  // Also note we only do single precision (float)
+  // which will *NOT* give same results as kaldi
+  // i-vector extractor which is float precision
+  // however, in practice, the differences do *NOT*
+  // affect overall accuracy
+  //
+  // This function is thread safe as all class variables
+  // are read-only
+  //
+  void GetIvector(const CuMatrixBase<float> &feats, CuVector<float> *ivector);
+
+  int32 FeatDim() const { return feat_dim_; }
+  int32 IvectorDim() const { return ivector_dim_; }
+  int32 NumGauss() const { return num_gauss_; }
+
+ private:
+  OnlineIvectorExtractionInfo info_;
+
+  IvectorExtractorFastCuda(IvectorExtractorFastCuda const &);
+  IvectorExtractorFastCuda &operator=(IvectorExtractorFastCuda const &);
+
+  void Read(const kaldi::OnlineIvectorExtractionConfig &config);
+
+  void SpliceFeats(const CuMatrixBase<BaseFloat> &feats,
+                   CuMatrix<BaseFloat> *spliced_feats);
+
+  void ComputePosteriors(const CuMatrixBase<float> &feats,
+                         CuMatrix<float> *posteriors);
+
+  void ComputeIvectorStats(const CuMatrixBase<float> &feats,
+                           const CuMatrixBase<float> &posteriors,
+                           CuVector<float> *gamma, CuMatrix<float> *X);
+
+  void ComputeIvectorFromStats(const CuVector<float> &gamma,
+                               const CuMatrix<float> &X,
+                               CuVector<float> *ivector);
+
+  CudaOnlineCmvnState naive_cmvn_state_;
+
+  int32 feat_dim_;
+  int32 ivector_dim_;
+  int32 num_gauss_;
+
+  // ubm variables
+  CuVector<BaseFloat> ubm_gconsts_;
+  CuMatrix<BaseFloat> ubm_means_inv_vars_;
+  CuMatrix<BaseFloat> ubm_inv_vars_;
+  CuMatrix<BaseFloat> cu_lda_;
+  CuVector<BaseFloat> offset_;
+  // extractor variables
+  CuMatrix<BaseFloat> ie_U_;
+
+  // Batched matrix which sotres this:
+  CuMatrix<BaseFloat> ie_Sigma_inv_M_f_;
+
+  // double buffer to store total posteriors.
+  // double buffering avoids extra calls to intitialize buffer
+  int b_;
+  CuVector<BaseFloat> tot_post_;
+  float prior_offset_;
+
+  // Buffer used by cusolver
+  int *d_info_;
+};
+}  // namespace kaldi
+
+#endif  // IVECTOR_IVECTOR_EXTRACTOR_FAST_CUDA_H_
diff --git a/src/cudafeatbin/Makefile b/src/cudafeatbin/Makefile
new file mode 100644
index 00000000000..105ece3c67f
--- /dev/null
+++ b/src/cudafeatbin/Makefile
@@ -0,0 +1,27 @@
+
+all:
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES =
+
+ifeq ($(CUDA), true)
+  BINFILES += compute-mfcc-feats-cuda apply-cmvn-online-cuda compute-online-feats-cuda compute-fbank-feats-cuda
+endif
+
+OBJFILES =
+
+TESTFILES =
+
+ADDLIBS = ../cudafeat/kaldi-cudafeat.a ../online2/kaldi-online2.a  \
+          ../ivector/kaldi-ivector.a ../decoder/kaldi-decoder.a \
+          ../lat/kaldi-lat.a ../cudamatrix/kaldi-cudamatrix.a \
+          ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
+          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/cudafeatbin/apply-cmvn-online-cuda.cc b/src/cudafeatbin/apply-cmvn-online-cuda.cc
new file mode 100644
index 00000000000..6dc18fdf2ab
--- /dev/null
+++ b/src/cudafeatbin/apply-cmvn-online-cuda.cc
@@ -0,0 +1,107 @@
+// online2bin/apply-cmvn-online.cc
+
+// Copyright      2014  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "feat/online-feature.h"
+#include "cudafeat/feature-online-cmvn-cuda.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    typedef kaldi::int32 int32;
+    using namespace kaldi;
+    const char *usage =
+      "Apply online cepstral mean (and possibly variance) computation online,\n"
+      "using the same code as used for online decoding in the 'new' setup in\n"
+      "online2/ and online2bin/.'\n"
+      "The computation is done on the device in serial. " 
+      "spk2utt is not supported.\n"
+      "\n"
+      "Usage: apply-cmvn-online-cuda [options] <global-cmvn-stats> <feature-rspecifier> "
+      "<feature-wspecifier>\n"
+      "e.g. apply-cmvn-online-cuda 'matrix-sum scp:data/train/cmvn.scp -|' data/train/split8/1/feats.scp ark:-\n";
+
+    ParseOptions po(usage);
+
+    OnlineCmvnOptions cmvn_opts;
+
+    std::string spk2utt_rspecifier;
+    cmvn_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+    std::string global_stats_rxfilename = po.GetArg(1),
+      feature_rspecifier = po.GetArg(2),
+      feature_wspecifier = po.GetArg(3);
+
+    // global_cmvn_stats helps us initialize to online CMVN to
+    // reasonable values at the beginning of the utterance.
+    Matrix<double> global_cmvn_stats;
+    ReadKaldiObject(global_stats_rxfilename, &global_cmvn_stats);
+
+    BaseFloatMatrixWriter feature_writer(feature_wspecifier);
+    int32 num_done = 0;
+    int64 tot_t = 0;
+      
+    OnlineCmvnState cmvn_state(global_cmvn_stats);
+    CudaOnlineCmvnState cu_cmvn_state(cmvn_state);
+    CudaOnlineCmvn cuda_cmvn(cmvn_opts, cu_cmvn_state);
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string utt = feature_reader.Key();
+      const Matrix<BaseFloat> &feats = feature_reader.Value();
+      int32_t numRows = feats.NumRows();
+      int32_t numCols = feats.NumCols();
+
+      CuMatrix<BaseFloat> cu_feats_in(feats);
+      CuMatrix<BaseFloat> cu_feats_out(numRows, numCols, kUndefined);
+      Matrix<BaseFloat> normalized_feats(numRows, numCols, kUndefined);
+
+      cuda_cmvn.ComputeFeatures(cu_feats_in, &cu_feats_out);
+
+      normalized_feats.CopyFromMat(cu_feats_out);
+
+      num_done++;
+      tot_t += feats.NumRows();
+      feature_writer.Write(utt, normalized_feats);
+
+      num_done++;
+    }
+
+    KALDI_LOG << "Applied online CMVN to " << num_done << " files, or "
+      << tot_t << " frames.";
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/cudafeatbin/compute-fbank-feats-cuda.cc b/src/cudafeatbin/compute-fbank-feats-cuda.cc
new file mode 100644
index 00000000000..7ac35f43e33
--- /dev/null
+++ b/src/cudafeatbin/compute-fbank-feats-cuda.cc
@@ -0,0 +1,194 @@
+// cudafeatbin/compute-fbank-feats-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudafeat/feature-spectral-cuda.h"
+#include "feat/wave-reader.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Create Fbank feature files.\n"
+        "Usage:  compute-fbank-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    FbankOptions fbank_opts;
+    bool subtract_mean = false;
+    BaseFloat vtln_warp = 1.0;
+    std::string vtln_map_rspecifier;
+    std::string utt2spk_rspecifier;
+    int32 channel = -1;
+    BaseFloat min_duration = 0.0;
+    // Define defaults for gobal options
+    std::string output_format = "kaldi";
+
+    // Register the fbank option struct
+    fbank_opts.Register(&po);
+
+    // Register the options
+    po.Register("output-format", &output_format, "Format of the output "
+                "files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable "
+                "if vtln-map not specified)");
+    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or "
+                "speaker-id to vtln warp factor (rspecifier)");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map "
+                "rspecifier (if doing VTLN and you have warps per speaker)");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+
+    std::string wav_rspecifier = po.GetArg(1);
+
+    std::string output_wspecifier = po.GetArg(2);
+
+    // Fbank is implemented via the MFCC code path 
+    CudaSpectralFeatures fbank(fbank_opts);
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
+    TableWriter<HtkMatrixHolder> htk_writer;
+
+    if (utt2spk_rspecifier != "")
+      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
+                   "needed if the vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+    
+    if (output_format == "kaldi") {
+      if (!kaldi_writer.Open(output_wspecifier))
+        KALDI_ERR << "Could not initialize output with wspecifier "
+                  << output_wspecifier;
+    } else if (output_format == "htk") {
+      if (!htk_writer.Open(output_wspecifier))
+        KALDI_ERR << "Could not initialize output with wspecifier "
+                  << output_wspecifier;
+    } else {
+      KALDI_ERR << "Invalid output_format string " << output_format;
+    }
+
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      if (wave_data.Duration() < min_duration) {
+        KALDI_WARN << "File: " << utt << " is too short ("
+                   << wave_data.Duration() << " sec): producing no output.";
+        continue;
+      }
+      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+      {  // This block works out the channel (0=left, 1=right...)
+        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        // reading code if no channels.
+        if (channel == -1) {
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << utt << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            continue;
+          }
+        }
+      }
+      BaseFloat vtln_warp_local;  // Work out VTLN warp factor.
+      if (vtln_map_rspecifier != "") {
+        if (!vtln_map_reader.HasKey(utt)) {
+          KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) "
+                     << utt;
+          continue;
+        }
+        vtln_warp_local = vtln_map_reader.Value(utt);
+      } else {
+        vtln_warp_local = vtln_warp;
+      }
+
+      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
+      Matrix<BaseFloat> features;
+      try {
+        CuVector<BaseFloat> cu_waveform(waveform);
+        CuMatrix<BaseFloat> cu_features;
+        fbank.ComputeFeatures(cu_waveform, wave_data.SampFreq(),
+                              vtln_warp_local, &cu_features);
+        features.Resize(cu_features.NumRows(), cu_features.NumCols());
+        features.CopyFromMat(cu_features);
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+                   << utt;
+        continue;
+      }
+      if (subtract_mean) {
+        Vector<BaseFloat> mean(features.NumCols());
+        mean.AddRowSumMat(1.0, features);
+        mean.Scale(1.0 / features.NumRows());
+        for (int32 i = 0; i < features.NumRows(); i++)
+          features.Row(i).AddVec(-1.0, mean);
+      }
+      if (output_format == "kaldi") {
+        kaldi_writer.Write(utt, features);
+      } else {
+        std::pair<Matrix<BaseFloat>, HtkHeader> p;
+        p.first.Resize(features.NumRows(), features.NumCols());
+        p.first.CopyFromMat(features);
+        HtkHeader header = {
+          features.NumRows(),
+          100000,  // 10ms shift
+          static_cast<int16>(sizeof(float)*(features.NumCols())),
+          static_cast<uint16>( 006 | // MFCC
+          (fbank_opts.use_energy ? 0100 : 020000)) // energy; otherwise c0
+        };
+        p.second = header;
+        htk_writer.Write(utt, p);
+      }
+      if (num_utts % 10 == 0)
+        KALDI_LOG << "Processed " << num_utts << " utterances";
+      KALDI_VLOG(2) << "Processed features for key " << utt;
+      num_success++;
+    }
+    KALDI_LOG << " Done " << num_success << " out of " << num_utts
+              << " utterances.";
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/cudafeatbin/compute-mfcc-feats-cuda.cc b/src/cudafeatbin/compute-mfcc-feats-cuda.cc
new file mode 100644
index 00000000000..4c464e7a27c
--- /dev/null
+++ b/src/cudafeatbin/compute-mfcc-feats-cuda.cc
@@ -0,0 +1,193 @@
+// cudafeatbin/compute-mfcc-feats-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudafeat/feature-spectral-cuda.h"
+#include "feat/wave-reader.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    const char *usage =
+        "Create MFCC feature files.\n"
+        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+
+    // construct all the global objects
+    ParseOptions po(usage);
+    MfccOptions mfcc_opts;
+    bool subtract_mean = false;
+    BaseFloat vtln_warp = 1.0;
+    std::string vtln_map_rspecifier;
+    std::string utt2spk_rspecifier;
+    int32 channel = -1;
+    BaseFloat min_duration = 0.0;
+    // Define defaults for gobal options
+    std::string output_format = "kaldi";
+
+    // Register the MFCC option struct
+    mfcc_opts.Register(&po);
+
+    // Register the options
+    po.Register("output-format", &output_format, "Format of the output "
+                "files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable "
+                "if vtln-map not specified)");
+    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or "
+                "speaker-id to vtln warp factor (rspecifier)");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map "
+                "rspecifier (if doing VTLN and you have warps per speaker)");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+
+    std::string wav_rspecifier = po.GetArg(1);
+
+    std::string output_wspecifier = po.GetArg(2);
+
+    CudaSpectralFeatures mfcc(mfcc_opts);
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
+    TableWriter<HtkMatrixHolder> htk_writer;
+
+    if (utt2spk_rspecifier != "")
+      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
+                   "needed if the vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+    
+    if (output_format == "kaldi") {
+      if (!kaldi_writer.Open(output_wspecifier))
+        KALDI_ERR << "Could not initialize output with wspecifier "
+                  << output_wspecifier;
+    } else if (output_format == "htk") {
+      if (!htk_writer.Open(output_wspecifier))
+        KALDI_ERR << "Could not initialize output with wspecifier "
+                  << output_wspecifier;
+    } else {
+      KALDI_ERR << "Invalid output_format string " << output_format;
+    }
+
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      const WaveData &wave_data = reader.Value();
+      if (wave_data.Duration() < min_duration) {
+        KALDI_WARN << "File: " << utt << " is too short ("
+                   << wave_data.Duration() << " sec): producing no output.";
+        continue;
+      }
+      int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
+      {  // This block works out the channel (0=left, 1=right...)
+        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        // reading code if no channels.
+        if (channel == -1) {
+          this_chan = 0;
+          if (num_chan != 1)
+            KALDI_WARN << "Channel not specified but you have data with "
+                       << num_chan  << " channels; defaulting to zero";
+        } else {
+          if (this_chan >= num_chan) {
+            KALDI_WARN << "File with id " << utt << " has "
+                       << num_chan << " channels but you specified channel "
+                       << channel << ", producing no output.";
+            continue;
+          }
+        }
+      }
+      BaseFloat vtln_warp_local;  // Work out VTLN warp factor.
+      if (vtln_map_rspecifier != "") {
+        if (!vtln_map_reader.HasKey(utt)) {
+          KALDI_WARN << "No vtln-map entry for utterance-id (or speaker-id) "
+                     << utt;
+          continue;
+        }
+        vtln_warp_local = vtln_map_reader.Value(utt);
+      } else {
+        vtln_warp_local = vtln_warp;
+      }
+
+      SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
+      Matrix<BaseFloat> features;
+      try {
+        CuVector<BaseFloat> cu_waveform(waveform);
+        CuMatrix<BaseFloat> cu_features;
+        mfcc.ComputeFeatures(cu_waveform, wave_data.SampFreq(), vtln_warp_local,
+                             &cu_features);
+        features.Resize(cu_features.NumRows(), cu_features.NumCols());
+        features.CopyFromMat(cu_features);
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+                   << utt;
+        continue;
+      }
+      if (subtract_mean) {
+        Vector<BaseFloat> mean(features.NumCols());
+        mean.AddRowSumMat(1.0, features);
+        mean.Scale(1.0 / features.NumRows());
+        for (int32 i = 0; i < features.NumRows(); i++)
+          features.Row(i).AddVec(-1.0, mean);
+      }
+      if (output_format == "kaldi") {
+        kaldi_writer.Write(utt, features);
+      } else {
+        std::pair<Matrix<BaseFloat>, HtkHeader> p;
+        p.first.Resize(features.NumRows(), features.NumCols());
+        p.first.CopyFromMat(features);
+        HtkHeader header = {
+          features.NumRows(),
+          100000,  // 10ms shift
+          static_cast<int16>(sizeof(float)*(features.NumCols())),
+          static_cast<uint16>( 006 | // MFCC
+          (mfcc_opts.use_energy ? 0100 : 020000)) // energy; otherwise c0
+        };
+        p.second = header;
+        htk_writer.Write(utt, p);
+      }
+      if (num_utts % 10 == 0)
+        KALDI_LOG << "Processed " << num_utts << " utterances";
+      KALDI_VLOG(2) << "Processed features for key " << utt;
+      num_success++;
+    }
+    KALDI_LOG << " Done " << num_success << " out of " << num_utts
+              << " utterances.";
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/cudafeatbin/compute-online-feats-cuda.cc b/src/cudafeatbin/compute-online-feats-cuda.cc
new file mode 100644
index 00000000000..b9135c3cee6
--- /dev/null
+++ b/src/cudafeatbin/compute-online-feats-cuda.cc
@@ -0,0 +1,123 @@
+// cudafeatbin/compute-online-feats-cuda.cc
+//
+// Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+// Justin Luitjens
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if HAVE_CUDA == 1
+#include <nvToolsExt.h>
+#endif
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudafeat/online-cuda-feature-pipeline.h"
+#include "feat/wave-reader.h"
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+
+int main(int argc, char *argv[]) {
+  using namespace kaldi;
+  typedef kaldi::int32 int32;
+  typedef kaldi::int64 int64;
+  try {
+    const char *usage =
+      "Extract features and ivectors for utterances using the cuda online\n"
+      "feature pipeline. This class models the online feature pipeline.\n"  
+      "\n"
+      "Usage:  compute-online-feats-cuda [options] <wave-rspecifier> "
+      "<ivector-wspecifier> <feats-wspecifier>\n"
+      "e.g.: \n"
+      "  ./compute-online-feats-cuda --config=feature_config wav.scp "
+      "ark,scp:ivector.ark,ivector.scp ark,scp:feat.ark,feat.scp\n";
+
+    ParseOptions po(usage);
+    // Use online feature config as that is the flow we are trying to model
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+
+    feature_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    g_cuda_allocator.SetOptions(g_allocator_options);
+    CuDevice::Instantiate().SelectGpuId("yes");
+    CuDevice::Instantiate().AllowMultithreading();
+
+    std::string wav_rspecifier = po.GetArg(1),
+      ivector_wspecifier = po.GetArg(2),
+      feature_wspecifier = po.GetArg(3);
+
+    OnlineCudaFeaturePipeline feature_pipeline(feature_opts);
+
+    SequentialTableReader<WaveHolder> reader(wav_rspecifier);
+    BaseFloatVectorWriter ivector_writer;
+    BaseFloatMatrixWriter feature_writer;
+
+    if (!ivector_writer.Open(ivector_wspecifier)) {
+      KALDI_ERR << "Could not initialize ivector_writer with wspecifier "
+        << ivector_wspecifier;
+    }
+    if (!feature_writer.Open(feature_wspecifier)) {
+      KALDI_ERR << "Could not initialize feature_writer with wspecifier "
+        << feature_wspecifier;
+    }
+
+    int32 num_utts = 0, num_success = 0;
+    for (; !reader.Done(); reader.Next()) {
+      num_utts++;
+      std::string utt = reader.Key();
+      KALDI_LOG << "Processing Utterance " << utt;
+      try
+      {
+        const WaveData &wave_data = reader.Value();
+        SubVector<BaseFloat> waveform(wave_data.Data(), 0);
+        CuVector<BaseFloat> cu_wave(waveform);
+        CuMatrix<BaseFloat> cu_features;
+        CuVector<BaseFloat> cu_ivector;
+
+        nvtxRangePushA("Feature Extract");
+        feature_pipeline.ComputeFeatures(cu_wave,  wave_data.SampFreq(),
+            &cu_features, &cu_ivector);
+        cudaDeviceSynchronize();
+        nvtxRangePop();
+
+        Matrix<BaseFloat> features(cu_features.NumRows(), cu_features.NumCols());
+        Vector<BaseFloat> ivector(cu_ivector.Dim());
+
+        features.CopyFromMat(cu_features);
+        ivector.CopyFromVec(cu_ivector);
+
+        feature_writer.Write(utt, features);
+        ivector_writer.Write(utt, ivector);
+
+        num_success++;
+      } catch (...) {
+        KALDI_WARN << "Failed to compute features for utterance "
+          << utt;
+        continue;
+      }
+    }
+    KALDI_LOG << "Processed " << num_utts << " utterances with "
+      << num_utts - num_success << " failures.";
+    return (num_success != 0 ? 0 : 1);
+
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+
+}
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index ca831390ea9..45c2ba44fd7 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -18,8 +18,7 @@ endif
 
 LIBNAME = kaldi-cudamatrix
 
-ADDLIBS = ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index fec75b01a3f..32781faf1ed 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -1,6 +1,6 @@
 // cudamatrix/cu-allocator.cc
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright      2015-2018  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,6 +19,8 @@
 
 
 
+#include "cudamatrix/cu-allocator.h"
+
 #if HAVE_CUDA == 1
 
 #include <cublas_v2.h>
@@ -28,6 +30,10 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#ifndef _MSC_VER
+#include <dlfcn.h>
+#endif
+
 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-matrix.h"
@@ -39,10 +45,223 @@ namespace kaldi {
 
 
 void* CuMemoryAllocator::Malloc(size_t size) {
-  // For now just call MallocPitch and throw away the pitch, to avoid
-  // duplicating code here.  Apparently the time difference is quite small.
-  size_t pitch;
-  return MallocPitch(size, 1, &pitch);
+  Timer tim;
+  if (!opts_.cache_memory) {
+    void *ans;
+    CU_SAFE_CALL(cudaMalloc(&ans, size));
+    double elapsed = tim.Elapsed();
+    tot_time_taken_ += elapsed;
+    malloc_time_taken_ += elapsed;
+    t_++;
+    return ans;
+  }
+
+  // We could perhaps change this to KALDI_PARANOID_ASSERT to save time.
+  KALDI_ASSERT(size != 0);
+
+  // Round up 'size' to a multiple of 256; this ensures the right kind of
+  // memory alignment.
+  size = (size + 255) & ~((size_t)255);
+  void *ans = MallocInternal(size);
+  tot_time_taken_ += tim.Elapsed();
+  return ans;
+}
+
+
+CuMemoryAllocator::MemoryBlock *CuMemoryAllocator::SplitBlock(
+    MemoryBlock *block, size_t size) {
+  SubRegion *subregion = block->subregion;
+  // new_block will become the right-most part of 'block', and 'block' will
+  // be the left-most part.
+  MemoryBlock *new_block = new MemoryBlock;
+  bool return_new_block;
+  char *new_begin;
+
+  // We now decide whether to make the left part of 'block' be of size ('size')
+  // and return it (the 'if' branch of the if-else block below), or the right
+  // part (the 'else' branch).  We decide this based on heuristics.  Basically,
+  // we want to allocate the sub-block that's either next to the edge of the
+  // MemoryRegion, or next to something that was allocated long ago (and which,
+  // we assume won't be deallocated for a relatively long time).  That is: we
+  // want to leave the un-allocated memory next to a memory block that was
+  // recently allocated (and thus is likely to be freed sooner), so that when
+  // that block is freed we can merge it with the still-unallocated piece into a
+  // larger block; this will reduce fragmentation.  But if this block spans
+  // multiple sub-regions we don't want to do that, as that would be against our
+  // heuristic of, where possible, allocating memory from lower-numbered
+  // sub-regions.
+  //
+  // Bear in mind that we can assume block->next and block->prev, if they are
+  // non-NULL, are both currently allocated, since 'block' is un-allocated and
+  // we would have merged any adjacent un-allocated sub-regions.
+  if (block->next != NULL && block->prev != NULL &&
+      block->prev->t < block->next->t &&
+      block->next->subregion == subregion) {
+    // We'll allocate the right part of the block, since the left side is next
+    // to a relatively recently-allocated block.
+    return_new_block = true;
+    new_begin = block->end - size;
+  } else {
+    // We'll allocate the left part of the block.
+    return_new_block = false;
+    new_begin = block->begin + size;
+  }
+
+  // The following code makes sure the SubRegion for 'new_block' is correct,
+  // i.e. its 'begin' is >= the 'begin' of the subregion and < the 'end' of the
+  // subregion.  If the following loop segfaults, it indicates a bug somewhere
+  // else.
+  while (new_begin >= subregion->end)
+    subregion = subregion->next;
+  MemoryBlock *next_block = block->next;
+  new_block->begin = new_begin;
+  new_block->end = block->end;
+  new_block->subregion = subregion;
+  new_block->allocated = false;
+  new_block->thread_id = block->thread_id;
+  new_block->t = block->t;
+  new_block->next = next_block;
+  new_block->prev = block;
+  if (next_block)
+    next_block->prev = new_block;
+  block->next = new_block;
+  block->end = new_begin;
+
+  // Add the split-up piece that we won't be allocating, to the
+  // 'free_blocks' member of its subregion.
+  if (return_new_block) {
+    AddToFreeBlocks(block);
+    return new_block;
+  } else {
+    AddToFreeBlocks(new_block);
+    return block;
+  }
+}
+
+
+void CuMemoryAllocator::RemoveFromFreeBlocks(MemoryBlock *block) {
+  SubRegion *subregion = block->subregion;
+  size_t block_size = block->end - block->begin;
+  std::pair<size_t, MemoryBlock*> p(block_size, block);
+  size_t num_removed = subregion->free_blocks.erase(p);
+  KALDI_ASSERT(num_removed != 0);
+  // Update largest_free_block_, if needed.
+  size_t subregion_index = subregion->subregion_index;
+  if (block_size == largest_free_block_[subregion_index]) {
+    if (subregion->free_blocks.empty())
+      largest_free_block_[subregion_index] = 0;
+    else
+      largest_free_block_[subregion_index] =
+          subregion->free_blocks.rbegin()->first;
+  }
+}
+
+void CuMemoryAllocator::AddToFreeBlocks(MemoryBlock *block) {
+  SubRegion *subregion = block->subregion;
+  KALDI_PARANOID_ASSERT(block->begin >= subregion->begin &&
+                        block->begin < subregion->end);
+  size_t block_size = block->end - block->begin,
+       subregion_index = subregion->subregion_index;
+  // Update largest_free_block_, if needed.
+  if (block_size > largest_free_block_[subregion_index]) {
+    largest_free_block_[subregion_index] = block_size;
+  }
+  subregion->free_blocks.insert(std::pair<size_t, MemoryBlock*>(block_size, block));
+}
+
+
+void* CuMemoryAllocator::MallocFromSubregion(SubRegion *subregion,
+                                             size_t size) {
+  // NULL is implementation defined and doesn't have to be zero so we can't
+  // guarantee that NULL will be <= a valid pointer-- so we cast to a pointer
+  // from zero instead of using NULL.
+  std::pair<size_t, MemoryBlock*> p(size, (MemoryBlock*)0);
+
+  std::set<std::pair<size_t, MemoryBlock*> >::iterator iter =
+      subregion->free_blocks.lower_bound(p);
+  // so now 'iter' is the first member of free_blocks whose size_t value is >=
+  // size.  If 'iter' was equal to the end() of that multi_map, it would be a
+  // bug because the calling code checked that the largest free block in this
+  // region was sufficiently large.  We don't check this; if it segfaults, we'll
+  // debug.
+
+  // search for a block that we don't have to synchronize on
+  int max_iters = 20;
+  auto search_iter = iter;
+  for (int32 i = 0;
+       search_iter != subregion->free_blocks.end() && i < max_iters;
+       ++i, ++search_iter) {
+    if (search_iter->second->thread_id == std::this_thread::get_id() ||
+        search_iter->second->t <= synchronize_gpu_t_) {
+      iter = search_iter;
+      break;
+    }
+  }
+
+  MemoryBlock *block = iter->second;
+  // Erase 'block' from its subregion's free blocks list... the next lines are
+  // similar to RemoveFromFreeBlocks(), but we code it directly as we have the
+  // iterator here, and it would be wasteful to do another lookup.
+  subregion->free_blocks.erase(iter);
+  // Update largest_free_block_, if needed.  The following few lines of code also appear
+  // in RemoveFromFreeBlocks().
+  size_t block_size = block->end - block->begin,
+      subregion_index = subregion->subregion_index;
+  if (block_size == largest_free_block_[subregion_index]) {
+    if (subregion->free_blocks.empty())
+      largest_free_block_[subregion_index] = 0;
+    else
+      largest_free_block_[subregion_index] =
+          subregion->free_blocks.rbegin()->first;
+  }
+
+  KALDI_PARANOID_ASSERT(block_size >= size && block->allocated == false);
+
+  // the most memory we allow to be 'wasted' by failing to split a block, is the
+  // smaller of: 1/16 of the size we're allocating, or half a megabyte.
+  size_t allowed_extra_size = std::min<size_t>(size >> 4, 524288);
+  if (block_size > size + allowed_extra_size) {
+    // If the requested block is substantially larger than what was requested,
+    // split it so we don't waste memory.
+    block = SplitBlock(block, size);
+  }
+
+  if (std::this_thread::get_id() != block->thread_id &&
+      block->t > synchronize_gpu_t_) {
+    // see NOTE ON SYNCHRONIZATION in the header.
+    SynchronizeGpu();
+    synchronize_gpu_t_ = t_;
+    num_synchronizations_++;
+  }
+  block->allocated = true;
+  block->t = t_;
+  allocated_block_map_[block->begin] = block;
+  allocated_memory_ += (block->end - block->begin);
+  if (allocated_memory_ > max_allocated_memory_) 
+    max_allocated_memory_ = allocated_memory_;
+  return block->begin;
+}
+
+// By the time MallocInternal is called, we will have ensured that 'size' is
+// a nonzero multiple of 256 (for memory aligment reasons).
+// inline
+void* CuMemoryAllocator::MallocInternal(size_t size) {
+start:
+  std::vector<size_t>::const_iterator iter = largest_free_block_.begin(),
+      end = largest_free_block_.end();
+  size_t subregion_index = 0;
+  for (; iter != end; ++iter, ++subregion_index) {
+    if (*iter > size) {
+      return MallocFromSubregion(subregions_[subregion_index], size);
+    }
+  }
+  // We dropped off the loop without finding a subregion with enough memory
+  // to satisfy the request -> allocate a new region.
+  AllocateNewRegion(size);
+  // An infinite loop shouldn't be possible because after calling
+  // AllocateNewRegion(size), there should always be a SubRegion
+  // with that size available.
+  goto start;
 }
 
 // Returns max(0, floor(log_2(i))).   Not tested independently.
@@ -63,311 +282,365 @@ static inline size_t IntegerLog2(size_t i) {
   return ans;
 }
 
-//inline
-CuMemoryAllocator::MruCache& CuMemoryAllocator::GetCacheForSize(
-    size_t num_bytes) {
-  size_t bucket_index = IntegerLog2(num_bytes);
-  KALDI_ASSERT(num_bytes > 0 && bucket_index < caches_.size());
-  return caches_[bucket_index];
-}
-
-//inline
-void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes,
-                                             size_t num_rows,
-                                            size_t *pitch) {
-  num_system_allocations_++;
-  void *ans;
-  cudaError_t e;
-  for (int32 i = 0; i <= 2; i++) {
-    if (num_rows != 1) {
-      CuTimer tim;
-      e = cudaMallocPitch(&ans, pitch, row_bytes, num_rows);
-      tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed();
+std::string GetFreeGpuMemory(int64* free, int64* total) {
+#ifdef _MSC_VER
+  size_t mem_free, mem_total;
+  cuMemGetInfo_v2(&mem_free, &mem_total);
+#else
+  // define the function signature type
+  size_t mem_free, mem_total;
+  {
+    // we will load cuMemGetInfo_v2 dynamically from libcuda.so
+    // pre-fill ``safe'' values that will not cause problems
+    mem_free = 1; mem_total = 1;
+    // open libcuda.so
+    void* libcuda = dlopen("libcuda.so", RTLD_LAZY);
+    if (NULL == libcuda) {
+      KALDI_WARN << "cannot open libcuda.so";
     } else {
-      CuTimer tim;
-      // we might save a little time this way.
-      e = cudaMalloc(&ans, row_bytes);
-      tot_time_taken_in_cuda_malloc_ += tim.Elapsed();
-      *pitch = row_bytes;
-    }
-    if (e != cudaSuccess) {
-      PrintMemoryUsage();
-      // On the first 2 out of the 3 iters, try freeing memory.
-      if (i <= 1) {
-        KALDI_WARN << "Allocation of " << row_bytes << " x "
-                   << num_rows << " region failed: freeing some memory and "
-                   << "trying again. ";
-        BaseFloat new_memory_factor = 1.1;
-        if (opts_.memory_factor > new_memory_factor) {
-          KALDI_LOG << "To avoid future problems like this, changing "
-                    << "memory_factor from " << opts_.memory_factor << " to "
-                    << new_memory_factor;
-          opts_.memory_factor = new_memory_factor;
-        }
-        size_t memory_cached = MemoryCached(),
-            memory_requested = row_bytes * num_rows,
-            memory_to_free = std::max<size_t>(memory_cached / 2,
-                                              std::min<size_t>(memory_cached,
-                                                               memory_requested));
-        FreeSomeCachedMemory(memory_to_free);
+      // define the function signature type
+      // and get the symbol
+      typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
+      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
+      if (NULL == dl_cuMemGetInfo) {
+        KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
       } else {
-        KALDI_ERR << "Cannot allocate the requested memory ("
-                  << row_bytes << " x " << num_rows << " = "
-                  << row_bytes * num_rows << " bytes)";
+        // call the function
+        dl_cuMemGetInfo(&mem_free, &mem_total);
       }
-      cudaGetLastError();  // Clear the error state.
-    } else {
-      break;
+      // close the library
+      dlclose(libcuda);
     }
   }
-  return ans;
+#endif
+  // copy the output values outside
+  if (NULL != free) *free = mem_free;
+  if (NULL != total) *total = mem_total;
+  // prepare the text output
+  std::ostringstream os;
+  os << "free:" << mem_free/(1024*1024) << "M, "
+     << "used:" << (mem_total-mem_free)/(1024*1024) << "M, "
+     << "total:" << mem_total/(1024*1024) << "M, "
+     << "free/total:" << mem_free/(float)mem_total;
+  return os.str();
 }
 
 void CuMemoryAllocator::PrintMemoryUsage() const {
-  KALDI_LOG << "Memory usage: " << cur_bytes_allocated_
-            << " bytes currently allocated (max: "
-            << max_bytes_allocated_ << "); " << cur_bytes_used_
-            << " currently in use by user (max: " << max_bytes_used_ << ")"
-            << "; " << num_system_allocations_ << '/'
-            << num_user_allocations_ << " calls to Malloc* resulted in "
-            << "CUDA calls.";
-  if (GetVerboseLevel() >= 1) {
-    // CuTimer only accumulates stats at verbose level 1 or above.
-    KALDI_LOG << "Time taken in cudaMallocPitch=" << tot_time_taken_in_cuda_malloc_pitch_
-              << ", in cudaMalloc=" << tot_time_taken_in_cuda_malloc_
-              << ", in cudaFree=" << tot_time_taken_in_cuda_free_
-              << ", in this->MallocPitch()=" << tot_time_taken_in_malloc_pitch_;
+  if (!opts_.cache_memory) {
+    KALDI_LOG << "Not caching allocations; time taken in "
+              << "malloc/free is " << malloc_time_taken_
+              << "/" << (tot_time_taken_ - malloc_time_taken_)
+              << ", num operations is " << t_
+              << "; device memory info: "
+              << GetFreeGpuMemory(NULL, NULL);
+    return;
+  }
+
+  size_t num_blocks_allocated = 0, num_blocks_free = 0,
+      memory_allocated = 0, memory_held = 0,
+      largest_free_block = 0, largest_allocated_block = 0;
+
+  for (size_t i = 0; i < memory_regions_.size(); i++) {
+    MemoryBlock *m = memory_regions_[i].block_begin;
+    KALDI_ASSERT(m->begin == memory_regions_[i].begin);
+    for (; m != NULL; m = m->next) {
+      size_t size = m->end - m->begin;
+      if (m->allocated) {
+        num_blocks_allocated++;
+        memory_allocated += size;
+        if (size > largest_allocated_block)
+          largest_allocated_block = size;
+      } else {
+        num_blocks_free++;
+        if (size > largest_free_block)
+          largest_free_block = size;
+      }
+      memory_held += size;
+      // The following is just some sanity checks; this code is rarely called so
+      // it's a reasonable place to put them.
+      if (m->next) {
+        KALDI_ASSERT(m->next->prev == m && m->end == m->next->begin);
+      } else {
+        KALDI_ASSERT(m->end == memory_regions_[m->subregion->memory_region].end);
+      }
+    }
   }
+  KALDI_LOG << "Memory usage: " << memory_allocated << "/"
+            << memory_held << " bytes currently allocated/total-held; "
+            << num_blocks_allocated << "/" << num_blocks_free
+            << " blocks currently allocated/free; largest "
+            << "free/allocated block sizes are "
+            << largest_allocated_block << "/" << largest_free_block
+            << "; time taken total/cudaMalloc is "
+            << tot_time_taken_ << "/" << malloc_time_taken_
+            << ", synchronized the GPU " << num_synchronizations_
+            << " times out of " << (t_/2) << " frees; "
+            << "device memory info: " << GetFreeGpuMemory(NULL, NULL)
+            << "maximum allocated: " << max_allocated_memory_  
+            << "current allocated: " << allocated_memory_; 
+}
+
+// Note: we just initialize with the default options, but we can change it later
+// (as long as it's before we first use the class) by calling SetOptions().
+CuMemoryAllocator::CuMemoryAllocator():
+    opts_(CuAllocatorOptions()),
+    t_(0),
+    synchronize_gpu_t_(0),
+    num_synchronizations_(0),
+    tot_time_taken_(0.0),
+    malloc_time_taken_(0.0),
+    max_allocated_memory_(0),
+    allocated_memory_(0) {
+  // Note: we don't allocate any memory regions at the start; we wait for the user
+  // to call Malloc() or MallocPitch(), and then allocate one when needed.
 }
 
-CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts):
-    opts_(opts),
-    caches_(40),
-    cur_bytes_allocated_(0),
-    max_bytes_allocated_(0),
-    cur_bytes_used_(0),
-    max_bytes_used_(0),
-    t_(1),
-    num_user_allocations_(0),
-    num_system_allocations_(0),
-    tot_time_taken_in_cuda_malloc_(0.0),
-    tot_time_taken_in_cuda_malloc_pitch_(0.0),
-    tot_time_taken_in_cuda_free_(0.0),
-    tot_time_taken_in_malloc_pitch_(0.0) { }
 
 void* CuMemoryAllocator::MallocPitch(size_t row_bytes,
                                      size_t num_rows,
                                      size_t *pitch) {
-  CuTimer tim;
-  t_++;
-  num_user_allocations_++;
-  size_t requested_bytes = row_bytes * num_rows;
-  if (cur_bytes_used_ + requested_bytes > max_bytes_used_)
-    max_bytes_used_ = cur_bytes_used_ + requested_bytes;
-  MruCache &cache = GetCacheForSize(requested_bytes);
-  MemoryRequest request(row_bytes, num_rows);
-  CachedMemoryElement output;
-  if (cache.Lookup(request, &output)) {
-    // we have cached memory with this value.
-    void *ans = output.pointer;
-    *pitch = output.pitch;
-    used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, output.pitch);
-    cur_bytes_used_ += requested_bytes;
-    tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
-    return ans;
-  } else {
-    // note: it's important that we already updated max_bytes_used_.
-    size_t next_bytes_allocated = cur_bytes_allocated_ + requested_bytes,
-        max_bytes_to_allocate =
-        static_cast<size_t>(opts_.memory_factor * max_bytes_used_);
-    ssize_t bytes_overflow = next_bytes_allocated - max_bytes_to_allocate;
-    if (bytes_overflow > 0) {
-      // The amount we would have allocated, after fulfilling this request,
-      // would exceed our limits (we don't allow ourselves to allocate more than
-      // memory_factor times the maximum amount of memory the user ever owns
-      // during the lifetime of the program).  So free some memory.
-      KALDI_ASSERT(bytes_overflow <= MemoryCached());  // sanity check.
-      FreeSomeCachedMemory(static_cast<size_t>(bytes_overflow));
-      KALDI_ASSERT(cur_bytes_allocated_ + requested_bytes <=
-                   max_bytes_to_allocate);
-    }
-    void *ans = MallocPitchInternal(row_bytes, num_rows, pitch);
-    cur_bytes_allocated_ += requested_bytes;
-    if (cur_bytes_allocated_ > max_bytes_allocated_)
-      max_bytes_allocated_ = cur_bytes_allocated_;
-    used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, *pitch);
-    cur_bytes_used_ += requested_bytes;
-    tot_time_taken_in_malloc_pitch_ += tim.Elapsed();
+  Timer tim;
+  if (!opts_.cache_memory) {
+    void *ans;
+    CU_SAFE_CALL(cudaMallocPitch(&ans, pitch, row_bytes, num_rows));
+    double elapsed = tim.Elapsed();
+    tot_time_taken_ += elapsed;
+    malloc_time_taken_ += elapsed;
     return ans;
   }
-}
 
-void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) {
-  CuTimer tim;
-  // the next few lines are responsible for increasing the amount of memory we
-  // are going to free, in case the user requested an amount that's very tiny
-  // compared with the total amount of memory ever used.  This helps us
-  // to amortize the cost of visiting all of the buckets inside this code.
-  // (there are only 40 buckets so it's not so big, but we're being careful.
-  size_t bytes_cached = cur_bytes_allocated_ - cur_bytes_used_,
-      min_to_free = static_cast<size_t>(max_bytes_used_ * opts_.delete_factor);
-  size_t bytes_to_free = std::min(bytes_cached,
-                                  std::max(bytes_to_free_in, min_to_free)),
-      bytes_freed = 0;
-
-  size_t num_caches = caches_.size(),
-      t = t_;
-  // size_factor contains the approximate (power-of-two) size of the pointers
-  // that each cache's pointers contain.  The 'cost' of keeping any given pointer,
-  // we declare to be the time since we last used it multiplied by the size
-  // of the memory in the pointer.
-  std::vector<BaseFloat> size_factor(num_caches);
-  for (size_t i = 0, j=1; i < num_caches; i++, j *= 2)
-    size_factor[i] = j;
-
-  std::priority_queue<std::pair<BaseFloat,int32> > queue;
-  // Set up the queue.
-  for (int32 i = 0; i < num_caches; i++) {
-    const MruCache &cache = caches_[i];
-    size_t cache_t = cache.LeastRecentTime();
-    if (cache_t > 0) {  // t == 0 means the cache is empty.
-      size_t interval = t - cache_t;
-      BaseFloat cost = size_factor[i] * interval;
-      KALDI_ASSERT(interval > 0);
-      queue.push(std::pair<BaseFloat,int32>(cost, i));
-    }
-  }
-  while (bytes_freed < bytes_to_free) {
-    // If the following fails it means I made some kind of bookkeeping error,
-    // and most likely we are trying to free more memory than we really have
-    // cached.
-    KALDI_ASSERT(!queue.empty() && "Code error.");
-    std::pair<BaseFloat, int32> p = queue.top();
-    int32 cache_index = p.second;
-    MruCache &cache = caches_[cache_index];
-    queue.pop();
-    if (queue.empty()) {
-      while (bytes_freed < bytes_to_free) {
-        bytes_freed += cache.RemoveLeastRecentlyUsed();
-      }
-    } else {
-      BaseFloat next_worst_cost = queue.top().first;
-      while (1)  {
-        bytes_freed += cache.RemoveLeastRecentlyUsed();
-        if (bytes_freed >= bytes_to_free)
-          break;
-        size_t least_recent_time = cache.LeastRecentTime();
-        if (least_recent_time == 0)  // this cache is now empty
-          break;
-        size_t interval = t - least_recent_time;
-        KALDI_ASSERT(interval > 0);
-        BaseFloat cost = size_factor[cache_index] * interval;
-        if (cost < next_worst_cost) {
-          // There is another bucket that has worse cost than this,
-          // so stop processing this bucket-- but first put it
-          // back in the queue.
-          queue.push(std::pair<BaseFloat, int32>(cost, cache_index));
-          break;
-        }
-      }
-    }
-  }
-  KALDI_ASSERT(bytes_freed <= cur_bytes_allocated_);
-  cur_bytes_allocated_ -= bytes_freed;
-  tot_time_taken_in_cuda_free_ += tim.Elapsed();
+  // Round up row_bytes to a multiple of 256.
+  row_bytes = (row_bytes + 255) & ~((size_t)255);
+  *pitch = row_bytes;
+  void *ans = MallocInternal(row_bytes * num_rows);
+  tot_time_taken_ += tim.Elapsed();
+  return ans;
 }
 
 void CuMemoryAllocator::Free(void *ptr) {
+  Timer tim;
+  if (!opts_.cache_memory) {
+    CU_SAFE_CALL(cudaFree(ptr));
+    tot_time_taken_ += tim.Elapsed();
+    t_++;
+    return;
+  }
   t_++;
-  unordered_map<void*, UsedMemoryElement, PointerHasher>::iterator iter =
-      used_map_.find(ptr);
-  if (iter == used_map_.end()) {
+  unordered_map<void*, MemoryBlock*>::iterator iter =
+      allocated_block_map_.find(ptr);
+  if (iter == allocated_block_map_.end()) {
     KALDI_ERR << "Attempt to free CUDA memory pointer that was not allocated: "
               << ptr;
   }
-  const UsedMemoryElement &elem = iter->second;
-  size_t num_bytes = elem.row_bytes * elem.num_rows;
-
-  cur_bytes_used_ -= num_bytes;
-  MruCache &cache = GetCacheForSize(num_bytes);
+  MemoryBlock *block = iter->second;
+  allocated_memory_ -= (block->end - block->begin);
+  allocated_block_map_.erase(iter);
+  block->t = t_;
+  block->thread_id = std::this_thread::get_id();
+  block->allocated = false;
+
+  // If this is not the first block of the memory region and the previous block
+  // is not allocated, merge this block into the previous block.
+  MemoryBlock *prev_block = block->prev;
+  if (prev_block != NULL && !prev_block->allocated) {
+    RemoveFromFreeBlocks(prev_block);
+    prev_block->end = block->end;
+    if (prev_block->thread_id != block->thread_id) {
+      // the two blocks we're merging were freed by different threads, so we
+      // give the 'nonexistent thread' as their thread, which means that
+      // whichever thread requests that block, we force synchronization.  We can
+      // assume that prev_block was previously allocated (prev_block->t > 0)
+      // because we always start from the left when allocating blocks, and we
+      // know that this block was previously allocated.
+      prev_block->thread_id = std::thread::id();
+    }
+    prev_block->t = t_;
+    prev_block->next = block->next;
+    if (block->next)
+      block->next->prev = prev_block;
+    delete block;
+    block = prev_block;
+  }
 
-  cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows),
-               CachedMemoryElement(ptr, t_, elem.pitch));
-  used_map_.erase(iter);
+  // If this is not the last block of the memory region and the next block is
+  // not allocated, merge the next block into this block.
+  MemoryBlock *next_block = block->next;
+  if (next_block != NULL && !next_block->allocated) {
+    // merge next_block into 'block', deleting 'next_block'.  Note: at this
+    // point, if we merged with the previous block, the variable 'block' may now
+    // be pointing to that previous block, so it would be a 3-way merge.
+    RemoveFromFreeBlocks(next_block);
+    block->end = next_block->end;
+    if (next_block->thread_id != block->thread_id && next_block->t > 0) {
+      // the two blocks we're merging were freed by different threads, so we
+      // give the 'nonexistent thread' as their thread, which means that
+      // whichever thread requests that block, we force synchronization.  there
+      // is no need to do this if next_block->t == 0, which would mean it had
+      // never been allocated.
+      block->thread_id = std::thread::id();
+    }
+    // We don't need to inspect the 't' value of next_block; it can't be
+    // larger than t_ because t_ is now.
+    block->next = next_block->next;
+    if (block->next)
+      block->next->prev = block;
+    delete next_block;
+  }
+  AddToFreeBlocks(block);
+  tot_time_taken_ += tim.Elapsed();
 }
 
-size_t CuMemoryAllocator::MruCache::LeastRecentTime() const {
-  if (list_.empty()) {
-    KALDI_PARANOID_ASSERT(map_.empty());
-    return 0;
-  } else {
-    const MemoryRequest &mr = list_.front();
-    MapType::const_iterator iter = map_.find(mr);
-    KALDI_ASSERT(iter != map_.end());
-    const MapValueType &queue = iter->second;
-    KALDI_ASSERT(!queue.empty());
-    return queue.front().first.t;
+void CuMemoryAllocator::AllocateNewRegion(size_t size) {
+  int64 free_memory, total_memory;
+  std::string mem_info = GetFreeGpuMemory(&free_memory, &total_memory);
+  opts_.Check();
+  size_t region_size = static_cast<size_t>(free_memory * opts_.memory_proportion);
+  if (region_size < size)
+    region_size = size;
+  // Round up region_size to an exact multiple of 1M (note: we expect it will
+  // be much larger than that).  1048575 is 2^20 - 1.
+  region_size = (region_size + 1048575) & ~((size_t)1048575);
+
+  if (!memory_regions_.empty()) {
+    // If this is not the first region allocated, print some information.
+    KALDI_LOG << "About to allocate new memory region of " << region_size
+              << " bytes; current memory info is: " << mem_info;
+  }
+  void *memory_region;
+  cudaError_t e;
+  {
+    Timer tim;
+    e = cudaMalloc(&memory_region, region_size);
+    malloc_time_taken_ += tim.Elapsed();
+  }
+  if (e != cudaSuccess) {
+    PrintMemoryUsage();
+    if (!CuDevice::Instantiate().IsComputeExclusive()) {
+      KALDI_ERR << "Failed to allocate a memory region of " << region_size
+                << " bytes.  Possibly this is due to sharing the GPU.  Try "
+                << "switching the GPUs to exclusive mode (nvidia-smi -c 3) and using "
+                << "the option --use-gpu=wait to scripts like "
+                << "steps/nnet3/chain/train.py.  Memory info: "
+                << mem_info
+                << " CUDA error: '" << cudaGetErrorString(e) << "'";
+    } else {
+      KALDI_ERR << "Failed to allocate a memory region of " << region_size
+                << " bytes.  Possibly smaller minibatch size would help.  "
+                << "Memory info: " << mem_info
+                << " CUDA error: '" << cudaGetErrorString(e) << "'";
+    }
   }
+  // this_num_subregions would be approximately 'opts_.num_subregions' if
+  // 'region_size' was all the device's memory.  (We add one to round up).
+  // We're aiming to get a number of sub-regions approximately equal to
+  // opts_.num_subregions by the time we allocate all the device's memory.
+  size_t this_num_subregions = 1 +
+      (region_size * opts_.num_subregions) / total_memory;
+
+  size_t memory_region_index = memory_regions_.size();
+  memory_regions_.resize(memory_region_index + 1);
+  MemoryRegion &this_region = memory_regions_.back();
+
+  this_region.begin = static_cast<char*>(memory_region);
+  this_region.end = this_region.begin + region_size;
+  // subregion_size will be hundreds of megabytes.
+  size_t subregion_size = region_size / this_num_subregions;
+
+  std::vector<SubRegion*> new_subregions;
+  char* subregion_begin = static_cast<char*>(memory_region);
+  for (size_t i = 0; i < this_num_subregions; i++) {
+    SubRegion *subregion = new SubRegion();
+    subregion->memory_region = memory_region_index;
+    subregion->begin = subregion_begin;
+    if (i + 1 == this_num_subregions) {
+      subregion->end = this_region.end;
+      KALDI_ASSERT(subregion->end > subregion->begin);
+    } else {
+      subregion->end = subregion_begin + subregion_size;
+      subregion_begin = subregion->end;
+    }
+    subregion->next = NULL;
+    if (i > 0) {
+      new_subregions.back()->next = subregion;
+    }
+    new_subregions.push_back(subregion);
+  }
+  // Initially the memory is in a single block, owned by
+  // the first subregion.  It will be split up gradually.
+  MemoryBlock *block = new MemoryBlock();
+  block->begin = this_region.begin;
+  block->end = this_region.end;
+  block->subregion = new_subregions.front();
+  block->allocated = false;
+  block->t = 0; // was never allocated.
+  block->next = NULL;
+  block->prev = NULL;
+  for (size_t i = 0; i < this_num_subregions; i++)
+    subregions_.push_back(new_subregions[i]);
+  SortSubregions();
+  this_region.block_begin = block;
+
+  AddToFreeBlocks(block);
 }
 
-bool CuMemoryAllocator::MruCache::Lookup(const MemoryRequest &request,
-                                         CachedMemoryElement *output) {
-  MapType::iterator iter = map_.find(request);
-  if (iter == map_.end())
-    return false;
-  MapValueType &q = iter->second;
-  KALDI_ASSERT(!q.empty());
-  // use q.back() as we want to return the most recently used one if there
-  // is a choice.  We believe this will give better caching behavior.
-  *output = q.back().first;
-  list_.erase(q.back().second);
-  q.pop_back();
-  if (q.empty())
-    map_.erase(request);
-  return true;
+// We sort the sub-regions according to the distance between the start of the
+// MemoryRegion of which they are a part, and the start of the SubRegion.  This
+// will generally mean that the highest-numbered SubRegion-- the one we keep
+// free at all costs-- will be the end of the first block which we allocated
+// (which under most situations will be the largest block).
+void CuMemoryAllocator::SortSubregions() {
+  largest_free_block_.resize(subregions_.size());
+
+  std::vector<std::pair<size_t, SubRegion*> > pairs;
+  for (size_t i = 0; i < subregions_.size(); i++) {
+    SubRegion *subregion = subregions_[i];
+    MemoryRegion &memory_region = memory_regions_[subregion->memory_region];
+    size_t distance = subregion->begin - memory_region.begin;
+    pairs.push_back(std::pair<size_t, SubRegion*>(distance, subregion));
+  }
+  std::sort(pairs.begin(), pairs.end());
+  for (size_t i = 0; i < subregions_.size(); i++) {
+    subregions_[i] = pairs[i].second;
+    subregions_[i]->subregion_index = i;
+    if (subregions_[i]->free_blocks.empty())
+      largest_free_block_[i] = 0;
+    else
+      largest_free_block_[i] = subregions_[i]->free_blocks.rbegin()->first;
+  }
 }
 
-void CuMemoryAllocator::MruCache::Insert(const MemoryRequest &request,
-                                         const CachedMemoryElement &element) {
-  list_.push_back(request);
-  map_[request].push_back(std::pair<CachedMemoryElement, ListIterType>(
-      element,
-      --list_.end()));
+CuMemoryAllocator::~CuMemoryAllocator() {
+  // We mainly free these blocks of memory so that cuda-memcheck doesn't report
+  // spurious errors.
+  for (size_t i = 0; i < memory_regions_.size(); i++) {
+    // No need to check the return status here-- the program is exiting anyway.
+    cudaFree(memory_regions_[i].begin);
+  }
+  for (size_t i = 0; i < subregions_.size(); i++) {
+    SubRegion *subregion = subregions_[i];
+    for (auto iter = subregion->free_blocks.begin();
+         iter != subregion->free_blocks.end(); ++iter)
+      delete iter->second;
+    delete subregion;
+  }
 }
 
-size_t CuMemoryAllocator::MruCache::RemoveLeastRecentlyUsed() {
-  // Remove least-recently-used element from cache.
-  KALDI_ASSERT(!list_.empty());
-  MemoryRequest request = list_.front();
-  MapType::iterator iter = map_.find(request);
-  KALDI_ASSERT(iter != map_.end());
-  MapValueType &queue = iter->second;
-  KALDI_ASSERT(!queue.empty());
-  // least recently used elements are at the front of the queue.
-  std::pair<CachedMemoryElement, ListIterType> &p = queue.front();
-  KALDI_ASSERT(p.second == list_.begin());
-  CU_SAFE_CALL(cudaFree(p.first.pointer));
-  queue.pop_front();
-  if (queue.empty())
-    map_.erase(request);
-  list_.pop_front();
-  return request.first * request.second;
-}
 
-CuMemoryAllocator::MruCache& CuMemoryAllocator::MruCache::operator = (
-    const CuMemoryAllocator::MruCache &other) {
-  KALDI_ASSERT(other.list_.empty());
-  return *this;
-}
-CuMemoryAllocator::MruCache::MruCache(
-    const CuMemoryAllocator::MruCache &other) {
-  KALDI_ASSERT(other.list_.empty());
-}
+CuMemoryAllocator g_cuda_allocator;
 
 
+}  // namespace kaldi
 
 
-}
+#endif // HAVE_CUDA
 
 
-#endif // HAVE_CUDA
+namespace kaldi {
+
+// Define/initialize this global variable.  It was declared in cu-allocator.h.
+// This has to be done outside of the ifdef, because we register the options
+// whether or not CUDA is compiled in (so that the binaries accept the same
+// options).
+CuAllocatorOptions g_allocator_options;
+
+}
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 0f96315e848..d7d65da806a 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -23,54 +23,137 @@
 #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_
 
 #if HAVE_CUDA == 1
-
 #include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif
+
 #include <map>
+#include <set>
 #include <mutex>
 #include <list>
 #include <queue>
+#include <thread>
 #include <iostream>
-#include <cuda.h>
-#include <cuda_runtime_api.h>
 #include "base/kaldi-common.h"
 #include "util/stl-utils.h"
+#include "itf/options-itf.h"
 
 namespace kaldi {
 
 
 // For now we don't give the user a way to modify these from the command line.
+// or the code, it just documents what the default options are.  To change
+// the options, you have to do it in the code.
 struct CuAllocatorOptions {
-  // memory_factor is the total amount of (allocated + cached) memory that we
-  // allow to be held, relative to the max amount of memory the program has ever
-  // allocated.  It will increase the amount of memory the program will
-  // potentially consume, by this factor.
-  BaseFloat memory_factor;
-
-  // This is the minimum amount of memory that we will delete when we are forced
-  // to delete stuff, relative to the max amount of memory the program has ever
-  // allocated.  This should be less than memory_factor - 1.0 and > 0.  It
-  // shouldn't be too critical.  The reason it exists is to avoid calling the
-  // cleanup code and only releasing very small amounts of memory, because there
-  // is a constant overhead proportional to the number of buckets.
-  BaseFloat delete_factor;
-
-  CuAllocatorOptions(): memory_factor(1.3),
-                        delete_factor(0.001) { }
+  // True if we are going to actually cache memory allocations on this device.
+  // You'd normally set it to false only if you wanted to debug a possible
+  // memory problem using cuda-memcheck or cuda-gdb.  It will be slower, but
+  // using CUDA's native allocator allows those tools to detect out-of-region
+  // memory accesses.
+  bool cache_memory;
+
+  // The proportion of the device's memory that the CuAllocator allocates to
+  // start with; by default this is 0.5, although if you want to share the
+  // device (not recommended!) you should set this lower.
+  BaseFloat memory_proportion;
+
+  // The target number of subregions of the entire CUDA device memory (we'll
+  // start with a smaller number of memory_proportion is << 1).  Kind of
+  // a tuning knob.. more regions will make it more aggressively consolidate
+  // memory low addresses.
+  int32 num_subregions;
+
+  CuAllocatorOptions():
+      cache_memory(true), memory_proportion(0.5), num_subregions(20) { }
+
+  void Register(OptionsItf *po) {
+    po->Register("cuda-cache-memory", &cache_memory, "True if you want "
+                 "to use the caching allocator.  Set this to false only if you "
+                 "want to use cuda-memcheck or cuda-gdb; it will be slower.");
+    po->Register("cuda-memory-proportion", &memory_proportion,
+                 "Proportion of the GPU device memory that the allocator "
+                 "should allocate at the start");
+  }
 
   void Check() {
-    KALDI_ASSERT(delete_factor < memory_factor - 1.0 && delete_factor > 0.0);
+    // don't let it get too close to 1;
+    KALDI_ASSERT(memory_proportion >= 0.05 && memory_proportion < 0.99);
   }
 };
 
+extern CuAllocatorOptions g_allocator_options;
+
+inline void RegisterCuAllocatorOptions(OptionsItf *po) {
+  g_allocator_options.Register(po);
+}
 
 
+} // namespace kaldi
+
+
+#if HAVE_CUDA == 1
+namespace kaldi {
+
+/**
+   This class allocates large regions of memory from the GPU and allocates
+   sub-blocks of it for the user.  This is needed because the CUDA malloc and
+   free routines are very slow.
+
+   The user doesn't access this class directly, it is accessed via the CuDevice
+   object.  The CuDevice class allocates memory using this class's Malloc() and
+   MallocPitch() functions, and frees them with its Free() function, and this
+   class caches the memory blocks to avoid calling the CUDA library's
+   malloc/free functions too often.  If the application is using multiple
+   threads, it's necessary to lock this class before using it, and in that case
+   the CuDevice class calls the MallocLocking() and MallocPitchLocking()
+   versions of the allocation functions (but the user should call
+   CuDevice::AllowMultithreading() if the application plans to use GPU
+   functionality from multiple CPU threads).
+
+   NOTE ON SYNCHRONIZATION: if multiple CUDA streams are used there is a
+   potential problem with any caching allocator which shares its pool across
+   CUDA streams.  That is: if a memory block is freed by stream 1 and allocated to
+   stream 2, an operation might start in stream 2 before stream 1 has finished
+   working with that memory location.  We solve this here using a rather low-tech
+   solution, relying on calling SynchronizeGpu() which submits a no-op kernel
+   into the legacy default stream.  Each
+   time CuMemoryAllocator()::Free() is called and we cache the memory block
+   in this class, we record the thread-id of the CPU thread from which it was
+   freed, as well as a timestamp (the t_ member of CuMemoryAllocator, which
+   we increment every time the class is used).  When we allocate memory
+   that was cached, we try to allocate it from a block that was relased by the
+   same CPU thread; and if that is not possible and we haven't called
+   SynchronizeGpu() since the block was freed, then we call
+   SynchronizeGpu().  The hope is that this will happen quite rarely.
+   Note that this is based on the assumption that the user is using the
+   per-thread default stream (indeed this is how we compile).  If the
+   user were to make explicit use of CUDA streams, this mechanism would
+   not necessarily be sufficient to prevent data-race conditions and the
+   user might have to take further precautions.
+
+   NOTE ON FRAGMENTATION: Memory fragmentation is one of the main problems that
+   you'll run into with allocators like this.  This allocator will allocate a
+   small number of large regions of memory, and allocate smaller pieces of
+   memory that it splits off from the regions as needed.  It will always merge
+   adjacent blocks as much as it can when the user frees memory.  The main
+   heuristic to avoid memory fragmenting too much is that it always allocates,
+   where possible, from memory that's as close as possible to the start of a
+   memory region.  This will tend to keep all the small allocations together at
+   the beginning of the memory region, and hopefully keep large blocks availale
+   at the end.  The mechanism to always allocate from as close as possible to
+   the start of the memory region, is that we split up the memory regions into
+   a small number of sub-regions and, when handling a request for allocation,
+   allocate it from the lowest-numbered sub-region that can meet a request for
+   that size.  (Note: we can allocate blocks that span sub-regions, so this
+   approach does not limit the block size we can allocate).
+
+*/
 
-// Class that caches memory for us (the CUDA
-// malloc and free routines are very slow).
-// This is a member of the CuDevice class.
 class CuMemoryAllocator {
  public:
-  /// Allocates memory on the CUDA device, of size 'size'.
+  /// Allocates memory on the CUDA device, of size 'size'.  size == 0 is not
+  /// allowed and is an error.
   void* Malloc(size_t size);
 
   /// Allocation function for matrix-like things.
@@ -95,156 +178,187 @@ class CuMemoryAllocator {
     Free(ptr);
   }
 
+  void PrintMemoryUsage() const;
+
+  // returns the current memory allocated within the cache
+  size_t GetAllocatedMemory() { return allocated_memory_; }
 
-  // the maximum amount of memory that was ever allocated in the lifetime of the
-  // program, in bytes.
-  size_t MaxMemoryAllocated() const { return max_bytes_allocated_; }
+  //  returns the maximum memory used within the cache during current execution
+  size_t GetMaxAllocatedMemory() { return max_allocated_memory_; }
 
-  // memory held in the cache currently, in bytes.
-  size_t MemoryCached() const { return cur_bytes_allocated_ - cur_bytes_used_; }
+  CuMemoryAllocator();
 
-  // memory that's cached plus memory that's allocated, in bytes.
-  size_t MemoryAllocated() const { return cur_bytes_allocated_; }
+  // Allows you to set options: must be called before any Malloc function is
+  // called on this class.  It's done this way so the options can be changed
+  // by the user (c.f. RegisterCuAllocatorOptions()) before the options are read.
+  void SetOptions(const CuAllocatorOptions &opts) { opts_ = opts; }
 
-  void PrintMemoryUsage() const;
+  ~CuMemoryAllocator();
 
-  CuMemoryAllocator(CuAllocatorOptions opts);
  private:
 
-  void FreeSomeCachedMemory(size_t bytes_to_free);
+  struct SubRegion;
+
+  struct MemoryBlock {
+    char *begin;  // The beginning of the block (in CUDA memory)
+    char *end;  // the end of the block (in CUDA memory)
+    SubRegion *subregion;  // Pointer to the SubRegion to which this memory
+                            // block belongs.
+    bool allocated;  // True if this MemoryBlock has currently been given to the
+                     // user; false if not.
+
+    size_t t;        // Zero if this memory block was never given to the user;
+                     // otherwise, the time value (t_ in the CuAllocator class)
+                     // when it was most recently either allocated to the user
+                     // or freed by the user.
+
+    std::thread::id thread_id;  // If allocated == false and t > 0 (i.e. this
+                                // memory block was released by the user), the
+                                // thread-id of the user thread that freed this
+                                // block, or the invalid thread-id as created by
+                                // the constructor of std::thread::id if this
+                                // block was created by merging blocks from
+                                // different threads.  Required for
+                                // synchronization; and note that we assume
+                                // there is one CUDA stream per CPU thread.
+
+    MemoryBlock *next;  // The next MemoryBlock within this MemoryRegion (or
+                        // NULL if this is the last one); its 'begin' would be
+                        // the same as the 'end' of this block.
+    MemoryBlock *prev;  // The previous MemoryBlock within this MemoryRegion (or
+                        // NULL if this is the first one); its 'end' would be the
+                        // same as the 'begin' of this block.
 
-  // This calls CudaMallocPitch, checks for errors (dies if it has to), and
-  // returns the result.  It's up to the caller to do all the bookkeeping though.
-  inline void* MallocPitchInternal(size_t row_bytes, size_t num_rows, size_t *pitch);
+  };
 
-  typedef std::pair<size_t, size_t> MemoryRequest;  // (row_bytes, num_rows).
-  struct CachedMemoryElement {
-    void *pointer;  // the CUDA memory location that we own
-    size_t t;       // time value when we put this in the cache.
-    size_t pitch;   // pitch of this memory region (c.f. cudaMallocPitch()).
-    CachedMemoryElement() { }
-    CachedMemoryElement(void *pointer, size_t t, size_t pitch):
-        pointer(pointer), t(t), pitch(pitch) { }
+  // a MemoryRegion is a large piece of memory that we allocated via CudaMalloc.
+  // there normally won't be more than about 3 or 4 of these.
+  // We'll identify MemoryRegions by a size_t (e.g 0, 1, 2, 3... ) which is an
+  // index into the memory_regions_ vector.
+  struct MemoryRegion {
+    char *begin;  // 'begin' is the start of the memory region.
+    char *end;  // 'end' is the end of the memory region.
+    SubRegion *subregion_begin;  // The first SubRegion that belongs to this
+                                 // MemoryRegion.
+    MemoryBlock *block_begin;  // The first MemoryBlock that belongs to this
+                               // MemoryRegion.
   };
 
-  // This class caches a map from MemoryRequest to a list of CachedMemoryElements,
-  // and gives us access to the least-recently-used element for efficient.
-  // removal.
-  // We will have an instance of this class for each power-of-2 of size in
-  // bytes.  This makes it easier to, when we need to delete something, find
-  // the item for which the (time-since-used * size-in-bytes) is approximately
-  // greatest.
-  class MruCache {
-   public:
-    size_t LeastRecentTime() const;  // t value of least recent CachedMemoryElement (0
-                                     // if empty).
-
-    size_t RemoveLeastRecentlyUsed();  // Remove least-recently-used element
-                                       // from cache.  Return size in bytes of
-                                       // that removed memory region.  Crash if
-                                       // this was empty.
-
-    // Attempts lookup of the most recently cached element corresponding to
-    // 'request'.  If available, removes it from the cache and puts it to
-    // 'output', and returns true.  Otherwise returns false.
-    bool Lookup(const MemoryRequest &request,
-                CachedMemoryElement *output);
-
-    // Inserts this CachedMemoryElement to the list of CachedMemoryElements for this
-    // MemoryRequest.  The time in the CachedMemoryElement is expected to be greater
-    // than times in previously supplied CachedMemoryElements.
-    void Insert(const MemoryRequest &request,
-                const CachedMemoryElement &element);
-
-    struct MemoryRequestHasher {
-      // input is interpreted as (row_bytes, num_rows).  row_bytes will always
-      // be a multiple of 4, and num_rows will frequently be a multiple of
-      // powers of 2 also.  We need to shift right and add so that there will be
-      // some action in the lower-order bits.
-      size_t operator () (const std::pair<size_t,size_t> &p) const noexcept {
-        size_t temp = p.first + 1867 * p.second;
-        return temp + (temp >> 2) + (temp >> 8);
-      }
-    };
-
-    MruCache() { }
-    // Define these to make inclusion in std::vector possible, but make them
-    // fail if called on anything but empty cache objects-- we never resize
-    // the vector of caches after initializing it.
-    MruCache &operator = (const MruCache &other);
-    MruCache(const MruCache &other);
-   private:
-    typedef std::list<MemoryRequest> ListType;
-    typedef std::list<MemoryRequest>::iterator ListIterType;
-    typedef std::deque<std::pair<CachedMemoryElement, ListIterType> > MapValueType;
-    typedef unordered_map<MemoryRequest, MapValueType,
-                          MemoryRequestHasher> MapType;
-    // 'list_' contains MemoryRequests with the most recent on the back (where they are added),
-    // and least recent on the front (where they are removed by RemoveLeastRecentlyUsed, although
-    // they are also removed from random parts of the list by Lookup().
-    // There will in general be duplicates of MemoryRequests in the list, as
-    // many as there are entries in the MapValueType.
-    ListType list_;
-    // 'map_' maps from a MemoryRequest to a queue of (memory-element,
-    // iterator), with the most-recently-added things at the back; we remove
-    // things from the front of these queues (oldest) inside
-    // RemoveLeastRecentlyUsed(), and from the back (newest) in Lookup.
-    MapType map_;
+  // a SubRegion is a smaller zone of memory within a MemoryRegion.  For
+  // example, we divide the first MemoryRegion we allocate into 10 blocks, and
+  // if we allocate blocks of memory later on, we'll sub-divide them into blocks
+  // of about the same size.  A SubRegion is just a largish bin into which we
+  // put any blocks of memory that happen to start within that SubRegion;
+  // actually, memory blocks may cross over the boundaries of SubRegions.  The
+  // motivation for dividing up MemoryRegions into SubRegions is that it allos
+  // us an efficient mechanism to segregate smaller memory blocks into higher
+  // memory and larger ones into lower memory: for each allocation, we allocate
+  // it from the highest-numbered SubRegion that is able to allocate something of
+  // that size.  Over time, this will lead to smaller memory blocks being
+  // concentrated in higher-numbered SubRegions.
+  struct SubRegion {
+    size_t memory_region;  // This is an index into the memory_regions_ vector
+                           // which identifies which MemoryRegion this SubRegion
+                           // is a part of.
+    size_t subregion_index;  // The index of this SubRegion within the
+                             // subregions_ vector; this can change when we
+                             // allocate more MemoryRegions.
+    char *begin;  // 'begin' is the start of the memory in this SubRegion.
+    char *end;    // 'end' is the end of the memory in this SubRegion.
+
+    // Contains the free MemoryBlocks starting within this SubRegion.
+    std::set<std::pair<size_t, MemoryBlock*> > free_blocks;
+
+    // Pointer to the next SubRegion within this MemoryRegion (i.e. the SubRegion
+    // whose begin equals this one's end), or NULL if this is the last one.
+    SubRegion *next;
   };
 
+  // Tries to allocate CUDA memory of the given size; will crash if it was not
+  // able to.
+  inline void* MallocInternal(size_t size);
 
-  inline MruCache &GetCacheForSize(size_t num_bytes);
+  // Allocates from a given SubRegion, after we have determined that it
+  // can satisfy this request.  Broken out of MallocInternal for clarity.
+  inline void* MallocFromSubregion(SubRegion *subregion, size_t size);
 
-  CuAllocatorOptions opts_;
 
-  // indexed by log_2 (amount of memory requested), the caches.
-  std::vector<MruCache> caches_;
+  // Splits the given MemoryBlock so that one piece is of size 'size', and
+  // returns the piece which is of size 'size'.  The caller guarantees that
+  // 'size' is less than the current size of the memory block, that 'block' is
+  // not currently allocated (i.e. block->allocated == false).  This function
+  // assumes that, at entry, 'block' is not present in its subregion's
+  // 'free_blocks' (because the caller has removed it), and it takes
+  // responsibility for entering the 'unused' part (the part we're not
+  // returning) into its subregion's 'free_blocks' by calling AddToFreeBlocks().
+  inline MemoryBlock *SplitBlock(MemoryBlock *block, size_t size);
 
-  size_t cur_bytes_allocated_;  // number of bytes currently owned by callers or
-                                // cached.
-  size_t max_bytes_allocated_;  // the max over all time, of cur_bytes_allocated_.
-  size_t cur_bytes_used_;  // number of bytes currently owned by callers.
-  size_t max_bytes_used_;  // the max over all time, of cur_bytes_used_.
-  size_t t_;  // time counter, incremented with each call.
-  size_t num_user_allocations_;  // number of times user calls Malloc*
-  size_t num_system_allocations_;  // number of times we call cudaMalloc*.
-  double tot_time_taken_in_cuda_malloc_;  // time in cudaMalloc
-  double tot_time_taken_in_cuda_malloc_pitch_;  // time in cudaMallocPitch
-  double tot_time_taken_in_cuda_free_;  // time in cudaFree
-  double tot_time_taken_in_malloc_pitch_;  // time in this->MallocPitch()
-
-
-  // a memory element is 'used' when it is currently possessed by the caller
-  // (and is not in our cache).
-  struct UsedMemoryElement {
-    size_t row_bytes;
-    size_t num_rows;
-    size_t pitch;
-    UsedMemoryElement() { }
-    UsedMemoryElement(size_t row_bytes, size_t num_rows, size_t pitch):
-        row_bytes(row_bytes), num_rows(num_rows), pitch(pitch)  { }
-  };
+  // Removes this block from the 'free_blocks' set of the SubRegion to which
+  // it belongs.  This is called when allocating a block, and from other places.
+  void RemoveFromFreeBlocks(MemoryBlock *block);
+
+  // Adds this block to the 'free_blocks' set of the SubRegion to which it
+  // belongs.  This is called when freeing a block, and from other places.
+  void AddToFreeBlocks(MemoryBlock *block);
+
+  // This function is called when an allocation failed and we need to try to
+  // allocate more memory from the evice.  The 'size' is the size of the
+  // requested memory block whose allocation failed-- it's provided so that
+  // we can be sure to allocate a new region of at least this size.
+  void AllocateNewRegion(size_t size);
+
+  // Called from AllocateNewRegion(), this ensures that the subregions are
+  // sorted as we want (which is a kind of heuristic that will be discussed in
+  // the code), and it also recomputes the largest_free_block_ array.
+  void SortSubregions();
 
-  struct PointerHasher {
-    size_t operator() (const void *arg) const noexcept {
-      // the last few bits tend to be very predictable, for alignment reasons (CUDA
-      // allocation may align on 256 byte or 512 byte boundaries or something similar).
-      size_t temp = reinterpret_cast<size_t>(arg);
-      return (temp >> 4) + (temp >> 9);
-    }
-  };
 
-  // This is a map from memory locations owned by the user, so we can recover
-  // the information when people call Free() and we add it back into the cache.
-  unordered_map<void*, UsedMemoryElement, PointerHasher> used_map_;
 
-  // this is only locked by the '*Locking' versions of the functions.
+  CuAllocatorOptions opts_;
+
+  std::vector<MemoryRegion> memory_regions_;
+
+  std::vector<SubRegion*> subregions_;
+
+  // For each SubRegion in sub_regions_, this vector gives us the size of the
+  // largest free block present in that SubRegion, which is equal to
+  // sub_regions_[i]->free_blocks.begin()->first.  It allows us to fairly
+  // efficiently find the lowest-numbered SubRegion which can handle a
+  // particular request for memory.
+  std::vector<size_t> largest_free_block_;
+
+  size_t t_;  // time counter, incremented with each call.
+  size_t synchronize_gpu_t_;     // value of t_ at the last time we called
+                                 // SynchronizeGpu().
+  size_t num_synchronizations_;  // number of times we called SynchronizeGpu()
+  double tot_time_taken_;  // Total time taken in calls to this object.
+  double malloc_time_taken_;  // Total time we spent calling cudaMalloc().
+
+  // This is a map from memory locations currently owned by the user, to the
+  // MemoryBlock which stores the information about that location.
+  std::unordered_map<void*, MemoryBlock*> allocated_block_map_;
+
+  // this is only locked by the '*Locking' versions of the functions (necessary only
+  // in multi-threaded applications).
   std::mutex mutex_;
 
+  // Keep track of the memory usage from the cache to track the maximum memory used by
+  //   the application
+  size_t max_allocated_memory_;
+  size_t allocated_memory_;
 };
 
 
-}  // namespace
+// This function returns some printable information about the memory used
+// as a string: an example showing the format is:
+//  "free: 10M, used: 490M, total: 500M: free/total: 0.02"
+// In addition, if the pointers 'free' and 'total' are non-NULL, it will
+// output to them the free memory and the total memory of the device.
+std::string GetFreeGpuMemory(int64* free, int64* total);
+
+extern CuMemoryAllocator g_cuda_allocator;
+
+}  // namespace kaldi
 
 #endif // HAVE_CUDA
 
diff --git a/src/cudamatrix/cu-array-inl.h b/src/cudamatrix/cu-array-inl.h
index ddae19b9a4e..53de59fe4fc 100644
--- a/src/cudamatrix/cu-array-inl.h
+++ b/src/cudamatrix/cu-array-inl.h
@@ -105,8 +105,9 @@ void CuArrayBase<T>::CopyFromVec(const std::vector<T> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CU_SAFE_CALL(
-        cudaMemcpy(data_, &src.front(), src.size() * sizeof(T),
-                   cudaMemcpyHostToDevice));
+        cudaMemcpyAsync(data_, &src.front(), src.size() * sizeof(T),
+                   cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -122,7 +123,9 @@ void CuArray<T>::CopyFromVec(const std::vector<T> &src) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(this->data_, &src.front(), src.size()*sizeof(T), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(this->data_, &src.front(), 
+          src.size()*sizeof(T), cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -139,8 +142,9 @@ void CuArray<T>::CopyFromArray(const CuArrayBase<T> &src) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(this->data_, src.data_, this->dim_ * sizeof(T),
-                            cudaMemcpyDeviceToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(this->data_, src.data_, this->dim_ * sizeof(T),
+                                 cudaMemcpyDeviceToDevice,
+                                 cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -158,8 +162,8 @@ void CuArrayBase<T>::CopyFromArray(const CuArrayBase<T> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CU_SAFE_CALL(
-        cudaMemcpy(this->data_, src.data_, dim_ * sizeof(T),
-                   cudaMemcpyDeviceToDevice));
+      cudaMemcpyAsync(this->data_, src.data_, dim_ * sizeof(T),
+                      cudaMemcpyDeviceToDevice, cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -178,7 +182,9 @@ void CuArrayBase<T>::CopyToVec(std::vector<T> *dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(&dst->front(), Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(&dst->front(), Data(), this->dim_ * sizeof(T),
+          cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
   } else
 #endif
@@ -195,7 +201,9 @@ void CuArrayBase<T>::CopyToHost(T *dst) const {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(dst, Data(), this->dim_ * sizeof(T), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(dst, Data(), this->dim_ * sizeof(T),
+          cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::CopyToVecD2H", tim);
   } else
 #endif
@@ -211,7 +219,8 @@ void CuArrayBase<T>::SetZero() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset(this->data_, 0, this->dim_ * sizeof(T)));
+    CU_SAFE_CALL(cudaMemsetAsync(this->data_, 0, this->dim_ * sizeof(T),
+          cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuArray::SetZero", tim);
   } else
 #endif
diff --git a/src/cudamatrix/cu-array.h b/src/cudamatrix/cu-array.h
index 82d07bdab4f..84f78f00a91 100644
--- a/src/cudamatrix/cu-array.h
+++ b/src/cudamatrix/cu-array.h
@@ -189,8 +189,8 @@ class CuSubArray: public CuArrayBase<T> {
   CuSubArray(const T* data, MatrixIndexT length) {
     // Yes, we're evading C's restrictions on const here, and yes, it can be used
     // to do wrong stuff; unfortunately the workaround would be very difficult.
-    CuArrayBase<T>::data_ = const_cast<T*>(data);
-    CuArrayBase<T>::dim_ = length;
+    this->data_ = const_cast<T*>(data);
+    this->dim_ = length;
   }
 };
 
diff --git a/src/cudamatrix/cu-block-matrix.cc b/src/cudamatrix/cu-block-matrix.cc
index fc8f4b7ce72..e0c64912207 100644
--- a/src/cudamatrix/cu-block-matrix.cc
+++ b/src/cudamatrix/cu-block-matrix.cc
@@ -140,7 +140,9 @@ void CuBlockMatrix<Real>::SetCudaData() {
     size_t size = NumBlocks() * sizeof(CuBlockMatrixData);
     cu_data_ = static_cast<CuBlockMatrixData*>(
         CuDevice::Instantiate().Malloc(size));
-    CU_SAFE_CALL(cudaMemcpy(cu_data_, &(tmp_cu_data[0]), size, cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(cu_data_, &(tmp_cu_data[0]), size, 
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);    
   }
 #endif
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 7446a76bf93..3d9d7e52939 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -43,6 +43,14 @@
   } \
 }
 
+#define CUFFT_SAFE_CALL(fun) \
+{ \
+  int32 ret; \
+  if ((ret = (fun)) != CUFFT_SUCCESS) { \
+    KALDI_ERR << "cublasResult " << ret << " returned from '" << #fun << "'"; \
+  } \
+}
+
 #define CUBLAS_SAFE_CALL(fun) \
 { \
   int32 ret; \
@@ -51,6 +59,15 @@
   } \
 }
 
+#define CUSOLVER_SAFE_CALL(fun) \
+{ \
+  int32 ret; \
+  if ((ret = (fun)) != 0) { \
+    KALDI_ERR << "cusolverStatus_t " << ret << " : \"" << ret << "\" returned from '" << #fun << "'"; \
+  } \
+}
+
+
 #define CUSPARSE_SAFE_CALL(fun) \
 { \
   int32 ret; \
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index c5114ed8b22..4d0012f28a4 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -23,7 +23,6 @@
 
 
 #if HAVE_CUDA == 1
-
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
@@ -42,23 +41,15 @@
 #include "base/kaldi-utils.h"
 #include "util/common-utils.h"
 #include "util/kaldi-io.h"
+// the following is for cuda_legacy_noop().
+#include "cudamatrix/cu-kernels-ansi.h"
 
 namespace kaldi {
 
-/**
-   This function was added by Dan in July 2015 after upgrading on the CLSP
-   cluster to the CUDA 7.0 toolkit; the old mechanism of just calling
-   cudaThreadSynchronize() [==cudaDeviceSynchronize()] and having it
-   automagically select a GPU (when exclusive mode is on) doesn't seem to work
-   any more, in situations where GPU 0 is already being used.  This works.  It's
-   not 100% clear if the fact that the old code wasn't working was a bug, or a
-   changed feature (the NVidia docs were never super-clear regarding device
-   initialization).  But regardless, changing to this new mechanism should be
-   harmless even if the problem was specific to the CLSP grid.
-*/
-
+/// This function attempts to get a CUDA device context on some available device
+/// by doing 'cudaFree(0)'.  If it succeeds it returns true; if it fails, it
+/// outputs some debugging information into 'debug_str' and returns false.
 static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
-
   // Our first attempt to get a device context is: we do cudaFree(0) and see if
   // that returns no error code.  If it succeeds then we have a device
   // context.  Apparently this is the canonical way to get a context.
@@ -88,53 +79,143 @@ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
   return false;
 }
 
-/**
- * SelectGpuId(use_gpu)
- *
- * There are 3 'use_gpu' modes for GPU selection:
- * "yes"      -- Select GPU automatically (or get one by exclusive mode)
- *               and die if this fails.
- * "optional" -- Do as above, but if it fails, back off to CPU.
- * "no"       -- Run on CPU.
- *
- * In case of Compute exclusive mode, the GPU is selected by OS.
- *
- * Otherwise GPU selection is based on largest proportion of free memory.
- * This can eventually lead to multiple processes computing on single GPU,
- * which is slow. More practical is to use "compute exclusive mode".
- *
- * This method is to be called at the very beginning of the program
- * (before first allocation in cudamatrix), or not at all (default to CPU).
- *
- */
-void CuDevice::SelectGpuId(std::string use_gpu) {
-  // Possible modes
-  if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional" && use_gpu != "wait") {
-    KALDI_ERR << "Please choose : --use-gpu=yes|no|optional|wait, passed '" << use_gpu << "'";
+
+void CuDevice::Initialize() {
+  // This function may be called in the following two situations:
+  //
+  // (1) in the main thread, only when a GPU is not currently being used, either
+  // within a call like CuDevice()::Instantiate().SelectGpuId(..)
+  // (where the Instantiate() call will call Initialize() before SelectGpuId()
+  // is called, just because of how Instantiate() works), or in a call
+  // to 'CuDevice::Instantiate().Enabled()'.  In this case it will just
+  // set initialized_ to true and notice that device_id_ == 1, and do nothing.
+  //
+  // (2) in threads created by the user, as soon as someone calls something that
+  //   might potentially use the GPU, via CuDevice()::Instantiate().
+  //   If device_id_ is >= 0, this will create the cuBLAS and cuSparse handles.
+  KALDI_ASSERT(!initialized_);
+  initialized_ = true;
+  if (device_id_ == -1) {
+    // There is nothing to do; we are not using a GPU.
+    return;
+  } else {
+    if (!multi_threaded_) {
+      multi_threaded_ = true;
+      KALDI_WARN << "For multi-threaded code that might use GPU, you should call "
+          "CuDevice::Instantiate().AllowMultithreading() at the start of "
+          "the program.";
+    }
+    device_id_copy_ = device_id_;
+    cudaSetDevice(device_id_);
+    // Initialize CUBLAS.
+    CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
+    CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
+
+#if CUDA_VERSION >= 9010
+    CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
+    CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_, 
+            cudaStreamPerThread));
+#endif
+    
+#if CUDA_VERSION >= 9000 
+    if (device_options_.use_tensor_cores) {
+      // Enable tensor cores in CUBLAS
+      // Note if the device does not support tensor cores this will fall back to normal math mode
+      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
+            CUBLAS_TENSOR_OP_MATH));
+    }
+#endif
+
+    // Initialize the cuSPARSE library
+    CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
+    CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+
+    // Initialize the generator,
+    CURAND_SAFE_CALL(curandCreateGenerator(
+          &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
+    // To get same random sequence, call srand() before the constructor is invoked,
+    CURAND_SAFE_CALL(curandSetGeneratorOrdering(
+          curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
+    CURAND_SAFE_CALL(curandSetStream(curand_handle_, cudaStreamPerThread));
+    SeedGpu();
   }
+}
 
-  // Make sure this function is not called twice!
-  if (Enabled()) {
-    KALDI_ERR << "There is already an active GPU " << active_gpu_id_
-              << ", cannot change it on the fly!";
+void CuDevice::SelectGpuDevice(int device_id) {
+  if (device_id_ != -1) {
+    KALDI_ERR << "You cannot call SelectGpu* twice if, on the first time, "
+                 "you requested a GPU.";
   }
-  // Allow the GPU to stay disabled
-  if (!Enabled() && use_gpu == "no") {
+
+  if (device_id < 0) {
     KALDI_LOG << "Manually selected to compute on CPU.";
     return;
   }
+  // Check that we have a gpu available
+  int32 num_gpus = 0;
+
+  cudaError_t e = cudaGetDeviceCount(&num_gpus);
+
+  // Make sure the global allocator object has the up-to-date options.
+  g_cuda_allocator.SetOptions(g_allocator_options);
+
+  if (num_gpus == 0) {
+    KALDI_CUDA_ERR(e, "No CUDA GPU detected!");
+  }
+
+  if (device_id > num_gpus - 1) {
+    KALDI_ERR
+        << "Valid device id should be in the range (both inclusive): 0 to "
+        << (num_gpus - 1) << ". You passed: " << device_id;
+  }
 
+  KALDI_LOG << "Select gpu id " << device_id << " out of " << (num_gpus - 1)
+            << " (starting from 0)";
+
+  e = cudaSetDevice(device_id);
+  if (e != cudaSuccess) {
+    KALDI_CUDA_ERR(e, "Failed to select device with id " << device_id);
+  }
+  // Double check that we have the context
+  KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize());
+
+  // Check if the machine uses compute exclusive mode
+  if (IsComputeExclusive()) {
+    KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.";
+  } else {
+    // Suggest to use compute exclusive mode
+    KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
+                  "'nvidia-smi -c 3' to set compute exclusive mode";
+  }
+  FinalizeActiveGpu();
+}
+
+void CuDevice::SelectGpuId(std::string use_gpu) {
+  if (device_id_ != -1) {
+    KALDI_ERR << "You cannot call SelectGpuId twice if, on the first time, "
+        "you requested a GPU.";
+  }
+  if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional" && use_gpu != "wait") {
+    KALDI_ERR << "Please choose : --use-gpu=yes|no|optional|wait, passed '" << use_gpu << "'";
+  }
+  if (use_gpu == "no") {
+    KALDI_LOG << "Manually selected to compute on CPU.";
+    return;
+  }
   // Check that we have a gpu available
   int32 num_gpus = 0;
 
   cudaError_t e = cudaGetDeviceCount(&num_gpus);
 
+  // Make sure the global allocator object has the up-to-date options.
+  g_cuda_allocator.SetOptions(g_allocator_options);
+
   if (num_gpus == 0) {
     if (use_gpu == "yes" || use_gpu == "wait") {
       KALDI_CUDA_ERR(e, "No CUDA GPU detected!");
     }
     if (use_gpu == "optional") {
-      KALDI_WARN << "Running on CPU!!! No CUDA GPU detected...";
+      KALDI_WARN << "No CUDA GPU detected; running on CPU since --use-gpu=optional specified.";
       return;
     }
   }
@@ -183,8 +264,8 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
                << " seconds before creating CUDA context";
   }
 
-  // Re-assure we have the context
-  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+  // Double check that we have the context
+  KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize());
 
   // Check if the machine use compute exclusive mode
   if (IsComputeExclusive()) {
@@ -196,7 +277,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
     KALDI_WARN << "Not in compute-exclusive mode.  Suggestion: use "
         "'nvidia-smi -c 3' to set compute exclusive mode";
     // We want to choose the device more carefully, so release the CUDA context.
-    e = cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset
+    e = cudaDeviceReset();
     if (e != cudaSuccess) {
       KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU");
     }
@@ -206,8 +287,8 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
       FinalizeActiveGpu();
       return;
     } else {
-      // Could not get GPU, after prevously having the CUDA context?
-      // Strange but not impossible...
+      // We could not get a GPU the second time, after prevously having the CUDA
+      // context.  Strange but not impossible.
       if (use_gpu == "yes") {
         KALDI_ERR << "Error acquiring GPU.";
       }
@@ -221,37 +302,65 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
 
 
 void CuDevice::FinalizeActiveGpu() {
-  // The device at this point should have active GPU, so we can query its name
-  // and memory stats and notify user which GPU is finally used.
+  // The device at this point should have an active GPU, so we can query its
+  // name and memory stats and notify user which GPU is being used.
 
-  // Get the device-id of active device:
+  // Get the device-id of the active device.
   {
-    int32 act_gpu_id;
-    cudaError_t e = cudaGetDevice(&act_gpu_id);
+    int device_id;
+    cudaError_t e = cudaGetDevice(&device_id);
     if (e != cudaSuccess) {
       KALDI_CUDA_ERR(e, "Failed to get device-id of active device.");
     }
-    // Remember the id of active GPU
-    active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on
+    device_id_ = device_id;
+    device_id_copy_ = device_id;
+    initialized_ = true;  // Prevent Initialize() from being called on this,
+                          // the main thread.
     // Initialize CUBLAS.
-    CUBLAS_SAFE_CALL(cublasCreate(&handle_));
+    CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_));
+    CUBLAS_SAFE_CALL(cublasSetStream(cublas_handle_, cudaStreamPerThread));
+    
+#if CUDA_VERSION >= 9010 
+    CUSOLVER_SAFE_CALL(cusolverDnCreate(&cusolverdn_handle_));
+    CUSOLVER_SAFE_CALL(cusolverDnSetStream(cusolverdn_handle_,
+            cudaStreamPerThread));
+#endif
+
+#if CUDA_VERSION >= 9000 
+    if (device_options_.use_tensor_cores) {
+      // Enable tensor cores in CUBLAS
+      // Note if the device does not support tensor cores this will fall back to normal math mode
+      CUBLAS_SAFE_CALL(cublasSetMathMode(cublas_handle_, 
+            CUBLAS_TENSOR_OP_MATH));
+    }
+#endif
+
+    
     // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
-
-    // Notify user which GPU is finally used
+    CUSPARSE_SAFE_CALL(cusparseSetStream(cusparse_handle_, cudaStreamPerThread));
+    
+    // Initialize the generator,
+    CURAND_SAFE_CALL(curandCreateGenerator(
+          &curand_handle_, CURAND_RNG_PSEUDO_DEFAULT));
+    // To get same random sequence, call srand() before the constructor is invoked,
+    CURAND_SAFE_CALL(curandSetGeneratorOrdering(
+          curand_handle_, CURAND_ORDERING_PSEUDO_DEFAULT));
+    SeedGpu();
+
+    // Notify the user which GPU is being userd.
     char name[128];
-    DeviceGetName(name,128,act_gpu_id);
+    DeviceGetName(name,128, device_id);
 
-    CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id));
+    CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, device_id));
 
-    KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t"
-              << GetFreeMemory(&free_memory_at_startup_, NULL) << " version "
+    KALDI_LOG << "The active GPU is [" << device_id << "]: " << name << "\t"
+              << GetFreeGpuMemory(&free_memory_at_startup_, NULL) << " version "
               << properties_.major << "." << properties_.minor;
   }
   return;
 }
 
-
 bool CuDevice::DoublePrecisionSupported() {
   if (!Enabled()) return true;
   return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3);
@@ -261,10 +370,10 @@ bool CuDevice::DoublePrecisionSupported() {
 
 bool CuDevice::IsComputeExclusive() {
   // assume we already have an CUDA context created
-  KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize());
+  KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize());
 
   // get the device-id and its device-properties
-  int32 gpu_id = -1;
+  int gpu_id = -1;
   cudaError_t e = cudaGetDevice(&gpu_id);
   if (e != cudaSuccess) {
     KALDI_CUDA_ERR(e, "Failed to get current device");
@@ -279,19 +388,52 @@ bool CuDevice::IsComputeExclusive() {
     case cudaComputeModeExclusive :
       return true;
       break;
-#if (CUDA_VERSION >= 4000)
     case cudaComputeModeExclusiveProcess :
       return true;
       break;
-#endif
     default :
       // in this case we release the GPU context...
       return false;
   }
 }
 
-template<typename TA, typename TB>
-bool greater_pair(const std::pair<TA, TB> &left, const std::pair<TA, TB>& right) {
+bool CuDevice::SelectGpuId(int dev_id) {
+  KALDI_LOG << "Trying to select device: " << dev_id;
+  cudaError_t e = cudaSetDevice(dev_id);
+  if (e != cudaSuccess) {
+    KALDI_WARN << "Cannot select this device: return code " << e
+               << ", Error message: \"" << cudaGetErrorString(e) << "\"";
+    return false;
+  } else {
+    e = cudaDeviceSynchronize();
+    if (e != cudaSuccess) {
+      KALDI_WARN << "Cannot select this device: return code " << e
+                 << ", Error message: \"" << cudaGetErrorString(e) << "\"";
+      return false;
+    }
+  }
+
+  std::string debug_str;
+  int num_gpus = dev_id + 1;  // used for debugging purposes
+  bool got_context = GetCudaContext(num_gpus, &debug_str);
+  if (!got_context) {
+    KALDI_WARN << "Cannot get Cuda Context, Error message: \"" << debug_str
+               << "\"";
+  }
+
+  return true;
+}
+
+bool CuDevice::SelectAndInitializeGpuIdWithExistingCudaContext(int dev_id) {
+  // Make sure the global allocator object has the up-to-date options.
+  g_cuda_allocator.SetOptions(g_allocator_options);
+  if (!CuDevice::SelectGpuId(dev_id)) return false;
+  FinalizeActiveGpu();
+  return true;
+}
+
+template <typename TA, typename TB>
+bool greater_pair(const std::pair<TA, TB> &left, const std::pair<TA, TB> &right) {
   return left.second > right.second;
 }
 
@@ -318,37 +460,35 @@ bool CuDevice::SelectGpuIdAuto() {
     switch(ret) {
       case cudaSuccess : {
         // create the CUDA context for the thread
-        cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize
+        cudaDeviceSynchronize();
         // get GPU name
         char name[128];
         DeviceGetName(name,128,n);
         // get GPU memory stats
         int64 free, total;
         std::string mem_stats;
-        mem_stats = GetFreeMemory(&free, &total);
+        mem_stats = GetFreeGpuMemory(&free, &total);
         // log
         KALDI_LOG << "cudaSetDevice(" << n << "): "
                   << name << "\t" << mem_stats;
 
-        // We have seen that in some cases GetFreeMemory returns zero
+        // We have seen that in some cases GetFreeGpuMemory returns zero
         // That will produce nan after division, which might confuse
         // the sorting routine. Or maybe not, but let's keep it clean
         if (total <= 0) {
-          KALDI_LOG << "Total memory reported for device " << n << " is zero (or less).";
+          KALDI_LOG << "Total memory reported for device " << n
+                    << " is zero (or less).";
         }
         float mem_ratio = total > 0 ? free/(float)total : 0;
         free_mem_ratio[n] = std::make_pair(n, mem_ratio);
 
         // destroy the CUDA context for the thread
-        cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset
+        cudaDeviceReset();
       } break;
-
-#if (CUDA_VERSION > 3020)
       case cudaErrorDeviceAlreadyInUse :
         KALDI_LOG << "cudaSetDevice(" << n << "): "
                   << "Device cannot be accessed, used EXCLUSIVE-THREAD mode...";
         break;
-#endif
       case cudaErrorInvalidDevice :
         KALDI_LOG << "cudaSetDevice(" << n << "): "
                   << "Device cannot be accessed, not a VALID CUDA device!";
@@ -366,8 +506,9 @@ bool CuDevice::SelectGpuIdAuto() {
   // the free_mem_ratio should be bigger than zero
   KALDI_ASSERT(free_mem_ratio[max_id].second > 0.0);
 
-  float dev_id;
+  int dev_id;
   float mem_ratio;
+  bool success;
   do {
     // try to select the GPU in the best to worst order
     // Note we have to check the return codes manually, as the CU_SAFE_CALL
@@ -376,20 +517,11 @@ bool CuDevice::SelectGpuIdAuto() {
     dev_id = free_mem_ratio[max_id].first;
     mem_ratio = free_mem_ratio[max_id].second;
 
-    KALDI_LOG << "Trying to select device: " << dev_id << " (automatically), mem_ratio: " << mem_ratio;
-    e = cudaSetDevice(dev_id);
-    if (e != cudaSuccess) {
-      KALDI_WARN << "Cannot select this device: return code " << e
-                 << ", Error message: \"" << cudaGetErrorString(e) << "\"";
-    } else {
-      e = cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize
-      if (e != cudaSuccess) {
-        KALDI_WARN << "Cannot select this device: return code " << e
-                   << ", Error message: \"" << cudaGetErrorString(e) << "\"";
-      }
-    }
+    KALDI_LOG << "Device: " << dev_id << ", mem_ratio: " << mem_ratio;
+    success = SelectGpuId(dev_id);
+
     max_id++;
-  } while ((e != cudaSuccess) && (max_id < free_mem_ratio.size()));
+  } while (success && (max_id < free_mem_ratio.size()));
 
   if (e != cudaSuccess) {
     KALDI_WARN << "Failed to (automatically) select any device";
@@ -403,10 +535,16 @@ bool CuDevice::SelectGpuIdAuto() {
 void CuDevice::AccuProfile(const char *function_name,
                            const CuTimer &timer) {
   if (GetVerboseLevel() >= 1) {
+    std::unique_lock<std::mutex> lock(profile_mutex_, std::defer_lock_t());
+    if (multi_threaded_)
+      lock.lock();
     std::string key(function_name);
-    cudaDeviceSynchronize();
+    // by passing 0 as the stream to cudaStreamSynchronize, we are using the
+    // per-thread default stream.  Since we compile with
+    // -DCUDA_API_PER_THREAD_DEFAULT_STREAM, this equates to a per-thread
+    // stream.
+    CU_SAFE_CALL(cudaStreamSynchronize(0));
     double elapsed = timer.Elapsed();
-
     if (profile_map_.find(key) == profile_map_.end())
       profile_map_[key] = elapsed;
     else
@@ -415,13 +553,8 @@ void CuDevice::AccuProfile(const char *function_name,
 }
 
 void CuDevice::PrintMemoryUsage() const {
-  if (Enabled()) {
-    allocator_.PrintMemoryUsage();
-    int64 free_memory_now;
-    GetFreeMemory(&free_memory_now, NULL);
-    KALDI_LOG << "Memory used (according to the device): "
-              << (free_memory_at_startup_ - free_memory_now) << " bytes.";
-  }
+  if (Enabled())
+    g_cuda_allocator.PrintMemoryUsage();
 }
 
 void CuDevice::PrintProfile() {
@@ -452,60 +585,6 @@ void CuDevice::PrintProfile() {
 }
 
 
-std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
-  // WARNING! the CUDA API is inconsistent accross versions!
-#ifdef _MSC_VER
-  size_t mem_free, mem_total;
-  cuMemGetInfo_v2(&mem_free, &mem_total);
-#else
-#if (CUDA_VERSION >= 3020)
-  // define the function signature type
-  size_t mem_free, mem_total;
-#else
-  unsigned int mem_free, mem_total;
-#endif
-  {
-    // we will load cuMemGetInfo_v2 dynamically from libcuda.so
-    // pre-fill ``safe'' values that will not cause problems
-    mem_free = 1; mem_total = 1;
-    // open libcuda.so
-    void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
-    if (NULL == libcuda) {
-      KALDI_WARN << "cannot open libcuda.so";
-    } else {
-      // define the function signature type
-      // and get the symbol
-#if (CUDA_VERSION >= 3020)
-      typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
-      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
-#else
-      typedef CUresult (*cu_fun_ptr)(int*, int*);
-      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo");
-#endif
-      if (NULL == dl_cuMemGetInfo) {
-        KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
-      } else {
-        // call the function
-        dl_cuMemGetInfo(&mem_free, &mem_total);
-      }
-      // close the library
-      dlclose(libcuda);
-    }
-  }
-#endif
-  // copy the output values outside
-  if (NULL != free) *free = mem_free;
-  if (NULL != total) *total = mem_total;
-  // prepare the text output
-  std::ostringstream os;
-  os << "free:" << mem_free/(1024*1024) << "M, "
-     << "used:" << (mem_total-mem_free)/(1024*1024) << "M, "
-     << "total:" << mem_total/(1024*1024) << "M, "
-     << "free/total:" << mem_free/(float)mem_total;
-  return os.str();
-}
-
-
 void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
   // prefill with something reasonable
   strncpy(name,"Unknown GPU",len);
@@ -554,15 +633,58 @@ void CuDevice::CheckGpuHealth() {
   AccuProfile(__func__, t);
 }
 
-CuDevice::CuDevice() :
-    active_gpu_id_(-1), debug_stride_mode_(false),
-    num_debug_stride_allocations_(0), allocator_(CuAllocatorOptions()),
-    multi_threaded_(false) { }
+CuDevice::CuDevice():
+    initialized_(false),
+    device_id_copy_(-1),
+    cublas_handle_(NULL),
+    cusparse_handle_(NULL),
+    cusolverdn_handle_(NULL) {
+}
+
+CuDevice::~CuDevice() {
+  if (cublas_handle_)
+    CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_));
+  if (cusparse_handle_)
+    CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_));
+  if (curand_handle_) {
+    CURAND_SAFE_CALL(curandDestroyGenerator(curand_handle_));
+  }
+#if CUDA_VERSION >= 9010
+  if (cusolverdn_handle_) {
+    CUSOLVER_SAFE_CALL(cusolverDnDestroy(cusolverdn_handle_));
+  }
+#endif
+}
+
+
+// Each thread has its own copy of the CuDevice object.
+// Note: this was declared "static".
+thread_local CuDevice CuDevice::this_thread_device_;
+  
+CuDevice::CuDeviceOptions CuDevice::device_options_;
 
+// define and initialize the static members of the CuDevice object.
+int32 CuDevice::device_id_ = -1;
+bool CuDevice::multi_threaded_ = false;
+unordered_map<std::string, double, StringHasher> CuDevice::profile_map_;
+std::mutex CuDevice::profile_mutex_;
+int64 CuDevice::free_memory_at_startup_;
+cudaDeviceProp CuDevice::properties_;
+bool CuDevice::debug_stride_mode_ = false;
 
-// The instance of the static singleton
-CuDevice CuDevice::global_device_;
+
+void SynchronizeGpu() {
+  cuda_legacy_noop();
+  CU_SAFE_CALL(cudaGetLastError());
 }
 
+}  // namespace kaldi
+
+#else  // #if HAVE_CUDA == 1
+
+namespace kaldi {
+// SynchronizeGpu() does nothing if we didn't compile for GPU.
+void SynchronizeGpu() { }
+}
 
-#endif // HAVE_CUDA
+#endif  // #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index 99105355a8f..87c569f3075 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -24,9 +24,9 @@
 #define KALDI_CUDAMATRIX_CU_DEVICE_H_
 
 #if HAVE_CUDA == 1
-
 #include <cublas_v2.h>
 #include <cusparse.h>
+#include <curand.h>
 #include <map>
 #include <string>
 #include <iostream>
@@ -35,67 +35,137 @@
 #include "base/kaldi-common.h"
 #include "base/timer.h"
 #include "cudamatrix/cu-allocator.h"
+#include "cudamatrix/cu-common.h"
+
+#if CUDA_VERSION >= 9010
+#include <cusolverDn.h>
+#else
+// cusolver not supported.  
+// Setting a few types to minimize compiler guards.
+// If a user tries to use cusovler it will throw an error.
+typedef void* cusolverDnHandle_t;
+typedef int cusolverStatus_t;
+#endif
 
 namespace kaldi {
 
 class CuTimer;
 
 /**
- * Singleton object which represents the CUDA device
- * responsible for CUBLAS initilalisation, collects profiling info
+   This class contains code for selecting the CUDA device, initializing the
+   cuBLAS and cuSparse handles, and providing an interface for memory allocation
+   (which supports caching, to avoid the slowness of the CUDA memory allocator).
+
+   There is a separate instance of the CuDevice object for each thread of the
+   program, but many of its variables are static (hence, shared between all
+   instances).
+
+   We only (currently) support using a single GPU device; however, we support
+   multiple CUDA streams.  The expected programming model here is that you will
+   have multiple CPU threads, and each CPU thread automatically gets its own
+   CUDA stream because we compile with -DCUDA_API_PER_THREAD_DEFAULT_STREAM.
+
+   In terms of synchronizing the activities of multiple threads: The CuDevice
+   object (with help from the underlying CuAllocator object) ensures that the
+   memory caching code won't itself be a cause of synchronization problems,
+   i.e. you don't have to worry that when you allocate with CuDevice::Malloc(),
+   the memory will still be in use by another thread on the GPU.  However, it
+   may sometimes still be necessary to synchronize the activities of multiple
+   streams by calling the function SynchronizeGpu()-- probably right before a
+   thread increments a semaphore, right after it waits on a semaphore, or
+   right after it acquires a mutex, or something like that.
+
  */
 class CuDevice {
- // Singleton object (there should only be one instantiated per program)
  public:
-  static inline CuDevice& Instantiate() { return global_device_; }
 
-  inline cublasHandle_t GetHandle() { return handle_; }
+  // You obtain the CuDevice for the current thread by calling
+  //   CuDevice::Instantiate()
+  // At the beginning of the program, if you want to use a GPU, you
+  // should call CuDevice::Instantiate().SelectGpuId(..).
+  static inline CuDevice& Instantiate() {
+    CuDevice &ans = this_thread_device_;
+    if (!ans.initialized_)
+      ans.Initialize();
+    return ans;
+  }
+
+  inline cublasHandle_t GetCublasHandle() { return cublas_handle_; }
   inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; }
+  inline curandGenerator_t GetCurandHandle() { return curand_handle_; }
+  inline cusolverDnHandle_t GetCusolverDnHandle() { 
+#if CUDA_VERSION < 9010
+    KALDI_ERR << "CUDA VERSION '" << CUDA_VERSION << "' not new enough to support "
+      << "cusolver. Upgrade to at least 9.1";
+#endif
+    return cusolverdn_handle_; 
+  }
 
-  // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc,
-  // cudaMallocPitch and cudaFree.  Their function is to cache the results of
-  // previous allocations to avoid the very large overhead that CUDA's
-  // allocation seems to give for some setups.
+  inline void SeedGpu() {
+    if (CuDevice::Instantiate().Enabled()) {
+      // To get same random sequence, call srand() before the method is invoked,
+      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(
+            curand_handle_, RandInt(128, RAND_MAX)));
+      CURAND_SAFE_CALL(curandSetGeneratorOffset(curand_handle_, 0));
+    }
+  }
+  // We provide functions Malloc(), MallocPitch() and Free() which replace
+  // cudaMalloc(), cudaMallocPitch() and cudaFree().  Their function is to cache
+  // the results of previous allocations to avoid the very large overhead that
+  // CUDA's allocation seems to give for some setups.
   inline void* Malloc(size_t size) {
-    return multi_threaded_ ? allocator_.MallocLocking(size) :
-        allocator_.Malloc(size);
+    return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) :
+        g_cuda_allocator.Malloc(size);
   }
 
   inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) {
     if (multi_threaded_) {
-      return allocator_.MallocPitchLocking(row_bytes, num_rows, pitch);
+      return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch);
     } else if (debug_stride_mode_) {
       // The pitch bucket size is hardware dependent.
       // It is 512 on K40c with CUDA 7.5
       // "% 8" ensures that any 8 adjacent allocations have different pitches
       // if their original pitches are same in the normal mode.
-      return allocator_.MallocPitch(
-          row_bytes + 512 * ((num_debug_stride_allocations_++) % 8), num_rows,
+      return g_cuda_allocator.MallocPitch(
+          row_bytes + 512 * RandInt(0, 4), num_rows,
           pitch);
     } else {
-      return allocator_.MallocPitch(row_bytes, num_rows, pitch);
+      return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch);
     }
   }
+
   inline void Free(void *ptr) {
-    if (multi_threaded_) allocator_.FreeLocking(ptr);
-    else allocator_.Free(ptr);
+    if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr);
+    else g_cuda_allocator.Free(ptr);
   }
 
-  /// Select a GPU for computation, the 'use_gpu' modes are:
-  ///  "yes"      -- Select GPU automatically and die if this fails.
+  /// Select a GPU for computation.  You are supposed to call this function just
+  /// once, at the beginning of the program (from the main thread), or not at
+  /// all.
+  /// The 'use_gpu' modes are:
+  ///  "yes" -- Select GPU automatically and die if this fails.  If you have set
+  ///           the GPUs to exclusive mode it will select one
+  ///           pseudo-randomly; otherwise it will choose whichever one has
+  ///           the most free memory (but we recommend to set GPUs to
+  ///           exclusive mode, or controlling which GPU to use by setting
+  ///           the variable CUDA_VISIBLE_DEVICES to the id of the GPU you
+  ///           want the program to use.
   ///  "optional" -- Do as above, but if it fails, back off to CPU.
   ///  "no"       -- Run on CPU.
-  ///  (more comments in cu-device.cc)
   void SelectGpuId(std::string use_gpu);
 
+  /// when device_id < 0, use CPU
+  /// otherwise, select the specified GPU
+  void SelectGpuDevice(int device_id);
+
+  // Select a specific GPU for computation. Will reuse the existing Cuda Context
+  // for that device. Initialize the necessary handles for GPU use (e.g. cublas
+  // handle)
+  bool SelectAndInitializeGpuIdWithExistingCudaContext(int dev_id);
+
   /// Check if the CUDA GPU is selected for use
   bool Enabled() const {
-    return (active_gpu_id_ > -1);
-  }
-
-  /// Get the active GPU id
-  int32 ActiveGpuId() {
-    return active_gpu_id_;
+    return (device_id_ > -1);
   }
 
   /// Returns true if either we have no GPU, or we have a GPU
@@ -106,21 +176,19 @@ class CuDevice {
   /// are printed out when you call PrintProfile().  However,
   /// it only does something if VerboseLevel() >= 1.
   void AccuProfile(const char *function_name, const CuTimer &timer);
+
+  /// Print some profiling information using KALDI_LOG.
   void PrintProfile();
 
+  /// Print some memory-usage information using KALDI_LOG.
   void PrintMemoryUsage() const;
 
   /// The user should call this if the program plans to access the GPU (e.g. via
   /// using class CuMatrix) from more than one thread.  If you fail to call this
-  /// for a multi-threaded program, it will occasionally segfault.
+  /// for a multi-threaded program, it may occasionally segfault (and also
+  /// the code will detect that you failed to call it, and will print a warning).
   inline void AllowMultithreading() { multi_threaded_ = true; }
 
-  void ResetProfile() {
-    profile_map_.clear();
-  }
-
-  /// Get the actual GPU memory use stats
-  std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const;
   /// Get the name of the GPU
   void DeviceGetName(char* name, int32 len, int32 dev);
 
@@ -153,22 +221,60 @@ class CuDevice {
   /// (i.e. from outside the class), call this only if Enabled() returns true.
   bool IsComputeExclusive();
 
+  // Register command line options for CUDA device.  
+  // This must be done before calling CuDevice::Initialize()
+  // Example:
+  //  CuDevice::RegisterDeviceOptions(&po);
+  //  po.Read(argc, argv);
+  //  CuDevice::Initialize();
+  static void RegisterDeviceOptions(OptionsItf *po) {
+    CuDevice::device_options_.Register(po);  
+  }
+  ~CuDevice();
  private:
+
+  struct CuDeviceOptions {
+    bool use_tensor_cores; // Enable tensor cores
+    CuDeviceOptions () : use_tensor_cores(false) {};
+    void Register(OptionsItf *po) {
+      po->Register("cuda-use-tensor-cores", &use_tensor_cores, 
+          "Enable FP16 tensor math. "
+          "This is higher performance but less accuracy. "
+          "This is only recommended for inference.");
+    }
+  };
+
+  static CuDeviceOptions device_options_;
+
+  // Default constructor used to initialize this_thread_device_
   CuDevice();
   CuDevice(CuDevice&); // Disallow.
   CuDevice &operator=(CuDevice&);  // Disallow.
 
 
-  static CuDevice global_device_;
-  cublasHandle_t handle_;
-  cusparseHandle_t cusparse_handle_;
+  /// The Initialize() function exists to do the following, in threads other
+  /// than the main thread, and only if we are using a GPU: call
+  /// cudaSetDevice(), and set up cublas_handle_ and cusparse_handle_.  It does
+  /// get called in the main thread (see documentation by its definition), but
+  /// does nothing interesting there.
+  void Initialize();
 
-  /// Automatically select GPU and get CUDA context.  Returns true on success.
+  /// Automatically select GPU and get CUDA context (this is only called, from
+  /// SelectGpuId(), if the GPUs are in non-exclusive mode).  Returns true on
+  /// success.
   bool SelectGpuIdAuto();
 
-  /// Try to get CUDA context on manually selected GPU.  Return true on success.
-  bool SelectGpuIdManual(int32 gpu_id);
-
+  // Selects GPU given its ID. Called from SelectGpuIdAuto or
+  // SelectGpuIdWithExistingCudaContext
+  bool SelectGpuId(int dev_id);
+
+  /// This function, called from SelectGpuId(), is to be called when a
+  /// GPU context corresponding to the GPU we want to use exists; it
+  /// works out the device-id, creates the cuBLAS and cuSparse handles,
+  /// and prints out some information that's useful for debugging.
+  /// It also sets initialized_ to true, to suppress Initialize() from
+  /// being called on this, the main thread, in future, since
+  /// that would try to create the handles again.
   void FinalizeActiveGpu();
 
   /// Should only be called if Enabled() == true.
@@ -177,29 +283,58 @@ class CuDevice {
   /// Should only be called if Enabled() == true.
   int32 MinorDeviceVersion();
 
-  unordered_map<std::string, double, StringHasher> profile_map_;
-
-  /// active_gpu_id_ values:
-  /// -3 default (default, the SelectGpuId was not called, we did not want to use GPU)
-  /// -2 SelectGpuId was called, but no GPU was present
-  /// -1 SelectGpuId was called, but the GPU was manually disabled
-  /// 0..N Normal GPU IDs
-  int32 active_gpu_id_;
-
-  int64 free_memory_at_startup_;
-
-  cudaDeviceProp properties_;
 
-  // there used to be a 'bool verbose_' here.  I'm leaving a placeholder here
-  // instead of removing it because it causes particularly hard-to-debug errors
-  // if compilation is not done right (e.g. make depend was not done), and this
-  // class's members move about.
-  bool unused_;
-  bool debug_stride_mode_;
-  uint32 num_debug_stride_allocations_;
-
-  CuMemoryAllocator allocator_;
-  bool multi_threaded_;   // true if user called AllowMultithreading().
+  // Each thread has its own CuDevice object, which contains the cublas and
+  // cusparse handles.  These are unique to the thread (which is what is
+  // recommended by NVidia).
+  static thread_local CuDevice this_thread_device_;
+
+  // The GPU device-id that we are using.  This will be initialized to -1, and will
+  // be set when the user calls
+  //  CuDevice::Instantiate::SelectGpuId(...)
+  // from the main thread.  Background threads will, when spawned and when
+  // CuDevice::Instantiate() is called from them the first time, will
+  // call cudaSetDevice(device_id))
+  static int32 device_id_;
+
+  // This will automatically be set to true if the application has multiple
+  // threads that access the GPU device.  It is used to know whether to
+  // use locks when accessing the allocator and the profiling-related code.
+  static bool multi_threaded_;
+
+  // The variable profile_map_ will only be used if the verbose level is >= 1;
+  // it will accumulate some function-level timing information that is printed
+  // out at program end.  This makes things a bit slower as we have to call
+  // cudaDeviceSynchronize() to make the timing information meaningful.
+  static unordered_map<std::string, double, StringHasher> profile_map_;
+  // profile_mutex_ guards profile_map_ in case multi_threaded_ is true.
+  static std::mutex profile_mutex_;
+
+  // free_memory_at_startup_ is just used in printing the memory used according
+  // to the device.
+  static int64 free_memory_at_startup_;
+  static cudaDeviceProp properties_;
+
+  // If set to true by SetDebugStrideMode(), code will be activated to use
+  // pseudo-random stride values when allocating data (to detect errors which
+  // otherwise would be rare).
+  static bool debug_stride_mode_;
+
+
+  // The following member variable is initialized to false; if the user calls
+  // Instantiate() in a thread where it is still false, Initialize() will be
+  // called, in order to -- if a GPU is being used-- call cudaSetDevice() and
+  // set up the cublas and cusparse handles.
+  bool initialized_;
+
+  // This variable is just a copy of the static variable device_id_.  It's used
+  // to detect when this code is called in the wrong way.
+  int32 device_id_copy_;
+
+  cublasHandle_t cublas_handle_;
+  cusparseHandle_t cusparse_handle_;
+  curandGenerator_t curand_handle_;
+  cusolverDnHandle_t cusolverdn_handle_;
 }; // class CuDevice
 
 
@@ -214,13 +349,51 @@ class CuTimer: public Timer {
 
 // This function is declared as a more convenient way to get the CUDA device handle for use
 // in the CUBLAS v2 API, since we so frequently need to access it.
-inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetHandle(); }
+inline cublasHandle_t GetCublasHandle() { 
+  return CuDevice::Instantiate().GetCublasHandle(); 
+}
+
+inline cusolverDnHandle_t GetCusolverDnHandle() { 
+  return CuDevice::Instantiate().GetCusolverDnHandle(); 
+}
+
 // A more convenient way to get the handle to use cuSPARSE APIs.
-inline cusparseHandle_t GetCusparseHandle() { return CuDevice::Instantiate().GetCusparseHandle(); }
+inline cusparseHandle_t GetCusparseHandle() { 
+  return CuDevice::Instantiate().GetCusparseHandle(); 
+}
+
+inline curandGenerator_t GetCurandHandle() { 
+  return CuDevice::Instantiate().GetCurandHandle(); 
+}
 
-}  // namespace
+
+}  // namespace kaldi
 
 #endif // HAVE_CUDA
 
 
-#endif
+namespace kaldi {
+
+/**
+   The function SynchronizeGpu(), which for convenience is defined whether or
+   not we have compiled for CUDA, is intended to be called in places where threads
+   need to be synchronized.
+
+   It just launches a no-op kernel into the legacy default stream.  This will
+   have the effect that it will run after any kernels previously launched from
+   any stream(*), and before kernels that will later be launched from any stream(*).
+   (*) does not apply to non-blocking streams.
+
+   Note: at the time of writing we never call SynchronizeGpu() from binary-level
+   code because it hasn't become necessary yet; the only program that might have
+   multiple threads actually using the GPU is rnnlm-train (if the user were to
+   invoke it with the ,bg option for loading training examples); but the only
+   CUDA invocation the RnnlmExample::Read() function uses (via
+   CuMatrix::Read()), is cudaMemcpy, which is synchronous already.
+
+*/
+void SynchronizeGpu();
+
+}   // namespace kaldi
+
+#endif // KALDI_CUDAMATRIX_CU_DEVICE_H_
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index ebbcb9da5ff..2b99d09f0e4 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -6,6 +6,7 @@
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 //           2016-2018  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -39,6 +40,12 @@ typedef float   BaseFloat;
 #endif
 
 
+void cudaD_add_row_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta);
+void cudaF_add_row_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta);
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta);
@@ -195,34 +202,6 @@ void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
                        const double* y, double beta, int dim);
 void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x,
                        const float* y, float beta, int dim);
-void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
-                         MatrixDim d);
-void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                         MatrixDim d);
-void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
-                             double lower_limit, double upper_limit);
-void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
-                             float lower_limit, float upper_limit);
-void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim,
-                             const double* in, int in_stride);
-void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
-                             const float* in, int in_stride);
-void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                       MatrixDim d);
-void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                       MatrixDim d);
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
-void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
-void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                         bool include_sign, MatrixDim d);
-void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                         bool include_sign, MatrixDim d);
-void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
-void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
 void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
                              int num_blocks, const double *C_data,
                              int C_num_cols, int C_row_stride, int C_col_stride,
@@ -500,6 +479,36 @@ void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
                      int src_stride);
 void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
                      int src_stride);
+void cudaD_exp(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+	       int src_stride);
+void cudaF_exp(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+	       int src_stride);
+void cudaD_pow(dim3 Gr, dim3 Bl, double *y, const double *x, double power, MatrixDim d,
+	       int src_stride);
+void cudaF_pow(dim3 Gr, dim3 Bl, float *y, const float *x, float power, MatrixDim d,
+	       int src_stride);
+void cudaD_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
+		   MatrixDim dim, int src_stride);
+void cudaF_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
+		   MatrixDim dim, int src_stride);
+void cudaD_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
+		 MatrixDim dim, int src_stride);
+void cudaF_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
+		 MatrixDim dim, int src_stride);
+void cudaD_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
+		       double lower_limit, double upper_limit, MatrixDim d, int src_stride);
+void cudaF_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
+		       float lower_limit, float upper_limit, MatrixDim d, int src_stride);
+void cudaD_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x,
+		       MatrixDim d, int src_stride);
+void cudaF_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x,
+		       MatrixDim d, int src_stride);
+void cudaD_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride);
+void cudaF_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride);
+void cudaD_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+		   bool include_sign, MatrixDim dim, int src_stride);
+void cudaF_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+		   bool include_sign, MatrixDim dim, int src_stride);  
 void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
                     MatrixDim d);
 void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
@@ -790,6 +799,34 @@ void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
                           MatrixDim dim, const uint8_t *src,
                           int src_stride, float scale);
 
+// copies the sub matrix in src[range_start, range_end] to the matrix in dst
+// if src row is outside of the clamped range it will clamp to the specified
+// rows. src and dst cannot overlap.
+void cudaF_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd);
+void cudaD_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd);
+
+// for i=[0,num_mats) perform the matrix copy outputs[i] = inputs[i] where
+// the matrices are of size num_rows[i] x num_cols[i] and have a leading
+// dimension of ldo[i] for the output and ldi[i] for the input.
+void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
+    int32_t *ldo);
+void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
+    int32_t *ldo);
+
+// Launches a kernel that does nothing, explicitly using the legacy default stream;
+// this will synchronize all CUDA streams (except for non-blocking streams) on the
+// device.
+void cuda_legacy_noop();
 
 
 } // extern "C"
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 4101d5ba52f..21468ca9f63 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -8,6 +8,7 @@
 //           2013-2015  Guoguo Chen
 //           2016-2018  Shiyin Kang
 //                2017  Hossein Hadian, Daniel Galvez
+//                2019  Yiwen Shao
 
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -28,7 +29,7 @@
 #include <limits>
 #include <math_constants.h>
 #include "cudamatrix/cu-kernels-ansi.h"
-
+#include <cub/block/block_reduce.cuh>
 
 
 /***********************************************************************
@@ -293,25 +294,6 @@ static void _add_smat_trans(Real* mat, MatrixDim mat_dim, Real alpha,
   }
 }
 
-/// For each element x of the matrix, set it to
-/// (x < 0 ? exp(x) : x + 1).
-/// Use block/grid sizes for simple matrix ops
-template<typename T>
-__global__
-static void _apply_exp_special(T* out, MatrixDim out_dim, const T* in,
-                               int in_stride) {
-  const int i = blockIdx.x * blockDim.x + threadIdx.x;
-  const int j = blockIdx.y * blockDim.y + threadIdx.y;
-  if (i < out_dim.rows && j < out_dim.cols) {
-    T x = in[i * in_stride + j];
-    if (x < T(0)) {
-      out[i * out_dim.stride + j] = exp(x);
-    } else {
-      out[i * out_dim.stride + j] = x + T(1);
-    }
-  }
-}
-
 /// Fill the array 'data' with the sequence [base ... base + length)
 /// Use 1D block and 1D grid
 template<typename T>
@@ -389,37 +371,6 @@ static void _trace_mat_smat(const Real* mat, MatrixDim mat_dim,
   }
 }
 
-template<typename Real>
-__global__
-static void _apply_exp(Real* mat, MatrixDim d) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    mat[index] = exp(mat[index]);
-  }
-}
-
-template<typename Real>
-__global__
-static void _apply_exp_limited(Real* mat, MatrixDim d,
-                               Real lower_limit, Real upper_limit) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    Real x = mat[index];
-    // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that
-    // nan's will be set to the lower-limit.
-    if (!(x >= lower_limit))
-      x = lower_limit;
-    else if (x > upper_limit)
-      x = upper_limit;
-    mat[index] = exp(x);
-  }
-}
-
-
 template<typename Real>
 __global__
 static void _scale_diag_packed(Real* mat, Real value, int dim) {
@@ -500,16 +451,6 @@ static void _scale(Real* mat, Real value, MatrixDim d) {
     mat[index] = mat[index] * value;
 }
 
-template<typename Real>
-__global__
-static void _apply_log(Real* mat, MatrixDim d) {
-  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
-  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
-  int32_cuda index = i + j * d.stride;
-  if (i < d.cols && j < d.rows)
-    mat[index] = log(mat[index]);
-}
-
 template<typename Real>
 __global__
 static void _mul_elements(Real* mat, const Real* A, MatrixDim dst_d,
@@ -958,6 +899,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
     Real trans[TileDim][TileDim + 1];
     Real sum[CU1DBLOCK];
   } smem;
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda grid_height = gridDim.y * TileDim;
@@ -1021,6 +963,7 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   if (tid == 0) {
     value[blockIdx.y * gridDim.x + blockIdx.x] = smem.sum[0];
   }
+
 }
 
 // _trace_mat_mat_trans reduce the partial sum to
@@ -1030,6 +973,7 @@ __global__
 static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
                                  int B_stride, Real* value) {
   __shared__ Real ssum[CU1DBLOCK];
+
   // linear thread id;
   const int32_cuda tid = threadIdx.y * blockDim.x + threadIdx.x;
   const int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x;
@@ -1046,7 +990,7 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA,
   }
   ssum[tid] = tsum;
   __syncthreads();
-
+  
   // Block reduce
 # pragma unroll
   for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
@@ -1697,6 +1641,48 @@ static void _vec_transform_reduce(
     result[blockIdx.x] = op.PostReduce(sdata[0], result[blockIdx.x]);
 }
 
+// Reduce a matrix 'mat' to a row vector 'result'
+template<EnumTransformReduce TransReduceType, typename Real>
+__global__
+static void _transform_reduce_mat_rows(
+    Real *result, const Real *mat, const MatrixDim d,
+    const TransReduceOp<TransReduceType, Real> op) {
+
+  __shared__ Real sdata[CU1DBLOCK];
+  const int tid = threadIdx.x;
+  const int j = blockIdx.x;
+
+  Real tdata = op.InitValue();
+  for (int i = tid; i < d.rows; i += CU1DBLOCK) {
+    //Note the loads of mat are uncoalesced.  We could eliminate these
+    //with shared memory but at the matrix sizes we are currently looking 
+    //at it probably would not help much and would add a lot of complexity.
+    //Alternatively we could look at something like trov to help loads.
+    tdata = op.Reduce(tdata, op.Transform(mat[i * d.stride + j]));
+  }
+  sdata[tid] = tdata;
+  __syncthreads();
+
+  // Tree reduce
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift)
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    __syncthreads();
+  }
+
+  // Reduce last warp. Threads implicitly synchronized within a warp.
+  if (tid < warpSize) {
+    for (int shift = warpSize; shift > 0; shift >>= 1)
+      sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+  }
+
+  // Output to vector result.
+  if (tid == 0) {
+    result[j] = op.PostReduce(sdata[0], result[j]);
+  }
+}
+
 // Reduce a matrix 'mat' to a column vector 'result'
 template<EnumTransformReduce TransReduceType, typename Real>
 __global__
@@ -1834,83 +1820,6 @@ static void _vec_apply_ceiling(Real *v, Real ceiling_val, float *count,
   }
 }
 
-template<typename Real>
-__global__
-static void _apply_pow(Real* mat, Real power, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    if (power == 1.0)
-      return;
-    if (power == 2.0) {
-      mat[index] = mat[index] * mat[index];
-    } else if (power == 0.5) {
-      if (!(mat[index] >= 0.0))
-        return;
-      mat[index] = sqrt(mat[index]);
-    } else {
-      mat[index] = pow(mat[index], power);
-    }
-  }
-}
-
-template<typename Real>
-__global__
-static void _apply_pow_abs(Real* mat, Real power, bool include_sign,
-                           MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-  if (i < d.cols && j < d.rows) {
-    if (include_sign == true && mat[index] < 0) {
-      if (power == 1.0)
-        mat[index] = -std::abs(mat[index]);
-      if (power == 2.0) {
-        mat[index] = -mat[index] * mat[index];
-      } else if (power == 0.5) {
-        mat[index] = -sqrt(std::abs(mat[index]));
-      } else {
-        mat[index] = -pow(std::abs(mat[index]), power);
-      }
-    } else {
-      if (power == 1.0)
-        mat[index] = std::abs(mat[index]);
-      if (power == 2.0) {
-        mat[index] = mat[index] * mat[index];
-      } else if (power == 0.5) {
-        mat[index] = sqrt(std::abs(mat[index]));
-      } else if (power < 0.0 && mat[index] == 0.0) {
-        mat[index] = 0.0;
-      } else {
-        mat[index] = pow(std::abs(mat[index]), power);
-      }
-    }
-  }
-}
-
-template<typename Real>
-__global__
-static void _apply_heaviside(Real* mat, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-  if (i < d.cols && j < d.rows)
-    mat[index] = (mat[index] > 0.0 ? 1.0 : 0.0);
-}
-
-template<typename Real>
-__global__
-static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
-  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
-  int index = i + j * d.stride;
-
-  if (i < d.cols && j < d.rows) {
-    mat[index] = max(mat[index], floor_val);
-  }
-}
-
 template<typename Real>
 __global__
 static void _copy_cols(Real* dst, const Real *src,
@@ -2072,18 +1981,6 @@ static void _add_to_rows(Real alpha, Real* const * dst, const Real *src,
   }
 }
 
-template<typename Real>
-__global__
-static void _apply_ceiling(Real* mat, Real ceiling_val, MatrixDim d) {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  int j = blockIdx.y * blockDim.y + threadIdx.y;
-  int index = i + j * d.stride;
-
-  if (i < d.cols && j < d.rows) {
-    mat[index] = min(mat[index], ceiling_val);
-  }
-}
-
 template<typename Real>
 __global__
 static void _invert_elements(Real* data, MatrixDim d) {
@@ -2471,7 +2368,7 @@ static void _diff_parametric_relu(Real* eout, const Real* e, const Real* y,
 
 template<typename Real>
 __global__
-static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) {
+static void _heaviside(Real* y, const Real* x, MatrixDim d, int src_stride) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
   int dst_index = i + j * d.stride, src_index = i + j * src_stride;
@@ -2481,10 +2378,126 @@ static void _heaviside(Real*y, const Real*x, MatrixDim d, int src_stride) {
   }
 }
 
+template<typename Real>
+__global__
+static void _exp(Real* y, const Real* x, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    Real res = exp(x[src_index]);
+    y[dst_index] = res;
+  }
+}
+
+template<typename Real>
+__global__
+static void _pow(Real* y, const Real* x, Real power, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    y[dst_index] = pow(x[src_index], power);
+  }
+}
+
+template<typename Real>
+__global__
+static void _ceiling(Real* y, const Real* x, Real ceiling_val, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+
+  if (i < d.cols && j < d.rows) {
+    y[dst_index] = min(x[src_index], ceiling_val);
+  }
+}
+
+template<typename Real>
+__global__
+static void _floor(Real* y, const Real* x, Real floor_val, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+
+  if (i < d.cols && j < d.rows) {
+    y[dst_index] = max(x[src_index], floor_val);
+  }
+}
+
+template<typename Real>
+__global__
+static void _exp_limited(Real* y, const Real* x, Real lower_limit, Real upper_limit,
+			 MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    const Real x_i = x[src_index];
+    // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that
+    // nan's will be set to the lower-limit.
+    if (!(x_i  >= lower_limit))
+      y[dst_index] = exp(lower_limit);
+    else if (x_i > upper_limit)
+      y[dst_index] = exp(upper_limit);
+    else
+      y[dst_index] = exp(x_i);
+  }
+}
+
+/// For each element x of the matrix, set it to
+/// (x < 0 ? exp(x) : x + 1).
+/// Use block/grid sizes for simple matrix ops
+template<typename Real>
+__global__
+static void _exp_special(Real* y, const Real* x, MatrixDim d,
+			 int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    const Real in = x[src_index];
+    if (in < Real(0)) {
+      y[dst_index] = exp(in);
+    } else {
+      y[dst_index] = in + Real(1);
+    }
+  }
+}
+
+template<typename Real>
+__global__
+static void _log(Real* y, const Real* x, MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows)
+    y[dst_index] = log(x[src_index]);
+}
+
+template<typename Real>
+__global__
+static void _pow_abs(Real* y, const Real* x, Real power, bool include_sign,
+		     MatrixDim d, int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  int dst_index = i + j * d.stride, src_index = i + j * src_stride;
+  if (i < d.cols && j < d.rows) {
+    if (include_sign == true && x[src_index] < 0) {
+        y[dst_index] = -pow(std::abs(x[src_index]), power);
+    }
+    else {
+      y[dst_index] = pow(std::abs(x[src_index]), power);
+    }
+  }
+}
+
 template<typename Real>
 __global__
 static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
-  __shared__ Real smem[CU1DBLOCK];
+  __shared__ Real smem;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * src_stride;
   const int y_start = i * d.stride;
@@ -2496,29 +2509,14 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-  }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
 
   // broadcast max to all threads
+  if (tid == 0) {
+    smem = tmax;
+  }
   __syncthreads();
-  Real max = smem[0];
+  Real max = smem;
 
   // sum_j(exp(x(i,j)-max))
   // reduce to CU1DBLOCK elements per row.
@@ -2526,29 +2524,14 @@ static void _softmax_reduce(Real*y, const Real*x, MatrixDim d, int src_stride) {
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // broadcast sum to all threads
+  if (tid == 0) {
+    smem = tsum;
+  }
   __syncthreads();
-  Real inv_sum = Real(1) / smem[0];
+  Real inv_sum = Real(1) / smem;
 
   // normalize the row
   for (int j = tid; j < d.cols; j += CU1DBLOCK) {
@@ -2577,43 +2560,27 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   const int i = blockIdx.x;
   const int tid = threadIdx.x;
   const Real* x_row = x + i * x_d.stride;
-  __shared__ Real ssum[CU1DBLOCK];
+
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  __shared__ Real stddev_div_target_rms;
+  __shared__ Real scale;
 
   // Reduce x_j^2 to CU1DBLOCK elements per row
   Real tsum = Real(0);
   for (int j = tid; j < x_d.cols; j += CU1DBLOCK) {
     tsum += x_row[j] * x_row[j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift)
-      ssum[tid] += ssum[tid + shift];
-    __syncthreads();
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
-  // Reduce last warp to 1 element per row.
-  // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
-
-  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (tid == 0) {
-    ssum[0] = sqrt(
-        fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+    const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+    stddev_div_target_rms = sqrt(
+      fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+    scale = Real(1) / stddev_div_target_rms;
   }
-
-  // Broadcast floored stddev to all threads.
   __syncthreads();
-  const Real stddev_div_target_rms = ssum[0];
-  const Real scale = Real(1) / stddev_div_target_rms;
 
   // Store normalized input to output
   Real* y_row = y + i * y_stride;
@@ -2626,7 +2593,6 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   }
 }
 
-
 template<typename Real>
 __global__
 static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
@@ -2721,7 +2687,9 @@ template<typename Real>
 __global__
 static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
                                 int x_stride) {
-  __shared__ Real smem[CU1DBLOCK];
+  __shared__ Real smem;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int i = blockIdx.x;
   const int x_start = i * x_stride;
   const int y_start = i * y_dim.stride;
@@ -2733,28 +2701,14 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tmax = fmax(tmax, x[x_start + j]);
   }
-  smem[tid] = tmax;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] = fmax(smem[tid], smem[tid + shift]);
-    }
-  }
+  tmax = BlockReduceT(temp_storage).Reduce(tmax, cub::Max());
 
   // broadcast max to all threads
+  if (tid == 0) {
+    smem = tmax;
+  }
   __syncthreads();
-  Real max = smem[0];
+  Real max = smem;
 
   // sum_j(exp(x(i,j)-max))
   // reduce to CU1DBLOCK elements per row.
@@ -2762,28 +2716,14 @@ static void _log_softmax_reduce(Real* y, const Real* x, MatrixDim y_dim,
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
     tsum += exp(x[x_start + j] - max);
   }
-  smem[tid] = tsum;
-  __syncthreads();
-
-  // reduce to 2x warpSize elements per row
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      smem[tid] += smem[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // reduce to 1 element per row
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      smem[tid] += smem[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // broadcast sum to all threads
+  if (tid == 0) {
+    smem = tsum;
+  }
   __syncthreads();
-  Real log_sum = log(smem[0]);
+  Real log_sum = log(smem);
 
   // normalize the row
   for (int j = tid; j < y_dim.cols; j += CU1DBLOCK) {
@@ -3023,7 +2963,10 @@ __global__
 static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
                           const int value_stride, const Real* diff,
                           const int diff_stride) {
-  __shared__ Real ssum[CU1DBLOCK];
+  __shared__ Real ssum;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int value_start = i * value_stride;
@@ -3035,29 +2978,14 @@ static void _diff_softmax(Real* x, const MatrixDim dim, const Real* value,
   for (int j = tid; j < dim.cols; j += CU1DBLOCK) {
     tsum += value[value_start + j] * diff[diff_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // Broadcast result to all threads
+  if (tid == 0) {
+    ssum = tsum;
+  }
   __syncthreads();
-  const Real pe = ssum[0];
+  const Real pe = ssum;
 
   // Apply element-wise x = value * (diff - pe)
   for (int j = tid; j < dim.cols; j += CU1DBLOCK) {
@@ -3077,7 +3005,9 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
                               const Real* out_deriv, const int out_deriv_stride,
                               Real* in_deriv) {
 
-  __shared__ Real ssum[CU1DBLOCK];
+  __shared__ Real ssum;
+  typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
   const int tid = threadIdx.x;
   const int i = blockIdx.x;
   const int out_value_start = i * out_value_stride;
@@ -3089,29 +3019,14 @@ static void _diff_log_softmax(const MatrixDim in_deriv_dim,
   for (int j = tid; j < in_deriv_dim.cols; j += CU1DBLOCK) {
     tsum += out_deriv[out_deriv_start + j];
   }
-  ssum[tid] = tsum;
-  __syncthreads();
-
-  // Tree reduce to 2x warpSize elements.
-# pragma unroll
-  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
-    if (tid < shift) {
-      ssum[tid] += ssum[tid + shift];
-    }
-    __syncthreads();
-  }
-
-  // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-#   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
-  }
+  tsum = BlockReduceT(temp_storage).Sum(tsum);
 
   // Broadcast result to all threads
+  if (tid == 0) {
+    ssum = tsum;
+  }
   __syncthreads();
-  const Real sum_e = ssum[0];
+  const Real sum_e = ssum;
 
   // Apply element-wise x = out_deriv - exp(value) * sum_e
   for (int j = tid; j < in_deriv_dim.cols; j += CU1DBLOCK) {
@@ -3699,7 +3614,80 @@ static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim,
   }
 }
 
+template <typename Real>
+__global__
+void _cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const Real * __restrict__ src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   Real * __restrict__ dst, int32_t ldd) {
+  int32_t rid = blockIdx.y*blockDim.y+threadIdx.y;
+  int32_t cid = blockIdx.x*blockDim.x+threadIdx.x;
+
+  int32_t num_rows = row_end - row_start;
+  // for each row in parallel
+  for (int32_t r = rid; r < num_rows; r += blockDim.y * gridDim.y) {
+    // for each column in parallel
+    for (int32_t c = cid; c < num_cols; c += blockDim.x * gridDim.x) {
+      // compute offset row
+      int32_t r_in = r + row_start;
+      // clamp if necessary
+      if (r_in < clamp_low) r_in = clamp_low;
+      if (r_in > clamp_high) r_in = clamp_high;
 
+      // copy data
+      dst[r * ldd + c] = src[r_in * lds + c];
+    }
+  }
+}
+
+template <typename Real> 
+struct MatrixCopyDesc {
+  const Real *input;
+  Real *output;
+  int32_t ldi, ldo;
+  int32_t num_rows, num_cols;
+};
+
+template <typename Real>
+struct  BatchedMatrixCopyDesc {
+  //maximum size allowed in formal parameter list
+  static const int32_t MAX_BATCH_SIZE=128; 
+  MatrixCopyDesc<Real> batch[MAX_BATCH_SIZE];
+};
+
+// launched with a block size of 32x32 (32 rows, 32 cols per CTA)
+// grid dim x,y expands to fill out average in x/y across batches
+// grid dim.z is batch
+template<typename Real>
+__global__ 
+void _cuda_batch_copy_mats(BatchedMatrixCopyDesc<Real> batch_desc) {
+
+  int32_t rid = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_t cid = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_t bid = blockIdx.z;  // batch id 
+
+  // read copy parameters
+  MatrixCopyDesc<Real> desc = batch_desc.batch[bid];
+  int32_t num_rows = desc.num_rows;
+  int32_t num_cols = desc.num_cols;
+  const Real *input = desc.input;
+  Real *output = desc.output;
+  int32_t ldi = desc.ldi;
+  int32_t ldo = desc.ldo;
+
+  // for each row of output in parallel
+  for (int32_t r = rid; r < num_rows; r += blockDim.y * gridDim.y) {
+    // for each of column of output in parallel
+    for (int32_t c = cid; c < num_cols; c+= blockDim.x * gridDim.x) {
+      output[r * ldo + c] = input[r * ldi + c];
+    }
+  }
+}
+
+__global__
+static void _noop_kernel() {
+}
 
 /***********************************************************************
  * ANSI-C wrappers of CUDA kernels
@@ -3758,28 +3746,6 @@ void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
   _copy_from_tp<<<Gr,Bl>>>(A,B,dmat);
 }
 
-void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_exp<<<Gr,Bl>>>(mat,d);
-}
-
-void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
-                             float lower_limit, float upper_limit) {
-  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
-}
-
-void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) {
-  _apply_pow<<<Gr,Bl>>>(mat, power, d);
-}
-
-void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                         bool include_sign, MatrixDim d) {
-  _apply_pow_abs<<<Gr,Bl>>>(mat, power, include_sign, d);
-}
-
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_heaviside<<<Gr,Bl>>>(mat, d);
-}
-
 void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
@@ -3836,16 +3802,6 @@ void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
   _add_to_rows<<<Gr,Bl>>>(alpha, dst, src, src_dim);
 }
 
-void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                       MatrixDim d) {
-  _apply_floor<<<Gr,Bl>>>(mat, floor_val, d);
-}
-
-void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                         MatrixDim d) {
-  _apply_ceiling<<<Gr,Bl>>>(mat, ceiling_val, d);
-}
-
 void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d) {
   _set_diag<<<Gr,Bl>>>(mat,value,d);
 }
@@ -3878,10 +3834,6 @@ void cudaF_scale(dim3 Gr, dim3 Bl, float* mat, float value, MatrixDim d) {
   _scale<<<Gr,Bl>>>(mat,value,d);
 }
 
-void cudaF_apply_log(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  _apply_log<<<Gr,Bl>>>(mat,d);
-}
-
 void cudaF_mul_elements(dim3 Gr, dim3 Bl, float* mat, const float* A,
                         MatrixDim dst_d, int src_stride) {
   _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
@@ -4028,6 +3980,12 @@ void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,float>());
 }
+void cudaF_add_row_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta) {
+  _transform_reduce_mat_rows<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, float>(alpha, beta));
+}
 void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
                            const MatrixDim d, const float alpha,
                            const float beta) {
@@ -4035,6 +3993,7 @@ void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
       TransReduceOp<SUMAB, float>(alpha, beta));
 }
 
+
 void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
                          float changed) {
   _replace_value<<<Gr,Bl>>>(v, dim, orig, changed);
@@ -4299,6 +4258,45 @@ void cudaF_heaviside(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d,
   _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
+void cudaF_exp(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d,
+	       int src_stride) {
+  _exp<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaF_pow(dim3 Gr, dim3 Bl, float* y, const float* x, float power, MatrixDim d,
+	       int src_stride) {
+  _pow<<<Gr,Bl>>>(y, x, power, d, src_stride);
+}
+
+void cudaF_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
+		   MatrixDim d, int src_stride) {
+  _ceiling<<<Gr,Bl>>>(y, x, ceiling_val, d, src_stride);
+}
+
+void cudaF_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
+		 MatrixDim d, int src_stride) {
+  _floor<<<Gr,Bl>>>(y, x, floor_val, d, src_stride);
+}
+
+void cudaF_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
+		       float lower_limit, float upper_limit, MatrixDim d, int src_stride) {
+  _exp_limited<<<Gr,Bl>>>(y, x, lower_limit, upper_limit, d, src_stride);
+}
+
+void cudaF_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d,
+		       int src_stride) {
+  _exp_special<<<Gr, Bl>>>(y, x, d, src_stride);
+}
+
+void cudaF_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
+  _log<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaF_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+		   bool include_sign, MatrixDim d, int src_stride) {
+  _pow_abs<<<Gr,Bl>>>(y, x, power, include_sign, d, src_stride);
+}
+
 void cudaF_softmax_reduce(size_t Gr, size_t Bl, float* y, const float* x,
                           MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
@@ -4462,30 +4460,6 @@ void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
   _copy_from_tp<<<Gr,Bl>>>(A,B,dmat);
 }
 
-void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_exp<<<Gr,Bl>>>(mat,d);
-}
-
-void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
-                             double lower_limit, double upper_limit) {
-  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
-}
-
-
-
-void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) {
-  _apply_pow<<<Gr,Bl>>>(mat, power, d);
-}
-
-void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                         bool include_sign, MatrixDim d) {
-  _apply_pow_abs<<<Gr,Bl>>>(mat, power, include_sign, d);
-}
-
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_heaviside<<<Gr,Bl>>>(mat, d);
-}
-
 void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride) {
@@ -4543,16 +4517,6 @@ void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, double alpha,
   _add_to_rows<<<Gr,Bl>>>(alpha, dst, src, src_dim);
 }
 
-void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                       MatrixDim d) {
-  _apply_floor<<<Gr,Bl>>>(mat, floor_val, d);
-}
-
-void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
-                         MatrixDim d) {
-  _apply_ceiling<<<Gr,Bl>>>(mat, ceiling_val, d);
-}
-
 void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d) {
   _set_diag<<<Gr,Bl>>>(mat,value,d);
 }
@@ -4586,10 +4550,6 @@ void cudaD_scale(dim3 Gr, dim3 Bl, double* mat, double value, MatrixDim d) {
   _scale<<<Gr,Bl>>>(mat,value,d);
 }
 
-void cudaD_apply_log(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  _apply_log<<<Gr,Bl>>>(mat,d);
-}
-
 void cudaD_mul_elements(dim3 Gr, dim3 Bl, double* mat, const double* A,
                         MatrixDim dst_d, int src_stride) {
   _mul_elements<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
@@ -4736,6 +4696,12 @@ void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,double>());
 }
+void cudaD_add_row_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta) {
+  _transform_reduce_mat_rows<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, double>(alpha, beta));
+}
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta) {
@@ -4996,6 +4962,45 @@ void cudaD_heaviside(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
   _heaviside<<<Gr,Bl>>>(y, x, d, src_stride);
 }
 
+void cudaD_exp(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
+	       int src_stride) {
+  _exp<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaD_pow(dim3 Gr, dim3 Bl, double* y, const double* x, double power, MatrixDim d,
+	       int src_stride) {
+  _pow<<<Gr,Bl>>>(y, x, power, d, src_stride);
+}
+
+void cudaD_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
+		   MatrixDim d, int src_stride) {
+  _ceiling<<<Gr,Bl>>>(y, x, ceiling_val, d, src_stride);
+}
+
+void cudaD_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
+		 MatrixDim d, int src_stride) {
+  _floor<<<Gr,Bl>>>(y, x, floor_val, d, src_stride);
+}
+
+void cudaD_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
+		       double lower_limit, double upper_limit, MatrixDim d, int src_stride) {
+  _exp_limited<<<Gr,Bl>>>(y, x, lower_limit, upper_limit, d, src_stride);
+}
+
+void cudaD_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
+		       int src_stride) {
+  _exp_special<<<Gr, Bl>>>(y, x, d, src_stride);
+}
+
+void cudaD_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
+  _log<<<Gr,Bl>>>(y, x, d, src_stride);
+}
+
+void cudaD_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+		   bool include_sign, MatrixDim d, int src_stride) {
+  _pow_abs<<<Gr,Bl>>>(y, x, power, include_sign, d, src_stride);
+}
+
 void cudaD_softmax_reduce(size_t Gr, size_t Bl, double* y, const double* x,
                           MatrixDim d, int src_stride) {
   _softmax_reduce<<<Gr,Bl>>>(y, x, d, src_stride);
@@ -5384,14 +5389,6 @@ void cudaF_add_smat_trans(dim3 Gr, dim3 Bl, float* mat, MatrixDim mat_dim,
   _add_smat_trans<<<Gr, Bl>>>(mat, mat_dim, alpha, smat_row_ptr, smat_col_idx,
                               smat_val);
 }
-void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim,
-                             const double* in, int in_stride) {
-  _apply_exp_special<<<Gr, Bl>>>(out, out_dim, in, in_stride);
-}
-void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
-                             const float* in, int in_stride) {
-  _apply_exp_special<<<Gr, Bl>>>(out, out_dim, in, in_stride);
-}
 
 void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
                               unsigned char *dest, int dest_stride) {
@@ -5459,3 +5456,168 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
                            int src_stride, float scale) {
   _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
 }
+
+
+// Launches a kernel that does nothing, explicitly using the legacy default stream;
+// this will synchronize all threads without blocking.
+void cuda_legacy_noop() {
+  _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>();
+}
+
+void cudaF_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd) {
+
+  int32_t num_rows =  row_end - row_start;
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+
+  _cuda_mat_copy_range_clamped<float><<<blocks,threads>>>(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+void cudaD_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd) {
+
+  int32_t num_rows =  row_end - row_start;
+  dim3 threads(32,32);
+  dim3 blocks((num_cols+31)/32,(num_rows+31)/32);
+
+  _cuda_mat_copy_range_clamped<double><<<blocks,threads>>>(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+void cudaF_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
+    int32_t *ldo) {
+
+  dim3 threads(32,32);
+  int32_t total_rows=0, total_cols=0;
+  
+  BatchedMatrixCopyDesc<float> batch_desc; 
+  const int32_t MAX_BATCH_SIZE=batch_desc.MAX_BATCH_SIZE;
+
+  int i;
+  for (i = 0; i < num_mats; i++) {
+    int b = i%MAX_BATCH_SIZE;
+    
+    // fill in desc
+    MatrixCopyDesc<float> &desc = batch_desc.batch[b];
+    desc.num_rows = num_rows[i];
+    desc.num_cols = num_cols[i];
+    desc.input = inputs[i];
+    desc.output = outputs[i];
+    desc.ldi = ldi[i];
+    desc.ldo = ldo[i];
+
+    total_rows+=desc.num_rows;
+    total_cols+=desc.num_cols;
+
+    if (b==MAX_BATCH_SIZE-1) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
+      int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  MAX_BATCH_SIZE);
+
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+      
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+
+       // reset total counters
+       total_rows=0;
+       total_cols=0;
+    }
+  }
+
+  int32_t remaining = i%MAX_BATCH_SIZE;
+
+  if (remaining > 0) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)remaining);
+      int32_t cols = ceilf(total_cols / (float)remaining);
+      
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  remaining);
+
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+  }
+}
+
+void cudaD_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
+    int32_t *ldo) {
+
+  dim3 threads(32,32);
+  int32_t total_rows=0, total_cols=0;
+  
+  BatchedMatrixCopyDesc<double> batch_desc; 
+  const int32_t MAX_BATCH_SIZE=batch_desc.MAX_BATCH_SIZE;
+
+  int i;
+  for (i = 0; i < num_mats; i++) {
+    int b = i%MAX_BATCH_SIZE;
+    
+    // fill in desc
+    MatrixCopyDesc<double> &desc = batch_desc.batch[b];
+    desc.num_rows = num_rows[i];
+    desc.num_cols = num_cols[i];
+    desc.input = inputs[i];
+    desc.output = outputs[i];
+    desc.ldi = ldi[i];
+    desc.ldo = ldo[i];
+
+    total_rows+=desc.num_rows;
+    total_cols+=desc.num_cols;
+
+    if (b==MAX_BATCH_SIZE-1) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)MAX_BATCH_SIZE);
+      int32_t cols = ceilf(total_cols / (float)MAX_BATCH_SIZE);
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  MAX_BATCH_SIZE);
+
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+      
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+
+       // reset total counters
+       total_rows=0;
+       total_cols=0;
+    }
+  }
+
+  int32_t remaining = i%MAX_BATCH_SIZE;
+
+  if (remaining > 0) {
+      // compute average number of rows/cols across batch
+      int32_t rows = ceilf(total_rows / (float)remaining);
+      int32_t cols = ceilf(total_cols / (float)remaining);
+
+      dim3 blocks((cols + 31) / 32,
+                  (rows + 31) / 32, 
+                  remaining);
+      
+      // no memcpy needed here.  Memory will be passed down directly
+      // through paramter passing and live in constant memory
+
+      // launch batch
+       _cuda_batch_copy_mats<<<blocks,threads>>>(batch_desc);
+  }
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 6c24ce0dd58..1df1626fc6d 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -7,6 +7,7 @@
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 //           2016-2018  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -38,6 +39,16 @@
 
 namespace kaldi {
 
+inline void cuda_add_row_sum_mat(int Gr, int Bl, double* result,
+                                 const double* mat, const MatrixDim d,
+                                 const double alpha, const double beta) {
+  cudaD_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
+inline void cuda_add_row_sum_mat(int Gr, int Bl, float* result,
+                                 const float* mat, const MatrixDim d,
+                                 const float alpha, const float beta) {
+  cudaF_add_row_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
 inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result,
                                  const double* mat, const MatrixDim d,
                                  const double alpha, const double beta) {
@@ -335,74 +346,6 @@ inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v,
                              int dim) {
   cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
 }
-inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat,
-                               double ceiling_val, MatrixDim dim) {
-  cudaD_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
-}
-inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                               MatrixDim dim) {
-  cudaF_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
-}
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  cudaD_apply_exp(Gr, Bl, mat, d);
-}
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  cudaF_apply_exp(Gr, Bl, mat, d);
-}
-inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
-                                   double lower_limit, double upper_limit) {
-  cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
-}
-inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
-                                   float lower_limit, float upper_limit) {
-  cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
-}
-inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out,
-                                   MatrixDim out_dim, const double* in,
-                                   int in_stride) {
-  cudaD_apply_exp_special(Gr, Bl, out, out_dim, in, in_stride);
-}
-inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, float* out,
-                                   MatrixDim out_dim, const float* in,
-                                   int in_stride) {
-  cudaF_apply_exp_special(Gr, Bl, out, out_dim, in, in_stride);
-}
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                             MatrixDim dim) {
-  cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);
-}
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                             MatrixDim dim) {
-  cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
-}
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
-  cudaD_apply_heaviside(Gr, Bl, mat, dim);
-}
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
-  cudaF_apply_heaviside(Gr, Bl, mat, dim);
-}
-inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) {
-  cudaD_apply_log(Gr, Bl, mat, d);
-}
-inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) {
-  cudaF_apply_log(Gr, Bl, mat, d);
-}
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                               bool include_sign, MatrixDim dim) {
-  cudaD_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
-}
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                               bool include_sign, MatrixDim dim) {
-  cudaF_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
-}
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power,
-                           MatrixDim dim) {
-  cudaD_apply_pow(Gr, Bl, mat, power, dim);
-}
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power,
-                           MatrixDim dim) {
-  cudaF_apply_pow(Gr, Bl, mat, power, dim);
-}
 inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha,
                                 const double *x, int incx, double *y,
                                 int incy) {
@@ -929,19 +872,81 @@ inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
                                   double power) {
   cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float* y, const float* x,
                                   MatrixDim d, int src_stride, int group_size,
                                   float power) {
   cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x,
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, double* y, const double* x,
                            MatrixDim d, int src_stride) {
   cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x,
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, float* y, const float* x,
                            MatrixDim d, int src_stride) {
   cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
 }
+inline void cuda_exp(dim3 Gr, dim3 Bl, double* y, const double* x,
+		     MatrixDim d, int src_stride) {
+  cudaD_exp(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_exp(dim3 Gr, dim3 Bl, float* y, const float* x,
+		     MatrixDim d, int src_stride) {
+  cudaF_exp(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_pow(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+		     MatrixDim d, int src_stride) {
+  cudaD_pow(Gr, Bl, y, x, power, d, src_stride);
+}
+inline void cuda_pow(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+		     MatrixDim d, int src_stride) {
+  cudaF_pow(Gr, Bl, y, x, power, d, src_stride);
+}
+inline void cuda_ceiling(dim3 Gr, dim3 Bl, double* y, const double* x, double ceiling_val,
+			 MatrixDim dim, int src_stride) {
+  cudaD_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
+}
+inline void cuda_ceiling(dim3 Gr, dim3 Bl, float* y, const float* x, float ceiling_val,
+			 MatrixDim dim, int src_stride) {
+  cudaF_ceiling(Gr, Bl, y, x, ceiling_val, dim, src_stride);
+}
+inline void cuda_floor(dim3 Gr, dim3 Bl, double* y, const double* x, double floor_val,
+		       MatrixDim dim, int src_stride) {
+  cudaD_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
+}
+inline void cuda_floor(dim3 Gr, dim3 Bl, float* y, const float* x, float floor_val,
+		       MatrixDim dim, int src_stride) {
+  cudaF_floor(Gr, Bl, y, x, floor_val, dim, src_stride);
+}
+inline void cuda_exp_limited(dim3 Gr, dim3 Bl, double* y, const double* x,
+			     double lower_limit, double upper_limit, MatrixDim d, int src_stride) {
+  cudaD_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
+}
+inline void cuda_exp_limited(dim3 Gr, dim3 Bl, float* y, const float* x,
+			     float lower_limit, float upper_limit, MatrixDim d, int src_stride) {
+  cudaF_exp_limited(Gr, Bl, y, x, lower_limit, upper_limit, d, src_stride);
+}
+inline void cuda_exp_special(dim3 Gr, dim3 Bl, double* y, const double* x,
+			     MatrixDim d, int src_stride) {
+  cudaD_exp_special(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_exp_special(dim3 Gr, dim3 Bl, float* y, const float* x,
+			     MatrixDim d, int src_stride) {
+  cudaF_exp_special(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_log(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
+  cudaD_log(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_log(dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
+  cudaF_log(Gr, Bl, y, x, d, src_stride);
+}
+inline void cuda_pow_abs(dim3 Gr, dim3 Bl, double* y, const double* x, double power,
+			 bool include_sign, MatrixDim dim, int src_stride) {
+  cudaD_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
+}
+inline void cuda_pow_abs(dim3 Gr, dim3 Bl, float* y, const float* x, float power,
+			 bool include_sign, MatrixDim dim, int src_stride) {
+  cudaF_pow_abs(Gr, Bl, y, x, power, include_sign, dim, src_stride);
+}
 inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) {
   cudaD_invert_elements(Gr, Bl, data, d);
 }
@@ -1551,6 +1556,38 @@ inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
   cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
 }
 
+inline void cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const double *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   double *dst, int32_t ldd) {
+  cudaD_mat_copy_range_clamped(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+inline void cuda_mat_copy_range_clamped(
+   int32_t row_start, int32_t row_end, int32_t num_cols,
+   const float *src, int32_t lds, 
+   int32_t clamp_low, int32_t clamp_high,
+   float *dst, int32_t ldd) {
+  cudaF_mat_copy_range_clamped(row_start, row_end, num_cols,
+      src, lds, clamp_low, clamp_high, dst, ldd);
+}
+
+inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const float **inputs, int32_t *ldi, float **outputs,
+    int32_t *ldo) {
+  cudaF_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
+      outputs, ldo);
+}
+
+inline void cuda_batched_copy_mats(int32_t num_mats, int32_t *num_rows,
+    int32_t *num_cols, const double **inputs, int32_t *ldi, double **outputs,
+    int32_t *ldo) {
+  cudaD_batched_copy_mats(num_mats, num_rows, num_cols, inputs, ldi,
+      outputs, ldo);
+}
+    
 
 } // namespace kaldi
 
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 09255c9587b..022742ed29f 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -545,6 +545,50 @@ static void UnitTestCuMathNormalizePerRow() {
   }
 }
 
+
+template<typename Real>
+static void UnitTestCuMathNormalizePerRow_v2() {
+
+  int row = 128;
+  int col = 1024;
+
+  Matrix<Real> Hi(row,col);
+  Matrix<Real> Ho(row,col);
+  Hi.SetRandn();
+  Hi.Scale(5.0);
+  Hi.ApplyFloor(0.0); // like ReLU,
+
+  CuMatrix<Real> Di(row, col);
+  CuMatrix<Real> Do(row, col);
+  Di.CopyFromMat(Hi);
+
+  Real target_rms = 0.3456;
+  bool add_log_stddev = false;
+  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+
+  //gpu
+  cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);
+
+  //cpu
+  {
+    MatrixBase<Real>& in(Hi);
+    MatrixBase<Real>& out(Ho);
+    Real target_rms=0.3456;
+    Vector<Real> in_norm(in.NumRows());
+    Real d_scaled = in.NumCols() * target_rms * target_rms;
+    in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+    in_norm.ApplyFloor(kSquaredNormFloor);
+    in_norm.ApplyPow(-0.5);
+    out.CopyFromMat(in);
+    out.MulRowsVec(in_norm);
+  }
+
+  Matrix<Real> Ho2(Do);
+  // here the BUG was detected (by processing big-enough matrix),
+  AssertEqual(Ho,Ho2,0.00001);
+}
+
+
 template<typename Real>
 static void UnitTestCuDiffNormalizePerRow() {
   for (int32 i = 0; i < 2; i++) {
@@ -660,6 +704,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestEnsureNonzero<Real>();
   UnitTestBackpropLstmNonlinearity<Real>();
   UnitTestCuMathNormalizePerRow<Real>();
+  UnitTestCuMathNormalizePerRow_v2<Real>();
   UnitTestCuDiffNormalizePerRow<Real>();
 }
 
@@ -673,9 +718,9 @@ int main() {
   for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // 0 means no GPU
     else
-      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // 1 .. automatic selection
 #endif
     srand(time(NULL));
     kaldi::CudaMathUnitTest<float>();
diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h
index 9b7a707d2e5..0e182d4e72a 100644
--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@@ -36,6 +36,7 @@ inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
     // initializer, so nothing to do.
   } else {
     KALDI_ASSERT(row_offset >= 0 && col_offset >= 0 &&
+                 num_rows >= 0 && num_cols >= 0 &&
                  row_offset + num_rows <= mat.num_rows_ &&
                  col_offset + num_cols <= mat.num_cols_);
     this->data_ = mat.data_ + static_cast<size_t>(col_offset) +
@@ -68,5 +69,3 @@ inline CuSubMatrix<Real>::CuSubMatrix(const Real *data,
 } // namespace kaldi
 
 #endif
-
-
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index c67eaf220b8..230112b1bd0 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -505,7 +505,7 @@ template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
   Timer tim;
   int32 iter = 0;
   for (;tim.Elapsed() < time_in_secs; iter++) {
-    N.ApplySoftMaxPerRow(M);
+    N.SoftMaxPerRow(M);
   }
 
   BaseFloat fdim = dim;
@@ -523,7 +523,7 @@ template<typename Real> void TestCuMatrixLogSoftmax(int32 dim) {
   Timer tim;
   int32 iter = 0;
   for (;tim.Elapsed() < time_in_secs; iter++) {
-    N.ApplyLogSoftMaxPerRow(M);
+    N.LogSoftMaxPerRow(M);
   }
 
   BaseFloat fdim = dim;
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 01030bb8353..be8483e48f5 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -77,8 +77,8 @@ static void UnitTestCuMatrixTraceMatMat() {
   for (int32 i = 0; i < 2; i++) {
     int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
     CuMatrix<Real> A(M, N);
-    A.SetRandn();
-    // add a bias to avoid numerical failure when comparing r2 and r3
+    A.SetRandUniform();
+    // Add bias to avoid numbers close to zero
     A.Add(0.1);
     if (i % 2 == 1) {
       CuMatrix<Real> B(M, N);
@@ -143,7 +143,8 @@ template<typename Real>
 static void UnitTestCuMatrixApplyLog() {
   int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
   Matrix<Real> H(M, N);
-  H.SetRandn();
+  H.SetRandUniform(); // Using uniform distribution to ensure positive numbers
+  H.Add(0.1);         // Add bias to eliminate zeros
   H.MulElements(H); // make numbers positive
 
   CuMatrix<Real> D(H);
@@ -153,7 +154,7 @@ static void UnitTestCuMatrixApplyLog() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -173,8 +174,7 @@ static void UnitTestCuMatrixApplyExpSpecial() {
   H.ApplyExpSpecial();
 
   Matrix<Real> H2(D);
-
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 template<typename Real>
@@ -190,7 +190,7 @@ static void UnitTestCuMatrixApplyExp() {
 
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -200,21 +200,17 @@ static void UnitTestCuMatrixApplyExpLimited() {
   Matrix<Real> H(M, N);
   H.SetRandn();
 
-
   BaseFloat lower_limit = -0.2, upper_limit = 0.2;
 
   CuMatrix<Real> D(H);
-
   D.ApplyExpLimited(lower_limit, upper_limit);
 
-
   H.ApplyFloor(lower_limit);
   H.ApplyCeiling(upper_limit);
   H.ApplyExp();
-
   Matrix<Real> H2(D);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 
@@ -235,7 +231,7 @@ static void UnitTestCuMatrixSigmoid() {
 
     Matrix<Real> H2(E);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -251,7 +247,7 @@ static void UnitTestCuMatrixScale() {
   H.Scale(scale);
   Matrix<Real> E(D);
 
-  AssertEqual(H, E);
+  KALDI_ASSERT(ApproxEqual(H, E));
 }
 
 template<typename Real>
@@ -266,7 +262,7 @@ static void UnitTestCuMatrixAdd() {
   H.Add(offset);
   Matrix<Real> E(D);
 
-  AssertEqual(H, E);
+  KALDI_ASSERT(ApproxEqual(H, E));
 }
 
 
@@ -285,7 +281,7 @@ static void UnitTestCuMatrixSoftHinge() {
 
   Matrix<Real> H2(E);
 
-  AssertEqual(H,H2);
+  KALDI_ASSERT(ApproxEqual(H,H2));
 }
 
 template<typename Real>
@@ -308,7 +304,7 @@ static void UnitTestCuMatrixGroupPnorm() {
       CuMatrix<Real> E(M, N);
       E.GroupPnorm(D, p);
       Matrix<Real> H2(E);
-      AssertEqual(H, H2);
+      KALDI_ASSERT(ApproxEqual(H, H2));
     }
   }
 }
@@ -330,7 +326,7 @@ static void UnitTestCuMatrixGroupMax() {
     CuMatrix<Real> E(M, N);
     E.GroupMax(D);
     Matrix<Real> H2(E);
-    AssertEqual(H,H2);
+    KALDI_ASSERT(ApproxEqual(H,H2));
   }
 }
 
@@ -344,7 +340,7 @@ static void UnitTestCuMatrixSet() {
     m1.Set(value);
     m2.Set(value);
     Matrix<Real> m3(m1);
-    AssertEqual(m2, m3);
+    KALDI_ASSERT(ApproxEqual(m2, m3));
   }
 }
 
@@ -369,7 +365,7 @@ static void UnitTestCuMatrixApplyPow() {
 
     H.ApplyPow(pow);
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -390,7 +386,7 @@ static void UnitTestCuMatrixApplyPowAbs() {
 
     H.ApplyPowAbs(pow, true);
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -417,7 +413,7 @@ static void UnitTestCuMatrixCopyRowsFromVec() {
     mat.CopyRowsFromVec(vec);
 
     Matrix<Real> mat2(cu_mat);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -442,7 +438,7 @@ static void UnitTestCuMatrixCopyColsFromVec() {
     mat.CopyColsFromVec(vec);
 
     Matrix<Real> mat2(cu_mat);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -477,8 +473,8 @@ static void UnitTestCuMatrixCopyRows() {
         if (reorder[i] < 0) O(i, j) = 0;
         else O(i, j) = M(reorder[i], j);
 
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -512,7 +508,7 @@ static void UnitTestCuMatrixCopyToRows() {
     CuArray<Real*> reorder_dst_cuda(reorder_dst);
     M.CopyToRows(reorder_dst_cuda);
 
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -552,8 +548,8 @@ static void UnitTestCuMatrixAddRows() {
       }
     }
 
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -588,7 +584,7 @@ static void UnitTestCuMatrixMulRows() {
       }
     }
 
-    AssertEqual(N1, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
   }
 }
 
@@ -630,8 +626,8 @@ static void UnitTestCuMatrixAddToRows() {
     CuArray<Real*> reorder_dst_cuda(reorder_dst);
     M.AddToRows(alpha, reorder_dst_cuda);
     M.AddToRows(alpha, reorder_cuda, &N2);
-    AssertEqual(N1, O);
-    AssertEqual(N2, O);
+    KALDI_ASSERT(ApproxEqual(N1, O));
+    KALDI_ASSERT(ApproxEqual(N2, O));
   }
 }
 
@@ -648,13 +644,13 @@ void UnitTestCuMatrixCopyCross() {
       mat2.CopyFromMat(mat1);
       CuMatrix<Real> mat3(M, N);
       mat3.CopyFromMat(mat2);
-      AssertEqual(mat1, mat3);
+      KALDI_ASSERT(ApproxEqual(mat1, mat3));
     } else {
       CuMatrix<float> mat2(N, M);
       mat2.CopyFromMat(mat1, kTrans);
       CuMatrix<Real> mat3(M, N);
       mat3.CopyFromMat(mat2, kTrans);
-      AssertEqual(mat1, mat3);
+      KALDI_ASSERT(ApproxEqual(mat1, mat3));
     }
   }
 }
@@ -669,7 +665,7 @@ template<typename Real> void UnitTestCuMatrixCopyCross2() {
     mat2.CopyFromMat(mat1);
     CuMatrix<Real> mat3(M, N);
     mat3.CopyFromMat(mat2);
-    AssertEqual(mat1, mat3);
+    KALDI_ASSERT(ApproxEqual(mat1, mat3));
   }
 }
 
@@ -708,7 +704,7 @@ static void UnitTestCuMatrixSumColumnRanges() {
     CuArray<Int32Pair> indices_tmp(indices);
     cu_dst.SumColumnRanges(cu_src, indices_tmp);
     Matrix<Real> dst2(cu_dst);
-    AssertEqual(dst, dst2);
+    KALDI_ASSERT(ApproxEqual(dst, dst2));
   }
 }
 
@@ -748,7 +744,7 @@ static void UnitTestCuMatrixAddRowRanges() {
     CuArray<Int32Pair> cu_indexes(indexes);
     cu_dst.AddRowRanges(cu_src, cu_indexes);
     Matrix<Real> dst2(cu_dst);
-    AssertEqual(dst1, dst2);
+    KALDI_ASSERT(ApproxEqual(dst1, dst2));
   }
 }
 
@@ -774,7 +770,7 @@ static void UnitTestCuMatrixCopyCols() {
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
         else O(i, j) = M(i, reorder[j]);
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -806,7 +802,7 @@ static void UnitTextCuMatrixAddSmat() {
 
     Matrix<Real> mat2(cumat);
 
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -844,7 +840,7 @@ static void UnitTextCuMatrixAddMatSmat() {
 
     Matrix<Real> result2(curesult);
 
-    AssertEqual(result, result2);
+    KALDI_ASSERT(ApproxEqual(result, result2));
   }
 }
 
@@ -882,7 +878,7 @@ static void UnitTextCuMatrixAddSmatMat() {
 
     Matrix<Real> result2(curesult);
 
-    AssertEqual(result, result2);
+    KALDI_ASSERT(ApproxEqual(result, result2));
   }
 }
 
@@ -907,7 +903,7 @@ static void UnitTestCuMatrixAddCols() {
       for (int32 j = 0; j < num_cols2; j++)
         if (reorder[j] < 0) O(i, j) = 0;
         else O(i, j) = M(i, reorder[j]);
-    AssertEqual(N, O);
+    KALDI_ASSERT(ApproxEqual(N, O));
   }
 }
 
@@ -929,7 +925,7 @@ static void UnitTestCuMatrixApplyFloor() {
     H.ApplyFloor(floor);
     Matrix<Real> H2(cH);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -950,7 +946,7 @@ static void UnitTestCuMatrixApplyCeiling() {
     H.ApplyCeiling(ceiling);
     Matrix<Real> H2(cH);
 
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -969,7 +965,7 @@ static void UnitTestCuMatrixApplyHeaviside() {
     cH.ApplyHeaviside();
     H.ApplyHeaviside();
     Matrix<Real> H2(cH);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -988,7 +984,7 @@ static void UnitTestCuMatrixHeaviside() {
     cH2.Heaviside(cH);
     H.ApplyHeaviside();
     Matrix<Real> H2(cH2);
-    AssertEqual(H, H2);
+    KALDI_ASSERT(ApproxEqual(H, H2));
   }
 }
 
@@ -1014,7 +1010,7 @@ static void UnitTestCuMatrixMulElements() {
     Matrix<Real> Ha2(dimM, dimN);
     Da.CopyToMat(&Ha2);
 
-    AssertEqual(Ha,Ha2);
+    KALDI_ASSERT(ApproxEqual(Ha,Ha2));
   }
 }
 
@@ -1026,7 +1022,9 @@ static void UnitTestCuMatrixDivElements() {
     Matrix<Real> Ha(dimM, dimN);
     Matrix<Real> Hb(dimM, dimN);
     Ha.SetRandn();
-    Hb.SetRandn();
+
+    Hb.SetRandUniform();  // Use uniform distirbution t ensure positive numbers
+    Hb.Add(0.1);          // Add bias to ensure we do not divide by zero
 
     CuMatrix<Real> Da(dimM, dimN);
     CuMatrix<Real> Db(dimM, dimN);
@@ -1039,7 +1037,7 @@ static void UnitTestCuMatrixDivElements() {
     Matrix<Real> Ha2(dimM, dimN);
     Da.CopyToMat(&Ha2);
 
-    AssertEqual(Ha,Ha2);
+    KALDI_ASSERT(ApproxEqual(Ha,Ha2));
   }
 }
 
@@ -1052,6 +1050,7 @@ static void UnitTestCuMatrixMax() {
 
   CuMatrix<Real> Da(100,100);
   CuMatrix<Real> Db(100,100);
+  
   Da.CopyFromMat(Ha);
   Db.CopyFromMat(Hb);
 
@@ -1061,7 +1060,7 @@ static void UnitTestCuMatrixMax() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha,Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha,Ha2));
 }
 
 template<typename Real>
@@ -1082,7 +1081,7 @@ static void UnitTestCuMatrixMin() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha, Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha, Ha2));
 }
 
 
@@ -1105,7 +1104,7 @@ static void UnitTestCuMatrixMulColsVec() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1131,7 +1130,7 @@ static void UnitTestCuMatrixMulRowsVec() {
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
 
-    AssertEqual(Hm,Hm2);
+    KALDI_ASSERT(ApproxEqual(Hm,Hm2));
   }
 }
 
@@ -1157,7 +1156,7 @@ static void UnitTestCuMatrixMulRowsGroupMat() {
 
     Matrix<Real> Hm2(dimM, dimN);
     Dm.CopyToMat(&Hm2);
-    AssertEqual(Hm,Hm2);
+    KALDI_ASSERT(ApproxEqual(Hm,Hm2));
   }
 }
 
@@ -1196,7 +1195,7 @@ static void UnitTestCuMatrixDiffGroupPnorm() {
 
     Matrix<Real> Hid2(dimM, dimN);
     Did.CopyToMat(&Hid2);
-    AssertEqual(Hid, Hid2);
+    KALDI_ASSERT(ApproxEqual(Hid, Hid2));
   }
 }
 
@@ -1231,7 +1230,7 @@ static void UnitTestCuMatrixGroupMaxDeriv() {
   // KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
   Matrix<Real> Hr2(dimM, dimN);
   Dr.CopyToMat(&Hr2);
-  AssertEqual(Hr,Hr2);
+  KALDI_ASSERT(ApproxEqual(Hr,Hr2));
 }
 
 template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
@@ -1266,7 +1265,7 @@ template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
     }
 
     M.AddDiagVecMat(alpha, V, N, trans, beta);
-    AssertEqual(M, Mcheck);
+    KALDI_ASSERT(ApproxEqual(M, Mcheck));
     KALDI_ASSERT(M.Sum() != 0.0);
   }
 }
@@ -1294,7 +1293,7 @@ template<typename Real> static void UnitTestCuMatrixAddMatDiagVec() {
     Mcheck.AddMat(alpha, buf, kNoTrans);
 
     M.AddMatDiagVec(alpha, N, trans, V, beta);
-    AssertEqual(M, Mcheck);
+    KALDI_ASSERT(ApproxEqual(M, Mcheck));
     KALDI_ASSERT(M.Sum() != 0.0);
   }
 }
@@ -1313,7 +1312,7 @@ template<typename Real> static void UnitTestCuMatrixAddMatMatElements() {
   Mcheck.Scale(beta); Mcheck.AddMat(alpha, buf, kNoTrans);
 
   M.AddMatMatElements(alpha, A, B, beta);
-  AssertEqual(M, Mcheck);
+  KALDI_ASSERT(ApproxEqual(M, Mcheck));
   KALDI_ASSERT(M.Sum() != 0.0);
 }
 
@@ -1327,14 +1326,16 @@ template<typename Real> static void UnitTestCuMatrixSetMatMatDivMat() {
   B.SetRandn();
   C.SetRandn();
 
+  C.ApplyFloor(0.01);  // make sure there are no zeros.
+
   M.SetMatMatDivMat(A,B,C);
   ref.AddMatMatElements(1.0, A, B, 0.0);
   ref.DivElements(C);
-  AssertEqual(M, ref);
+  KALDI_ASSERT(ApproxEqual(M, ref));
 
   C.SetZero();
   M.SetMatMatDivMat(A,B,C);
-  AssertEqual(M, A);
+  KALDI_ASSERT(ApproxEqual(M, A));
 }
 
 template<typename Real>
@@ -1357,7 +1358,7 @@ static void UnitTestCuMatrixDivRowsVec() {
   Matrix<Real> Hm2(dimM, dimN);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm, Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm, Hm2));
 }
 
 
@@ -1380,13 +1381,13 @@ static void UnitTestCuMatrixAddMat() {
   Matrix<Real> Ha2(100,100);
   Da.CopyToMat(&Ha2);
 
-  AssertEqual(Ha,Ha2);
+  KALDI_ASSERT(ApproxEqual(Ha,Ha2));
 
   //check use with submatrix
   CuMatrix<Real> mat1(10,10,kSetZero);
   mat1.AddMat(1.0,Da.Range(5,10,12,10)); //different stride for mat1,mat2
   CuMatrix<Real> mat2(Da.Range(5,10,12,10));
-  AssertEqual(mat1,mat2);
+  KALDI_ASSERT(ApproxEqual(mat1,mat2));
 
   for (int i = 0; i < 10; i++) {
     int32 N = 5 * (10 + Rand() % 10),  M = 100 + Rand() % 50;
@@ -1406,14 +1407,14 @@ static void UnitTestCuMatrixAddMat() {
 
     Matrix<Real> Hc2(N,M);
     Dc.CopyToMat(&Hc2);
-    AssertEqual(Hc,Hc2);
+    KALDI_ASSERT(ApproxEqual(Hc,Hc2));
 
     // check use with submatrix
     CuMatrix<Real> mat3(N/5,M,kSetZero);
     mat3.AddMat(1.0, Dd.Range(0,M,0,N/5),kTrans);
 
     CuMatrix<Real> mat4(Dd.Range(0,M,0,N/5),kTrans);
-    AssertEqual(mat3,mat4);
+    KALDI_ASSERT(ApproxEqual(mat3,mat4));
   }
 }
 
@@ -1442,7 +1443,7 @@ static void UnitTestCuMatrixAddMatBlocks1() {
       }
     }
     dst.AddMatBlocks(alpha, src);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1469,7 +1470,7 @@ static void UnitTestCuMatrixAddMatBlocks1Trans() {
       }
     }
     dst.AddMatBlocks(alpha, src, kTrans);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1499,7 +1500,7 @@ static void UnitTestCuMatrixAddMatBlocks2() {
       }
     }
     dst.AddMatBlocks(alpha, src);
-    AssertEqual(dst, dst_copy);
+    KALDI_ASSERT(ApproxEqual(dst, dst_copy));
   }
 }
 
@@ -1551,7 +1552,7 @@ static void UnitTestCuMatrixAddVecToCols() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1574,7 +1575,7 @@ static void UnitTestCuMatrixAddVecToRows() {
   Matrix<Real> Hm2(100,99);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -1601,7 +1602,7 @@ static void UnitTestCuMatrixSymAddMat2() {
 
     CuTpMatrix<Real> T1(M), T2(M2);
     CuMatrix<Real> X1(T1), X2(T2); // so we can test equality.
-    AssertEqual(X1, X2);
+    KALDI_ASSERT(ApproxEqual(X1, X2));
     KALDI_ASSERT(dimM == 0 || X1.Trace() != 0);
   }
 }
@@ -1681,8 +1682,8 @@ static void UnitTestCuMatrixAddMatMat() {
   Dc1.CopyToMat(&Hc1a);
   Dc2.CopyToMat(&Hc2a);
 
-  AssertEqual(Hc1,Hc1a);
-  AssertEqual(Hc2,Hc2a);
+  KALDI_ASSERT(ApproxEqual(Hc1,Hc1a));
+  KALDI_ASSERT(ApproxEqual(Hc2,Hc2a));
 }
 
 
@@ -1706,7 +1707,7 @@ static void UnitTestCuMatrixAddVecVec() {
   Matrix<Real> A2(100, 200);
   CuA.CopyToMat(&A2);
 
-  AssertEqual(A,A2);
+  KALDI_ASSERT(ApproxEqual(A,A2));
 }
 
 
@@ -1771,8 +1772,8 @@ static void UnitTestCuMatrixAddMatMatBatched() {
     (*HC2[i]).AddMatMat(0.5f, *(HA[i]), kTrans, *(HB[i]), kTrans, 0.0f);
     DC1[i]->CopyToMat(&Hca1);
     DC2[i]->CopyToMat(&Hca2);
-    AssertEqual(*(HC1[i]), Hca1);
-    AssertEqual(*(HC2[i]), Hca2);
+    KALDI_ASSERT(ApproxEqual(*(HC1[i]), Hca1));
+    KALDI_ASSERT(ApproxEqual(*(HC2[i]), Hca2));
     delete Ha[i]; delete Hb[i]; delete Hc1[i]; delete Hc2[i];
     delete HA[i]; delete HB[i]; delete HC1[i]; delete HC2[i];
     delete Da[i]; delete Db[i]; delete Dc1[i]; delete Dc2[i];
@@ -1794,7 +1795,7 @@ static void UnitTestCuMatrixAddToDiag() {
     M.AddToDiag(alpha);
     Mc.AddToDiag(alpha);
     Matrix<Real> M2(Mc);
-    AssertEqual(M, M2);
+    KALDI_ASSERT(ApproxEqual(M, M2));
   }
 }
 
@@ -1808,7 +1809,7 @@ static void UnitTestCuMatrixAdd2() {
     M.Add(alpha);
     Mc.Add(alpha);
     Matrix<Real> M2(Mc);
-    AssertEqual(M, M2);
+    KALDI_ASSERT(ApproxEqual(M, M2));
   }
 }
 
@@ -1824,7 +1825,7 @@ static void UnitTestCuMatrixCopyFromMat() {
     CuMatrix<Real> B(dim, dim);
     B.CopyFromMat(E);
 
-    AssertEqual<Real>(B, E);
+    KALDI_ASSERT(ApproxEqual<Real>(B, E));
   }
 }
 
@@ -1840,7 +1841,7 @@ static void UnitTestCuMatrixCopyFromTp() {
     B.CopyFromTp(A, kNoTrans);
     C.CopyFromTp(E, kNoTrans);
     CuMatrix<Real> D(B);
-    AssertEqual<Real>(D, C);
+    KALDI_ASSERT(ApproxEqual<Real>(D, C));
   }
 }
 
@@ -1863,7 +1864,7 @@ static void UnitTestCuMatrixAddMatTp() {
     D.AddMatTp(1.0, E, kNoTrans, F, kNoTrans, 1.0);
 
     CuMatrix<Real> G(A);
-    AssertEqual<Real>(G, D);
+    KALDI_ASSERT(ApproxEqual<Real>(G, D));
   }
 }
 
@@ -1882,7 +1883,7 @@ static void UnitTestCuMatrixTranspose() {
     Matrix<Real> hA(A);
     Matrix<Real> hB(B);
     hB.Transpose();
-    AssertEqual(hA, hB);
+    KALDI_ASSERT(ApproxEqual(hA, hB));
   }
 }
 
@@ -1905,7 +1906,7 @@ static void UnitTestCuMatrixAddTpMat() {
     D.AddTpMat(1.0, F, kNoTrans, E, kNoTrans, 1.0);
 
     CuMatrix<Real> G(A);
-    AssertEqual<Real>(G, D);
+    KALDI_ASSERT(ApproxEqual<Real>(G, D));
   }
 }
 
@@ -1931,7 +1932,7 @@ static void UnitTestCuVectorAddVec() {
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -1962,7 +1963,7 @@ static void UnitTestCuVectorAddRowSumMat() {
   Vector<Real> Hv2(Y);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -1990,7 +1991,7 @@ static void UnitTestCuVectorAddRowSumMatLarge() {
   Vector<Real> Hv2(990);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -2021,7 +2022,7 @@ static void UnitTestCuVectorAddColSumMat() {
   Vector<Real> Hv2(X);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2066,7 +2067,7 @@ static void UnitTestCuVectorAddColSumMatLarge() {
   Vector<Real> Hv2(1000);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 
@@ -2085,7 +2086,7 @@ static void UnitTestCuVectorInvertElements() {
   Vector<Real> Hv2(777);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2102,7 +2103,7 @@ static void UnitTestCuMatrixInvertElements() {
   Matrix<Real> Hm2(77, 77);
   Dm.CopyToMat(&Hm2);
 
-  AssertEqual(Hm,Hm2);
+  KALDI_ASSERT(ApproxEqual(Hm,Hm2));
 }
 
 
@@ -2121,7 +2122,7 @@ static void UnitTestCuMatrixIO() {
     CuMatrix<Real> mat2;
     std::istringstream is(os.str());
     mat2.Read(is, binary);
-    AssertEqual(mat, mat2);
+    KALDI_ASSERT(ApproxEqual(mat, mat2));
   }
 }
 
@@ -2149,7 +2150,7 @@ static void UnitTestCuVectorAddTpVec() {
   Vector<Real> Hv2(300);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real>
@@ -2187,7 +2188,7 @@ static void UnitTestCuVectorMulTp() {
   Vector<Real> Hv2(300);
   Dv.CopyToVec(&Hv2);
 
-  AssertEqual(Hv,Hv2);
+  KALDI_ASSERT(ApproxEqual(Hv,Hv2));
 }
 
 template<typename Real, typename OtherReal>
@@ -2241,7 +2242,7 @@ static void UnitTestCuSigmoid() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 
@@ -2272,7 +2273,7 @@ static void UnitTestCuDiffSigmoid() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 
@@ -2315,7 +2316,7 @@ static void UnitTestCuDiffSoftmax() {
     Matrix<Real> Ho2(m, n);
     Do.CopyToMat(&Ho2);
 
-    AssertEqual(Ho, Ho2);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2));
   }
 }
 
@@ -2360,7 +2361,7 @@ static void UnitTestCuDiffLogSoftmax() {
     Matrix<Real> Ho2(m, n);
     Do.CopyToMat(&Ho2);
 
-    AssertEqual(Ho, Ho2);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2));
  }
 }
 
@@ -2383,11 +2384,11 @@ static void UnitTestCuSoftmax() {
 
     //gpu
     if (i % 2 == 0) {
-      Do.ApplySoftMaxPerRow(Di);
+      Do.SoftMaxPerRow(Di);
     } else {
       // in-place
       Do.CopyFromMat(Di);
-      Do.ApplySoftMaxPerRow(Do);
+      Do.SoftMaxPerRow(Do);
     }
     //cpu
     Ho.CopyFromMat(Hi);
@@ -2397,7 +2398,7 @@ static void UnitTestCuSoftmax() {
 
     Matrix<Real> Ho2(Do);
 
-    AssertEqual(Ho,Ho2,0.00001);
+    KALDI_ASSERT(ApproxEqual(Ho,Ho2,(Real)0.00001));
   }
 }
 
@@ -2420,11 +2421,11 @@ static void UnitTestCuLogSoftmax() {
 
     //gpu
     if (i % 2 == 0) {
-      Do.ApplyLogSoftMaxPerRow(Di);
+      Do.LogSoftMaxPerRow(Di);
     } else {
       // in-place.
       Do.CopyFromMat(Di);
-      Do.ApplyLogSoftMaxPerRow(Do);
+      Do.LogSoftMaxPerRow(Do);
     }
     //cpu
     Ho.CopyFromMat(Hi);
@@ -2434,7 +2435,7 @@ static void UnitTestCuLogSoftmax() {
 
     Matrix<Real> Ho2(Do);
 
-    AssertEqual(Ho, Ho2, 0.00001);
+    KALDI_ASSERT(ApproxEqual(Ho, Ho2, (Real)0.00001));
   }
 }
 
@@ -2467,7 +2468,12 @@ static void UnitTestCuFindRowMaxId() {
     std::vector<int32> Hmax2(dimM);
     Dmax.CopyToVec(&Hmax2);
 
-    KALDI_ASSERT(Hmax == Hmax2);
+    // If the same value were generated randomly we can get to a case
+    // where the GPU and CPU return different columns.  Both would be correct.
+    // Thus check that the max for each row is the same and not the index.
+    for (MatrixIndexT r=0; r<Hi.NumRows(); r++) {
+      KALDI_ASSERT(Hi(r, Hmax[r]) == Di(r, Hmax2[r]));
+    }
   }
 }
 
@@ -2506,8 +2512,8 @@ static void UnitTestCuDiffXent() {
   Vector<Real> Hlogpost2(X);
   Dlogpost.CopyToVec(&Hlogpost2);
 
-  AssertEqual(Hi,Hi2);
-  AssertEqual(Hlogpost,Hlogpost2);
+  KALDI_ASSERT(ApproxEqual(Hi,Hi2));
+  KALDI_ASSERT(ApproxEqual(Hlogpost,Hlogpost2));
 }
 
 template<typename Real> void UnitTestCheck() {
@@ -2540,8 +2546,8 @@ void UnitTestSwapCu2Cu() {
   Di.CopyToMat(&Hf);
   Matrix<Real> Hf2(Di2.NumRows(), Di2.NumCols());
   Di2.CopyToMat(&Hf2);
-  AssertEqual(Hi,Hf2);
-  AssertEqual(Hi2,Hf);
+  KALDI_ASSERT(ApproxEqual(Hi,Hf2));
+  KALDI_ASSERT(ApproxEqual(Hi2,Hf));
 }
 
 template<typename Real>
@@ -2559,8 +2565,8 @@ void UnitTestSwapCu2M() {
   Di.Swap(&Hi2);
   Matrix<Real> Hf(Di.NumRows(), Di.NumCols());
   Di.CopyToMat(&Hf);
-  AssertEqual(Di2,Hf);
-  AssertEqual(Hi2,Hi);
+  KALDI_ASSERT(ApproxEqual(Di2,Hf));
+  KALDI_ASSERT(ApproxEqual(Hi2,Hi));
 }
 
 
@@ -2580,7 +2586,7 @@ void UnitTestCuTanh() {
   //cpu
   Matrix<Real> Hf(H.NumRows(), H.NumCols());
   Hf.Tanh(H);
-  AssertEqual(Df,Hf);
+  KALDI_ASSERT(ApproxEqual(Df,Hf));
 }
 
 template<typename Real>
@@ -2609,7 +2615,7 @@ static void UnitTestCuDiffTanh() {
   Matrix<Real> Ho2(100,111);
   Do.CopyToMat(&Ho2);
 
-  AssertEqual(Ho,Ho2);
+  KALDI_ASSERT(ApproxEqual(Ho,Ho2));
 }
 
 // just need this for testing function below.  Compute n!!
@@ -2620,14 +2626,18 @@ static int32 DoubleFactorial(int32 i) {
 template <typename Real>
 static void UnitTestCuMatrixSetRandn() {
 
-  { // First test consistency when called twice.
+
+  if (false) {
+    // This block tests consistency when called twice.
+    // It has been disabled since we added multi-threaded testing,
+    // since consistency wouldn't be expected if other threads were running.
     int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200;
     Matrix<Real> M(dimM, dimN), N(dimM, dimN);
     srand(104);
     M.SetRandn();
     srand(104);
     N.SetRandn();
-    AssertEqual(M, N);
+    KALDI_ASSERT(ApproxEqual(M, N));
   }
 
   for (int32 i = 0; i < 5; i++) {
@@ -2644,10 +2654,13 @@ static void UnitTestCuMatrixSetRandn() {
       // see http://en.wikipedia.org/wiki/Normal_distribution#Moments,
       // note that mu = 0 and sigma = 1.
       Real expected_moment = (pow % 2 == 1 ? 0 : DoubleFactorial(pow - 1));
+      Real expected_twice_moment = DoubleFactorial(2 * pow - 1);
       Real k = 10.0; // This is just a constant we use to give us some wiggle
                      // room before rejecting the distribution... e.g. 20 sigma,
                      // quite approximately.
-      Real allowed_deviation = k * pow / sqrt(static_cast<Real>(rows * cols));
+      // VAR(X) = E(X^2) - (E(X))^2
+      Real deviation = sqrt(expected_twice_moment - expected_moment * expected_moment);
+      Real allowed_deviation = k * deviation / sqrt(static_cast<Real>(rows * cols));
       // give it a bit more wiggle room for higher powers.. this is quite
       // unscientific, it would be better to involve the absolute moments or
       // something like that, and use one of those statistical inequalities,
@@ -2816,29 +2829,37 @@ static void UnitTestCuMatrixAddElements() {
     CuMatrix<Real> M(H);
     int32 num_elements = 100 + Rand() % 10;
     std::vector<MatrixElement<Real> > input;
-    std::vector<Int32Pair> input_index;
+    std::set<Int32Pair> input_index;      //Set used to ensure unique elements
+    std::vector<Int32Pair> input_index_v;
     Real *input_value = new Real[num_elements];
     BaseFloat scale = -1 + (0.33 * (Rand() % 5));
     for (int32 j = 0; j < num_elements; j++) {
-      MatrixIndexT r = Rand() % dimM;
-      MatrixIndexT c = Rand() % dimN;
       Int32Pair tmp_pair;
-      tmp_pair.first = r;
-      tmp_pair.second = c;
+      // Generate a unique random index
+      do {
+        tmp_pair.first = Rand() % dimM;
+        tmp_pair.second = Rand() % dimN;
+      } while (input_index.find(tmp_pair)!=input_index.end());
+      input_index.insert(tmp_pair);  
+
+      MatrixIndexT r = tmp_pair.first;
+      MatrixIndexT c = tmp_pair.second;
+      input_index_v.push_back(tmp_pair);
+
       Real offset = -1 + (0.33 * (Rand() % 5));
       M(r, c) += scale * offset;
       MatrixElement<Real> t = {r, c, offset};
       input.push_back(t);
-      input_index.push_back(tmp_pair);
       input_value[j] = offset;
     }
+    
     H.AddElements(scale, input);
-    CuArray<Int32Pair> cu_input_index(input_index);
+    CuArray<Int32Pair> cu_input_index(input_index_v);
     H_copy.AddElements(scale, cu_input_index, input_value);
     delete[] input_value;
 
-    AssertEqual(H, M);
-    AssertEqual(H_copy, M);
+    KALDI_ASSERT(ApproxEqual(H, M));
+    KALDI_ASSERT(ApproxEqual(H_copy, M));
   }
 }
 
@@ -2860,7 +2881,7 @@ static void UnitTestCuMatrixAddToElements() {
     }
     CuArray<int32> cu_elements(elements);
     A_copy.AddToElements(alpha, cu_elements);
-    AssertEqual(A_copy, A);
+    KALDI_ASSERT(ApproxEqual(A_copy, A));
   }
 }
 
@@ -3040,16 +3061,38 @@ template<typename Real> void CudaMatrixUnitTest() {
 int main() {
   SetVerboseLevel(1);
   int32 loop = 0;
+  bool test_threads = true;
+  // num_threads only matters if test_threads == true.   Don't make it
+  // to large, because it will affect CPU usage if you are using CPU.
+  int32 num_threads = 4;
+
+
 #if HAVE_CUDA == 1
   for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
+    if (test_threads)
+      CuDevice::Instantiate().AllowMultithreading();
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
     else
       CuDevice::Instantiate().SelectGpuId("yes");
 #endif
 
-    kaldi::CudaMatrixUnitTest<float>();
+    if (test_threads) {
+      KALDI_LOG << "Doing matrix unit test with "
+                << num_threads << " threads.";
+      std::vector<std::thread*> threads;
+      for (int32 i = 0;  i < num_threads - 1; i++)
+        threads.push_back(new std::thread(kaldi::CudaMatrixUnitTest<float>));
+      // the last thread running is the main thread.
+      kaldi::CudaMatrixUnitTest<float>();
+      for (size_t i = 0; i < threads.size(); i++) {
+        threads[i]->join();
+        delete threads[i];
+      }
+    } else {
+      kaldi::CudaMatrixUnitTest<float>();
+    }
 
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index beccd9dc4a5..92cac2142b5 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -8,6 +8,7 @@
 //           2013-2015  Guoguo Chen
 //           2016-2017  Shiyin Kang
 //                2017  Hossein Hadian
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -229,8 +230,10 @@ void CuMatrixBase<Real>::CopyFromMat(const CuMatrixBase<OtherReal> &M,
       MatrixIndexT dst_pitch = stride_ * sizeof(Real);
       MatrixIndexT src_pitch = M.Stride() * sizeof(Real);
       MatrixIndexT width = M.NumCols() * sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, M.data_, src_pitch,
-                                width, M.num_rows_, cudaMemcpyDeviceToDevice));
+      CU_SAFE_CALL(
+        cudaMemcpy2DAsync(data_, dst_pitch, M.data_, src_pitch,
+                          width, M.num_rows_, cudaMemcpyDeviceToDevice,
+                          cudaStreamPerThread));
     } else {
       if (trans == kNoTrans) {
         dim3 dimGrid, dimBlock;
@@ -319,8 +322,10 @@ void CuMatrixBase<Real>::CopyFromMat(const MatrixBase<Real> &src,
       MatrixIndexT dst_pitch = stride_*sizeof(Real);
       MatrixIndexT src_pitch = src.Stride()*sizeof(Real);
       MatrixIndexT width = src.NumCols()*sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2D(data_, dst_pitch, src.Data(), src_pitch,
-                                width, src.NumRows(), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpy2DAsync(data_, dst_pitch, src.Data(), src_pitch,
+                                width, src.NumRows(), cudaMemcpyHostToDevice,
+                                cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
 
       CuDevice::Instantiate().AccuProfile("CuMatrixBase::CopyFromMat(from CPU)", tim);
     } else {
@@ -410,6 +415,33 @@ template
 CuMatrix<double>::CuMatrix(const MatrixBase<double> &other, MatrixTransposeType trans);
 
 
+template <typename Real>
+void CuMatrixBase<Real>:: CopyRangeFromMatClamped(const CuMatrixBase<Real> & src,
+      int32_t start_range, int32_t end_range,
+      int32_t clamp_low, int32_t clamp_high) {
+
+  KALDI_ASSERT(NumCols() == this->NumCols());
+  KALDI_ASSERT(NumRows() == end_range-start_range);
+
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    cuda_mat_copy_range_clamped(start_range, end_range, NumCols(),
+      src.Data(), src.Stride(), clamp_low, clamp_high,
+      Data(), Stride());
+  } else
+#endif
+  {
+    for (int32 t = start_range; t < end_range; t++) {
+      int32 t_clamped = t;
+      if (t_clamped < clamp_low) t_clamped = clamp_low;
+      if (t_clamped >= clamp_high) t_clamped = clamp_high;
+      CuSubVector<Real> dest_row=this->Row(t - start_range);
+      const CuSubVector<Real> src_row=src.Row(t_clamped);
+      dest_row.CopyFromVec(src_row);
+    }
+  }
+}
+
 template<typename Real>
 template<typename OtherReal>
 void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
@@ -427,9 +459,10 @@ void CuMatrixBase<Real>::CopyToMat(MatrixBase<OtherReal> *dst,
       MatrixIndexT src_pitch = stride_*sizeof(Real);
       MatrixIndexT dst_pitch = dst->Stride()*sizeof(Real);
       MatrixIndexT width = NumCols()*sizeof(Real);
-      CU_SAFE_CALL(cudaMemcpy2D(dst->Data(), dst_pitch, this->data_, src_pitch,
-                                width, this->num_rows_, cudaMemcpyDeviceToHost));
-
+      CU_SAFE_CALL(cudaMemcpy2DAsync(dst->Data(), dst_pitch, this->data_,
+                                     src_pitch, width, this->num_rows_,
+                                     cudaMemcpyDeviceToHost, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuMatrix::CopyToMatD2H", tim);
     }
   } else
@@ -477,8 +510,9 @@ void CuMatrixBase<Real>::SetZero() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset2D(data_, stride_ * sizeof(Real), 0,
-                              num_cols_ * sizeof(Real), num_rows_ ));
+    CU_SAFE_CALL(cudaMemset2DAsync(data_, stride_ * sizeof(Real), 0,
+                              num_cols_ * sizeof(Real), num_rows_ ,
+                              cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuMatrix::SetZero", tim);
   } else
 #endif
@@ -628,27 +662,6 @@ void CuMatrixBase<Real>::Scale(Real value) {
   }
 }
 
-template<typename Real>
-void CuMatrixBase<Real>::ApplyLog() {
-  #if HAVE_CUDA == 1
-  if (CuDevice::Instantiate().Enabled()) {
-    if (num_rows_ == 0) return;
-    CuTimer tim;
-
-    dim3 dimGrid, dimBlock;
-    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-                                          &dimGrid, &dimBlock);
-
-    cuda_apply_log(dimGrid, dimBlock, data_, Dim());
-    CU_SAFE_CALL(cudaGetLastError());
-
-    CuDevice::Instantiate().AccuProfile(__func__, tim);
-  } else
-  #endif
-  {
-    Mat().ApplyLog();
-  }
-}
 
 template<typename Real>
 void CuMatrixBase<Real>::MulElements(const CuMatrixBase<Real>& A) {
@@ -1666,7 +1679,10 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
       return;
     }
     void *addr = CuDevice::Instantiate().Malloc(sv_labels.size() * sizeof(MatrixElement<Real>));
-    CU_SAFE_CALL(cudaMemcpy(addr, sv_labels.data(), sv_labels.size() * sizeof(MatrixElement<Real>), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(addr, sv_labels.data(), sv_labels.size() *
+                                 sizeof(MatrixElement<Real>),
+                                 cudaMemcpyHostToDevice,
+                                 cudaStreamPerThread));
     CuTimer tim;
     CuVector<Real> tmp(2, kUndefined);
     int dimBlock(CU1DBLOCK);
@@ -1690,7 +1706,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
       //KALDI_ASSERT(label >= 0 && label < nnet_.OutputDim());
       Real this_prob = output(m, label);
       KALDI_ASSERT(this_prob >= 0.99e-20); // we floored to 1.0e-20 in SoftmaxLayer.
-      *tot_objf += weight * Log(this_prob);
+      *tot_objf += weight * kaldi::Log(this_prob);
       *tot_weight += weight;
       (*this)(m, label) += weight / this_prob;
     }
@@ -1698,7 +1714,7 @@ void CuMatrix<Real>::CompObjfAndDeriv(const std::vector<MatrixElement<Real> >& s
 }
 
 template<typename Real> // Y->this, X->src
-void CuMatrixBase<Real>::ApplySoftMaxPerRow(const CuMatrixBase<Real> &src) {
+void CuMatrixBase<Real>::SoftMaxPerRow(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -1721,7 +1737,7 @@ void CuMatrixBase<Real>::ApplySoftMaxPerRow(const CuMatrixBase<Real> &src) {
 }
 
 template<typename Real> // Y->this, X->src
-void CuMatrixBase<Real>::ApplyLogSoftMaxPerRow(const CuMatrixBase<Real> &src) {
+void CuMatrixBase<Real>::LogSoftMaxPerRow(const CuMatrixBase<Real> &src) {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -1960,7 +1976,7 @@ void CuMatrixBase<Real>::DiffXent(const CuArrayBase<int32> &tgt,
     for(int32 r = 0; r < num_rows; r++) {
       int32 col_tgt = tgt.Data()[r];
       Real &value = Mat()(r, col_tgt);
-      log_post_tgt->Vec()(r) = Log(value);
+      log_post_tgt->Vec()(r) = kaldi::Log(value);
       value -= 1.0;
     }
   }
@@ -2086,6 +2102,7 @@ void CuMatrixBase<Real>::Cholesky(CuMatrixBase<Real> *inv_cholesky) {
   // (5)(d) zero L12 and M12.
   this_12.SetZero();
   inv_12.SetZero();
+
 }
 
 
@@ -2241,7 +2258,9 @@ void AddMatMatBatched(const Real alpha, std::vector<CuSubMatrix<Real>* > &C,
       host_c_array[i] = C[i]->data_;
     }
 
-    CU_SAFE_CALL(cudaMemcpy(device_abc_array, host_abc_array, 3*size*sizeof(Real*), cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(device_abc_array, host_abc_array,
+                                 3*size*sizeof(Real*), cudaMemcpyHostToDevice,
+                                 cudaStreamPerThread));
 
     CUBLAS_SAFE_CALL(cublas_gemmBatched(GetCublasHandle(),
                                         (transB==kTrans? CUBLAS_OP_T:CUBLAS_OP_N),
@@ -2286,14 +2305,15 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
     if (v.Dim() == num_rows_*num_cols_) {
       if (stride_ == num_cols_) {
         const Real* v_data = v.Data();
-        CU_SAFE_CALL(cudaMemcpy(data_, v_data,
-                                sizeof(Real)*num_rows_*num_cols_,
-                                cudaMemcpyDeviceToDevice));
+        CU_SAFE_CALL(
+          cudaMemcpyAsync(data_, v_data, sizeof(Real)*num_rows_*num_cols_,
+                          cudaMemcpyDeviceToDevice, cudaStreamPerThread));
       } else {
-        CU_SAFE_CALL(cudaMemcpy2D(data_, stride_ * sizeof(Real), v.Data(),
-                                  num_cols_*sizeof(Real), num_cols_*sizeof(Real),
-                                  num_rows_,
-                                  cudaMemcpyDeviceToDevice));
+        CU_SAFE_CALL(
+          cudaMemcpy2DAsync(data_, stride_ * sizeof(Real), v.Data(),
+                            num_cols_*sizeof(Real), num_cols_*sizeof(Real),
+                            num_rows_, cudaMemcpyDeviceToDevice,
+                            cudaStreamPerThread));
       }
     } else if (v.Dim() == num_cols_) {
       dim3 dimGrid, dimBlock;
@@ -2320,15 +2340,21 @@ void CuMatrixBase<Real>::CopyRowsFromVec(const VectorBase<Real> &v) {
     if (v.Dim() == num_rows_*num_cols_) {
       if (stride_ == num_cols_) {
         const Real* v_data = v.Data();
-        cudaMemcpy(data_, v_data, sizeof(Real)*num_rows_*num_cols_, cudaMemcpyHostToDevice);
+        CU_SAFE_CALL(cudaMemcpyAsync(data_, v_data,
+                                     sizeof(Real)*num_rows_*num_cols_,
+                                     cudaMemcpyHostToDevice,
+                                     cudaStreamPerThread));
       } else {
         const Real *v_data = v.Data();
         for (MatrixIndexT r = 0; r < num_rows_; r++) {
           Real *row_data = RowData(r);
-          cudaMemcpy(row_data, v_data, sizeof(Real)*num_cols_, cudaMemcpyHostToDevice);
+          CU_SAFE_CALL(cudaMemcpyAsync(row_data, v_data, sizeof(Real)*num_cols_,
+                                       cudaMemcpyHostToDevice,
+                                       cudaStreamPerThread));
           v_data += num_cols_;
         }
       }
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     } else if (v.Dim() == num_cols_) {
       dim3 dimGrid, dimBlock;
       GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
@@ -2406,61 +2432,72 @@ void CuMatrixBase<Real>::CopyColFromVec(const CuVectorBase<Real> &v,
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyPow(Real power) {
+void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_pow(dimGrid, dimBlock, data_, power, Dim());
+    cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+                   src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
+
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
-#endif
+  #endif
   {
-    Mat().ApplyPow(power);
+    Mat().Heaviside(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
+void CuMatrixBase<Real>::Exp(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_pow_abs(dimGrid, dimBlock, data_, power, include_sign, Dim());
+    cuda_exp(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+	     src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
+
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
-#endif
+  #endif
   {
-    Mat().ApplyPowAbs(power, include_sign);
+    Mat().Exp(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyHeaviside() {
+void CuMatrixBase<Real>::Log(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
+    if (num_rows_ == 0) return;
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_heaviside(dimGrid, dimBlock, data_, Dim());
+
+    cuda_log(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
+	     src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
+
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
-#endif
+  #endif
   {
-    Mat().ApplyHeaviside();
+    Mat().Log(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
+void CuMatrixBase<Real>::Pow(const CuMatrixBase<Real> &src, Real power) {
   KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -2468,38 +2505,41 @@ void CuMatrixBase<Real>::Heaviside(const CuMatrixBase<Real> &src) {
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_heaviside(dimGrid, dimBlock, this->data_, src.data_, this->Dim(),
-                   src.Stride());
+    cuda_pow(dimGrid, dimBlock, this->data_, src.data_, power, this->Dim(),
+	     src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
 
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
   {
-    Mat().Heaviside(src.Mat());
+    Mat().Pow(src.Mat(), power);
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyExp() {
+void CuMatrixBase<Real>::PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_exp(dimGrid, dimBlock, data_, Dim());
+    cuda_pow_abs(dimGrid, dimBlock, this->data_, src.data_, power, include_sign,
+		 this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyExp();
+    Mat().PowAbs(src.Mat(), power, include_sign);
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyExpLimited(Real lower_limit, Real upper_limit) {
+void CuMatrixBase<Real>::ExpLimited(const CuMatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
+  KALDI_ASSERT(SameDim(*this, src));
   KALDI_ASSERT(upper_limit > lower_limit);
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -2507,82 +2547,72 @@ void CuMatrixBase<Real>::ApplyExpLimited(Real lower_limit, Real upper_limit) {
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit);
+    cuda_exp_limited(dimGrid, dimBlock, this->data_, src.data_, lower_limit, upper_limit,
+		     this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    int32 num_rows = num_rows_, num_cols = num_cols_;
-    for (int32 r = 0; r < num_rows; r++) {
-      Real *row_data = this->RowData(r);
-      for (int32 c = 0; c < num_cols; c++) {
-        Real x = row_data[c];
-        if (!(x >= lower_limit))
-          x = lower_limit;
-        if (x > upper_limit)
-          x = upper_limit;
-        row_data[c] = Exp(x);
-      }
-    }
+    Mat().ExpLimited(src.Mat(), lower_limit, upper_limit);
   }
 }
 
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyExpSpecial() {
+void CuMatrixBase<Real>::ExpSpecial(const CuMatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-
-    const int warpSize = 32;
-    dim3 dimBlock(CU1DBLOCK / warpSize, warpSize);
-    dim3 dimGrid(n_blocks(NumRows(), dimBlock.x),
-                 n_blocks(NumCols(), dimBlock.y));
-
-    cuda_apply_exp_special(dimGrid, dimBlock, Data(), Dim(), Data(), Stride());
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_exp_special(dimGrid, dimBlock, this->data_, src.data_, Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyExpSpecial();
+    Mat().ExpSpecial(src.Mat());
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyFloor(Real floor_val) {
+void CuMatrixBase<Real>::Floor(const CuMatrixBase<Real> &src, Real floor_val) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, Dim());
+    cuda_floor(dimGrid, dimBlock, data_, src.data_, floor_val, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyFloor(floor_val);
+    Mat().Floor(src.Mat(), floor_val);
   }
 }
 
 template<typename Real>
-void CuMatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
+void CuMatrixBase<Real>::Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val) {
+  KALDI_ASSERT(SameDim(*this, src));
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     dim3 dimGrid, dimBlock;
     GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
                                           &dimGrid, &dimBlock);
-    cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, Dim());
+    cuda_ceiling(dimGrid, dimBlock, this->data_, src.data_, ceiling_val, this->Dim(), src.Stride());
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
   {
-    Mat().ApplyCeiling(ceiling_val);
+    Mat().Ceiling(src.Mat(), ceiling_val);
   }
 }
 
@@ -2594,16 +2624,19 @@ void VectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
-      cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_, cudaMemcpyDeviceToHost);
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
+                   cudaMemcpyDeviceToHost, cudaStreamPerThread));
     } else {
       // we could definitely do better than the following.
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        cudaMemcpy(vec_data, mat.RowData(r), sizeof(Real) * mat.NumCols(),
-                   cudaMemcpyDeviceToHost);
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
+                     sizeof(Real) * mat.NumCols(), cudaMemcpyDeviceToHost,
+                     cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVectorBase::CopyRowsFromMat", tim);
   } else
 #endif
@@ -3252,9 +3285,9 @@ void CuMatrixBase<Real>::AddElements(Real alpha,
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     void *addr = CuDevice::Instantiate().Malloc(input.size() * sizeof(MatrixElement<Real>));
-    CU_SAFE_CALL(cudaMemcpy(addr, input.data(),
-                        input.size() * sizeof(MatrixElement<Real>),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(addr, input.data(),
+                                 input.size() * sizeof(MatrixElement<Real>),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     CuTimer tim;
     int dimBlock(CU1DBLOCK);
@@ -3284,8 +3317,9 @@ void CuMatrixBase<Real>::AddElements(Real alpha, const CuArrayBase<Int32Pair> &i
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     CuVector<Real> tmp_vec(indexes.Dim(), kUndefined);
-    CU_SAFE_CALL(cudaMemcpy(tmp_vec.Data(), input, indexes.Dim() * sizeof(Real),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(tmp_vec.Data(), input,
+                                 indexes.Dim() * sizeof(Real),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
 
     int dimBlock(CU1DBLOCK);
     int dimGrid = n_blocks(indexes.Dim(), CU1DBLOCK);
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 85aa4c049e7..a531ecd45b9 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -6,6 +6,7 @@
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 //                2017  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -237,6 +238,7 @@ class CuMatrixBase {
   void CopyFromMat(const MatrixBase<OtherReal> &src,
                    MatrixTransposeType trans = kNoTrans);
 
+
   void CopyFromGeneralMat(const GeneralMatrix &src,
                           MatrixTransposeType trans = kNoTrans);
 
@@ -248,6 +250,13 @@ class CuMatrixBase {
   template<typename OtherReal>
   void CopyFromTp(const CuTpMatrix<OtherReal> &M,
                   MatrixTransposeType trans = kNoTrans);
+  
+  // This function will copy from source rows (start_range, end_range]
+  // if the range is outside of the clamped region then the clamped
+  // row will be replicated across the out of range areas
+  void CopyRangeFromMatClamped(const CuMatrixBase<Real> & src,
+      int32_t start_range, int32_t end_range,
+      int32_t clamp_low, int32_t clamp_high);
 
   template<typename OtherReal>
   void CopyFromMat(const CuMatrixBase<OtherReal> &M,
@@ -283,6 +292,48 @@ class CuMatrixBase {
   /// in general, there are different ways to deal with the situation when x==0.]
   void Heaviside(const CuMatrixBase<Real> &src);
 
+  void Exp(const CuMatrixBase<Real> &src);
+
+  void Log(const CuMatrixBase<Real> &src);
+
+  void Pow(const CuMatrixBase<Real> &src, Real power);
+
+  /// Apply power to the absolute value of each element.
+  /// If include_sign is true, the result will be multiplied with
+  /// the sign of the input value.
+  /// If the power is negative and the input to the power is zero,
+  /// The output will be set zero. If include_sign is true, it will
+  /// multiply the result by the sign of the input.
+  void PowAbs(const CuMatrixBase<Real> &src, Real power, bool include_sign=false);
+
+  void Floor(const CuMatrixBase<Real> &src, Real floor_val);
+  
+  void Ceiling(const CuMatrixBase<Real> &src, Real ceiling_val);
+  
+  /// This is equivalent to running:
+  /// Floor(src, lower_limit);
+  /// Ceiling(src, upper_limit);
+  /// Exp(src)
+  void ExpLimited(const CuMatrixBase<Real> &src, Real lower_limit, Real upper_limit);
+
+  /// For each element x of the matrix, set it to
+  /// (x < 0 ? exp(x) : x + 1).  This function is used
+  /// in our RNNLM training.
+  void ExpSpecial(const CuMatrixBase<Real> &src);
+  
+  /// Softmax nonlinearity
+  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row,
+  /// with attention to avoiding  overflow or underflow.
+  /// Supports in-place operation (i.e. this == &src).
+  void SoftMaxPerRow(const CuMatrixBase<Real> &src);
+
+  /// LogSoftmax nonlinearity
+  /// Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row,
+  /// with attention to avoiding  overflow or underflow.
+  /// Supports in-place operation (i.e. this == &src).
+  void LogSoftMaxPerRow(const CuMatrixBase<Real> &src);
+
+  
   /// Apply the function y = log(1 + exp(x)), to each element.
   /// Note: the derivative of this function is the sigmoid function.
   /// This is like a soft ReLU.
@@ -384,44 +435,51 @@ class CuMatrixBase {
   /// The output is symmetric.
   void SymInvertPosDef();
 
-  void ApplyPow(Real power);
-  /// Apply power to the absolute value of each element.
-  /// If include_sign is true, the result will be multiplied with
-  /// the sign of the input value.
-  /// If the power is negative and the input to the power is zero,
-  /// The output will be set zero. If include_sign is true, it will
-  /// multiply the result by the sign of the input.
-  void ApplyPowAbs(Real power, bool include_sign=false);
-  /// For each element, sets x = (x > 0 ? 1.0 : 0.0).
-  /// See also Heaviside().
-  void ApplyHeaviside();
-  void ApplyFloor(Real floor_val);
-  void ApplyCeiling(Real ceiling_val);
-  void ApplyExp();
-
-
-  /// This is equivalent to running:
-  /// ApplyFloor(lower_limit);
-  /// ApplyCeiling(upper_limit);
-  /// ApplyExp()
-  void ApplyExpLimited(Real lower_limit, Real upper_limit);
-
-  /// For each element x of the matrix, set it to
-  /// (x < 0 ? exp(x) : x + 1).  This function is used
-  /// in our RNNLM training.
-  void ApplyExpSpecial();
-
-  /// Softmax nonlinearity
-  /// Y = Softmax(X) : Yij = e^Xij / sum_k(e^Xik), done to each row,
-  /// with attention to avoiding  overflow or underflow.
-  /// Supports in-place operation (i.e. this == &src).
-  void ApplySoftMaxPerRow(const CuMatrixBase<Real> &src);
-
-  /// LogSoftmax nonlinearity
-  /// Y = LogSoftmax(X) : Yij = Xij - log(sum_k(e^Xik)), done to each row,
-  /// with attention to avoiding  overflow or underflow.
-  /// Supports in-place operation (i.e. this == &src).
-  void ApplyLogSoftMaxPerRow(const CuMatrixBase<Real> &src);
+  inline void ApplyPow(Real power) {
+    this -> Pow(*this, power);
+  };
+
+  
+  inline void ApplyPowAbs(Real power, bool include_sign=false) {
+    this -> PowAbs(*this, power, include_sign);
+  };
+  
+  inline void ApplyHeaviside() {
+    this -> Heaviside(*this);
+  };
+  
+  inline void ApplyFloor(Real floor_val) {
+    this -> Floor(*this, floor_val);
+  };
+  
+  inline void ApplyCeiling(Real ceiling_val) {
+    this -> Ceiling(*this, ceiling_val);
+  };
+  
+  inline void ApplyExp() {
+    this -> Exp(*this);
+  };
+
+
+  inline void ApplyExpLimited(Real lower_limit, Real upper_limit) {
+    this -> ExpLimited(*this, lower_limit, upper_limit);
+  };
+
+  inline void ApplyExpSpecial() {
+    this -> ExpSpecial(*this);
+  };
+
+  inline void ApplySoftMaxPerRow() {
+    this -> SoftMaxPerRow(*this);
+  };
+
+  inline void ApplyLogSoftMaxPerRow() {
+    this -> LogSoftMaxPerRow(*this);
+  };
+
+  inline void ApplyLog() {
+    this -> Log(*this);
+  };
 
   /// Find the id of the maximal element for each row (resizes the 'id'
   /// array to the appropriate size).
@@ -434,7 +492,6 @@ class CuMatrixBase {
   /// Zeroes all elements for which col > row.
   void SetZeroAboveDiag();
   void Scale(Real value);
-  void ApplyLog();
 
   /// Multiply two matrices elementwise: C = C .* A
   void MulElements(const CuMatrixBase<Real> &A);
diff --git a/src/cudamatrix/cu-matrixdim.h b/src/cudamatrix/cu-matrixdim.h
index dab7bd40eb2..248e08199a1 100644
--- a/src/cudamatrix/cu-matrixdim.h
+++ b/src/cudamatrix/cu-matrixdim.h
@@ -26,16 +26,10 @@
 /*
  * Typedefs needed for ANSI-C interface of CUDA wrappers
  */
-#ifdef _MSC_VER
-  typedef unsigned __int32 uint32_cuda;
-  typedef __int32          int32_cuda;
-  typedef __int32          MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
-#else
-  #include <stdint.h>
-  typedef uint32_t         uint32_cuda;
-  typedef int32_t          int32_cuda;
-  typedef int32_t          MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
-#endif
+#include <cstdint>
+typedef uint32_t         uint32_cuda;
+typedef int32_t          int32_cuda;
+typedef int32_t          MatrixIndexT_cuda; // you'd have to change this if you changed MatrixIndexT from int32.
 
 template<typename Real>
 struct MatrixElement {
@@ -85,6 +79,14 @@ extern "C" {
     int32_cuda first;
     int32_cuda second;
   } Int32Pair;
+
+  inline bool operator<(const Int32Pair &a, const Int32Pair &b) {
+    if (a.first < b.first)
+      return true;
+    if (a.first > b.first)
+      return false;
+    return a.second < b.second;
+  }
 }
 
 #endif
diff --git a/src/cudamatrix/cu-packed-matrix.cc b/src/cudamatrix/cu-packed-matrix.cc
index 64f8afe0616..756d580c7cf 100644
--- a/src/cudamatrix/cu-packed-matrix.cc
+++ b/src/cudamatrix/cu-packed-matrix.cc
@@ -143,8 +143,9 @@ void CuPackedMatrix<Real>::CopyFromPacked(const CuPackedMatrix<Real> &src) {
     size_t nr = static_cast<size_t>(num_rows_),
         num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
 
-    CU_SAFE_CALL(cudaMemcpy(data_, src.data_, num_bytes,
-                            cudaMemcpyDeviceToDevice));
+    CU_SAFE_CALL(
+      cudaMemcpyAsync(data_, src.data_, num_bytes, cudaMemcpyDeviceToDevice,
+                      cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked1",
                                         tim);
   } else
@@ -161,8 +162,9 @@ void CuPackedMatrix<Real>::CopyFromPacked(const PackedMatrix<Real> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     if (num_rows_ == 0) return; // Nothing to do.
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.SizeInBytes(),
-                            cudaMemcpyHostToDevice));
+    CU_SAFE_CALL(cudaMemcpyAsync(data_, src.data_, src.SizeInBytes(),
+                                 cudaMemcpyHostToDevice, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyFromPacked2", tim);
   } else
 #endif
@@ -183,8 +185,9 @@ void CuPackedMatrix<Real>::CopyToPacked(PackedMatrix<Real> *dst) const {
     size_t nr = static_cast<size_t>(num_rows_),
       num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
 
-    CU_SAFE_CALL(cudaMemcpy(dst->data_, data_, num_bytes,
-                            cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(dst->data_, data_, num_bytes,
+                                 cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::CopyToPackedD2H", tim);
   } else
 #endif
@@ -247,7 +250,8 @@ void CuPackedMatrix<Real>::SetZero() {
     size_t nr = static_cast<size_t>(num_rows_),
       num_bytes = ((nr * (nr+1)) / 2) * sizeof(Real);
 
-    CU_SAFE_CALL(cudaMemset(reinterpret_cast<void*>(this->data_), 0, num_bytes));
+    CU_SAFE_CALL(cudaMemsetAsync(reinterpret_cast<void*>(this->data_), 0, 
+          num_bytes, cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuPackedMatrix::SetZero", tim);
   } else
   #endif
diff --git a/src/cudamatrix/cu-packed-matrix.h b/src/cudamatrix/cu-packed-matrix.h
index 0131ba6c101..8ed7ed79f7b 100644
--- a/src/cudamatrix/cu-packed-matrix.h
+++ b/src/cudamatrix/cu-packed-matrix.h
@@ -122,8 +122,10 @@ class CuPackedMatrix {
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {    
       Real value;
-      CU_SAFE_CALL(cudaMemcpy(&value, this->data_ + (r * (r+1)) / 2 + c,
-                              sizeof(Real), cudaMemcpyDeviceToHost));
+      CU_SAFE_CALL(cudaMemcpyAsync(&value, this->data_ + (r * (r+1)) / 2 + c,
+                                   sizeof(Real), cudaMemcpyDeviceToHost,
+                                   cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       return value;
     } else
 #endif
diff --git a/src/cudamatrix/cu-rand.cc b/src/cudamatrix/cu-rand.cc
index 20439834a98..63d858c25e9 100644
--- a/src/cudamatrix/cu-rand.cc
+++ b/src/cudamatrix/cu-rand.cc
@@ -69,7 +69,8 @@ void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
     CuMatrix<Real> tmp(tgt->NumRows(), tgt->NumCols(), kUndefined,
                        kStrideEqualNumCols);
     size_t s = static_cast<size_t>(tmp.NumRows()) * static_cast<size_t>(tmp.Stride());
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tmp.Data(), s));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tmp.Data(), s));
     tgt->CopyFromMat(tmp);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -86,7 +87,8 @@ void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
     CuTimer tim;
     // Here we don't need to use 'tmp' matrix,
     size_t s = static_cast<size_t>(tgt->NumRows()) * static_cast<size_t>(tgt->Stride());
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), s));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tgt->Data(), s));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -100,7 +102,8 @@ void CuRand<Real>::RandUniform(CuVectorBase<Real> *tgt) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->Dim()));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(
+          GetCurandHandle(), tgt->Data(), tgt->Dim()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -125,7 +128,8 @@ void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
     MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
     CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                        kStrideEqualNumCols);
-    CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.NumRows()*tmp.Stride()));
+    CURAND_SAFE_CALL(curandGenerateNormalWrap(
+          GetCurandHandle(), tmp.Data(), tmp.NumRows()*tmp.Stride()));
     tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -143,7 +147,8 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
     // Here we don't need to use 'tmp' matrix, if the number of elements is even,
     MatrixIndexT num_elements = tgt->NumRows() * tgt->Stride();
     if (0 == (num_elements % 2)) {
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), num_elements));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tgt->Data(), num_elements));
     } else {
       // We use 'tmp' matrix with one column added, this guarantees an even
       // number of elements.  Use the option kStrideEqualNumCols to ensure
@@ -152,8 +157,8 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
       MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
       CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                          kStrideEqualNumCols);
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(),
-                                            tmp.NumRows() * tmp.Stride()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tmp.Data(), tmp.NumRows() * tmp.Stride()));
       tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
@@ -174,11 +179,13 @@ void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
     // curandGenerateUniform(), curandGenerateUniformDouble().
     MatrixIndexT num_elements = tgt->Dim();
     if (0 == (num_elements % 2)) {
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), tgt->Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tgt->Data(), tgt->Dim()));
     } else {
       MatrixIndexT dim_even = tgt->Dim() + (tgt->Dim() % 2); // + 0 or 1,
       CuVector<Real> tmp(dim_even, kUndefined);
-      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(
+            GetCurandHandle(), tmp.Data(), tmp.Dim()));
       tgt->CopyFromVec(tmp.Range(0,tgt->Dim()));
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
diff --git a/src/cudamatrix/cu-rand.h b/src/cudamatrix/cu-rand.h
index fafc747df8d..6e0be648270 100644
--- a/src/cudamatrix/cu-rand.h
+++ b/src/cudamatrix/cu-rand.h
@@ -20,10 +20,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_RAND_H_
 #define KALDI_CUDAMATRIX_CU_RAND_H_
 
-#if HAVE_CUDA == 1
-  #include <curand.h>
-#endif
-
+#include "cudamatrix/cu-device.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-vector.h"
 #include "base/kaldi-math.h"
@@ -33,36 +30,10 @@ namespace kaldi {
 template<typename Real>
 class CuRand {
  public:
-  CuRand() {
-  #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // Initialize the generator,
-      CURAND_SAFE_CALL(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
-      // To get same random sequence, call srand() before the constructor is invoked,
-      CURAND_SAFE_CALL(curandSetGeneratorOrdering(gen_, CURAND_ORDERING_PSEUDO_DEFAULT));
-      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
-    }
-  #endif
-  }
 
-  ~CuRand() {
+   void SeedGpu() {
   #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // Release the generator,
-      CURAND_SAFE_CALL(curandDestroyGenerator(gen_));
-    }
-  #endif
-  }
-
-  /// Generate new seed for the GPU,
-  void SeedGpu() {
-  #if HAVE_CUDA == 1
-    if (CuDevice::Instantiate().Enabled()) {
-      // To get same random sequence, call srand() before the method is invoked,
-      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
-    }
+		CuDevice::Instantiate().SeedGpu();
   #endif
   }
 
@@ -88,11 +59,6 @@ class CuRand {
   void BinarizeProbs(const CuMatrix<Real> &probs, CuMatrix<Real> *states);
   /// add gaussian noise to each element,
   void AddGaussNoise(CuMatrix<Real> *tgt, Real gscale = 1.0);
-
- private:
-  #if HAVE_CUDA == 1
-  curandGenerator_t gen_;
-  #endif
 };
 
 }  // namsepace
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index 38f78c7c5e5..aad34b5dd54 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -173,7 +173,7 @@ static void UnitTestCuSparseMatrixSum() {
 
     Real sum1 = cu_smat.Sum();
     Real sum2 = mat.Sum();
-    KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-05);
+    KALDI_ASSERT(fabs(sum1 - sum2) < 1.0e-04);
   }
 }
 
diff --git a/src/cudamatrix/cu-value.h b/src/cudamatrix/cu-value.h
index 2245ff01200..af8e19987ce 100644
--- a/src/cudamatrix/cu-value.h
+++ b/src/cudamatrix/cu-value.h
@@ -22,7 +22,7 @@
 #ifndef KALDI_CUDAMATRIX_CU_VALUE_H_
 #define KALDI_CUDAMATRIX_CU_VALUE_H_
 
-#include <cudamatrix/cu-device.h>
+#include "cudamatrix/cu-device.h"
 
 namespace kaldi {
 
@@ -39,7 +39,9 @@ class CuValue {
   inline CuValue operator = (const CuValue<Real> &other) {
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, other.data_, sizeof(Real), cudaMemcpyDeviceToDevice));
+      CU_SAFE_CALL(
+        cudaMemcpyAsync(data_, other.data_, sizeof(Real), 
+                        cudaMemcpyDeviceToDevice, cudaStreamPerThread));
       return *this;
     } else
 #endif
@@ -52,7 +54,9 @@ class CuValue {
   inline Real operator = (Real r) { // assignment from Real
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, &r, sizeof(Real), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, &r, sizeof(Real), 
+            cudaMemcpyHostToDevice, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       return r;
     } else
 #endif
@@ -63,14 +67,16 @@ class CuValue {
   }
 
   inline Real operator += (Real r) { return (*this = r + Real(*this)); }
+  inline Real operator -= (Real r) { return (*this = Real(*this) - r); }
     
 
   inline operator Real () const { // assignment to Real
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Real value;
-    CU_SAFE_CALL(cudaMemcpy(&value, data_,
-                            sizeof(Real), cudaMemcpyDeviceToHost));
+    CU_SAFE_CALL(cudaMemcpyAsync(&value, data_, sizeof(Real), 
+                 cudaMemcpyDeviceToHost, cudaStreamPerThread));
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     return value;
   } else
 #endif
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index 5ea3a236b0a..5ee5d578511 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -4,6 +4,7 @@
 //           2012-2014  Johns Hopkins University (author: Daniel Povey)
 //                2017  Daniel Galvez
 //           2016-2018  Shiyin Kang
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -83,11 +84,11 @@ Real VecMatVec(const CuVectorBase<Real> &v1, const CuMatrixBase<Real> &M,
                const CuVectorBase<Real> &v2) {
   KALDI_ASSERT(v1.Dim() == M.NumRows() && M.NumCols() == v2.Dim());
   if (v1.Dim() > v2.Dim()) {  // do v2*M first
-    CuVector<Real> v2M(v1.Dim(), kUndefined);
+    CuVector<Real> v2M(v1.Dim());
     v2M.AddMatVec(1.0, M, kNoTrans, v2, 0.0);
     return VecVec(v2M, v1);
   } else {  // do v1*M first
-    CuVector<Real> v1M(v2.Dim(), kUndefined);
+    CuVector<Real> v1M(v2.Dim());
     v1M.AddMatVec(1.0, M, kTrans, v1, 0.0);
     return VecVec(v1M, v2);
   }
@@ -167,14 +168,16 @@ void CuVectorBase<Real>::CopyRowsFromMat(const CuMatrixBase<Real> &mat) {
     if (dim_ == 0) return;
     CuTimer tim;
     if (mat.Stride() == mat.NumCols() && mat.NumRows() != 0) {
-      CU_SAFE_CALL(cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_,
-                              cudaMemcpyDeviceToDevice));
+      CU_SAFE_CALL(
+        cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
+                        cudaMemcpyDeviceToDevice, cudaStreamPerThread));
     } else {
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpy(vec_data, mat.RowData(r),
-                                sizeof(Real) * mat.NumCols(),
-                                cudaMemcpyDeviceToDevice));
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
+                                     sizeof(Real) * mat.NumCols(),
+                                     cudaMemcpyDeviceToDevice,
+                                     cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
@@ -219,18 +222,18 @@ void CuVectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
     if (dim_ == 0) return;
     CuTimer tim;
     if (mat.Stride() == mat.NumCols()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, mat.Data(), sizeof(Real)*dim_,
-                              cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, mat.Data(), sizeof(Real)*dim_,
+                              cudaMemcpyHostToDevice, cudaStreamPerThread));
     } else {
       Real* vec_data = data_;
       for (MatrixIndexT r = 0; r < mat.NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpy(vec_data, mat.RowData(r),
+        CU_SAFE_CALL(cudaMemcpyAsync(vec_data, mat.RowData(r),
                                 sizeof(Real) * mat.NumCols(),
-                                cudaMemcpyHostToDevice));
+                                cudaMemcpyHostToDevice, cudaStreamPerThread));
         vec_data += mat.NumCols();
       }
     }
-    CU_SAFE_CALL(cudaGetLastError());
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -247,18 +250,21 @@ void MatrixBase<Real>::CopyRowsFromVec(const CuVectorBase<Real> &v) {
     if (num_rows_ == 0) return;
     CuTimer tim;
     if (Stride() == NumCols()) {
-      CU_SAFE_CALL(cudaMemcpy(data_, v.Data(),
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, v.Data(),
                               sizeof(Real)*v.Dim(),
-                              cudaMemcpyDeviceToHost));
+                              cudaMemcpyDeviceToHost,
+                              cudaStreamPerThread));
     } else {
       const Real* vec_data = v.Data();
       for (MatrixIndexT r = 0; r < NumRows(); r++) {
-        CU_SAFE_CALL(cudaMemcpy(RowData(r), vec_data,
+        CU_SAFE_CALL(cudaMemcpyAsync(RowData(r), vec_data,
                                 sizeof(Real) * NumCols(),
-                                cudaMemcpyDeviceToHost));
+                                cudaMemcpyDeviceToHost,
+                                cudaStreamPerThread));
         vec_data += NumCols();
       }
     }
+    CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -344,7 +350,7 @@ void CuVectorBase<Real>::ApplySoftMax() {
 }
 
 template<typename Real>
-void CuVectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
+void CuVectorBase<Real>::Floor(const CuVectorBase<Real> &src, Real floor_val, MatrixIndexT *floored_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     int dimBlock(CU1DBLOCK);
@@ -355,8 +361,8 @@ void CuVectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count)
       // We are calling a function meant for matrices, by viewing the
       // vector as a matrix with a single row.
       ::MatrixDim dim = {1, Dim(), 1};
-      cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, dim);
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloorNoCount", tim);
+      cuda_floor(dimGrid, dimBlock, this->data_, src.Data(), floor_val, dim, 1);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::FloorNoCount", tim);
     } else {
       if (dim_ == 0) { *floored_count = 0; return; }
       CuTimer tim;
@@ -366,17 +372,18 @@ void CuVectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count)
       cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_);
       CU_SAFE_CALL(cudaGetLastError());
       *floored_count = count_vec.Sum();
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::Floor", tim);
     }
   } else
 #endif
   {
-    Vec().ApplyFloor(floor_val, floored_count);
+    Vec().Floor(src.Vec(), floor_val, floored_count);
   }
 }
 
 template<typename Real>
-void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count) {
+void CuVectorBase<Real>::Ceiling(const CuVectorBase<Real> &src, Real ceiling_val,
+				 MatrixIndexT *ceiled_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     int dimBlock(CU1DBLOCK);
@@ -387,9 +394,9 @@ void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_cou
       // We are calling a function meant for matrices, by viewing the
       // vector as a matrix with a single row.
       ::MatrixDim dim = {1, Dim(), 1};
-      cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, dim);
+      cuda_ceiling(dimGrid, dimBlock, this->data_, src.Data(), ceiling_val, dim, 1);
 
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeilingNoCount", tim);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::CeilingNoCount", tim);
     } else {
       if (dim_ == 0) { *ceiled_count = 0; return; }
       CuTimer tim;
@@ -399,17 +406,17 @@ void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_cou
       cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_);
       CU_SAFE_CALL(cudaGetLastError());
       *ceiled_count = count_vec.Sum();
-      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::Ceiling", tim);
     }
   } else
 #endif
   {
-    Vec().ApplyCeiling(ceiling_val, ceiled_count);
+    Vec().Ceiling(src.Vec(), ceiling_val, ceiled_count);
   }
 }
 
 template<typename Real>
-void CuVectorBase<Real>::ApplyPow(Real power) {
+void CuVectorBase<Real>::Pow(const CuVectorBase<Real> &src, Real power) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
@@ -420,13 +427,13 @@ void CuVectorBase<Real>::ApplyPow(Real power) {
     dim3 dimGrid(n_blocks(Dim(), CU1DBLOCK), 1);
     ::MatrixDim fake_matrix_dim = { 1, Dim(), 1 };
     // num_cols is Dim(), num_rows is 1, stride is 1 (it's a don't-care).
-    cuda_apply_pow(dimGrid, dimBlock, data_, power, fake_matrix_dim);
+    cuda_pow(dimGrid, dimBlock, this->data_, src.Data(), power, fake_matrix_dim, 1);
     CU_SAFE_CALL(cudaGetLastError());
     CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyPow", tim);
   } else
 #endif
   {
-    Vec().ApplyPow(power);
+    Vec().Pow(src.Vec(), power);
   }
 }
 
@@ -473,6 +480,27 @@ void CuVectorBase<Real>::ApplyLog() {
   }
 }
 
+template<typename Real>
+void CuVectorBase<Real>::ApplyLogSoftMax() {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    if (dim_ == 0) return;
+    CuTimer tim;
+    size_t dimBlock = CU1DBLOCK;
+    size_t dimGrid = 1;       // dimGrid value represent the number of rows
+    ::MatrixDim dim = { 1, this->dim_, this->dim_};
+    
+    cuda_log_softmax_reduce(dimGrid, dimBlock, data_, data_, dim, this->dim_);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    Vec().ApplyLogSoftMax();
+  }
+}
+
+
 
 template<typename Real>
 void CuVectorBase<Real>::AddMatVec(const Real alpha,
@@ -884,7 +912,9 @@ void CuVectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &src) {
       KALDI_ASSERT(src.Dim() == dim_);
       if (dim_ == 0) return;
       CuTimer tim;
-      CU_SAFE_CALL(cudaMemcpy(data_, src.Data(), src.Dim()*sizeof(Real), cudaMemcpyHostToDevice));
+      CU_SAFE_CALL(cudaMemcpyAsync(data_, src.Data(), src.Dim()*sizeof(Real), 
+                                   cudaMemcpyHostToDevice, cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile("CuVector::CopyFromVecH2D", tim);
     }
   } else
@@ -915,8 +945,10 @@ void CuVectorBase<Real>::CopyToVec(VectorBase<OtherReal> *dst) const {
     } else {
       if (dim_ == 0) return;
       CuTimer tim;
-      CU_SAFE_CALL(cudaMemcpy(dst->Data(), this->data_,
-                              sizeof(Real) * dim_, cudaMemcpyDeviceToHost));
+      CU_SAFE_CALL(cudaMemcpyAsync(dst->Data(), this->data_,
+                              sizeof(Real) * dim_, cudaMemcpyDeviceToHost,
+                              cudaStreamPerThread));
+      CU_SAFE_CALL(cudaStreamSynchronize(cudaStreamPerThread));
       CuDevice::Instantiate().AccuProfile(__func__, tim);
     }
   } else
@@ -1049,7 +1081,9 @@ void CuVectorBase<Real>::CopyFromVec(const CuVectorBase<Real> &src) {
   if (CuDevice::Instantiate().Enabled()) {
     if (dim_ == 0) return;
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemcpy(data_, src.data_, src.dim_ * sizeof(Real), cudaMemcpyDeviceToDevice));
+    CU_SAFE_CALL(
+      cudaMemcpyAsync(data_, src.data_, src.dim_ * sizeof(Real), 
+                      cudaMemcpyDeviceToDevice, cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
   #endif
@@ -1068,7 +1102,8 @@ void CuVectorBase<Real>::SetZero() {
     KALDI_ASSERT(dim_>=0);
     KALDI_ASSERT(data_!=NULL);
     CuTimer tim;
-    CU_SAFE_CALL(cudaMemset(data_, 0, dim_*sizeof(Real)));
+    CU_SAFE_CALL(cudaMemsetAsync(data_, 0, dim_*sizeof(Real),
+          cudaStreamPerThread));
     CuDevice::Instantiate().AccuProfile("CuVector::SetZero", tim);
   } else
 #endif
@@ -1244,10 +1279,19 @@ void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
   KALDI_ASSERT(mat.NumCols() == Dim());
   if (Dim() == 0)
     return;
-  CuVector<Real> ones(mat.NumRows());
-  ones.Set(1.0);
-  this->AddMatVec(alpha, mat, kTrans, ones, beta);
-
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    cuda_add_row_sum_mat(mat.NumCols(), CU1DBLOCK, Data(), mat.Data(),
+                         mat.Dim(), alpha, beta);
+    CU_SAFE_CALL(cudaGetLastError());
+    
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else 
+#endif
+  {
+    Vec().AddRowSumMat(alpha, mat.Mat(), beta);
+  }
 }
 
 template<typename Real>
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 69ca2ae3125..f1c32756887 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -6,6 +6,7 @@
 //           2013       Xiaohui Zhang
 //           2015       Guoguo Chen
 //           2017       Daniel Galvez
+//           2019       Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -131,12 +132,26 @@ class CuVectorBase {
                     const MatrixTransposeType trans,
                     const CuArrayBase<int32> &elements);
 
+  void Floor(const CuVectorBase<Real> &src, Real floor_val, MatrixIndexT *floored_count = NULL);
+  void Ceiling(const CuVectorBase<Real> &src, Real ceiling_val, MatrixIndexT *ceiled_count = NULL);
+  void Pow(const CuVectorBase<Real> &src, Real power);
+
+  inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = NULL) {
+    this -> Floor(*this, floor_val, floored_count);
+  };
+
+  inline void ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count = NULL) {
+    this -> Ceiling(*this, ceiling_val, ceiled_count);
+  };
+
+  inline void ApplyPow(Real power) {
+    this -> Pow(*this, power);
+  };
+
   void ApplySoftMax();
+  void ApplyLogSoftMax();
   void ApplyExp();
   void ApplyLog();
-  void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = NULL);
-  void ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count = NULL);
-  void ApplyPow(Real power);
   Real Sum() const;
 
   void SetRandn();
@@ -314,27 +329,27 @@ class CuSubVector: public CuVectorBase<Real> {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
                  static_cast<UnsignedMatrixIndexT>(length) <=
                  static_cast<UnsignedMatrixIndexT>(t.Dim()));
-    CuVectorBase<Real>::data_ = const_cast<Real*>(t.Data()+origin);
-    CuVectorBase<Real>::dim_ = length;
+    this->data_ = const_cast<Real*>(t.Data()+origin);
+    this->dim_ = length;
   }
   /// Copy constructor
   /// this constructor needed for Range() to work in base class.
   CuSubVector(const CuSubVector &other) : CuVectorBase<Real> () {
-    CuVectorBase<Real>::data_ = other.data_;
-    CuVectorBase<Real>::dim_ = other.dim_;
+    this->data_ = other.data_;
+    this->dim_ = other.dim_;
   }
 
   CuSubVector(const Real* data, MatrixIndexT length) : CuVectorBase<Real> () {
     // Yes, we're evading C's restrictions on const here, and yes, it can be used
     // to do wrong stuff; unfortunately the workaround would be very difficult.
-    CuVectorBase<Real>::data_ = const_cast<Real*>(data);
-    CuVectorBase<Real>::dim_ = length;
+    this->data_ = const_cast<Real*>(data);
+    this->dim_ = length;
   }
 
   /// This operation does not preserve const-ness, so be careful.
   CuSubVector(const CuMatrixBase<Real> &matrix, MatrixIndexT row) {
-    CuVectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
-    CuVectorBase<Real>::dim_ = matrix.NumCols();
+    this->data_ = const_cast<Real*>(matrix.RowData(row));
+    this->dim_ = matrix.NumCols();
   }
 
 
diff --git a/src/decoder/Makefile b/src/decoder/Makefile
index e997cf3c3c4..a814931f694 100644
--- a/src/decoder/Makefile
+++ b/src/decoder/Makefile
@@ -7,13 +7,14 @@ TESTFILES =
 
 OBJFILES = training-graph-compiler.o lattice-simple-decoder.o lattice-faster-decoder.o \
    lattice-faster-online-decoder.o simple-decoder.o faster-decoder.o \
-   decoder-wrappers.o
+   decoder-wrappers.o grammar-fst.o decodable-matrix.o \
+   lattice-incremental-decoder.o lattice-incremental-online-decoder.o
 
 LIBNAME = kaldi-decoder
 
-ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
+ADDLIBS = ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/decoder/biglm-faster-decoder.h b/src/decoder/biglm-faster-decoder.h
index ad8157f8a0f..8e36deb8bb6 100644
--- a/src/decoder/biglm-faster-decoder.h
+++ b/src/decoder/biglm-faster-decoder.h
@@ -122,8 +122,9 @@ class BiglmFasterDecoder {
     // will be nonempty).
     fst_out->DeleteStates();
     Token *best_tok = NULL;
-    Weight best_final; // only set if is_final == true.  The final-prob corresponding
-    // to the best final token (i.e. the one with best weight best_weight, below).
+    Weight best_final = Weight::Zero(); // set only if is_final == true.  The
+    // final-prob corresponding to the best final token (i.e. the one with best
+    // weight best_weight, below).
     bool is_final = ReachedFinal();
     if (!is_final) {
       for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail)
@@ -396,13 +397,11 @@ class BiglmFasterDecoder {
             if (new_weight < next_weight_cutoff) {  // not pruned..
               PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
               Token *new_tok = new Token(arc, ac_weight, tok);
-              Elem *e_found = toks_.Find(next_pair);
+              Elem *e_found = toks_.Insert(next_pair, new_tok);
               if (new_weight + adaptive_beam < next_weight_cutoff)
                 next_weight_cutoff = new_weight + adaptive_beam;
-              if (e_found == NULL) {
-                toks_.Insert(next_pair, new_tok);
-              } else {
-                if ( *(e_found->val) < *new_tok ) {
+              if (e_found->val != new_tok) {
+                if (*(e_found->val) < *new_tok) {
                   Token::TokenDelete(e_found->val);
                   e_found->val = new_tok;
                 } else {
@@ -425,11 +424,12 @@ class BiglmFasterDecoder {
     // Processes nonemitting arcs for one frame. 
     KALDI_ASSERT(queue_.empty());
     for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail)
-      queue_.push_back(e->key);
+      queue_.push_back(e);
     while (!queue_.empty()) {
-      PairId state_pair = queue_.back();
+      const Elem *e = queue_.back();
       queue_.pop_back();
-      Token *tok = toks_.Find(state_pair)->val;  // would segfault if state not
+      PairId state_pair = e->key;
+      Token *tok = e->val;  // would segfault if state not
       // in toks_ but this can't happen.
       if (tok->weight_.Value() > cutoff) { // Don't bother processing successors.
         continue;
@@ -449,15 +449,14 @@ class BiglmFasterDecoder {
           if (new_tok->weight_.Value() > cutoff) {  // prune
             Token::TokenDelete(new_tok);
           } else {
-            Elem *e_found = toks_.Find(next_pair);
-            if (e_found == NULL) {
-              toks_.Insert(next_pair, new_tok);
-              queue_.push_back(next_pair);
+            Elem *e_found = toks_.Insert(next_pair, new_tok);
+            if (e_found->val == new_tok) {
+              queue_.push_back(e_found);
             } else {
               if ( *(e_found->val) < *new_tok ) {
                 Token::TokenDelete(e_found->val);
                 e_found->val = new_tok;
-                queue_.push_back(next_pair);
+                queue_.push_back(e_found);
               } else {
                 Token::TokenDelete(new_tok);
               }
@@ -476,7 +475,7 @@ class BiglmFasterDecoder {
   fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst_;
   BiglmFasterDecoderOptions opts_;
   bool warned_noarc_;
-  std::vector<PairId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
 
diff --git a/src/decoder/decodable-mapped.h b/src/decoder/decodable-mapped.h
index 73950968c3f..daebfb50de3 100644
--- a/src/decoder/decodable-mapped.h
+++ b/src/decoder/decodable-mapped.h
@@ -44,20 +44,20 @@ class DecodableMapped: public DecodableInterface {
     KALDI_ASSERT(static_cast<size_t>(state_index) < index_map_.size());
     return decodable_->LogLikelihood(frame, index_map_[state_index]);
   }
-  
+
   // note: indices are assumed to be numbered from one, so
   // NumIndices() will be the same as the largest index.
   virtual int32 NumIndices() const { return static_cast<int32>(index_map_.size()) - 1; }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     // We require all the decodables have the same #frames.  We don't check this though.
     return decodable_->IsLastFrame(frame);
-  }    
+  }
 
  private:
   std::vector<int32> index_map_;
   DecodableInterface *decodable_;
-  
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMapped);
 };
 
diff --git a/src/decoder/decodable-matrix.cc b/src/decoder/decodable-matrix.cc
new file mode 100644
index 00000000000..3cc7b87f2d7
--- /dev/null
+++ b/src/decoder/decodable-matrix.cc
@@ -0,0 +1,107 @@
+// decoder/decodable-matrix.cc
+
+// Copyright    2018 Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/decodable-matrix.h"
+
+namespace kaldi {
+
+DecodableMatrixMapped::DecodableMatrixMapped(
+    const TransitionModel &tm,
+    const MatrixBase<BaseFloat> &likes,
+    int32 frame_offset):
+    trans_model_(tm), likes_(&likes), likes_to_delete_(NULL),
+    frame_offset_(frame_offset) {
+  stride_ = likes.Stride();
+  raw_data_ = likes.Data() - (stride_ * frame_offset);
+
+  if (likes.NumCols() != tm.NumPdfs())
+    KALDI_ERR << "Mismatch, matrix has "
+              << likes.NumCols() << " rows but transition-model has "
+              << tm.NumPdfs() << " pdf-ids.";
+}
+
+DecodableMatrixMapped::DecodableMatrixMapped(
+    const TransitionModel &tm, const Matrix<BaseFloat> *likes,
+    int32 frame_offset):
+    trans_model_(tm), likes_(likes), likes_to_delete_(likes),
+    frame_offset_(frame_offset) {
+  stride_ = likes->Stride();
+  raw_data_ = likes->Data() - (stride_ * frame_offset_);
+  if (likes->NumCols() != tm.NumPdfs())
+    KALDI_ERR << "Mismatch, matrix has "
+              << likes->NumCols() << " rows but transition-model has "
+              << tm.NumPdfs() << " pdf-ids.";
+}
+
+
+BaseFloat DecodableMatrixMapped::LogLikelihood(int32 frame, int32 tid) {
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(tid);
+#ifdef KALDI_PARANOID
+  return (*likes_)(frame - frame_offset_, pdf_id);
+#else
+  return raw_data_[frame * stride_ + pdf_id];
+#endif
+}
+
+int32 DecodableMatrixMapped::NumFramesReady() const {
+  return frame_offset_ + likes_->NumRows();
+}
+
+bool DecodableMatrixMapped::IsLastFrame(int32 frame) const {
+  KALDI_ASSERT(frame < NumFramesReady());
+  return (frame == NumFramesReady() - 1);
+}
+
+// Indices are one-based!  This is for compatibility with OpenFst.
+int32 DecodableMatrixMapped::NumIndices() const {
+  return trans_model_.NumTransitionIds();
+}
+
+DecodableMatrixMapped::~DecodableMatrixMapped() {
+  delete likes_to_delete_;
+}
+
+
+void DecodableMatrixMappedOffset::AcceptLoglikes(
+    Matrix<BaseFloat> *loglikes, int32 frames_to_discard) {
+  if (loglikes->NumRows() == 0) return;
+  KALDI_ASSERT(loglikes->NumCols() == trans_model_.NumPdfs());
+  KALDI_ASSERT(frames_to_discard <= loglikes_.NumRows() &&
+               frames_to_discard >= 0);
+  if (frames_to_discard == loglikes_.NumRows()) {
+    loglikes_.Swap(loglikes);
+    loglikes->Resize(0, 0);
+  } else {
+    int32 old_rows_kept = loglikes_.NumRows() - frames_to_discard,
+        new_num_rows = old_rows_kept + loglikes->NumRows();
+    Matrix<BaseFloat> new_loglikes(new_num_rows, loglikes->NumCols());
+    new_loglikes.RowRange(0, old_rows_kept).CopyFromMat(
+        loglikes_.RowRange(frames_to_discard, old_rows_kept));
+    new_loglikes.RowRange(old_rows_kept, loglikes->NumRows()).CopyFromMat(
+        *loglikes);
+    loglikes_.Swap(&new_loglikes);
+  }
+  frame_offset_ += frames_to_discard;
+  stride_ = loglikes_.Stride();
+  raw_data_ = loglikes_.Data() - (frame_offset_ * stride_);
+}
+
+
+
+} // end namespace kaldi.
diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h
index de70ea82753..30b8b467c2e 100644
--- a/src/decoder/decodable-matrix.h
+++ b/src/decoder/decodable-matrix.h
@@ -26,14 +26,14 @@
 #include "base/kaldi-common.h"
 #include "hmm/transition-model.h"
 #include "itf/decodable-itf.h"
+#include "matrix/kaldi-matrix.h"
 
 namespace kaldi {
 
 
 class DecodableMatrixScaledMapped: public DecodableInterface {
  public:
-  // This constructor creates an object that will not delete "likes"
-  // when done.
+  // This constructor creates an object that will not delete "likes" when done.
   DecodableMatrixScaledMapped(const TransitionModel &tm,
                               const Matrix<BaseFloat> &likes,
                               BaseFloat scale): trans_model_(tm), likes_(&likes),
@@ -55,7 +55,7 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
       KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
                 << likes->NumCols() << " rows but transition-model has "
                 << tm.NumPdfs() << " pdf-ids.";
-  }  
+  }
 
   virtual int32 NumFramesReady() const { return likes_->NumRows(); }
 
@@ -66,7 +66,7 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
 
   // Note, frames are numbered from zero.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_ * (*likes_)(frame, trans_model_.TransitionIdToPdf(tid));
+    return scale_ * (*likes_)(frame, trans_model_.TransitionIdToPdfFast(tid));
   }
 
   // Indices are one-based!  This is for compatibility with OpenFst.
@@ -83,6 +83,59 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMapped);
 };
 
+/**
+   This is like DecodableMatrixScaledMapped, but it doesn't support an acoustic
+   scale, and it does support a frame offset, whereby you can state that the
+   first row of 'likes' is actually the n'th row of the matrix of available
+   log-likelihoods.  It's useful if the neural net output comes in chunks for
+   different frame ranges.
+
+   Note: DecodableMatrixMappedOffset solves the same problem in a slightly
+   different way, where you use the same decodable object.  This one, unlike
+   DecodableMatrixMappedOffset, is compatible with when the loglikes are in a
+   SubMatrix.
+ */
+class DecodableMatrixMapped: public DecodableInterface {
+ public:
+  // This constructor creates an object that will not delete "likes" when done.
+  // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
+  // greater than one if this is not the first chunk of likelihoods.
+  DecodableMatrixMapped(const TransitionModel &tm,
+                        const MatrixBase<BaseFloat> &likes,
+                        int32 frame_offset = 0);
+
+  // This constructor creates an object that will delete "likes"
+  // when done.
+  DecodableMatrixMapped(const TransitionModel &tm,
+                        const Matrix<BaseFloat> *likes,
+                        int32 frame_offset = 0);
+
+  virtual int32 NumFramesReady() const;
+
+  virtual bool IsLastFrame(int32 frame) const;
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid);
+
+  // Note: these indices are 1-based.
+  virtual int32 NumIndices() const;
+
+  virtual ~DecodableMatrixMapped();
+
+ private:
+  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const MatrixBase<BaseFloat> *likes_;
+  const Matrix<BaseFloat> *likes_to_delete_;
+  int32 frame_offset_;
+
+  // raw_data_ and stride_ are a kind of fast look-aside for 'likes_', to be
+  // used when KALDI_PARANOID is false.
+  const BaseFloat *raw_data_;
+  int32 stride_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixMapped);
+};
+
+
 /**
    This decodable class returns log-likes stored in a matrix; it supports
    repeatedly writing to the matrix and setting a time-offset representing the
@@ -91,68 +144,49 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
    code will call SetLoglikes() each time more log-likelihods are available.
    If you try to access a log-likelihood that's no longer available because
    the frame index is less than the current offset, it is of course an error.
+
+   See also DecodableMatrixMapped, which supports the same type of thing but
+   with a different interface where you are expected to re-construct the
+   object each time you want to decode.
 */
 class DecodableMatrixMappedOffset: public DecodableInterface {
  public:
   DecodableMatrixMappedOffset(const TransitionModel &tm):
-      trans_model_(tm), frame_offset_(0), input_is_finished_(false) { }  
-
-
-
-  virtual int32 NumFramesReady() { return frame_offset_ + loglikes_.NumRows(); }
+      trans_model_(tm), frame_offset_(0), input_is_finished_(false) { }
 
   // this is not part of the generic Decodable interface.
-  int32 FirstAvailableFrame() { return frame_offset_; }
-  
+  int32 FirstAvailableFrame() const { return frame_offset_; }
+
+  // Logically, this function appends 'loglikes' (interpreted as newly available
+  // frames) to the log-likelihoods stored in the class.
+  //
   // This function is destructive of the input "loglikes" because it may
   // under some circumstances do a shallow copy using Swap().  This function
   // appends loglikes to any existing likelihoods you've previously supplied.
-  // frames_to_discard, if nonzero, will discard that number of previously
-  // available frames, from the left, advancing FirstAvailableFrame() by
-  // a number equal to frames_to_discard.  You should only set frames_to_discard
-  // to nonzero if you know your decoder won't want to access the loglikes
-  // for older frames.
   void AcceptLoglikes(Matrix<BaseFloat> *loglikes,
-                      int32 frames_to_discard) {
-    if (loglikes->NumRows() == 0) return;
-    KALDI_ASSERT(loglikes->NumCols() == trans_model_.NumPdfs());
-    KALDI_ASSERT(frames_to_discard <= loglikes_.NumRows() &&
-                 frames_to_discard >= 0);
-    if (frames_to_discard == loglikes_.NumRows()) {
-      loglikes_.Swap(loglikes);
-      loglikes->Resize(0, 0);
-    } else {
-      int32 old_rows_kept = loglikes_.NumRows() - frames_to_discard,
-          new_num_rows = old_rows_kept + loglikes->NumRows();
-      Matrix<BaseFloat> new_loglikes(new_num_rows, loglikes->NumCols());
-      new_loglikes.RowRange(0, old_rows_kept).CopyFromMat(
-          loglikes_.RowRange(frames_to_discard, old_rows_kept));
-      new_loglikes.RowRange(old_rows_kept, loglikes->NumRows()).CopyFromMat(
-          *loglikes);
-      loglikes_.Swap(&new_loglikes);
-    }
-    frame_offset_ += frames_to_discard;
-  }
+                      int32 frames_to_discard);
 
   void InputIsFinished() { input_is_finished_ = true; }
-  
+
   virtual int32 NumFramesReady() const {
     return loglikes_.NumRows() + frame_offset_;
   }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1 && input_is_finished_);
   }
 
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    int32 index = frame - frame_offset_;
-    KALDI_ASSERT(index >= 0 && index < loglikes_.NumRows());
-    return loglikes_(index, trans_model_.TransitionIdToPdf(tid));
+    int32 pdf_id = trans_model_.TransitionIdToPdfFast(tid);
+#ifdef KALDI_PARANOID
+    return loglikes_(frame - frame_offset_, pdf_id);
+#else
+    // This does no checking, so will be faster.
+    return raw_data_[frame * stride_ + pdf_id];
+#endif
   }
 
-                 
-                 
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
   // nothing special to do in destructor.
@@ -162,6 +196,15 @@ class DecodableMatrixMappedOffset: public DecodableInterface {
   Matrix<BaseFloat> loglikes_;
   int32 frame_offset_;
   bool input_is_finished_;
+
+  // 'raw_data_' and 'stride_' are intended as a fast look-aside which is an
+  // alternative to accessing data_.  raw_data_ is a faked version of
+  // data_->Data() as if it started from frame zero rather than frame_offset_.
+  // This simplifies the code of LogLikelihood(), in cases where KALDI_PARANOID
+  // is not defined.
+  BaseFloat *raw_data_;
+  int32 stride_;
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixMappedOffset);
 };
 
@@ -171,20 +214,20 @@ class DecodableMatrixScaled: public DecodableInterface {
   DecodableMatrixScaled(const Matrix<BaseFloat> &likes,
                         BaseFloat scale):
     likes_(likes), scale_(scale) { }
-  
+
   virtual int32 NumFramesReady() const { return likes_.NumRows(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
   }
-  
+
   // Note, frames are numbered from zero.
   virtual BaseFloat LogLikelihood(int32 frame, int32 index) {
-    if (index > likes_.NumCols() || index <= 0 || 
+    if (index > likes_.NumCols() || index <= 0 ||
         frame < 0 || frame >= likes_.NumRows())
-      KALDI_ERR << "Invalid (frame, index - 1) = (" 
-                << frame << ", " << index - 1 << ") for matrix of size " 
+      KALDI_ERR << "Invalid (frame, index - 1) = ("
+                << frame << ", " << index - 1 << ") for matrix of size "
                 << likes_.NumRows() << " x " << likes_.NumCols();
     return scale_ * likes_(frame, index - 1);
   }
@@ -197,8 +240,6 @@ class DecodableMatrixScaled: public DecodableInterface {
   BaseFloat scale_;
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaled);
 };
-
-
 }  // namespace kaldi
 
 #endif  // KALDI_DECODER_DECODABLE_MATRIX_H_
diff --git a/src/decoder/decodable-sum.h b/src/decoder/decodable-sum.h
index 992b8f1d7ac..e3ff60a6f1b 100644
--- a/src/decoder/decodable-sum.h
+++ b/src/decoder/decodable-sum.h
@@ -49,7 +49,7 @@ class DecodableSum: public DecodableInterface {
       const std::vector<std::pair<DecodableInterface*, BaseFloat> > &decodables) :
       decodables_(decodables) { CheckSizes(); }
 
-  void CheckSizes() {
+  void CheckSizes() const {
     KALDI_ASSERT(decodables_.size() >= 1
                  && decodables_[0].first != NULL);
     for (size_t i = 1; i < decodables_.size(); i++)
@@ -67,10 +67,6 @@ class DecodableSum: public DecodableInterface {
          iter != decodables_.end();
          ++iter) {
       sum += iter->first->LogLikelihood(frame, state_index) * iter->second;
-      // BaseFloat tmp = iter->first->LogLikelihood(frame, state_index);
-      // KALDI_LOG << "ITEM " << i << " contributed with loglike=" << tmp << " scaled by=" << iter->second;
-      // i+=1;
-      // sum += tmp * iter->second;
      }
     return sum;
   }
diff --git a/src/decoder/decoder-wrappers.cc b/src/decoder/decoder-wrappers.cc
index 150d9e513a8..f63b3caa7c0 100644
--- a/src/decoder/decoder-wrappers.cc
+++ b/src/decoder/decoder-wrappers.cc
@@ -19,6 +19,8 @@
 
 #include "decoder/decoder-wrappers.h"
 #include "decoder/faster-decoder.h"
+#include "decoder/lattice-faster-decoder.h"
+#include "decoder/grammar-fst.h"
 #include "lat/lattice-functions.h"
 
 namespace kaldi {
@@ -32,7 +34,7 @@ DecodeUtteranceLatticeFasterClass::DecodeUtteranceLatticeFasterClass(
     DecodableInterface *decodable,
     const TransitionModel &trans_model,
     const fst::SymbolTable *word_syms,
-    std::string utt,
+    const std::string &utt,
     BaseFloat acoustic_scale,
     bool determinize,
     bool allow_partial,
@@ -66,7 +68,7 @@ void DecodeUtteranceLatticeFasterClass::operator () () {
   success_ = true;
   using fst::VectorFst;
   if (!decoder_->Decode(decodable_)) {
-    KALDI_WARN << "Failed to decode file " << utt_;
+    KALDI_WARN << "Failed to decode utterance with id " << utt_;
     success_ = false;
   }
   if (!decoder_->ReachedFinal()) {
@@ -193,10 +195,97 @@ DecodeUtteranceLatticeFasterClass::~DecodeUtteranceLatticeFasterClass() {
   delete decodable_;
 }
 
+template <typename FST>
+bool DecodeUtteranceLatticeIncremental(
+    LatticeIncrementalDecoderTpl<FST> &decoder, // not const but is really an input.
+    DecodableInterface &decodable, // not const but is really an input.
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    std::string utt,
+    double acoustic_scale,
+    bool determinize,
+    bool allow_partial,
+    Int32VectorWriter *alignment_writer,
+    Int32VectorWriter *words_writer,
+    CompactLatticeWriter *compact_lattice_writer,
+    LatticeWriter *lattice_writer,
+    double *like_ptr) { // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode utterance with id " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  // Get lattice
+  CompactLattice clat = decoder.GetLattice(decoder.NumFramesDecoded(), true);
+  if (clat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    CompactLattice decoded_clat;
+    CompactLatticeShortestPath(clat, &decoded_clat);
+    Lattice decoded;
+    fst::ConvertLattice(decoded_clat, &decoded);
+
+    if (decoded.Start() == fst::kNoStateId)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    KALDI_ASSERT(num_frames == decoder.NumFramesDecoded());
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // We'll write the lattice without acoustic scaling.
+  if (acoustic_scale != 0.0)
+    fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+  Connect(&clat);
+  compact_lattice_writer->Write(utt, clat);
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
 
 // Takes care of output.  Returns true on success.
+template <typename FST>
 bool DecodeUtteranceLatticeFaster(
-    LatticeFasterDecoder &decoder, // not const but is really an input.
+    LatticeFasterDecoderTpl<FST> &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
     const TransitionModel &trans_model,
     const fst::SymbolTable *word_syms,
@@ -212,7 +301,7 @@ bool DecodeUtteranceLatticeFaster(
   using fst::VectorFst;
 
   if (!decoder.Decode(&decodable)) {
-    KALDI_WARN << "Failed to decode file " << utt;
+    KALDI_WARN << "Failed to decode utterance with id " << utt;
     return false;
   }
   if (!decoder.ReachedFinal()) {
@@ -292,6 +381,69 @@ bool DecodeUtteranceLatticeFaster(
   return true;
 }
 
+// Instantiate the template above for the two required FST types.
+template bool DecodeUtteranceLatticeIncremental(
+    LatticeIncrementalDecoderTpl<fst::Fst<fst::StdArc> > &decoder,
+    DecodableInterface &decodable,
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    std::string utt,
+    double acoustic_scale,
+    bool determinize,
+    bool allow_partial,
+    Int32VectorWriter *alignment_writer,
+    Int32VectorWriter *words_writer,
+    CompactLatticeWriter *compact_lattice_writer,
+    LatticeWriter *lattice_writer,
+    double *like_ptr);
+
+template bool DecodeUtteranceLatticeIncremental(
+    LatticeIncrementalDecoderTpl<fst::GrammarFst> &decoder,
+    DecodableInterface &decodable,
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    std::string utt,
+    double acoustic_scale,
+    bool determinize,
+    bool allow_partial,
+    Int32VectorWriter *alignment_writer,
+    Int32VectorWriter *words_writer,
+    CompactLatticeWriter *compact_lattice_writer,
+    LatticeWriter *lattice_writer,
+    double *like_ptr);
+
+
+template bool DecodeUtteranceLatticeFaster(
+    LatticeFasterDecoderTpl<fst::Fst<fst::StdArc> > &decoder,
+    DecodableInterface &decodable,
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    std::string utt,
+    double acoustic_scale,
+    bool determinize,
+    bool allow_partial,
+    Int32VectorWriter *alignment_writer,
+    Int32VectorWriter *words_writer,
+    CompactLatticeWriter *compact_lattice_writer,
+    LatticeWriter *lattice_writer,
+    double *like_ptr);
+
+template bool DecodeUtteranceLatticeFaster(
+    LatticeFasterDecoderTpl<fst::GrammarFst> &decoder,
+    DecodableInterface &decodable,
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    std::string utt,
+    double acoustic_scale,
+    bool determinize,
+    bool allow_partial,
+    Int32VectorWriter *alignment_writer,
+    Int32VectorWriter *words_writer,
+    CompactLatticeWriter *compact_lattice_writer,
+    LatticeWriter *lattice_writer,
+    double *like_ptr);
+
+
 // Takes care of output.  Returns true on success.
 bool DecodeUtteranceLatticeSimple(
     LatticeSimpleDecoder &decoder, // not const but is really an input.
@@ -310,7 +462,7 @@ bool DecodeUtteranceLatticeSimple(
   using fst::VectorFst;
 
   if (!decoder.Decode(&decodable)) {
-    KALDI_WARN << "Failed to decode file " << utt;
+    KALDI_WARN << "Failed to decode utterance with id " << utt;
     return false;
   }
   if (!decoder.ReachedFinal()) {
@@ -347,7 +499,7 @@ bool DecodeUtteranceLatticeSimple(
       for (size_t i = 0; i < words.size(); i++) {
         std::string s = word_syms->Find(words[i]);
         if (s == "")
-          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
         std::cerr << s << ' ';
       }
       std::cerr << '\n';
diff --git a/src/decoder/decoder-wrappers.h b/src/decoder/decoder-wrappers.h
index 38ddcb40f50..085c8e94e73 100644
--- a/src/decoder/decoder-wrappers.h
+++ b/src/decoder/decoder-wrappers.h
@@ -22,6 +22,7 @@
 
 #include "itf/options-itf.h"
 #include "decoder/lattice-faster-decoder.h"
+#include "decoder/lattice-incremental-decoder.h"
 #include "decoder/lattice-simple-decoder.h"
 
 // This header contains declarations from various convenience functions that are called
@@ -88,6 +89,23 @@ void AlignUtteranceWrapper(
 void ModifyGraphForCarefulAlignment(
     fst::VectorFst<fst::StdArc> *fst);
 
+/// TODO
+template <typename FST>
+bool DecodeUtteranceLatticeIncremental(
+    LatticeIncrementalDecoderTpl<FST> &decoder, // not const but is really an input.
+    DecodableInterface &decodable, // not const but is really an input.
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    std::string utt,
+    double acoustic_scale,
+    bool determinize,
+    bool allow_partial,
+    Int32VectorWriter *alignments_writer,
+    Int32VectorWriter *words_writer,
+    CompactLatticeWriter *compact_lattice_writer,
+    LatticeWriter *lattice_writer,
+    double *like_ptr);  // puts utterance's likelihood in like_ptr on success.
+
 
 /// This function DecodeUtteranceLatticeFaster is used in several decoders, and
 /// we have moved it here.  Note: this is really "binary-level" code as it
@@ -95,8 +113,13 @@ void ModifyGraphForCarefulAlignment(
 /// other obvious place to put it.  If determinize == false, it writes to
 /// lattice_writer, else to compact_lattice_writer.  The writers for
 /// alignments and words will only be written to if they are open.
+///
+/// Caution: this will only link correctly if FST is either fst::Fst<fst::StdArc>,
+/// or fst::GrammarFst, as the template function is defined in the .cc file and
+/// only instantiated for those two types.
+template <typename FST>
 bool DecodeUtteranceLatticeFaster(
-    LatticeFasterDecoder &decoder, // not const but is really an input.
+    LatticeFasterDecoderTpl<FST> &decoder, // not const but is really an input.
     DecodableInterface &decodable, // not const but is really an input.
     const TransitionModel &trans_model,
     const fst::SymbolTable *word_syms,
@@ -110,6 +133,7 @@ bool DecodeUtteranceLatticeFaster(
     LatticeWriter *lattice_writer,
     double *like_ptr);  // puts utterance's likelihood in like_ptr on success.
 
+
 /// This class basically does the same job as the function
 /// DecodeUtteranceLatticeFaster, but in a way that allows us
 /// to build a multi-threaded command line program more easily.
@@ -125,7 +149,7 @@ class DecodeUtteranceLatticeFasterClass {
       DecodableInterface *decodable,
       const TransitionModel &trans_model,
       const fst::SymbolTable *word_syms,
-      std::string utt,
+      const std::string &utt,
       BaseFloat acoustic_scale,
       bool determinize,
       bool allow_partial,
diff --git a/src/decoder/faster-decoder.cc b/src/decoder/faster-decoder.cc
index 105289eb6d7..9e8775875fb 100644
--- a/src/decoder/faster-decoder.cc
+++ b/src/decoder/faster-decoder.cc
@@ -47,10 +47,7 @@ void FasterDecoder::InitDecoding() {
 
 void FasterDecoder::Decode(DecodableInterface *decodable) {
   InitDecoding();
-  while (!decodable->IsLastFrame(num_frames_decoded_ - 1)) {
-    double weight_cutoff = ProcessEmitting(decodable);
-    ProcessNonemitting(weight_cutoff);
-  }
+  AdvanceDecoding(decodable);
 }
 
 void FasterDecoder::AdvanceDecoding(DecodableInterface *decodable,
@@ -70,12 +67,12 @@ void FasterDecoder::AdvanceDecoding(DecodableInterface *decodable,
   while (num_frames_decoded_ < target_frames_decoded) {
     // note: ProcessEmitting() increments num_frames_decoded_
     double weight_cutoff = ProcessEmitting(decodable);
-    ProcessNonemitting(weight_cutoff); 
-  }    
+    ProcessNonemitting(weight_cutoff);
+  }
 }
 
 
-bool FasterDecoder::ReachedFinal() {
+bool FasterDecoder::ReachedFinal() const {
   for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) {
     if (e->val->cost_ != std::numeric_limits<double>::infinity() &&
         fst_.Final(e->key) != Weight::Zero())
@@ -178,7 +175,7 @@ double FasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
     double beam_cutoff = best_cost + config_.beam,
         min_active_cutoff = std::numeric_limits<double>::infinity(),
         max_active_cutoff = std::numeric_limits<double>::infinity();
-    
+
     if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
       std::nth_element(tmp_array_.begin(),
                        tmp_array_.begin() + config_.max_active,
@@ -189,7 +186,7 @@ double FasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
       if (adaptive_beam)
         *adaptive_beam = max_active_cutoff - best_cost + config_.beam_delta;
       return max_active_cutoff;
-    }    
+    }
     if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
       if (config_.min_active == 0) min_active_cutoff = best_cost;
       else {
@@ -231,12 +228,12 @@ double FasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
                                    &adaptive_beam, &best_elem);
   KALDI_VLOG(3) << tok_cnt << " tokens active.";
   PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.
-    
+
   // This is the cutoff we use after adding in the log-likes (i.e.
   // for the next frame).  This is a bound on the cutoff we will use
   // on the next frame.
   double next_weight_cutoff = std::numeric_limits<double>::infinity();
-  
+
   // First process the best token to get a hopefully
   // reasonably tight bound on the next cutoff.
   if (best_elem) {
@@ -277,13 +274,11 @@ double FasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
           double new_weight = arc.weight.Value() + tok->cost_ + ac_cost;
           if (new_weight < next_weight_cutoff) {  // not pruned..
             Token *new_tok = new Token(arc, ac_cost, tok);
-            Elem *e_found = toks_.Find(arc.nextstate);
+            Elem *e_found = toks_.Insert(arc.nextstate, new_tok);
             if (new_weight + adaptive_beam < next_weight_cutoff)
               next_weight_cutoff = new_weight + adaptive_beam;
-            if (e_found == NULL) {
-              toks_.Insert(arc.nextstate, new_tok);
-            } else {
-              if ( *(e_found->val) < *new_tok ) {
+            if (e_found->val != new_tok) {
+              if (*(e_found->val) < *new_tok) {
                 Token::TokenDelete(e_found->val);
                 e_found->val = new_tok;
               } else {
@@ -304,14 +299,15 @@ double FasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
 
 // TODO: first time we go through this, could avoid using the queue.
 void FasterDecoder::ProcessNonemitting(double cutoff) {
-  // Processes nonemitting arcs for one frame. 
+  // Processes nonemitting arcs for one frame.
   KALDI_ASSERT(queue_.empty());
   for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail)
-    queue_.push_back(e->key);
+    queue_.push_back(e);
   while (!queue_.empty()) {
-    StateId state = queue_.back();
+    const Elem* e = queue_.back();
     queue_.pop_back();
-    Token *tok = toks_.Find(state)->val;  // would segfault if state not
+    StateId state = e->key;
+    Token *tok = e->val;  // would segfault if state not
     // in toks_ but this can't happen.
     if (tok->cost_ > cutoff) { // Don't bother processing successors.
       continue;
@@ -326,15 +322,14 @@ void FasterDecoder::ProcessNonemitting(double cutoff) {
         if (new_tok->cost_ > cutoff) {  // prune
           Token::TokenDelete(new_tok);
         } else {
-          Elem *e_found = toks_.Find(arc.nextstate);
-          if (e_found == NULL) {
-            toks_.Insert(arc.nextstate, new_tok);
-            queue_.push_back(arc.nextstate);
+          Elem *e_found = toks_.Insert(arc.nextstate, new_tok);
+          if (e_found->val == new_tok) {
+            queue_.push_back(e_found);
           } else {
-            if ( *(e_found->val) < *new_tok ) {
+            if (*(e_found->val) < *new_tok) {
               Token::TokenDelete(e_found->val);
               e_found->val = new_tok;
-              queue_.push_back(arc.nextstate);
+              queue_.push_back(e_found);
             } else {
               Token::TokenDelete(new_tok);
             }
diff --git a/src/decoder/faster-decoder.h b/src/decoder/faster-decoder.h
index baedcc022b6..88f78a6156f 100644
--- a/src/decoder/faster-decoder.h
+++ b/src/decoder/faster-decoder.h
@@ -76,7 +76,7 @@ class FasterDecoder {
   void Decode(DecodableInterface *decodable);
 
   /// Returns true if a final state was active on the last frame.
-  bool ReachedFinal();
+  bool ReachedFinal() const;
 
   /// GetBestPath gets the decoding traceback. If "use_final_probs" is true
   /// AND we reached a final state, it limits itself to final states;
@@ -170,7 +170,7 @@ class FasterDecoder {
   HashList<StateId, Token*> toks_;
   const fst::Fst<fst::StdArc> &fst_;
   FasterDecoderOptions config_;
-  std::vector<StateId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
 
diff --git a/src/decoder/grammar-fst.cc b/src/decoder/grammar-fst.cc
new file mode 100644
index 00000000000..3bb063bf3fb
--- /dev/null
+++ b/src/decoder/grammar-fst.cc
@@ -0,0 +1,1038 @@
+// decoder/grammar-fst.cc
+
+// Copyright   2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/grammar-fst.h"
+#include "fstext/grammar-context-fst.h"
+
+namespace fst {
+
+
+GrammarFst::GrammarFst(
+    int32 nonterm_phones_offset,
+    std::shared_ptr<const ConstFst<StdArc> > top_fst,
+    const std::vector<std::pair<Label, std::shared_ptr<const ConstFst<StdArc> > > > &ifsts):
+    nonterm_phones_offset_(nonterm_phones_offset),
+    top_fst_(top_fst),
+    ifsts_(ifsts) {
+  Init();
+}
+
+void GrammarFst::Init() {
+  KALDI_ASSERT(nonterm_phones_offset_ > 1);
+  InitNonterminalMap();
+  entry_arcs_.resize(ifsts_.size());
+  if (!ifsts_.empty()) {
+    // We call this mostly so that if something is wrong with the input FSTs, the
+    // problem will be detected sooner rather than later.
+    // There would be no problem if we were to call InitEntryArcs(i)
+    // for all 0 <= i < ifsts_size(), but we choose to call it
+    // lazily on demand, to save startup time if the number of nonterminals
+    // is large.
+    InitEntryArcs(0);
+  }
+  InitInstances();
+}
+
+GrammarFst::~GrammarFst() {
+  Destroy();
+}
+
+void GrammarFst::Destroy() {
+  for (size_t i = 0; i < instances_.size(); i++) {
+    FstInstance &instance = instances_[i];
+    std::unordered_map<BaseStateId, ExpandedState*>::const_iterator
+        iter = instance.expanded_states.begin(),
+        end = instance.expanded_states.end();
+    for (; iter != end; ++iter) {
+      ExpandedState *e = iter->second;
+      delete e;
+    }
+  }
+  top_fst_ = NULL;
+  ifsts_.clear();
+  nonterminal_map_.clear();
+  entry_arcs_.clear();
+  instances_.clear();
+}
+
+
+void GrammarFst::DecodeSymbol(Label label,
+                              int32 *nonterminal_symbol,
+                              int32 *left_context_phone) {
+  // encoding_multiple will normally equal 1000 (but may be a multiple of 1000
+  // if there are a lot of phones); kNontermBigNumber is 10000000.
+  int32 big_number = static_cast<int32>(kNontermBigNumber),
+      nonterm_phones_offset = nonterm_phones_offset_,
+      encoding_multiple = GetEncodingMultiple(nonterm_phones_offset);
+  // The following assertion should be optimized out as the condition is
+  // statically known.
+  KALDI_ASSERT(big_number % static_cast<int32>(kNontermMediumNumber) == 0);
+
+  *nonterminal_symbol = (label - big_number) / encoding_multiple;
+  *left_context_phone = label % encoding_multiple;
+  if (*nonterminal_symbol <= nonterm_phones_offset ||
+      *left_context_phone == 0 || *left_context_phone >
+      nonterm_phones_offset + static_cast<int32>(kNontermBos))
+    KALDI_ERR << "Decoding invalid label " << label
+              << ": code error or invalid --nonterm-phones-offset?";
+
+}
+
+void GrammarFst::InitNonterminalMap() {
+  nonterminal_map_.clear();
+  for (size_t i = 0; i < ifsts_.size(); i++) {
+    int32 nonterminal = ifsts_[i].first;
+    if (nonterminal_map_.count(nonterminal))
+      KALDI_ERR << "Nonterminal symbol " << nonterminal
+                << " is paired with two FSTs.";
+    if (nonterminal < GetPhoneSymbolFor(kNontermUserDefined))
+      KALDI_ERR << "Nonterminal symbol " << nonterminal
+                << " in input pairs, was expected to be >= "
+                << GetPhoneSymbolFor(kNontermUserDefined);
+    nonterminal_map_[nonterminal] = static_cast<int32>(i);
+  }
+}
+
+
+bool GrammarFst::InitEntryArcs(int32 i) {
+  KALDI_ASSERT(static_cast<size_t>(i) < ifsts_.size());
+  const ConstFst<StdArc> &fst = *(ifsts_[i].second);
+  if (fst.NumStates() == 0)
+    return false;  /* this was the empty FST. */
+  InitEntryOrReentryArcs(fst, fst.Start(),
+                         GetPhoneSymbolFor(kNontermBegin),
+                         &(entry_arcs_[i]));
+  return true;
+}
+
+void GrammarFst::InitInstances() {
+  KALDI_ASSERT(instances_.empty());
+  instances_.resize(1);
+  instances_[0].ifst_index = -1;
+  instances_[0].fst = top_fst_.get();
+  instances_[0].parent_instance = -1;
+  instances_[0].parent_state = -1;
+}
+
+void GrammarFst::InitEntryOrReentryArcs(
+    const ConstFst<StdArc> &fst,
+    int32 entry_state,
+    int32 expected_nonterminal_symbol,
+    std::unordered_map<int32, int32> *phone_to_arc) {
+  phone_to_arc->clear();
+  ArcIterator<ConstFst<StdArc> > aiter(fst, entry_state);
+  int32 arc_index = 0;
+  for (; !aiter.Done(); aiter.Next(), ++arc_index) {
+    const StdArc &arc = aiter.Value();
+    int32 nonterminal, left_context_phone;
+    if (arc.ilabel <= (int32)kNontermBigNumber) {
+      if (entry_state == fst.Start()) {
+        KALDI_ERR << "There is something wrong with the graph; did you forget to "
+            "add #nonterm_begin and #nonterm_end to the non-top-level FSTs "
+            "before compiling?";
+      } else {
+        KALDI_ERR << "There is something wrong with the graph; re-entry state is "
+            "not as anticipated.";
+      }
+    }
+    DecodeSymbol(arc.ilabel, &nonterminal, &left_context_phone);
+    if (nonterminal != expected_nonterminal_symbol) {
+      KALDI_ERR << "Expected arcs from this state to have nonterminal-symbol "
+                << expected_nonterminal_symbol << ", but got "
+                << nonterminal;
+    }
+    std::pair<int32, int32> p(left_context_phone, arc_index);
+    if (!phone_to_arc->insert(p).second) {
+      // If it was not successfully inserted in the phone_to_arc map, it means
+      // there were two arcs with the same left-context phone, which does not
+      // make sense; that's an error, likely a code error (or an error when the
+      // input FSTs were generated).
+      KALDI_ERR << "Two arcs had the same left-context phone.";
+    }
+  }
+}
+
+GrammarFst::ExpandedState *GrammarFst::ExpandState(
+    int32 instance_id, BaseStateId state_id) {
+  int32 big_number = kNontermBigNumber;
+  const ConstFst<StdArc> &fst = *(instances_[instance_id].fst);
+  ArcIterator<ConstFst<StdArc> > aiter(fst, state_id);
+  KALDI_ASSERT(!aiter.Done() && aiter.Value().ilabel > big_number &&
+               "Something is not right; did you call PrepareForGrammarFst()?");
+
+  const StdArc &arc = aiter.Value();
+  int32 encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_),
+      nonterminal = (arc.ilabel - big_number) / encoding_multiple;
+  if (nonterminal == GetPhoneSymbolFor(kNontermBegin) ||
+      nonterminal == GetPhoneSymbolFor(kNontermReenter)) {
+    KALDI_ERR << "Encountered unexpected type of nonterminal while "
+        "expanding state.";
+  } else if (nonterminal == GetPhoneSymbolFor(kNontermEnd)) {
+    return ExpandStateEnd(instance_id, state_id);
+  } else if (nonterminal >= GetPhoneSymbolFor(kNontermUserDefined)) {
+    return ExpandStateUserDefined(instance_id, state_id);
+  } else {
+    KALDI_ERR << "Encountered unexpected type of nonterminal "
+              << nonterminal << " while expanding state.";
+  }
+  return NULL;  // Suppress compiler warning
+}
+
+
+// static inline
+void GrammarFst::CombineArcs(const StdArc &leaving_arc,
+                             const StdArc &arriving_arc,
+                             float cost_correction,
+                             StdArc *arc) {
+  // The following assertion shouldn't fail; we ensured this in
+  // PrepareForGrammarFst(), search for 'olabel_problem'.
+  KALDI_ASSERT(leaving_arc.olabel == 0);
+  // 'leaving_arc' leaves one fst, and 'arriving_arcs', conceptually arrives in
+  // another.  This code merges the information of the two arcs to make a
+  // cross-FST arc.  The ilabel information is discarded as it was only intended
+  // for the consumption of the GrammarFST code.
+  arc->ilabel = 0;
+  arc->olabel = arriving_arc.olabel;
+  // conceptually, arc->weight =
+  //  Times(Times(leaving_arc.weight, arriving_arc.weight), Weight(cost_correction)).
+  // The below might be a bit faster, I hope-- avoiding checking.
+  arc->weight = Weight(cost_correction + leaving_arc.weight.Value() +
+                       arriving_arc.weight.Value());
+  arc->nextstate = arriving_arc.nextstate;
+}
+
+GrammarFst::ExpandedState *GrammarFst::ExpandStateEnd(
+    int32 instance_id, BaseStateId state_id) {
+  if (instance_id == 0)
+    KALDI_ERR << "Did not expect #nonterm_end symbol in FST-instance 0.";
+  const FstInstance &instance = instances_[instance_id];
+  int32 parent_instance_id = instance.parent_instance;
+  const ConstFst<StdArc> &fst = *(instance.fst);
+  const FstInstance &parent_instance = instances_[parent_instance_id];
+  const ConstFst<StdArc> &parent_fst = *(parent_instance.fst);
+
+  ExpandedState *ans = new ExpandedState;
+  ans->dest_fst_instance = parent_instance_id;
+
+  // parent_aiter is the arc-iterator in the state we return to.  We'll Seek()
+  // to a different position 'parent_aiter' for each arc leaving this state.
+  // (actually we expect just one arc to leave this state).
+  ArcIterator<ConstFst<StdArc> > parent_aiter(parent_fst,
+                                              instance.parent_state);
+
+  // for explanation of cost_correction, see documentation for CombineArcs().
+  float num_reentry_arcs = instances_[instance_id].parent_reentry_arcs.size(),
+      cost_correction = -log(num_reentry_arcs);
+
+  ArcIterator<ConstFst<StdArc> > aiter(fst, state_id);
+
+  for (; !aiter.Done(); aiter.Next()) {
+    const StdArc &leaving_arc = aiter.Value();
+    int32 this_nonterminal, left_context_phone;
+    DecodeSymbol(leaving_arc.ilabel, &this_nonterminal,
+                 &left_context_phone);
+    KALDI_ASSERT(this_nonterminal == GetPhoneSymbolFor(kNontermEnd) &&
+                 ">1 nonterminals from a state; did you use "
+                 "PrepareForGrammarFst()?");
+    std::unordered_map<int32, int32>::const_iterator reentry_iter =
+        instances_[instance_id].parent_reentry_arcs.find(left_context_phone),
+        reentry_end = instances_[instance_id].parent_reentry_arcs.end();
+    if (reentry_iter == reentry_end) {
+      KALDI_ERR << "FST with index " << instance.ifst_index
+                << " ends with left-context-phone " << left_context_phone
+                << " but parent FST does not support that left-context "
+          "at the return point.";
+    }
+    size_t parent_arc_index = static_cast<size_t>(reentry_iter->second);
+    parent_aiter.Seek(parent_arc_index);
+    const StdArc &arriving_arc = parent_aiter.Value();
+    // 'arc' will combine the information on 'leaving_arc' and 'arriving_arc',
+    // except that the ilabel will be set to zero.
+    if (leaving_arc.olabel != 0) {
+      // If the following fails it would maybe indicate you hadn't called
+      // PrepareForGrammarFst(), or there was an error in that, because
+      // we made sure the leaving arc does not have an olabel.  Search
+      // in that code for 'olabel_problem' for more details.
+      KALDI_ERR << "Leaving arc has zero olabel.";
+    }
+    StdArc arc;
+    CombineArcs(leaving_arc, arriving_arc, cost_correction, &arc);
+    ans->arcs.push_back(arc);
+  }
+  return ans;
+}
+
+int32 GrammarFst::GetChildInstanceId(int32 instance_id, int32 nonterminal,
+                                     int32 state) {
+  int64 encoded_pair = (static_cast<int64>(nonterminal) << 32) + state;
+  // 'new_instance_id' is the instance-id we'd assign if we had to create a new one.
+  // We try to add it at once, to avoid having to do an extra map lookup in case
+  // it wasn't there and we did need to add it.
+  int32 child_instance_id = instances_.size();
+  {
+    std::pair<int64, int32> p(encoded_pair, child_instance_id);
+    std::pair<std::unordered_map<int64, int32>::const_iterator, bool> ans =
+        instances_[instance_id].child_instances.insert(p);
+    if (!ans.second) {
+      // The pair was not inserted, which means the key 'encoded_pair' did exist in the
+      // map.  Return the value in the map.
+      child_instance_id = ans.first->second;
+      return child_instance_id;
+    }
+  }
+  // If we reached this point, we did successfully insert 'child_instance_id' into
+  // the map, because the key didn't exist.  That means we have to actually create
+  // the instance.
+  instances_.resize(child_instance_id + 1);
+  const FstInstance &parent_instance = instances_[instance_id];
+  FstInstance &child_instance = instances_[child_instance_id];
+
+  // Work out the ifst_index for this nonterminal.
+  std::unordered_map<int32, int32>::const_iterator iter =
+      nonterminal_map_.find(nonterminal);
+  if (iter == nonterminal_map_.end()) {
+    KALDI_ERR << "Nonterminal " << nonterminal << " was requested, but "
+        "there is no FST for it.";
+  }
+  int32 ifst_index = iter->second;
+  child_instance.ifst_index = ifst_index;
+  child_instance.fst = ifsts_[ifst_index].second.get();
+  child_instance.parent_instance = instance_id;
+  child_instance.parent_state = state;
+  InitEntryOrReentryArcs(*(parent_instance.fst), state,
+                         GetPhoneSymbolFor(kNontermReenter),
+                         &(child_instance.parent_reentry_arcs));
+  return child_instance_id;
+}
+
+GrammarFst::ExpandedState *GrammarFst::ExpandStateUserDefined(
+    int32 instance_id, BaseStateId state_id) {
+  const ConstFst<StdArc> &fst = *(instances_[instance_id].fst);
+  ArcIterator<ConstFst<StdArc> > aiter(fst, state_id);
+
+  ExpandedState *ans = new ExpandedState;
+  int32 dest_fst_instance = -1;  // We'll set it in the loop.
+                                 // and->dest_fst_instance will be set to this.
+
+  for (; !aiter.Done(); aiter.Next()) {
+    const StdArc &leaving_arc = aiter.Value();
+    int32 nonterminal, left_context_phone;
+    DecodeSymbol(leaving_arc.ilabel, &nonterminal,
+                 &left_context_phone);
+    int32 child_instance_id = GetChildInstanceId(instance_id,
+                                                 nonterminal,
+                                                 leaving_arc.nextstate);
+    if (dest_fst_instance < 0) {
+      dest_fst_instance = child_instance_id;
+    } else if (dest_fst_instance != child_instance_id) {
+      KALDI_ERR << "Same state leaves to different FST instances "
+          "(Did you use PrepareForGrammarFst()?)";
+    }
+    const FstInstance &child_instance = instances_[child_instance_id];
+    const ConstFst<StdArc> &child_fst = *(child_instance.fst);
+    int32 child_ifst_index = child_instance.ifst_index;
+    std::unordered_map<int32, int32> &entry_arcs = entry_arcs_[child_ifst_index];
+    if (entry_arcs.empty()) {
+      if (!InitEntryArcs(child_ifst_index)) {
+        // This child-FST was the empty FST.  There are no arcs to expand.
+        continue;
+      }
+    }
+    // for explanation of cost_correction, see documentation for CombineArcs().
+    float num_entry_arcs = entry_arcs.size(),
+        cost_correction = -log(num_entry_arcs);
+
+    // Get the arc-index for the arc leaving the start-state of child FST that
+    // corresponds to this phonetic context.
+    std::unordered_map<int32, int32>::const_iterator entry_iter =
+        entry_arcs.find(left_context_phone);
+    if (entry_iter == entry_arcs.end()) {
+      KALDI_ERR << "FST for nonterminal " << nonterminal
+                << " does not have an entry point for left-context-phone "
+                << left_context_phone;
+    }
+    int32 arc_index = entry_iter->second;
+    ArcIterator<ConstFst<StdArc> > child_aiter(child_fst, child_fst.Start());
+    child_aiter.Seek(arc_index);
+    const StdArc &arriving_arc = child_aiter.Value();
+    StdArc arc;
+    CombineArcs(leaving_arc, arriving_arc, cost_correction, &arc);
+    ans->arcs.push_back(arc);
+  }
+  ans->dest_fst_instance = dest_fst_instance;
+  return ans;
+}
+
+
+void GrammarFst::Write(std::ostream &os, bool binary) const {
+  using namespace kaldi;
+  if (!binary)
+    KALDI_ERR << "GrammarFst::Write only supports binary mode.";
+  int32 format = 1,
+      num_ifsts = ifsts_.size();
+  WriteToken(os, binary, "<GrammarFst>");
+  WriteBasicType(os, binary, format);
+  WriteBasicType(os, binary, num_ifsts);
+  WriteBasicType(os, binary, nonterm_phones_offset_);
+
+  std::string stream_name("unknown");
+  FstWriteOptions wopts(stream_name);
+  top_fst_->Write(os, wopts);
+
+  for (int32 i = 0; i < num_ifsts; i++) {
+    int32 nonterminal = ifsts_[i].first;
+    WriteBasicType(os, binary, nonterminal);
+    ifsts_[i].second->Write(os, wopts);
+  }
+  WriteToken(os, binary, "</GrammarFst>");
+}
+
+static ConstFst<StdArc> *ReadConstFstFromStream(std::istream &is) {
+  fst::FstHeader hdr;
+  std::string stream_name("unknown");
+  if (!hdr.Read(is, stream_name))
+    KALDI_ERR << "Reading FST: error reading FST header";
+  FstReadOptions ropts("<unspecified>", &hdr);
+  ConstFst<StdArc> *ans = ConstFst<StdArc>::Read(is, ropts);
+  if (!ans)
+    KALDI_ERR << "Could not read ConstFst from stream.";
+  return ans;
+}
+
+
+
+void GrammarFst::Read(std::istream &is, bool binary) {
+  using namespace kaldi;
+  if (!binary)
+    KALDI_ERR << "GrammarFst::Read only supports binary mode.";
+  if (top_fst_ != NULL)
+    Destroy();
+  int32 format = 1, num_ifsts;
+  ExpectToken(is, binary, "<GrammarFst>");
+  ReadBasicType(is, binary, &format);
+  if (format != 1)
+    KALDI_ERR << "This version of the code cannot read this GrammarFst, "
+        "update your code.";
+  ReadBasicType(is, binary, &num_ifsts);
+  ReadBasicType(is, binary, &nonterm_phones_offset_);
+  top_fst_ = std::shared_ptr<const ConstFst<StdArc> >(ReadConstFstFromStream(is));
+  for (int32 i = 0; i < num_ifsts; i++) {
+    int32 nonterminal;
+    ReadBasicType(is, binary, &nonterminal);
+    std::shared_ptr<const ConstFst<StdArc> >
+        this_fst(ReadConstFstFromStream(is));
+    ifsts_.push_back(std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > >(
+        nonterminal, this_fst));
+  }
+  Init();
+}
+
+
+/**
+   This utility function input-determinizes a specified state s of the FST
+   'fst'.   (This input-determinizes while treating epsilon as a real symbol,
+   although for the application we expect to use it, there won't be epsilons).
+
+   What this function does is: for any symbol i that appears as the ilabel of
+   more than one arc leaving state s of FST 'fst', it creates an additional
+   state, it creates a new state t with epsilon-input transitions leaving it for
+   each of those multiple arcs leaving state s; it deletes the original arcs
+   leaving state s; and it creates a single arc leaving state s to the newly
+   created state with the ilabel i on it.  It sets the weights as necessary to
+   preserve equivalence and also to ensure that if, prior to this modification,
+   the FST was stochastic when cast to the log semiring (see
+   IsStochasticInLog()), it still will be.  I.e. when interpreted as
+   negative logprobs, the weight from state s to t would be the sum of
+   the weights on the original arcs leaving state s.
+
+   This is used as a very cheap solution when preparing FSTs for the grammar
+   decoder, to ensure that there is only one entry-state to the sub-FST for each
+   phonetic left-context; this keeps the grammar-FST code (i.e. the code that
+   stitches them together) simple.  Of course it will tend to introduce
+   unnecessary epsilons, and if we were careful we might be able to remove
+   some of those, but this wouldn't have a substantial impact on overall
+   decoder performance so we don't bother.
+ */
+static void InputDeterminizeSingleState(StdArc::StateId s,
+                                        VectorFst<StdArc> *fst) {
+  bool was_input_deterministic = true;
+  typedef StdArc Arc;
+  typedef Arc::StateId StateId;
+  typedef Arc::Label Label;
+  typedef Arc::Weight Weight;
+
+  struct InfoForIlabel {
+    std::vector<size_t> arc_indexes;  // indexes of all arcs with this ilabel
+    float tot_cost;  // total cost of all arcs leaving state s for this
+                     // ilabel, summed as if they were negative log-probs.
+    StateId new_state;  // state-id of new state, if any, that we have created
+                        // to remove duplicate symbols with this ilabel.
+    InfoForIlabel(): new_state(-1) { }
+  };
+
+  std::unordered_map<Label, InfoForIlabel> label_map;
+
+  size_t arc_index = 0;
+  for (ArcIterator<VectorFst<Arc> > aiter(*fst, s);
+       !aiter.Done(); aiter.Next(), ++arc_index) {
+    const Arc &arc = aiter.Value();
+    InfoForIlabel &info = label_map[arc.ilabel];
+    if (info.arc_indexes.empty()) {
+      info.tot_cost = arc.weight.Value();
+    } else {
+      info.tot_cost = -kaldi::LogAdd(-info.tot_cost, -arc.weight.Value());
+      was_input_deterministic = false;
+    }
+    info.arc_indexes.push_back(arc_index);
+  }
+
+  if (was_input_deterministic)
+    return;  // Nothing to do.
+
+  // 'new_arcs' will contain the modified list of arcs
+  // leaving state s
+  std::vector<Arc> new_arcs;
+  new_arcs.reserve(arc_index);
+  arc_index = 0;
+  for (ArcIterator<VectorFst<Arc> > aiter(*fst, s);
+       !aiter.Done(); aiter.Next(), ++arc_index) {
+    const Arc &arc = aiter.Value();
+    Label ilabel = arc.ilabel;
+    InfoForIlabel &info = label_map[ilabel];
+    if (info.arc_indexes.size() == 1) {
+      new_arcs.push_back(arc);  // no changes needed
+    } else {
+      if (info.new_state < 0) {
+        info.new_state = fst->AddState();
+        // add arc from state 's' to newly created state.
+        new_arcs.push_back(Arc(ilabel, 0, Weight(info.tot_cost),
+                               info.new_state));
+      }
+      // add arc from new state to original destination of this arc.
+      fst->AddArc(info.new_state, Arc(0, arc.olabel,
+                                      Weight(arc.weight.Value() - info.tot_cost),
+                                      arc.nextstate));
+    }
+  }
+  fst->DeleteArcs(s);
+  for (size_t i = 0; i < new_arcs.size(); i++)
+    fst->AddArc(s, new_arcs[i]);
+}
+
+
+// This class contains the implementation of the function
+// PrepareForGrammarFst(), which is declared in grammar-fst.h.
+class GrammarFstPreparer {
+ public:
+  using FST = VectorFst<StdArc>;
+  using Arc = StdArc;
+  using StateId = Arc::StateId;
+  using Label = Arc::Label;
+  using Weight = Arc::Weight;
+
+  GrammarFstPreparer(int32 nonterm_phones_offset,
+                     VectorFst<StdArc> *fst):
+      nonterm_phones_offset_(nonterm_phones_offset),
+      fst_(fst), orig_num_states_(fst->NumStates()),
+      simple_final_state_(kNoStateId) { }
+
+  void Prepare() {
+    if (fst_->Start() == kNoStateId) {
+      KALDI_ERR << "FST has no states.";
+    }
+    for (StateId s = 0; s < fst_->NumStates(); s++) {
+      if (IsSpecialState(s)) {
+        if (NeedEpsilons(s)) {
+          InsertEpsilonsForState(s);
+          // This state won't be treated as a 'special' state any more;
+          // all 'special' arcs (arcs with ilabels >= kNontermBigNumber)
+          // have been moved and now leave from newly created states that
+          // this state transitions to via epsilons arcs.
+        } else {
+          // OK, state s is a special state.
+          FixArcsToFinalStates(s);
+          MaybeAddFinalProbToState(s);
+          // The following ensures that the start-state of sub-FSTs only has
+          // a single arc per left-context phone (the graph-building recipe can
+          // end up creating more than one if there were disambiguation symbols,
+          // e.g. for langauge model backoff).
+          if (s == fst_->Start() && IsEntryState(s))
+            InputDeterminizeSingleState(s, fst_);
+        }
+      }
+    }
+    StateId num_new_states = fst_->NumStates() - orig_num_states_;
+    KALDI_LOG << "Added " << num_new_states << " new states while "
+        "preparing for grammar FST.";
+  }
+
+ private:
+
+  // Returns true if state 's' has at least one arc coming out of it with a
+  // special nonterminal-related ilabel on it (i.e. an ilabel >=
+  // kNontermBigNumber), and false otherwise.
+  bool IsSpecialState(StateId s) const;
+
+  // This function verifies that state s does not currently have any
+  // final-prob (crashes if that fails); then, if the arcs leaving s have
+  // nonterminal symbols kNontermEnd or user-defined nonterminals (>=
+  // kNontermUserDefined), it adds a final-prob with cost given by
+  // KALDI_GRAMMAR_FST_SPECIAL_WEIGHT to the state.
+  //
+  // State s is required to be a 'special state', i.e. have special symbols on
+  // arcs leaving it, and the function assumes (since it will already
+  // have been checked) that the arcs leaving s, if there are more than
+  // one, all correspond to the same nonterminal symbol.
+  void MaybeAddFinalProbToState(StateId s);
+
+
+  // This function does some checking for 'special states', that they have
+  // certain expected properties, and also detects certain problematic
+  // conditions that we need to fix.  It returns true if we need to
+  // modify this state (by adding input-epsilon arcs), and false otherwise.
+  bool NeedEpsilons(StateId s) const;
+
+  // Returns true if state s (which is expected to be the start state, although we
+  // don't check this) has arcs with nonterminal symbols #nonterm_begin.
+  bool IsEntryState(StateId s) const;
+
+  // Fixes any final-prob-related problems with this state.  The problem we aim
+  // to fix is that there may be arcs with nonterminal symbol #nonterm_end which
+  // transition from this state to a state with non-unit final prob.  This
+  // function assimilates that final-prob into the arc leaving from this state,
+  // by making the arc transition to a new state with unit final-prob, and
+  // incorporating the original final-prob into the arc's weight.
+  //
+  // The purpose of this is to keep the GrammarFst code simple.
+  //
+  // It would have been more efficient to do this in CheckProperties(), but
+  // doing it this way is clearer; and the extra time taken here will be tiny.
+  void FixArcsToFinalStates(StateId s);
+
+
+  // This struct represents a category of arcs that are allowed to leave from
+  // the same 'special state'.  If a special state has arcs leaving it that
+  // are in more than one category, it will need to be split up into
+  // multiple states connected by epsilons.
+  //
+  // The 'nonterminal' and 'nextstate' have to do with ensuring that all
+  // arcs leaving a particular FST state transition to the same FST instance
+  // (which, in turn, helps to keep the ArcIterator code efficient).
+  //
+  // The 'olabel' has to do with ensuring that arcs with user-defined
+  // nonterminals or kNontermEnd have no olabels on them.  This is a requirement
+  // of the CombineArcs() function of GrammarFst, because it needs to combine
+  // two olabels into one so we need to know that at least one of the olabels is
+  // always epsilon.
+  struct ArcCategory {
+    int32 nonterminal;  //  The nonterminal symbol #nontermXXX encoded into the ilabel,
+                        // or 0 if the ilabel was <kNontermBigNumber.
+    StateId nextstate; //  If 'nonterminal' is a user-defined nonterminal like
+                       //  #nonterm:foo,
+                       // then the destination state of the arc, else kNoStateId (-1).
+    Label olabel;  //  If 'nonterminal' is #nonterm_end or is a user-defined
+                   // nonterminal (e.g. #nonterm:foo), then the olabel on the
+                   // arc; else, 0.
+    bool operator < (const ArcCategory &other) const {
+      if (nonterminal < other.nonterminal) return true;
+      else if (nonterminal > other.nonterminal) return false;
+      if (nextstate < other.nextstate) return true;
+      else if (nextstate > other.nextstate) return false;
+      return olabel < other.olabel;
+    }
+  };
+
+  // This function, which is used in CheckProperties() and
+  // InsertEpsilonsForState(), works out the categrory of the arc; see
+  // documentation of struct ArcCategory for more details.
+  void GetCategoryOfArc(const Arc &arc,
+                        ArcCategory *arc_category) const;
+
+
+  // This will be called for 'special states' that need to be split up.
+  // Non-special arcs leaving this state will stay here.  For each
+  // category of special arcs (see ArcCategory for details), a new
+  // state will be created and those arcs will leave from that state
+  // instead; for each such state, an input-epsilon arc will leave this state
+  // for that state.  For more details, see the code.
+  void InsertEpsilonsForState(StateId s);
+
+  inline int32 GetPhoneSymbolFor(enum NonterminalValues n) const {
+    return nonterm_phones_offset_ + static_cast<int32>(n);
+  }
+
+  int32 nonterm_phones_offset_;
+  VectorFst<StdArc> *fst_;
+  StateId orig_num_states_;
+  // If needed we may add a 'simple final state' to fst_, which has unit
+  // final-prob.  This is used when we ensure that states with kNontermExit on
+  // them transition to a state with unit final-prob, so we don't need to
+  // look at the final-prob when expanding states.
+  StateId simple_final_state_;
+};
+
+bool GrammarFstPreparer::IsSpecialState(StateId s) const {
+  if (fst_->Final(s).Value() == KALDI_GRAMMAR_FST_SPECIAL_WEIGHT) {
+    // TODO: find a way to detect if it was a coincidence, or not make it an
+    // error, because in principle a user-defined grammar could contain this
+    // special cost.
+    KALDI_WARN << "It looks like you are calling PrepareForGrammarFst twice.";
+  }
+  for (ArcIterator<FST> aiter(*fst_, s ); !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    if (arc.ilabel >= kNontermBigNumber) // 1 million
+      return true;
+  }
+  return false;
+}
+
+bool GrammarFstPreparer::IsEntryState(StateId s) const {
+  int32 big_number = kNontermBigNumber,
+      encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_);
+
+  for (ArcIterator<FST> aiter(*fst_, s ); !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    int32 nonterminal = (arc.ilabel - big_number) /
+        encoding_multiple;
+    // we check that at least one has label with nonterminal equal to #nonterm_begin...
+    // in fact they will all have this value if at least one does, and this was checked
+    // in NeedEpsilons().
+    if (nonterminal == GetPhoneSymbolFor(kNontermBegin))
+      return true;
+  }
+  return false;
+}
+
+
+bool GrammarFstPreparer::NeedEpsilons(StateId s) const {
+
+  // See the documentation for GetCategoryOfArc() for explanation of what these are.
+  std::set<ArcCategory> categories;
+
+  if (fst_->Final(s) != Weight::Zero()) {
+    // A state having a final-prob is considered the same as it having
+    // a non-nonterminal arc out of it.. this would be like a transition
+    // within the same FST.
+    ArcCategory category;
+    category.nonterminal = 0;
+    category.nextstate = kNoStateId;
+    category.olabel = 0;
+    categories.insert(category);
+  }
+
+  int32 big_number = kNontermBigNumber,
+      encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_);
+
+  for (ArcIterator<FST> aiter(*fst_, s ); !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    ArcCategory category;
+    GetCategoryOfArc(arc, &category);
+    categories.insert(category);
+
+    // the rest of this block is just checking.
+    int32 nonterminal = category.nonterminal;
+
+    if (nonterminal >= GetPhoneSymbolFor(kNontermUserDefined)) {
+      // Check that the destination state of this arc has arcs with
+      // kNontermReenter on them.  We'll separately check that such states
+      // don't have other types of arcs leaving them (search for
+      // kNontermReenter below), so it's sufficient to check the first arc.
+      ArcIterator<FST> next_aiter(*fst_, arc.nextstate);
+      if (next_aiter.Done())
+        KALDI_ERR << "Destination state of a user-defined nonterminal "
+            "has no arcs leaving it.";
+      const Arc &next_arc = next_aiter.Value();
+      int32 next_nonterminal = (next_arc.ilabel - big_number) /
+          encoding_multiple;
+      if (next_nonterminal != GetPhoneSymbolFor(kNontermReenter)) {
+        KALDI_ERR << "Expected arcs with user-defined nonterminals to be "
+            "followed by arcs with kNontermReenter.";
+      }
+    }
+    if (nonterminal == GetPhoneSymbolFor(kNontermBegin) &&
+        s != fst_->Start()) {
+      KALDI_ERR << "#nonterm_begin symbol is present but this is not the "
+          "first state.  Did you do fstdeterminizestar while compiling?";
+    }
+    if (nonterminal == GetPhoneSymbolFor(kNontermEnd)) {
+      if (fst_->NumArcs(arc.nextstate) != 0 ||
+          fst_->Final(arc.nextstate) == Weight::Zero()) {
+        KALDI_ERR << "Arc with kNontermEnd is not the final arc.";
+      }
+    }
+  }
+  if (categories.size() > 1) {
+    // This state has arcs leading to multiple FST instances.
+    // Do some checking to see that there is nothing really unexpected in
+    // there.
+    for (std::set<ArcCategory>::const_iterator
+             iter = categories.begin();
+         iter != categories.end(); ++iter) {
+      int32 nonterminal = iter->nonterminal;
+      if (nonterminal == nonterm_phones_offset_ + kNontermBegin ||
+          nonterminal == nonterm_phones_offset_ + kNontermReenter)
+        // we don't expect any state which has symbols like (kNontermBegin:p1)
+        // on arcs coming out of it, to also have other types of symbol.  The
+        // same goes for kNontermReenter.
+        KALDI_ERR << "We do not expect states with arcs of type "
+            "kNontermBegin/kNontermReenter coming out of them, to also have "
+            "other types of arc.";
+    }
+  }
+  // the first half of the || below relates to olabels on arcs with either
+  // user-defined nonterminals or #nonterm_end (which would become 'leaving_arc'
+  // in the CombineArcs() function of GrammarFst).  That function does not allow
+  // nonzero olabels on 'leaving_arc', which would be a problem if the
+  // 'arriving' arc had nonzero olabels, so we solve this by introducing
+  // input-epsilon arcs and putting the olabels on them instead.
+  bool need_epsilons = (categories.size() == 1 &&
+                        categories.begin()->olabel != 0) ||
+      categories.size() > 1;
+  return need_epsilons;
+}
+
+void GrammarFstPreparer::FixArcsToFinalStates(StateId s) {
+  int32 encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_),
+      big_number = kNontermBigNumber;
+  for (MutableArcIterator<FST> aiter(fst_, s ); !aiter.Done(); aiter.Next()) {
+    Arc arc = aiter.Value();
+    if (arc.ilabel < big_number)
+      continue;
+    int32 nonterminal = (arc.ilabel - big_number) / encoding_multiple;
+    if (nonterminal ==  GetPhoneSymbolFor(kNontermEnd)) {
+      KALDI_ASSERT(fst_->NumArcs(arc.nextstate) == 0 &&
+                   fst_->Final(arc.nextstate) != Weight::Zero());
+      if (fst_->Final(arc.nextstate) == Weight::One())
+        continue;  // There is no problem to fix.
+      if (simple_final_state_ == kNoStateId) {
+        simple_final_state_ = fst_->AddState();
+        fst_->SetFinal(simple_final_state_, Weight::One());
+      }
+      arc.weight = Times(arc.weight, fst_->Final(arc.nextstate));
+      arc.nextstate = simple_final_state_;
+      aiter.SetValue(arc);
+    }
+  }
+}
+
+void GrammarFstPreparer::MaybeAddFinalProbToState(StateId s) {
+  if (fst_->Final(s) != Weight::Zero()) {
+    // Something went wrong and it will require some debugging.  In Prepare(),
+    // if we detected that the special state had a nonzero final-prob, we
+    // would have inserted epsilons to remove it, so there may be a bug in
+    // this class's code.
+    KALDI_ERR << "State already final-prob.";
+  }
+  ArcIterator<FST> aiter(*fst_, s );
+  KALDI_ASSERT(!aiter.Done());
+  const Arc &arc = aiter.Value();
+  int32 encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_),
+      big_number = kNontermBigNumber,
+      nonterminal = (arc.ilabel - big_number) / encoding_multiple;
+  KALDI_ASSERT(nonterminal >= GetPhoneSymbolFor(kNontermBegin));
+  if (nonterminal == GetPhoneSymbolFor(kNontermEnd) ||
+      nonterminal >= GetPhoneSymbolFor(kNontermUserDefined)) {
+    fst_->SetFinal(s, Weight(KALDI_GRAMMAR_FST_SPECIAL_WEIGHT));
+  }
+}
+
+void GrammarFstPreparer::GetCategoryOfArc(
+    const Arc &arc, ArcCategory *arc_category) const {
+  int32 encoding_multiple = GetEncodingMultiple(nonterm_phones_offset_),
+      big_number = kNontermBigNumber;
+
+  int32 ilabel = arc.ilabel;
+  if (ilabel < big_number) {
+    arc_category->nonterminal = 0;
+    arc_category->nextstate = kNoStateId;
+    arc_category->olabel = 0;
+  } else {
+    int32 nonterminal = (ilabel - big_number) / encoding_multiple;
+    arc_category->nonterminal = nonterminal;
+    if (nonterminal <= nonterm_phones_offset_) {
+      KALDI_ERR << "Problem decoding nonterminal symbol "
+          "(wrong --nonterm-phones-offset option?), ilabel="
+                << ilabel;
+    }
+    if (nonterminal >= GetPhoneSymbolFor(kNontermUserDefined)) {
+      // This is a user-defined symbol.
+      arc_category->nextstate = arc.nextstate;
+      arc_category->olabel = arc.olabel;
+    } else {
+      arc_category->nextstate = kNoStateId;
+      if (nonterminal == GetPhoneSymbolFor(kNontermEnd))
+        arc_category->olabel = arc.olabel;
+      else
+        arc_category->olabel = 0;
+    }
+  }
+}
+
+
+void GrammarFstPreparer::InsertEpsilonsForState(StateId s) {
+  // Maps from category of arc, to a pair:
+  //  the StateId is the state corresponding to that category.
+  //  the float is the cost on the arc leading to that state;
+  //   we compute the value that corresponds to the sum of the probabilities
+  //   of the leaving arcs, bearing in mind that p = exp(-cost).
+  // We don't insert the arc-category whose 'nonterminal' is 0 here (i.e. the
+  // category for normal arcs); those arcs stay at this state.
+  std::map<ArcCategory, std::pair<StateId, float> > category_to_state;
+
+  // This loop sets up 'category_to_state'.
+  for (fst::ArcIterator<FST> aiter(*fst_, s); !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    ArcCategory category;
+    GetCategoryOfArc(arc, &category);
+    int32 nonterminal = category.nonterminal;
+    if (nonterminal == 0)
+      continue;
+    if (nonterminal == GetPhoneSymbolFor(kNontermBegin) ||
+        nonterminal == GetPhoneSymbolFor(kNontermReenter)) {
+      KALDI_ERR << "Something went wrong; did not expect to insert epsilons "
+          "for this type of state.";
+    }
+    auto iter = category_to_state.find(category);
+    if (iter == category_to_state.end()) {
+      StateId new_state = fst_->AddState();
+      float cost = arc.weight.Value();
+      category_to_state[category] = std::pair<StateId, float>(new_state, cost);
+    } else {
+      std::pair<StateId, float> &p = iter->second;
+      p.second = -kaldi::LogAdd(-p.second, -arc.weight.Value());
+    }
+  }
+
+  KALDI_ASSERT(!category_to_state.empty());  // would be a code error.
+
+  // 'arcs_from_this_state' is a place to put arcs that will put on this state
+  // after we delete all its existing arcs.
+  std::vector<Arc> arcs_from_this_state;
+  arcs_from_this_state.reserve(fst_->NumArcs(s) + category_to_state.size());
+
+  // add arcs corresponding to transitions to the newly created states, to
+  // 'arcs_from_this_state'
+  for (std::map<ArcCategory, std::pair<StateId, float> >::const_iterator
+           iter = category_to_state.begin(); iter != category_to_state.end();
+       ++iter) {
+    const ArcCategory &category = iter->first;
+    StateId new_state = iter->second.first;
+    float cost = iter->second.second;
+    Arc arc;
+    arc.ilabel = 0;
+    arc.olabel = category.olabel;
+    arc.weight = Weight(cost);
+    arc.nextstate = new_state;
+    arcs_from_this_state.push_back(arc);
+  }
+
+  // Now add to 'arcs_from_this_state', and to the newly created states,
+  // arcs corresponding to each of the arcs that were originally leaving
+  // this state.
+  for (fst::ArcIterator<FST> aiter(*fst_, s); !aiter.Done(); aiter.Next()) {
+    const Arc &arc = aiter.Value();
+    ArcCategory category;
+    GetCategoryOfArc(arc, &category);
+    int32 nonterminal = category.nonterminal;
+    if (nonterminal == 0) { // this arc remains unchanged; we'll put it back later.
+      arcs_from_this_state.push_back(arc);
+      continue;
+    }
+    auto iter = category_to_state.find(category);
+    KALDI_ASSERT(iter != category_to_state.end());
+    Arc new_arc;
+    new_arc.ilabel = arc.ilabel;
+    if (arc.olabel == category.olabel) {
+      new_arc.olabel = 0;  // the olabel went on the epsilon-input arc.
+    } else {
+      KALDI_ASSERT(category.olabel == 0);
+      new_arc.olabel = arc.olabel;
+    }
+    StateId new_state = iter->second.first;
+    float epsilon_arc_cost = iter->second.second;
+    new_arc.weight = Weight(arc.weight.Value() - epsilon_arc_cost);
+    new_arc.nextstate = arc.nextstate;
+    fst_->AddArc(new_state, new_arc);
+  }
+
+  fst_->DeleteArcs(s);
+  for (size_t i = 0; i < arcs_from_this_state.size(); i++) {
+    fst_->AddArc(s, arcs_from_this_state[i]);
+  }
+  // leave the final-prob on this state as it was before.
+}
+
+
+void PrepareForGrammarFst(int32 nonterm_phones_offset,
+                          VectorFst<StdArc> *fst) {
+  GrammarFstPreparer p(nonterm_phones_offset, fst);
+  p.Prepare();
+}
+
+void CopyToVectorFst(GrammarFst *grammar_fst,
+                     VectorFst<StdArc> *vector_fst) {
+  typedef GrammarFstArc::StateId GrammarStateId;  // int64
+  typedef StdArc::StateId StdStateId;  // int
+  typedef StdArc::Label Label;
+  typedef StdArc::Weight Weight;
+
+  std::vector<std::pair<GrammarStateId, StdStateId> > queue;
+  std::unordered_map<GrammarStateId, StdStateId> state_map;
+
+  vector_fst->DeleteStates();
+  state_map[grammar_fst->Start()] = vector_fst->AddState();  // state 0.
+  vector_fst->SetStart(0);
+
+  queue.push_back(
+      std::pair<GrammarStateId, StdStateId>(grammar_fst->Start(), 0));
+
+  while (!queue.empty()) {
+    std::pair<GrammarStateId, StdStateId> p = queue.back();
+    queue.pop_back();
+    GrammarStateId grammar_state = p.first;
+    StdStateId std_state = p.second;
+    vector_fst->SetFinal(std_state, grammar_fst->Final(grammar_state));
+    ArcIterator<GrammarFst> aiter(*grammar_fst, grammar_state);
+    for (; !aiter.Done(); aiter.Next()) {
+      const GrammarFstArc &grammar_arc = aiter.Value();
+      StdArc std_arc;
+      std_arc.ilabel = grammar_arc.ilabel;
+      std_arc.olabel = grammar_arc.olabel;
+      std_arc.weight = grammar_arc.weight;
+      GrammarStateId next_grammar_state = grammar_arc.nextstate;
+      StdStateId next_std_state;
+      std::unordered_map<GrammarStateId, StdStateId>::const_iterator
+          state_iter = state_map.find(next_grammar_state);
+      if (state_iter == state_map.end()) {
+        next_std_state = vector_fst->AddState();
+        state_map[next_grammar_state] = next_std_state;
+        queue.push_back(std::pair<GrammarStateId, StdStateId>(
+            next_grammar_state, next_std_state));
+      } else {
+        next_std_state = state_iter->second;
+      }
+      std_arc.nextstate = next_std_state;
+      vector_fst->AddArc(std_state, std_arc);
+    }
+  }
+}
+
+
+
+} // end namespace fst
diff --git a/src/decoder/grammar-fst.h b/src/decoder/grammar-fst.h
new file mode 100644
index 00000000000..8a610aa06cf
--- /dev/null
+++ b/src/decoder/grammar-fst.h
@@ -0,0 +1,645 @@
+// decoder/grammar-fst.h
+
+// Copyright    2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_DECODER_GRAMMAR_FST_H_
+#define KALDI_DECODER_GRAMMAR_FST_H_
+
+/**
+   For an extended explanation of the framework of which grammar-fsts are a
+   part, please see \ref grammar (i.e. ../doc/grammar.dox).
+
+   This header implements a special FST type which we use in that framework;
+   it is a lightweight wrapper which stitches together several FSTs and makes
+   them look, to the decoder code, like a single FST.  It is a bit like
+   OpenFst's Replace() function, but with support for left-biphone context.
+ */
+
+
+
+#include "fst/fstlib.h"
+#include "fstext/grammar-context-fst.h"
+
+namespace fst {
+
+
+// GrammarFstArc is an FST Arc type which differs from the normal StdArc type by
+// having the state-id be 64 bits, enough to store two indexes: the higher 32
+// bits for the FST-instance index, and the lower 32 bits for the state within
+// that FST-instance.
+// Obviously this leads to very high-numbered state indexes, which might be
+// a problem in some circumstances, but the decoder code doesn't store arrays
+// indexed by state, it uses hashes, so this isn't a problem.
+struct GrammarFstArc {
+  typedef fst::TropicalWeight Weight;
+  typedef int Label;  // OpenFst's StdArc uses int; this is for compatibility.
+  typedef int64 StateId;
+
+  Label ilabel;
+  Label olabel;
+  Weight weight;
+  StateId nextstate;
+
+  GrammarFstArc() {}
+
+  GrammarFstArc(Label ilabel, Label olabel, Weight weight, StateId nextstate)
+      : ilabel(ilabel),
+        olabel(olabel),
+        weight(std::move(weight)),
+        nextstate(nextstate) {}
+};
+
+#define KALDI_GRAMMAR_FST_SPECIAL_WEIGHT 4096.0
+
+class GrammarFst;
+
+// Declare that we'll be overriding class ArcIterator for class GrammarFst.
+// This wouldn't work if we were fully using the OpenFst framework,
+// e.g. if we had GrammarFst inherit from class Fst.
+template<> class ArcIterator<GrammarFst>;
+
+
+/**
+   GrammarFst is an FST that is 'stitched together' from multiple FSTs, that can
+   recursively incorporate each other.  (This is limited to left-biphone
+   phonetic context). This class does not inherit from fst::Fst and does not
+   support its full interface-- only the parts that are necessary for the
+   decoder to work when templated on it.
+
+   The basic interface is inspired by OpenFst's 'ReplaceFst' (see its
+   replace.h), except that this handles left-biphone phonetic context, which
+   requires, essentially, having multiple exit-points and entry-points for
+   sub-FSTs that represent nonterminals in the grammar; and multiple return
+   points whenever we invoke a nonterminal.  For more information
+   see \ref grammar (i.e. ../doc/grammar.dox).
+
+   THREAD SAFETY: you can't use this object from multiple threads; you should
+   create lightweight copies of this object using the copy constructor,
+   e.g. `new GrammarFst(this_grammar_fst)`, if you want to decode from multiple
+   threads using the same GrammarFst.
+*/
+class GrammarFst {
+ public:
+  typedef GrammarFstArc Arc;
+  typedef TropicalWeight Weight;
+
+  // StateId is actually int64.  The high-order 32 bits are interpreted as an
+  // instance_id, i.e. and index into the instances_ vector; the low-order 32
+  // bits are the state index in the FST instance.
+  typedef Arc::StateId StateId;
+
+  // The StateId of the individual FST instances (int, currently).
+  typedef StdArc::StateId BaseStateId;
+
+  typedef Arc::Label Label;
+
+
+  /**
+     Constructor.  This constructor is very lightweight; the only immediate work
+     it does is to iterate over the arcs in the start states of the provided
+     FSTs in order to set up the appropriate entry points.
+
+     For simplicity (to avoid templates), we limit the input FSTs to be of type
+     ConstFst<StdArc>; this limitation could be removed later if needed.  You
+     can always construct a ConstFst<StdArc> if you have another StdArc-based
+     FST type.  If the FST was read from disk, it may already be of type
+     ConstFst, and dynamic_cast might be sufficient to convert the type.
+
+     @param [in] nonterm_phones_offset   The integer id of the symbol
+             "#nonterm_bos" in phones.txt.
+     @param [in] top_fst    The top-level FST of the grammar, which will
+              usually invoke the fsts in 'ifsts'.  The fsts in 'ifsts' may
+              also invoke each other recursively.  Even left-recursion is
+              allowed, although if it is with zero cost, it may blow up when you
+              decode.  When an FST invokes another, the invocation point will
+              have sequences of two special symbols which would be decoded as:
+                  (#nonterm:foo,p1) (#nonterm_reenter,p2)
+              where p1 and p2 (which may be real phones or #nonterm_bos)
+              represent the phonetic left-context that we enter, and leave, the
+              sub-graph with respectively.
+     @param [in] ifsts   ifsts is a list of pairs (nonterminal-symbol,
+              the HCLG.fst corresponding to that symbol).  The nonterminal
+              symbols must be among the user-specified nonterminals in
+              phones.txt, i.e. the things with names like "#nonterm:foo" and
+              "#nonterm:bar" in phones.txt.  Also no nonterminal may appear more
+              than once in 'fsts'.  ifsts may be empty, even though that doesn't
+              make much sense.
+    */
+  GrammarFst(
+      int32 nonterm_phones_offset,
+      std::shared_ptr<const ConstFst<StdArc> > top_fst,
+      const std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > &ifsts);
+
+  /// Copy constructor.  Useful because this object is not thread safe so cannot
+  /// be used by multiple parallel decoder threads, but it is lightweight and
+  /// can copy it without causing the stored FSTs to be copied.
+  GrammarFst(const GrammarFst &other) = default;
+
+  ///  This constructor should only be used prior to calling Read().
+  GrammarFst() { }
+
+  // This Write function allows you to dump a GrammarFst to disk as a single
+  // object.  It only supports binary mode, but the option is allowed for
+  // compatibility with other Kaldi read/write functions (it will crash if
+  // binary == false).
+  void Write(std::ostream &os, bool binary) const;
+
+  // Reads the format that Write() outputs.  Will crash if binary == false.
+  void Read(std::istream &os, bool binary);
+
+  StateId Start() const {
+    // the top 32 bits of the 64-bit state-id will be zero, because the
+    // top FST instance has instance-id = 0.
+    return static_cast<StateId>(top_fst_->Start());
+  }
+
+  Weight Final(StateId s) const {
+    // If the fst-id (top 32 bits of s) is nonzero, this state is not final,
+    // because we need to return to the top-level FST before we can be final.
+    if (s != static_cast<StateId>(static_cast<int32>(s))) {
+      return Weight::Zero();
+    } else {
+      BaseStateId base_state = static_cast<BaseStateId>(s);
+      Weight ans = top_fst_->Final(base_state);
+      if (ans.Value() == KALDI_GRAMMAR_FST_SPECIAL_WEIGHT) {
+        return Weight::Zero();
+      } else {
+        return ans;
+      }
+    }
+  }
+
+  // This is called in LatticeFasterDecoder.  As an implementation shortcut, if
+  // the state is an expanded state, we return 1, meaning 'yes, there are input
+  // epsilons'; the calling code doesn't actually care about the exact number.
+  inline size_t NumInputEpsilons(StateId s) const {
+    // Compare with the constructor of ArcIterator.
+    int32 instance_id = s >> 32;
+    BaseStateId base_state = static_cast<int32>(s);
+    const GrammarFst::FstInstance &instance = instances_[instance_id];
+    const ConstFst<StdArc> *base_fst = instance.fst;
+    if (base_fst->Final(base_state).Value() != KALDI_GRAMMAR_FST_SPECIAL_WEIGHT) {
+      return base_fst->NumInputEpsilons(base_state);
+    } else {
+      return 1;
+    }
+  }
+
+  inline std::string Type() const { return "grammar"; }
+
+  ~GrammarFst();
+ private:
+
+  struct ExpandedState;
+
+  friend class ArcIterator<GrammarFst>;
+
+  // sets up nonterminal_map_.
+  void InitNonterminalMap();
+
+  // sets up entry_arcs_[i].  We do this only on demand, as each one is
+  // accessed, so that if there are a lot of nonterminals, this object doesn't
+  // too much work when it is initialized.  Each call to this function only
+  // takes time O(number of left-context phones), which is quite small, but we'd
+  // like to avoid that if possible.
+  //
+  // This function returns true if it successfully initialized the
+  // entry_arcs_[i]; and false if it left it empty because
+  bool InitEntryArcs(int32 i);
+
+  // sets up instances_ with the top-level instance.
+  void InitInstances();
+
+  // Does the initialization tasks after nonterm_phones_offset_,
+  // top_fsts_ and ifsts_ have been set up
+  void Init();
+
+  // clears everything.
+  void Destroy();
+
+  /*
+    This utility function sets up a map from "left-context phone", meaning
+    either a phone index or the index of the symbol #nonterm_bos, to
+    an arc-index leaving a particular state in an FST (i.e. an index
+    that we could use to Seek() to the matching arc).
+
+      @param [in]  fst  The FST that is being entered (or reentered)
+      @param [in]  entry_state  The state in 'fst' which is being entered
+                 (or reentered); will be fst.Start() if it's being
+                 entered.  It must have arcs with ilabels decodable as
+                 (nonterminal_symbol, left_context_phone).  Will either be the
+                 start state (if 'nonterminal_symbol' corresponds to
+                 #nonterm_begin), or an internal state (if 'nonterminal_symbol'
+                 corresponds to #nonterm_reenter).  The arc-indexes of those
+                 arcs will be the values we set in 'phone_to_arc'
+      @param [in]  nonterminal_symbol  The index in phones.txt of the
+                 nonterminal symbol we expect to be encoded in the ilabels
+                 of the arcs leaving 'entry_state'.  Will either correspond
+                 to #nonterm_begin or #nonterm_reenter.
+      @param [out] phone_to_arc  We output the map from left_context_phone
+                 to the arc-index (i.e. the index we'd have to Seek() to
+                 in an arc-iterator set up for the state 'entry_state).
+   */
+  void InitEntryOrReentryArcs(
+      const ConstFst<StdArc> &fst,
+      int32 entry_state,
+      int32 nonterminal_symbol,
+      std::unordered_map<int32, int32> *phone_to_arc);
+
+
+  inline int32 GetPhoneSymbolFor(enum NonterminalValues n) {
+    return nonterm_phones_offset_ + static_cast<int32>(n);
+  }
+  /**
+     Decodes an ilabel into a pair (nonterminal, left_context_phone).  Crashes
+     if something went wrong or ilabel did not represent that (e.g. was less
+     than kNontermBigNumber).
+
+       @param [in] the ilabel to be decoded.  Note: the type 'Label' will in practice be int.
+       @param [out] The nonterminal part of the ilabel after decoding.
+                   Will be a value greater than nonterm_phones_offset_.
+       @param [out] The left-context-phone part of the ilabel after decoding.
+                    Will either be a phone index, or the symbol corresponding
+                    to #nonterm_bos (meaning no left-context as we are at
+                    the beginning of the sequence).
+   */
+  void DecodeSymbol(Label label,
+                    int32 *nonterminal_symbol,
+                    int32 *left_context_phone);
+
+
+  // This function creates and returns an ExpandedState corresponding to a
+  // particular state-id in the FstInstance for this instance_id.  It is called
+  // when we have determined that an ExpandedState needs to be created and that
+  // it is not currently present.  It creates and returns it; the calling code
+  // needs to add it to the expanded_states map for its FST instance.
+  ExpandedState *ExpandState(int32 instance_id, BaseStateId state_id);
+
+  // Called from ExpandState() when the nonterminal type on the arcs is
+  // #nonterm_end, this implements ExpandState() for that case.
+  ExpandedState *ExpandStateEnd(int32 instance_id, BaseStateId state_id);
+
+  // Called from ExpandState() when the nonterminal type on the arcs is a
+  // user-defined nonterminal, this implements ExpandState() for that case.
+  ExpandedState *ExpandStateUserDefined(int32 instance_id, BaseStateId state_id);
+
+  // Called from ExpandStateUserDefined(), this function attempts to look up the
+  // pair (nonterminal, state) in the map
+  // instances_[instance_id].child_instances.  If it exists (because this
+  // return-state has been expanded before), it returns the value it found;
+  // otherwise it creates the child-instance and returns its newly created
+  // instance-id.
+  inline int32 GetChildInstanceId(int32 instance_id, int32 nonterminal,
+                                  int32 state);
+
+  /**
+    Called while expanding states, this function combines information from two
+    arcs: one leaving one sub-fst and one arriving in another sub-fst.
+
+      @param [in] leaving_arc  The arc leaving the first FST; must have
+                     zero olabel.  The ilabel will have a nonterminal symbol
+                     like #nonterm:foo or #nonterm_end on it, encoded with a
+                     phonetic context, but we ignore the ilabel.
+      @param [in] arriving_arc  The arc arriving in the second FST.
+                    It will have an ilabel consisted of either #nonterm_begin
+                    or #nonterm_enter combined with a left-context phone,
+                    but we ignore the ilabel.
+      @param [in] cost_correction  A correction term that we add to the
+                    cost of the arcs.  This basically cancels out the
+                    "1/num_options" part of the weight that we added in L.fst
+                    when we put in all the phonetic-context options.  We
+                    did that to keep the FST stochastic, so that if we ever
+                    pushed the weights, it wouldn't lead to weird effects.
+                    This takes out that correction term... things will
+                    still sum to one in the appropriate way, because in fact
+                    when we cross these FST boundaries we only take one
+                    specific phonetic context, rather than all possibilities.
+      @param [out] arc  The arc that we output.  Will have:
+                   - weight equal to the product of the input arcs' weights,
+                      times a weight constructed from 'cost_correction'.
+                   - olabel equal to arriving_arc.olabel (leaving_arc's olabel
+                     will be zero).
+                   - ilabel equal to zero (we discard both ilabels, they are
+                     not transition-ids but special symbols).
+                   - nextstate equal to the nextstate of arriving_arc.
+  */
+  static inline void CombineArcs(const StdArc &leaving_arc,
+                                 const StdArc &arriving_arc,
+                                 float cost_correction,
+                                 StdArc *arc);
+
+  /** Called from the ArcIterator constructor when we encounter an FST state with
+      nonzero final-prob, this function first looks up this state_id in
+      'expanded_states' member of the corresponding FstInstance, and returns it
+      if already present; otherwise it populates the 'expanded_states' map with
+      something for this state_id and returns the value.
+  */
+  inline ExpandedState *GetExpandedState(int32 instance_id,
+                                         BaseStateId state_id) {
+    std::unordered_map<BaseStateId, ExpandedState*> &expanded_states =
+        instances_[instance_id].expanded_states;
+
+    std::unordered_map<BaseStateId, ExpandedState*>::iterator iter =
+        expanded_states.find(state_id);
+    if (iter != expanded_states.end()) {
+      return iter->second;
+    } else {
+      ExpandedState *ans = ExpandState(instance_id, state_id);
+      // Don't use the reference 'expanded_states'; it could have been
+      // invalidated.
+      instances_[instance_id].expanded_states[state_id] = ans;
+      return ans;
+    }
+  }
+
+  /**
+     Represents an expanded state in an FstInstance.  We expand states whenever
+     we encounter states with a final-cost equal to
+     KALDI_GRAMMAR_FST_SPECIAL_WEIGHT (4096.0).  The function
+     PrepareGrammarFst() makes sure to add this special final-cost on states
+     that have special arcs leaving them. */
+  struct ExpandedState {
+    // The final-prob for expanded states is always zero; to avoid
+    // corner cases, we ensure this via adding epsilon arcs where
+    // needed.
+
+    // fst-instance index of destination state (we will have ensured previously
+    // that this is the same for all outgoing arcs).
+    int32 dest_fst_instance;
+
+    // List of arcs out of this state, where the 'nextstate' element will be the
+    // lower-order 32 bits of the destination state and the higher order bits
+    // will be given by 'dest_fst_instance'.  We do it this way, instead of
+    // constructing a vector<Arc>, in order to simplify the ArcIterator code and
+    // avoid unnecessary branches in loops over arcs.
+    // We guarantee that this 'arcs' array will always be nonempty; this
+    // is to avoid certain hassles on Windows with automated bounds-checking.
+    std::vector<StdArc> arcs;
+  };
+
+
+  // An FstInstance is a copy of an FST.  The instance numbered zero is for
+  // top_fst_, and (to state it approximately) whenever any FST instance invokes
+  // another FST a new instance will be generated on demand.
+  struct FstInstance {
+    // ifst_index is the index into the ifsts_ vector that corresponds to this
+    // FST instance, or -1 if this is the top-level instance.
+    int32 ifst_index;
+
+    // Pointer to the FST corresponding to this instance: it will equal top_fst_
+    // if ifst_index == -1, or ifsts_[ifst_index].second otherwise.
+    const ConstFst<StdArc> *fst;
+
+    // 'expanded_states', which will be populated on demand as states in this
+    // FST instance are accessed, will only contain entries for states in this
+    // FST that the final-prob's value equal to
+    // KALDI_GRAMMAR_FST_SPECIAL_WEIGHT.  (That final-prob value is used as a
+    // kind of signal to this code that the state needs expansion).
+    std::unordered_map<BaseStateId, ExpandedState*> expanded_states;
+
+    // 'child_instances', which is populated on demand as states in this FST
+    // instance are accessed, is logically a map from pair (nonterminal_index,
+    // return_state) to instance_id.  When we encounter an arc in our FST with a
+    // user-defined nonterminal indexed 'nonterminal_index' on its ilabel, and
+    // with 'return_state' as its nextstate, we look up that pair
+    // (nonterminal_index, return_state) in this map to see whether there already
+    // exists an FST instance for that.  If it exists then the transition goes to
+    // that FST instance; if not, then we create a new one.  The 'return_state'
+    // that's part of the key in this map would be the same as the 'parent_state'
+    // in that child FST instance, and of course the 'parent_instance' in
+    // that child FST instance would be the instance_id of this instance.
+    //
+    // In most cases each return_state would only have a single
+    // nonterminal_index, making the 'nonterminal_index' in the key *usually*
+    // redundant, but in principle it could happen that two user-defined
+    // nonterminals might share the same return-state.
+    std::unordered_map<int64, int32> child_instances;
+
+    // The instance-id of the FST we return to when we are done with this one
+    // (or -1 if this is the top-level FstInstance so there is nowhere to
+    // return).
+    int32 parent_instance;
+
+    // The state in the FST of 'parent_instance' at which we expanded this FST
+    // instance, and to which we return (actually we return to the next-states
+    // of arcs out of 'parent_state').
+    int32 parent_state;
+
+    // 'parent_reentry_arcs' is a map from left-context-phone (i.e. either a
+    // phone index or #nonterm_bos), to an arc-index, which we could use to
+    // Seek() in an arc-iterator for state parent_state in the FST-instance
+    // 'parent_instance'.  It's set up when we create this FST instance.  (The
+    // arcs used to enter this instance are not located here, they can be
+    // located in entry_arcs_[instance_id]).  We make use of reentry_arcs when
+    // we expand states in this FST that have #nonterm_end on their arcs,
+    // leading to final-states, which signal a return to the parent
+    // FST-instance.
+    std::unordered_map<int32, int32> parent_reentry_arcs;
+  };
+
+  // The integer id of the symbol #nonterm_bos in phones.txt.
+  int32 nonterm_phones_offset_;
+
+  // The top-level FST passed in by the user; contains the start state and
+  // final-states, and may invoke FSTs in 'ifsts_' (which can also invoke
+  // each other recursively).
+  std::shared_ptr<const ConstFst<StdArc> > top_fst_;
+
+  // A list of pairs (nonterm, fst), where 'nonterm' is a user-defined
+  // nonterminal symbol as numbered in phones.txt (e.g. #nonterm:foo), and
+  // 'fst' is the corresponding FST.
+  std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > ifsts_;
+
+  // Maps from the user-defined nonterminals like #nonterm:foo as numbered
+  // in phones.txt, to the corresponding index into 'ifsts_', i.e. the ifst_index.
+  std::unordered_map<int32, int32> nonterminal_map_;
+
+  // entry_arcs_ will have the same dimension as ifsts_.  Each entry_arcs_[i]
+  // is a map from left-context phone (i.e. either a phone-index or
+  // #nonterm_bos) to the corresponding arc-index leaving the start-state in
+  // the FST 'ifsts_[i].second'.
+  // We populate this only on demand as each one is needed (except for the
+  // first one, which we populate immediately as a kind of sanity check).
+  // Doing it on-demand prevents this object's initialization from being
+  // nontrivial in the case where there are a lot of nonterminals.
+  std::vector<std::unordered_map<int32, int32> > entry_arcs_;
+
+  // The FST instances.  Initially it is a vector with just one element
+  // representing top_fst_, and it will be populated with more elements on
+  // demand.  An instance_id refers to an index into this vector.
+  std::vector<FstInstance> instances_;
+};
+
+
+/**
+   This is the overridden template for class ArcIterator for GrammarFst.  This
+   is only used in the decoder, and the GrammarFst is not a "real" FST (it just
+   has a very similar-looking interface), so we don't need to implement all the
+   functionality that the regular ArcIterator has.
+ */
+template <>
+class ArcIterator<GrammarFst> {
+ public:
+  using Arc = typename GrammarFst::Arc;
+  using BaseArc = StdArc;
+  using StateId = typename Arc::StateId;  // int64
+  using BaseStateId = typename StdArc::StateId;  // int
+  using ExpandedState = GrammarFst::ExpandedState;
+
+  // Caution: uses const_cast to evade const rules on GrammarFst.  This is for
+  // compatibility with how things work in OpenFst.
+  inline ArcIterator(const GrammarFst &fst_in, StateId s) {
+    GrammarFst &fst = const_cast<GrammarFst&>(fst_in);
+    // 'instance_id' is the high order bits of the state.
+    int32 instance_id = s >> 32;
+    // 'base_state' is low order bits of the state.  It's important to
+    // explicitly say int32 below, not BaseStateId == int, which might on some
+    // compilers be a 64-bit type.
+    BaseStateId base_state = static_cast<int32>(s);
+    const GrammarFst::FstInstance &instance = fst.instances_[instance_id];
+    const ConstFst<StdArc> *base_fst = instance.fst;
+    if (base_fst->Final(base_state).Value() != KALDI_GRAMMAR_FST_SPECIAL_WEIGHT) {
+      // A normal state
+      dest_instance_ = instance_id;
+      base_fst->InitArcIterator(s, &data_);
+      i_ = 0;
+    } else {
+      // A special state
+      ExpandedState *expanded_state = fst.GetExpandedState(instance_id,
+                                                           base_state);
+      dest_instance_ = expanded_state->dest_fst_instance;
+      // it's ok to leave the other members of data_ uninitialized, as they will
+      // never be interrogated.
+      data_.arcs = &(expanded_state->arcs[0]);
+      data_.narcs = expanded_state->arcs.size();
+      i_ = 0;
+    }
+    // Ideally we want to call CopyArcToTemp() now, but we rely on the fact that
+    // the calling code needs to call Done() before accessing Value(); we call
+    // CopyArcToTemp() from Done().  Of course this is slightly against the
+    // semantics of Done(), but it's more efficient to have Done() call
+    // CopyArcToTemp() than this function or Next(), as Done() already has to
+    // test that the arc-iterator has not reached the end.
+  }
+
+  inline bool Done() {
+    if (i_ < data_.narcs) {
+      CopyArcToTemp();
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  inline void Next() {
+    i_++;
+    // Note: logically, at this point we should do:
+    // if (i_ < data_.size)
+    //  CopyArcToTemp();
+    // Instead we move this CopyArcToTemp() invocation into Done(), which we
+    // know will always be called after Next() and before Value(), because the
+    // user has no other way of knowing whether the iterator is still valid.
+    // This is for efficiency.
+  }
+
+  inline const Arc &Value() const { return arc_; }
+
+ private:
+
+  inline void CopyArcToTemp() {
+    const StdArc &src = data_.arcs[i_];
+    arc_.ilabel = src.ilabel;
+    arc_.olabel = src.olabel;
+    arc_.weight = src.weight;
+    arc_.nextstate = (static_cast<int64>(dest_instance_) << 32) |
+        src.nextstate;
+  }
+
+  // The members of 'data_' that we use are:
+  //  const Arc *arcs;
+  //  size_t narcs;
+  ArcIteratorData<StdArc> data_;
+
+
+  int32 dest_instance_;  // The index of the FstInstance that we transition to from
+                         // this state.
+  size_t i_;  // i_ is the index into the 'arcs' pointer.
+
+  Arc arc_;  // 'Arc' is the current arc in the GrammarFst, that this iterator
+             // is pointing to.  It will be a copy of data_.arcs[i], except with
+             // the 'nextstate' modified to encode dest_instance_ in the higher
+             // order bits.  Making a copy is of course unnecessary for the most
+             // part, but Value() needs to return a reference; we rely on the
+             // compiler to optimize out any unnecessary moves of data.
+};
+
+/**
+   This function copies a GrammarFst to a VectorFst (intended mostly for testing
+   and comparison purposes).  GrammarFst doesn't actually inherit from class
+   Fst, so we can't just construct an FST from the GrammarFst.
+
+   grammar_fst gets expanded by this call, and although we could make it a const
+   reference (because the ArcIterator does actually use const_cast), we make it
+   a non-const pointer to emphasize that this call does change grammar_fst.
+ */
+void CopyToVectorFst(GrammarFst *grammar_fst,
+                     VectorFst<StdArc> *vector_fst);
+
+/**
+   This function prepares 'ifst' for use in GrammarFst: it ensures that it has
+   the expected properties, changing it slightly as needed.  'ifst' is expected
+   to be a fully compiled HCLG graph that is intended to be used in GrammarFst.
+   The user will most likely want to copy it to the ConstFst type after calling
+   this function.
+
+   The following describes what this function does, and the reasons why
+   it has to do these things:
+
+     - To keep the ArcIterator code simple (to avoid branches in loops), even
+       for expanded states we store the destination fst-instance index
+       separately per state, not per arc.  This requires that any transitions
+       across FST boundaries from a single FST must be to a single destination
+       FST (for a given source state).  We fix this problem by introducing
+       epsilon arcs and new states whenever we find a state that would cause a
+       problem for the above.
+     - In order to signal to the GrammarFst code that a particular state has
+       cross-FST-boundary transitions, we set the final-prob to a nonzero value
+       on that state.  Specifically, we use a weight with Value() == 4096.0.
+       When the GrammarFst code sees that value it knows that it was not a
+       'real' final-prob.  Prior to doing this we ensure, by adding epsilon
+       transitions as needed, that the state did not previously have a
+       final-prob.
+     - For arcs that are final arcs in an FST that represents a nonterminal
+       (these arcs would have #nonterm_exit on them), we ensure that the
+       states that they transition to have unit final-prob (i.e. final-prob
+       equal to One()), by incorporating any final-prob into the arc itself.
+       This avoids the GrammarFst code having to inspect those final-probs
+       when expanding states.
+
+     @param [in] nonterm_phones_offset   The integer id of
+                the symbols #nonterm_bos in the phones.txt file.
+     @param [in,out] fst  The FST to be (slightly) modified.
+ */
+void PrepareForGrammarFst(int32 nonterm_phones_offset,
+                          VectorFst<StdArc> *fst);
+
+
+} // end namespace fst
+
+
+#endif
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 6276c25a83d..9ea53a95836 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -312,14 +312,14 @@ class LatticeBiglmFasterDecoder {
   // for the current frame.  [note: it's inserted if necessary into hash toks_
   // and also into the singly linked list of tokens active on this frame
   // (whose head is at active_toks_[frame]).
-  inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed) {
+  inline Elem *FindOrAddToken(PairId state_pair, int32 frame,
+      BaseFloat tot_cost, bool emitting, bool *changed) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
     KALDI_ASSERT(frame < active_toks_.size());
     Token *&toks = active_toks_[frame].toks;
-    Elem *e_found = toks_.Find(state_pair);
-    if (e_found == NULL) { // no such token presently.
+    Elem *e_found = toks_.Insert(state_pair, NULL);
+    if (e_found->val == NULL) { // no such token presently.
       const BaseFloat extra_cost = 0.0;
       // tokens on the currently final frame have zero extra_cost
       // as any of them could end up
@@ -328,9 +328,9 @@ class LatticeBiglmFasterDecoder {
       // NULL: no forward links yet
       toks = new_tok;
       num_toks_++;
-      toks_.Insert(state_pair, new_tok);
+      e_found->val = new_tok;
       if (changed) *changed = true;
-      return new_tok;
+      return e_found;
     } else {
       Token *tok = e_found->val; // There is an existing Token for this state.
       if (tok->tot_cost > tot_cost) { // replace old token
@@ -346,7 +346,7 @@ class LatticeBiglmFasterDecoder {
       } else {
         if (changed) *changed = false;
       }
-      return tok;
+      return e_found;
     }
   }
   
@@ -744,11 +744,11 @@ class LatticeBiglmFasterDecoder {
             else if (tot_cost + config_.beam < next_cutoff)
               next_cutoff = tot_cost + config_.beam; // prune by best current token
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *next_tok = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
+            Elem *e_next = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
             // true: emitting, NULL: no change indicator needed
           
             // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-            tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
+            tok->links = new ForwardLink(e_next->val, arc.ilabel, arc.olabel, 
                                          graph_cost, ac_cost, tok->links);
           }
         } // for all arcs
@@ -770,7 +770,7 @@ class LatticeBiglmFasterDecoder {
     KALDI_ASSERT(queue_.empty());
     BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
     for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
-      queue_.push_back(e->key);
+      queue_.push_back(e);
       // for pruning with current best token
       best_cost = std::min(best_cost, static_cast<BaseFloat>(e->val->tot_cost));
     }
@@ -784,11 +784,12 @@ class LatticeBiglmFasterDecoder {
     BaseFloat cutoff = best_cost + config_.beam;
     
     while (!queue_.empty()) {
-      PairId state_pair = queue_.back();
+      const Elem *e = queue_.back();
       queue_.pop_back();
 
-      Token *tok = toks_.Find(state_pair)->val;  // would segfault if state not in
-                                                 // toks_ but this can't happen.
+      PairId state_pair = e->key;
+      Token *tok = e->val;  // would segfault if state not in
+                            // toks_ but this can't happen.
       BaseFloat cur_cost = tok->tot_cost;
       if (cur_cost > cutoff) // Don't bother processing successors.
         continue;
@@ -812,15 +813,15 @@ class LatticeBiglmFasterDecoder {
           if (tot_cost < cutoff) {
             bool changed;
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *new_tok = FindOrAddToken(next_pair, frame, tot_cost,
-                                            false, &changed); // false: non-emit
+            Elem *e_new = FindOrAddToken(next_pair, frame, tot_cost,
+                                         false, &changed); // false: non-emit
             
-            tok->links = new ForwardLink(new_tok, 0, arc.olabel,
+            tok->links = new ForwardLink(e_new->val, 0, arc.olabel,
                                          graph_cost, 0, tok->links);
             
             // "changed" tells us whether the new token has a different
             // cost from before, or is new [if so, add into queue].
-            if (changed) queue_.push_back(next_pair);
+            if (changed) queue_.push_back(e_new);
           }
         }
       } // for all arcs
@@ -835,7 +836,7 @@ class LatticeBiglmFasterDecoder {
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
-  std::vector<PairId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
   // make it class member to avoid internal new/delete.
   const fst::Fst<fst::StdArc> &fst_;
diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index b837d836a70..83c582d3b5e 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -1,7 +1,7 @@
 // decoder/lattice-faster-decoder.cc
 
 // Copyright 2009-2012  Microsoft Corporation  Mirko Hannemann
-//           2013-2014  Johns Hopkins University (Author: Daniel Povey)
+//           2013-2018  Johns Hopkins University (Author: Daniel Povey)
 //                2014  Guoguo Chen
 //                2018  Zhehuai Chen
 
@@ -20,40 +20,40 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-// Note on svn: this file is "upstream" from lattice-faster-online-decoder.cc, and
-// changes in this file should be merged into lattice-faster-online-decoder.cc,
-// after committing the changes to this file, using the command
-// svn merge ^/sandbox/online/src/decoder/lattice-faster-decoder.cc lattice-faster-online-decoder.cc
-
 #include "decoder/lattice-faster-decoder.h"
 #include "lat/lattice-functions.h"
 
 namespace kaldi {
 
 // instantiate this class once for each thing you have to decode.
-LatticeFasterDecoder::LatticeFasterDecoder(const fst::Fst<fst::StdArc> &fst,
-                                           const LatticeFasterDecoderConfig &config):
-    fst_(fst), delete_fst_(false), config_(config), num_toks_(0) {
+template <typename FST, typename Token>
+LatticeFasterDecoderTpl<FST, Token>::LatticeFasterDecoderTpl(
+    const FST &fst,
+    const LatticeFasterDecoderConfig &config):
+    fst_(&fst), delete_fst_(false), config_(config), num_toks_(0) {
   config.Check();
   toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
 }
 
 
-LatticeFasterDecoder::LatticeFasterDecoder(const LatticeFasterDecoderConfig &config,
-                                           fst::Fst<fst::StdArc> *fst):
-    fst_(*fst), delete_fst_(true), config_(config), num_toks_(0) {
+template <typename FST, typename Token>
+LatticeFasterDecoderTpl<FST, Token>::LatticeFasterDecoderTpl(
+    const LatticeFasterDecoderConfig &config, FST *fst):
+    fst_(fst), delete_fst_(true), config_(config), num_toks_(0) {
   config.Check();
   toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
 }
 
 
-LatticeFasterDecoder::~LatticeFasterDecoder() {
+template <typename FST, typename Token>
+LatticeFasterDecoderTpl<FST, Token>::~LatticeFasterDecoderTpl() {
   DeleteElems(toks_.Clear());
   ClearActiveTokens();
-  if (delete_fst_) delete &(fst_);
+  if (delete_fst_) delete fst_;
 }
 
-void LatticeFasterDecoder::InitDecoding() {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::InitDecoding() {
   // clean up from last time:
   DeleteElems(toks_.Clear());
   cost_offsets_.clear();
@@ -62,32 +62,26 @@ void LatticeFasterDecoder::InitDecoding() {
   num_toks_ = 0;
   decoding_finalized_ = false;
   final_costs_.clear();
-  StateId start_state = fst_.Start();
+  StateId start_state = fst_->Start();
   KALDI_ASSERT(start_state != fst::kNoStateId);
   active_toks_.resize(1);
-  Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
+  Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL);
   active_toks_[0].toks = start_tok;
   toks_.Insert(start_state, start_tok);
   num_toks_++;
-  ProcessNonemittingWrapper(config_.beam);
+  ProcessNonemitting(config_.beam);
 }
 
 // Returns true if any kind of traceback is available (not necessarily from
 // a final state).  It should only very rarely return false; this indicates
 // an unusual search error.
-bool LatticeFasterDecoder::Decode(DecodableInterface *decodable) {
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::Decode(DecodableInterface *decodable) {
   InitDecoding();
-
   // We use 1-based indexing for frames in this decoder (if you view it in
   // terms of features), but note that the decodable object uses zero-based
   // numbering, which we have to correct for when we call it.
-
-  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
-    if (NumFramesDecoded() % config_.prune_interval == 0)
-      PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
-    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);
-    ProcessNonemittingWrapper(cost_cutoff);
-  }
+  AdvanceDecoding(decodable);
   FinalizeDecoding();
 
   // Returns true if we have any kind of traceback available (not necessarily
@@ -97,7 +91,8 @@ bool LatticeFasterDecoder::Decode(DecodableInterface *decodable) {
 
 
 // Outputs an FST corresponding to the single best path through the lattice.
-bool LatticeFasterDecoder::GetBestPath(Lattice *olat,
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::GetBestPath(Lattice *olat,
                                        bool use_final_probs) const {
   Lattice raw_lat;
   GetRawLattice(&raw_lat, use_final_probs);
@@ -105,10 +100,12 @@ bool LatticeFasterDecoder::GetBestPath(Lattice *olat,
   return (olat->NumStates() != 0);
 }
 
-// Outputs an FST corresponding to the raw, state-level
-// tracebacks.
-bool LatticeFasterDecoder::GetRawLattice(Lattice *ofst,
-                                         bool use_final_probs) const {
+
+// Outputs an FST corresponding to the raw, state-level lattice
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::GetRawLattice(
+    Lattice *ofst,
+    bool use_final_probs) const {
   typedef LatticeArc Arc;
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
@@ -159,11 +156,11 @@ bool LatticeFasterDecoder::GetRawLattice(Lattice *ofst,
   for (int32 f = 0; f <= num_frames; f++) {
     for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) {
       StateId cur_state = tok_map[tok];
-      for (ForwardLink *l = tok->links;
+      for (ForwardLinkT *l = tok->links;
            l != NULL;
            l = l->next) {
-        unordered_map<Token*, StateId>::const_iterator iter =
-            tok_map.find(l->next_tok);
+        typename unordered_map<Token*, StateId>::const_iterator
+            iter = tok_map.find(l->next_tok);
         StateId nextstate = iter->second;
         KALDI_ASSERT(iter != tok_map.end());
         BaseFloat cost_offset = 0.0;
@@ -178,8 +175,8 @@ bool LatticeFasterDecoder::GetRawLattice(Lattice *ofst,
       }
       if (f == num_frames) {
         if (use_final_probs && !final_costs.empty()) {
-          unordered_map<Token*, BaseFloat>::const_iterator iter =
-              final_costs.find(tok);
+          typename unordered_map<Token*, BaseFloat>::const_iterator
+              iter = final_costs.find(tok);
           if (iter != final_costs.end())
             ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
         } else {
@@ -195,8 +192,9 @@ bool LatticeFasterDecoder::GetRawLattice(Lattice *ofst,
 // This function is now deprecated, since now we do determinization from outside
 // the LatticeFasterDecoder class.  Outputs an FST corresponding to the
 // lattice-determinized lattice (one path per word sequence).
-bool LatticeFasterDecoder::GetLattice(CompactLattice *ofst,
-                                      bool use_final_probs) const {
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::GetLattice(CompactLattice *ofst,
+                                           bool use_final_probs) const {
   Lattice raw_fst;
   GetRawLattice(&raw_fst, use_final_probs);
   Invert(&raw_fst);  // make it so word labels are on the input.
@@ -217,7 +215,8 @@ bool LatticeFasterDecoder::GetLattice(CompactLattice *ofst,
   return (ofst->NumStates() != 0);
 }
 
-void LatticeFasterDecoder::PossiblyResizeHash(size_t num_toks) {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PossiblyResizeHash(size_t num_toks) {
   size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
                                       * config_.hash_ratio);
   if (new_sz > toks_.Size()) {
@@ -230,24 +229,17 @@ void LatticeFasterDecoder::PossiblyResizeHash(size_t num_toks) {
 
   extra_cost is used in pruning tokens, to save memory.
 
-  Define the 'forward cost' of a token as zero for any token on the frame
-  we're currently decoding; and for other frames, as the shortest-path cost
-  between that token and a token on the frame we're currently decoding.
-  (by "currently decoding" I mean the most recently processed frame).
-
-  Then define the extra_cost of a token (always >= 0) as the forward-cost of
-  the token minus the smallest forward-cost of any token on the same frame.
+  extra_cost can be thought of as a beta (backward) cost assuming
+  we had set the betas on currently-active tokens to all be the negative
+  of the alphas for those tokens.  (So all currently active tokens would
+  be on (tied) best paths).
 
   We can use the extra_cost to accurately prune away tokens that we know will
   never appear in the lattice.  If the extra_cost is greater than the desired
   lattice beam, the token would provably never appear in the lattice, so we can
   prune away the token.
 
-  The advantage of storing the extra_cost rather than the forward-cost, is that
-  it is less costly to keep the extra_cost up-to-date when we process new frames.
-  When we process a new frame, *all* the previous frames' forward-costs would change;
-  but in general the extra_cost will change only for a finite number of frames.
-  (Actually we don't update all the extra_costs every time we update a frame; we
+  (Note: we don't update all the extra_costs every time we update a frame; we
   only do it every 'config_.prune_interval' frames).
  */
 
@@ -256,29 +248,35 @@ void LatticeFasterDecoder::PossiblyResizeHash(size_t num_toks) {
 // for the current frame.  [note: it's inserted if necessary into hash toks_
 // and also into the singly linked list of tokens active on this frame
 // (whose head is at active_toks_[frame]).
-inline LatticeFasterDecoder::Token *LatticeFasterDecoder::FindOrAddToken(
-    StateId state, int32 frame_plus_one, BaseFloat tot_cost, bool *changed) {
+template <typename FST, typename Token>
+inline typename LatticeFasterDecoderTpl<FST, Token>::Elem*
+LatticeFasterDecoderTpl<FST, Token>::FindOrAddToken(
+      StateId state, int32 frame_plus_one, BaseFloat tot_cost,
+      Token *backpointer, bool *changed) {
   // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
   // if the token was newly created or the cost changed.
   KALDI_ASSERT(frame_plus_one < active_toks_.size());
   Token *&toks = active_toks_[frame_plus_one].toks;
-  Elem *e_found = toks_.Find(state);
-  if (e_found == NULL) {  // no such token presently.
+  Elem *e_found = toks_.Insert(state, NULL);
+  if (e_found->val == NULL) {  // no such token presently.
     const BaseFloat extra_cost = 0.0;
     // tokens on the currently final frame have zero extra_cost
     // as any of them could end up
     // on the winning path.
-    Token *new_tok = new Token (tot_cost, extra_cost, NULL, toks);
+    Token *new_tok = new Token (tot_cost, extra_cost, NULL, toks, backpointer);
     // NULL: no forward links yet
     toks = new_tok;
     num_toks_++;
-    toks_.Insert(state, new_tok);
+    e_found->val = new_tok;
     if (changed) *changed = true;
-    return new_tok;
+    return e_found;
   } else {
     Token *tok = e_found->val;  // There is an existing Token for this state.
     if (tok->tot_cost > tot_cost) {  // replace old token
       tok->tot_cost = tot_cost;
+      // SetBackpointer() just does tok->backpointer = backpointer in
+      // the case where Token == BackpointerToken, else nothing.
+      tok->SetBackpointer(backpointer);
       // we don't allocate a new token, the old stays linked in active_toks_
       // we only replace the tot_cost
       // in the current frame, there are no forward links (and no extra_cost)
@@ -290,14 +288,15 @@ inline LatticeFasterDecoder::Token *LatticeFasterDecoder::FindOrAddToken(
     } else {
       if (changed) *changed = false;
     }
-    return tok;
+    return e_found;
   }
 }
 
 // prunes outgoing links for all tokens in active_toks_[frame]
 // it's called by PruneActiveTokens
 // all links, that have link_extra_cost > lattice_beam are pruned
-void LatticeFasterDecoder::PruneForwardLinks(
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneForwardLinks(
     int32 frame_plus_one, bool *extra_costs_changed,
     bool *links_pruned, BaseFloat delta) {
   // delta is the amount by which the extra_costs must change
@@ -324,7 +323,7 @@ void LatticeFasterDecoder::PruneForwardLinks(
     changed = false;
     for (Token *tok = active_toks_[frame_plus_one].toks;
          tok != NULL; tok = tok->next) {
-      ForwardLink *link, *prev_link = NULL;
+      ForwardLinkT *link, *prev_link = NULL;
       // will recompute tok_extra_cost for tok.
       BaseFloat tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
       // tok_extra_cost is the best (min) of link_extra_cost of outgoing links
@@ -338,7 +337,7 @@ void LatticeFasterDecoder::PruneForwardLinks(
         // through link source state and through link destination state
         KALDI_ASSERT(link_extra_cost == link_extra_cost);  // check for NaN
         if (link_extra_cost > config_.lattice_beam) {  // excise link
-          ForwardLink *next_link = link->next;
+          ForwardLinkT *next_link = link->next;
           if (prev_link != NULL) prev_link->next = next_link;
           else tok->links = next_link;
           delete link;
@@ -373,14 +372,15 @@ void LatticeFasterDecoder::PruneForwardLinks(
 // PruneForwardLinksFinal is a version of PruneForwardLinks that we call
 // on the final frame.  If there are final tokens active, it uses
 // the final-probs for pruning, otherwise it treats all tokens as final.
-void LatticeFasterDecoder::PruneForwardLinksFinal() {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneForwardLinksFinal() {
   KALDI_ASSERT(!active_toks_.empty());
   int32 frame_plus_one = active_toks_.size() - 1;
 
   if (active_toks_[frame_plus_one].toks == NULL)  // empty list; should not happen.
     KALDI_WARN << "No tokens alive at end of file";
 
-  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;
+  typedef typename unordered_map<Token*, BaseFloat>::const_iterator IterType;
   ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
   decoding_finalized_ = true;
   // We call DeleteElems() as a nicety, not because it's really necessary;
@@ -399,7 +399,7 @@ void LatticeFasterDecoder::PruneForwardLinksFinal() {
     changed = false;
     for (Token *tok = active_toks_[frame_plus_one].toks;
          tok != NULL; tok = tok->next) {
-      ForwardLink *link, *prev_link = NULL;
+      ForwardLinkT *link, *prev_link = NULL;
       // will recompute tok_extra_cost.  It has a term in it that corresponds
       // to the "final-prob", so instead of initializing tok_extra_cost to infinity
       // below we set it to the difference between the (score+final_prob) of this token,
@@ -425,7 +425,7 @@ void LatticeFasterDecoder::PruneForwardLinksFinal() {
             ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
              - next_tok->tot_cost);
         if (link_extra_cost > config_.lattice_beam) {  // excise link
-          ForwardLink *next_link = link->next;
+          ForwardLinkT *next_link = link->next;
           if (prev_link != NULL) prev_link->next = next_link;
           else tok->links = next_link;
           delete link;
@@ -457,7 +457,8 @@ void LatticeFasterDecoder::PruneForwardLinksFinal() {
   } // while changed
 }
 
-BaseFloat LatticeFasterDecoder::FinalRelativeCost() const {
+template <typename FST, typename Token>
+BaseFloat LatticeFasterDecoderTpl<FST, Token>::FinalRelativeCost() const {
   if (!decoding_finalized_) {
     BaseFloat relative_cost;
     ComputeFinalCosts(NULL, &relative_cost, NULL);
@@ -474,7 +475,8 @@ BaseFloat LatticeFasterDecoder::FinalRelativeCost() const {
 // [we don't do this in PruneForwardLinks because it would give us
 // a problem with dangling pointers].
 // It's called by PruneActiveTokens if any forward links have been pruned
-void LatticeFasterDecoder::PruneTokensForFrame(int32 frame_plus_one) {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneTokensForFrame(int32 frame_plus_one) {
   KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size());
   Token *&toks = active_toks_[frame_plus_one].toks;
   if (toks == NULL)
@@ -500,7 +502,8 @@ void LatticeFasterDecoder::PruneTokensForFrame(int32 frame_plus_one) {
 // that.  We go backwards through the frames and stop when we reach a point
 // where the delta-costs are not changing (and the delta controls when we consider
 // a cost to have "not changed").
-void LatticeFasterDecoder::PruneActiveTokens(BaseFloat delta) {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneActiveTokens(BaseFloat delta) {
   int32 cur_frame_plus_one = NumFramesDecoded();
   int32 num_toks_begin = num_toks_;
   // The index "f" below represents a "frame plus one", i.e. you'd have to subtract
@@ -529,7 +532,8 @@ void LatticeFasterDecoder::PruneActiveTokens(BaseFloat delta) {
                 << " to " << num_toks_;
 }
 
-void LatticeFasterDecoder::ComputeFinalCosts(
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::ComputeFinalCosts(
     unordered_map<Token*, BaseFloat> *final_costs,
     BaseFloat *final_relative_cost,
     BaseFloat *final_best_cost) const {
@@ -540,11 +544,12 @@ void LatticeFasterDecoder::ComputeFinalCosts(
   BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
   BaseFloat best_cost = infinity,
       best_cost_with_final = infinity;
+
   while (final_toks != NULL) {
     StateId state = final_toks->key;
     Token *tok = final_toks->val;
     const Elem *next = final_toks->tail;
-    BaseFloat final_cost = fst_.Final(state).Value();
+    BaseFloat final_cost = fst_->Final(state).Value();
     BaseFloat cost = tok->tot_cost,
         cost_with_final = cost + final_cost;
     best_cost = std::min(cost, best_cost);
@@ -571,8 +576,27 @@ void LatticeFasterDecoder::ComputeFinalCosts(
   }
 }
 
-void LatticeFasterDecoder::AdvanceDecoding(DecodableInterface *decodable,
-                                             int32 max_num_frames) {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::AdvanceDecoding(DecodableInterface *decodable,
+                                                int32 max_num_frames) {
+  if (std::is_same<FST, fst::Fst<fst::StdArc> >::value) {
+    // if the type 'FST' is the FST base-class, then see if the FST type of fst_
+    // is actually VectorFst or ConstFst.  If so, call the AdvanceDecoding()
+    // function after casting *this to the more specific type.
+    if (fst_->Type() == "const") {
+      LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, Token> *this_cast =
+          reinterpret_cast<LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, Token>* >(this);
+      this_cast->AdvanceDecoding(decodable, max_num_frames);
+      return;
+    } else if (fst_->Type() == "vector") {
+      LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, Token> *this_cast =
+          reinterpret_cast<LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, Token>* >(this);
+      this_cast->AdvanceDecoding(decodable, max_num_frames);
+      return;
+    }
+  }
+
+
   KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ &&
                "You must call InitDecoding() before AdvanceDecoding");
   int32 num_frames_ready = decodable->NumFramesReady();
@@ -589,15 +613,16 @@ void LatticeFasterDecoder::AdvanceDecoding(DecodableInterface *decodable,
     if (NumFramesDecoded() % config_.prune_interval == 0) {
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
     }
-    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);
-    ProcessNonemittingWrapper(cost_cutoff);
+    BaseFloat cost_cutoff = ProcessEmitting(decodable);
+    ProcessNonemitting(cost_cutoff);
   }
 }
 
 // FinalizeDecoding() is a version of PruneActiveTokens that we call
 // (optionally) on the final frame.  Takes into account the final-prob of
 // tokens.  This function used to be called PruneActiveTokensFinal().
-void LatticeFasterDecoder::FinalizeDecoding() {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::FinalizeDecoding() {
   int32 final_frame_plus_one = NumFramesDecoded();
   int32 num_toks_begin = num_toks_;
   // PruneForwardLinksFinal() prunes final frame (with final-probs), and
@@ -615,7 +640,8 @@ void LatticeFasterDecoder::FinalizeDecoding() {
 }
 
 /// Gets the weight cutoff.  Also counts the active tokens.
-BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
+template <typename FST, typename Token>
+BaseFloat LatticeFasterDecoderTpl<FST, Token>::GetCutoff(Elem *list_head, size_t *tok_count,
                                           BaseFloat *adaptive_beam, Elem **best_elem) {
   BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
   // positive == high cost == bad.
@@ -684,8 +710,9 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
   }
 }
 
-template <typename FstType>
-BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
+template <typename FST, typename Token>
+BaseFloat LatticeFasterDecoderTpl<FST, Token>::ProcessEmitting(
+    DecodableInterface *decodable) {
   KALDI_ASSERT(active_toks_.size() > 0);
   int32 frame = active_toks_.size() - 1; // frame is the frame-index
                                          // (zero-based) used to get likelihoods
@@ -708,8 +735,8 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
   // pruning "online" before having seen all tokens
 
   BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good
-  // dynamic range.
-  const FstType &fst = dynamic_cast<const FstType&>(fst_);
+                               // dynamic range.
+
 
   // First process the best token to get a hopefully
   // reasonably tight bound on the next cutoff.  The only
@@ -718,12 +745,12 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
     StateId state = best_elem->key;
     Token *tok = best_elem->val;
     cost_offset = - tok->tot_cost;
-    for (fst::ArcIterator<FstType> aiter(fst, state);
+    for (fst::ArcIterator<FST> aiter(*fst_, state);
          !aiter.Done();
          aiter.Next()) {
       const Arc &arc = aiter.Value();
       if (arc.ilabel != 0) {  // propagate..
-        BaseFloat new_weight = arc.weight.Value() + cost_offset - 
+        BaseFloat new_weight = arc.weight.Value() + cost_offset -
             decodable->LogLikelihood(frame, arc.ilabel) + tok->tot_cost;
         if (new_weight + adaptive_beam < next_cutoff)
           next_cutoff = new_weight + adaptive_beam;
@@ -745,7 +772,7 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
     StateId state = e->key;
     Token *tok = e->val;
     if (tok->tot_cost <= cur_cutoff) {
-      for (fst::ArcIterator<FstType> aiter(fst, state);
+      for (fst::ArcIterator<FST> aiter(*fst_, state);
            !aiter.Done();
            aiter.Next()) {
         const Arc &arc = aiter.Value();
@@ -760,13 +787,13 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
             next_cutoff = tot_cost + adaptive_beam; // prune by best current token
           // Note: the frame indexes into active_toks_ are one-based,
           // hence the + 1.
-          Token *next_tok = FindOrAddToken(arc.nextstate,
-                                           frame + 1, tot_cost, NULL);
+          Elem *e_next = FindOrAddToken(arc.nextstate,
+                                        frame + 1, tot_cost, tok, NULL);
           // NULL: no change indicator needed
 
           // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-          tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel,
-                                       graph_cost, ac_cost, tok->links);
+          tok->links = new ForwardLinkT(e_next->val, arc.ilabel, arc.olabel,
+                                        graph_cost, ac_cost, tok->links);
         }
       } // for all arcs
     }
@@ -776,53 +803,54 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
   return next_cutoff;
 }
 
-template BaseFloat LatticeFasterDecoder::ProcessEmitting<fst::ConstFst<fst::StdArc>>(
-        DecodableInterface *decodable);
-template BaseFloat LatticeFasterDecoder::ProcessEmitting<fst::VectorFst<fst::StdArc>>(
-        DecodableInterface *decodable);
-template BaseFloat LatticeFasterDecoder::ProcessEmitting<fst::Fst<fst::StdArc>>(
-        DecodableInterface *decodable);
-
-BaseFloat LatticeFasterDecoder::ProcessEmittingWrapper(DecodableInterface *decodable) {
-  if (fst_.Type() == "const") {
-    return LatticeFasterDecoder::ProcessEmitting<fst::ConstFst<Arc>>(decodable);
-  } else if (fst_.Type() == "vector") {
-    return LatticeFasterDecoder::ProcessEmitting<fst::VectorFst<Arc>>(decodable);
-  } else {
-    return LatticeFasterDecoder::ProcessEmitting<fst::Fst<Arc>>(decodable);
+// static inline
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::DeleteForwardLinks(Token *tok) {
+  ForwardLinkT *l = tok->links, *m;
+  while (l != NULL) {
+    m = l->next;
+    delete l;
+    l = m;
   }
+  tok->links = NULL;
 }
 
-template <typename FstType> 
-void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::ProcessNonemitting(BaseFloat cutoff) {
   KALDI_ASSERT(!active_toks_.empty());
   int32 frame = static_cast<int32>(active_toks_.size()) - 2;
   // Note: "frame" is the time-index we just processed, or -1 if
   // we are processing the nonemitting transitions before the
   // first frame (called from InitDecoding()).
-  const FstType &fst = dynamic_cast<const FstType&>(fst_);
 
   // Processes nonemitting arcs for one frame.  Propagates within toks_.
-  // Note-- this queue structure is is not very optimal as
+  // Note-- this queue structure is not very optimal as
   // it may cause us to process states unnecessarily (e.g. more than once),
   // but in the baseline code, turning this vector into a set to fix this
   // problem did not improve overall speed.
 
   KALDI_ASSERT(queue_.empty());
-  for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail)
-    queue_.push_back(e->key);
-  if (queue_.empty()) {
+
+  if (toks_.GetList() == NULL) {
     if (!warned_) {
       KALDI_WARN << "Error, no surviving tokens: frame is " << frame;
       warned_ = true;
     }
   }
 
+  for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
+    StateId state = e->key;
+    if (fst_->NumInputEpsilons(state) != 0)
+      queue_.push_back(e);
+  }
+
   while (!queue_.empty()) {
-    StateId state = queue_.back();
+    const Elem *e = queue_.back();
     queue_.pop_back();
 
-    Token *tok = toks_.Find(state)->val;  // would segfault if state not in toks_ but this can't happen.
+    StateId state = e->key;
+    Token *tok = e->val;  // would segfault if e is a NULL pointer but this can't happen.
     BaseFloat cur_cost = tok->tot_cost;
     if (cur_cost > cutoff) // Don't bother processing successors.
       continue;
@@ -830,9 +858,9 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
     // because we're about to regenerate them.  This is a kind
     // of non-optimality (remember, this is the simple decoder),
     // but since most states are emitting it's not a huge issue.
-    tok->DeleteForwardLinks(); // necessary when re-visiting
+    DeleteForwardLinks(tok); // necessary when re-visiting
     tok->links = NULL;
-    for (fst::ArcIterator<FstType> aiter(fst, state);
+    for (fst::ArcIterator<FST> aiter(*fst_, state);
          !aiter.Done();
          aiter.Next()) {
       const Arc &arc = aiter.Value();
@@ -842,51 +870,38 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
         if (tot_cost < cutoff) {
           bool changed;
 
-          Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
-                                          &changed);
+          Elem *e_new = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
+                                          tok, &changed);
 
-          tok->links = new ForwardLink(new_tok, 0, arc.olabel,
-                                       graph_cost, 0, tok->links);
+          tok->links = new ForwardLinkT(e_new->val, 0, arc.olabel,
+                                        graph_cost, 0, tok->links);
 
           // "changed" tells us whether the new token has a different
           // cost from before, or is new [if so, add into queue].
-          if (changed) queue_.push_back(arc.nextstate);
+          if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0)
+            queue_.push_back(e_new);
         }
       }
     } // for all arcs
   } // while queue not empty
 }
 
-template void LatticeFasterDecoder::ProcessNonemitting<fst::ConstFst<fst::StdArc>>(
-        BaseFloat cutoff);
-template void LatticeFasterDecoder::ProcessNonemitting<fst::VectorFst<fst::StdArc>>(
-        BaseFloat cutoff);
-template void LatticeFasterDecoder::ProcessNonemitting<fst::Fst<fst::StdArc>>(
-        BaseFloat cutoff);
-
-void LatticeFasterDecoder::ProcessNonemittingWrapper(BaseFloat cost_cutoff) {
-  if (fst_.Type() == "const") {
-    return LatticeFasterDecoder::ProcessNonemitting<fst::ConstFst<Arc>>(cost_cutoff);
-  } else if (fst_.Type() == "vector") {
-    return LatticeFasterDecoder::ProcessNonemitting<fst::VectorFst<Arc>>(cost_cutoff);
-  } else {
-    return LatticeFasterDecoder::ProcessNonemitting<fst::Fst<Arc>>(cost_cutoff);
-  }
-}
 
-void LatticeFasterDecoder::DeleteElems(Elem *list) {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::DeleteElems(Elem *list) {
   for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
     e_tail = e->tail;
     toks_.Delete(e);
   }
 }
 
-void LatticeFasterDecoder::ClearActiveTokens() { // a cleanup routine, at utt end/begin
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin
   for (size_t i = 0; i < active_toks_.size(); i++) {
     // Delete all tokens alive on this frame, and any forward
     // links they may have.
     for (Token *tok = active_toks_[i].toks; tok != NULL; ) {
-      tok->DeleteForwardLinks();
+      DeleteForwardLinks(tok);
       Token *next_tok = tok->next;
       delete tok;
       num_toks_--;
@@ -898,10 +913,11 @@ void LatticeFasterDecoder::ClearActiveTokens() { // a cleanup routine, at utt en
 }
 
 // static
-void LatticeFasterDecoder::TopSortTokens(Token *tok_list,
-                                         std::vector<Token*> *topsorted_list) {
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::TopSortTokens(
+    Token *tok_list, std::vector<Token*> *topsorted_list) {
   unordered_map<Token*, int32> token2pos;
-  typedef unordered_map<Token*, int32>::iterator IterType;
+  typedef typename unordered_map<Token*, int32>::iterator IterType;
   int32 num_toks = 0;
   for (Token *tok = tok_list; tok != NULL; tok = tok->next)
     num_toks++;
@@ -918,7 +934,7 @@ void LatticeFasterDecoder::TopSortTokens(Token *tok_list,
   for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) {
     Token *tok = iter->first;
     int32 pos = iter->second;
-    for (ForwardLink *link = tok->links; link != NULL; link = link->next) {
+    for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) {
       if (link->ilabel == 0) {
         // We only need to consider epsilon links, since non-epsilon links
         // transition between frames and this function only needs to sort a list
@@ -943,16 +959,16 @@ void LatticeFasterDecoder::TopSortTokens(Token *tok_list,
   for (loop_count = 0;
        !reprocess.empty() && loop_count < max_loop; ++loop_count) {
     std::vector<Token*> reprocess_vec;
-    for (unordered_set<Token*>::iterator iter = reprocess.begin();
+    for (typename unordered_set<Token*>::iterator iter = reprocess.begin();
          iter != reprocess.end(); ++iter)
       reprocess_vec.push_back(*iter);
     reprocess.clear();
-    for (std::vector<Token*>::iterator iter = reprocess_vec.begin();
+    for (typename std::vector<Token*>::iterator iter = reprocess_vec.begin();
          iter != reprocess_vec.end(); ++iter) {
       Token *tok = *iter;
       int32 pos = token2pos[tok];
       // Repeat the processing we did above (for comments, see above).
-      for (ForwardLink *link = tok->links; link != NULL; link = link->next) {
+      for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) {
         if (link->ilabel == 0) {
           IterType following_iter = token2pos.find(link->next_tok);
           if (following_iter != token2pos.end()) {
@@ -975,4 +991,17 @@ void LatticeFasterDecoder::TopSortTokens(Token *tok_list,
     (*topsorted_list)[iter->second] = iter->first;
 }
 
+// Instantiate the template for the combination of token types and FST types
+// that we'll need.
+template class LatticeFasterDecoderTpl<fst::Fst<fst::StdArc>, decoder::StdToken>;
+template class LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, decoder::StdToken >;
+template class LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, decoder::StdToken >;
+template class LatticeFasterDecoderTpl<fst::GrammarFst, decoder::StdToken>;
+
+template class LatticeFasterDecoderTpl<fst::Fst<fst::StdArc> , decoder::BackpointerToken>;
+template class LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, decoder::BackpointerToken >;
+template class LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, decoder::BackpointerToken >;
+template class LatticeFasterDecoderTpl<fst::GrammarFst, decoder::BackpointerToken>;
+
+
 } // end namespace kaldi.
diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h
index 9c6ddd67acd..57cbe5fe178 100644
--- a/src/decoder/lattice-faster-decoder.h
+++ b/src/decoder/lattice-faster-decoder.h
@@ -20,10 +20,6 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-// Note: this file is "upstream" from lattice-faster-online-decoder.h,
-// and changes in this file should be made to lattice-faster-online-decoder.h,
-// if applicable.
-
 #ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_
 #define KALDI_DECODER_LATTICE_FASTER_DECODER_H_
 
@@ -35,6 +31,7 @@
 #include "fstext/fstext-lib.h"
 #include "lat/determinize-lattice-pruned.h"
 #include "lat/kaldi-lattice.h"
+#include "decoder/grammar-fst.h"
 
 namespace kaldi {
 
@@ -46,11 +43,13 @@ struct LatticeFasterDecoderConfig {
   int32 prune_interval;
   bool determinize_lattice; // not inspected by this class... used in
                             // command-line program.
-  BaseFloat beam_delta; // has nothing to do with beam_ratio
+  BaseFloat beam_delta;
   BaseFloat hash_ratio;
-  BaseFloat prune_scale;   // Note: we don't make this configurable on the command line,
-                           // it's not a very important parameter.  It affects the
-                           // algorithm that prunes the tokens as we go.
+  // Note: we don't make prune_scale configurable on the command line, it's not
+  // a very important parameter.  It affects the algorithm that prunes the
+  // tokens as we go.
+  BaseFloat prune_scale;
+
   // Most of the options inside det_opts are not actually queried by the
   // LatticeFasterDecoder class itself, but by the code that calls it, for
   // example in the function DecodeUtteranceLatticeFaster.
@@ -86,32 +85,165 @@ struct LatticeFasterDecoderConfig {
   }
   void Check() const {
     KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0
+                 && min_active <= max_active
                  && prune_interval > 0 && beam_delta > 0.0 && hash_ratio >= 1.0
                  && prune_scale > 0.0 && prune_scale < 1.0);
   }
 };
 
+namespace decoder {
+// We will template the decoder on the token type as well as the FST type; this
+// is a mechanism so that we can use the same underlying decoder code for
+// versions of the decoder that support quickly getting the best path
+// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also
+// those that do not (LatticeFasterDecoder).
+
+
+// ForwardLinks are the links from a token to a token on the next frame.
+// or sometimes on the current frame (for input-epsilon links).
+template <typename Token>
+struct ForwardLink {
+  using Label = fst::StdArc::Label;
+
+  Token *next_tok;  // the next token [or NULL if represents final-state]
+  Label ilabel;  // ilabel on arc
+  Label olabel;  // olabel on arc
+  BaseFloat graph_cost;  // graph cost of traversing arc (contains LM, etc.)
+  BaseFloat acoustic_cost;  // acoustic cost (pre-scaled) of traversing arc
+  ForwardLink *next;  // next in singly-linked list of forward arcs (arcs
+                      // in the state-level lattice) from a token.
+  inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
+                     BaseFloat graph_cost, BaseFloat acoustic_cost,
+                     ForwardLink *next):
+      next_tok(next_tok), ilabel(ilabel), olabel(olabel),
+      graph_cost(graph_cost), acoustic_cost(acoustic_cost),
+      next(next) { }
+};
+
 
-/** A bit more optimized version of the lattice decoder.
-   See \ref lattices_generation \ref decoders_faster and \ref decoders_simple
-    for more information.
- */
-class LatticeFasterDecoder {
- public:
-  typedef fst::StdArc Arc;
-  typedef Arc::Label Label;
-  typedef Arc::StateId StateId;
-  typedef Arc::Weight Weight;
+struct StdToken {
+  using ForwardLinkT = ForwardLink<StdToken>;
+  using Token = StdToken;
+
+  // Standard token type for LatticeFasterDecoder.  Each active HCLG
+  // (decoding-graph) state on each frame has one token.
+
+  // tot_cost is the total (LM + acoustic) cost from the beginning of the
+  // utterance up to this point.  (but see cost_offset_, which is subtracted
+  // to keep it in a good numerical range).
+  BaseFloat tot_cost;
+
+  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals the
+  // minimum difference between the cost of the best path that this link is a
+  // part of, and the cost of the absolute best path, under the assumption that
+  // any of the currently active states at the decoding front may eventually
+  // succeed (e.g. if you were to take the currently active states one by one
+  // and compute this difference, and then take the minimum).
+  BaseFloat extra_cost;
+
+  // 'links' is the head of singly-linked list of ForwardLinks, which is what we
+  // use for lattice generation.
+  ForwardLinkT *links;
+
+  //'next' is the next in the singly-linked list of tokens for this frame.
+  Token *next;
+
+  // This function does nothing and should be optimized out; it's needed
+  // so we can share the regular LatticeFasterDecoderTpl code and the code
+  // for LatticeFasterOnlineDecoder that supports fast traceback.
+  inline void SetBackpointer (Token *backpointer) { }
+
+  // This constructor just ignores the 'backpointer' argument.  That argument is
+  // needed so that we can use the same decoder code for LatticeFasterDecoderTpl
+  // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a
+  // fast way to obtain the best path).
+  inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links,
+                  Token *next, Token *backpointer):
+      tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next) { }
+};
+
+struct BackpointerToken {
+  using ForwardLinkT = ForwardLink<BackpointerToken>;
+  using Token = BackpointerToken;
+
+  // BackpointerToken is like Token but also
+  // Standard token type for LatticeFasterDecoder.  Each active HCLG
+  // (decoding-graph) state on each frame has one token.
+
+  // tot_cost is the total (LM + acoustic) cost from the beginning of the
+  // utterance up to this point.  (but see cost_offset_, which is subtracted
+  // to keep it in a good numerical range).
+  BaseFloat tot_cost;
+
+  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals
+  // the minimum difference between the cost of the best path, and the cost of
+  // this is on, and the cost of the absolute best path, under the assumption
+  // that any of the currently active states at the decoding front may
+  // eventually succeed (e.g. if you were to take the currently active states
+  // one by one and compute this difference, and then take the minimum).
+  BaseFloat extra_cost;
+
+  // 'links' is the head of singly-linked list of ForwardLinks, which is what we
+  // use for lattice generation.
+  ForwardLinkT *links;
+
+  //'next' is the next in the singly-linked list of tokens for this frame.
+  BackpointerToken *next;
+
+  // Best preceding BackpointerToken (could be a on this frame, connected to
+  // this via an epsilon transition, or on a previous frame).  This is only
+  // required for an efficient GetBestPath function in
+  // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation
+  // (the "links" list is what stores the forward links, for that).
+  Token *backpointer;
+
+  inline void SetBackpointer (Token *backpointer) {
+    this->backpointer = backpointer;
+  }
+
+  inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links,
+                          Token *next, Token *backpointer):
+      tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next),
+      backpointer(backpointer) { }
+};
+
+}  // namespace decoder
 
-  // instantiate this class once for each thing you have to decode.
-  LatticeFasterDecoder(const fst::Fst<fst::StdArc> &fst,
-                       const LatticeFasterDecoderConfig &config);
 
-  // This version of the initializer "takes ownership" of the fst,
-  // and will delete it when this object is destroyed.
-  LatticeFasterDecoder(const LatticeFasterDecoderConfig &config,
-                       fst::Fst<fst::StdArc> *fst);
+/** This is the "normal" lattice-generating decoder.
+    See \ref lattices_generation \ref decoders_faster and \ref decoders_simple
+     for more information.
 
+   The decoder is templated on the FST type and the token type.  The token type
+   will normally be StdToken, but also may be BackpointerToken which is to support
+   quick lookup of the current best path (see lattice-faster-online-decoder.h)
+
+   The FST you invoke this decoder which is expected to equal
+   Fst::Fst<fst::StdArc>, a.k.a. StdFst, or GrammarFst.  If you invoke it with
+   FST == StdFst and it notices that the actual FST type is
+   fst::VectorFst<fst::StdArc> or fst::ConstFst<fst::StdArc>, the decoder object
+   will internally cast itself to one that is templated on those more specific
+   types; this is an optimization for speed.
+ */
+template <typename FST, typename Token = decoder::StdToken>
+class LatticeFasterDecoderTpl {
+ public:
+  using Arc = typename FST::Arc;
+  using Label = typename Arc::Label;
+  using StateId = typename Arc::StateId;
+  using Weight = typename Arc::Weight;
+  using ForwardLinkT = decoder::ForwardLink<Token>;
+
+  // Instantiate this class once for each thing you have to decode.
+  // This version of the constructor does not take ownership of
+  // 'fst'.
+  LatticeFasterDecoderTpl(const FST &fst,
+                          const LatticeFasterDecoderConfig &config);
+
+  // This version of the constructor takes ownership of the fst, and will delete
+  // it when this object is destroyed.
+  LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config,
+                          FST *fst);
 
   void SetOptions(const LatticeFasterDecoderConfig &config) {
     config_ = config;
@@ -121,7 +253,7 @@ class LatticeFasterDecoder {
     return config_;
   }
 
-  ~LatticeFasterDecoder();
+  ~LatticeFasterDecoderTpl();
 
   /// Decodes until there are no more frames left in the "decodable" object..
   /// note, this may block waiting for input if the "decodable" object blocks.
@@ -151,8 +283,13 @@ class LatticeFasterDecoder {
   /// of the graph then it will include those as final-probs, else
   /// it will treat all final-probs as one.
   /// The raw lattice will be topologically sorted.
-  bool GetRawLattice(Lattice *ofst,
-                     bool use_final_probs = true) const;
+  ///
+  /// See also GetRawLatticePruned in lattice-faster-online-decoder.h,
+  /// which also supports a pruning beam, in case for some reason
+  /// you want it pruned tighter than the regular lattice beam.
+  /// We could put that here in future needed.
+  bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const;
+
 
 
   /// [Deprecated, users should now use GetRawLattice and determinize it
@@ -181,15 +318,10 @@ class LatticeFasterDecoder {
   /// This function may be optionally called after AdvanceDecoding(), when you
   /// do not plan to decode any further.  It does an extra pruning step that
   /// will help to prune the lattices output by GetLattice and (particularly)
-  /// GetRawLattice more accurately, particularly toward the end of the
-  /// utterance.  It does this by using the final-probs in pruning (if any
-  /// final-state survived); it also does a final pruning step that visits all
-  /// states (the pruning that is done during decoding may fail to prune states
-  /// that are within kPruningScale = 0.1 outside of the beam).  If you call
-  /// this, you cannot call AdvanceDecoding again (it will fail), and you
-  /// cannot call GetLattice() and related functions with use_final_probs =
-  /// false.
-  /// Used to be called PruneActiveTokensFinal().
+  /// GetRawLattice more completely, particularly toward the end of the
+  /// utterance.  If you call this, you cannot call AdvanceDecoding again (it
+  /// will fail), and you cannot call GetLattice() and related functions with
+  /// use_final_probs = false.  Used to be called PruneActiveTokensFinal().
   void FinalizeDecoding();
 
   /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives
@@ -207,53 +339,13 @@ class LatticeFasterDecoder {
   // whenever we call ProcessEmitting().
   inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; }
 
- private:
-  // ForwardLinks are the links from a token to a token on the next frame.
-  // or sometimes on the current frame (for input-epsilon links).
-  struct Token;
-  struct ForwardLink {
-    Token *next_tok; // the next token [or NULL if represents final-state]
-    Label ilabel; // ilabel on link.
-    Label olabel; // olabel on link.
-    BaseFloat graph_cost; // graph cost of traversing link (contains LM, etc.)
-    BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing link
-    ForwardLink *next; // next in singly-linked list of forward links from a
-                       // token.
-    inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
-                       BaseFloat graph_cost, BaseFloat acoustic_cost,
-                       ForwardLink *next):
-        next_tok(next_tok), ilabel(ilabel), olabel(olabel),
-        graph_cost(graph_cost), acoustic_cost(acoustic_cost),
-        next(next) { }
-  };
+ protected:
+  // we make things protected instead of private, as code in
+  // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the
+  // internals.
 
-  // Token is what's resident in a particular state at a particular time.
-  // In this decoder a Token actually contains *forward* links.
-  // When first created, a Token just has the (total) cost.    We add forward
-  // links from it when we process the next frame.
-  struct Token {
-    BaseFloat tot_cost; // would equal weight.Value()... cost up to this point.
-    BaseFloat extra_cost; // >= 0.  This is used in pruning away tokens.
-    // there is a comment in lattice-faster-decoder.cc explaining this;
-    // search for "a note on the definition of extra_cost".
-
-    ForwardLink *links; // Head of singly linked list of ForwardLinks
-
-    Token *next; // Next in list of tokens for this frame.
-
-    inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links,
-                 Token *next):
-        tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next) { }
-    inline void DeleteForwardLinks() {
-      ForwardLink *l = links, *m;
-      while (l != NULL) {
-        m = l->next;
-        delete l;
-        l = m;
-      }
-      links = NULL;
-    }
-  };
+  // Deletes the elements of the singly linked list tok->links.
+  inline static void DeleteForwardLinks(Token *tok);
 
   // head of per-frame list of Tokens (list is in topological order),
   // and something saying whether we ever pruned it using PruneForwardLinks.
@@ -265,7 +357,13 @@ class LatticeFasterDecoder {
                  must_prune_tokens(true) { }
   };
 
-  typedef HashList<StateId, Token*>::Elem Elem;
+  using Elem = typename HashList<StateId, Token*>::Elem;
+  // Equivalent to:
+  //  struct Elem {
+  //    StateId key;
+  //    Token *val;
+  //    Elem *tail;
+  //  };
 
   void PossiblyResizeHash(size_t num_toks);
 
@@ -277,8 +375,11 @@ class LatticeFasterDecoder {
   // index plus one, which is used to index into the active_toks_ array.
   // Returns the Token pointer.  Sets "changed" (if non-NULL) to true if the
   // token was newly created or the cost changed.
-  inline Token *FindOrAddToken(StateId state, int32 frame_plus_one,
-                               BaseFloat tot_cost, bool *changed);
+  // If Token == StdToken, the 'backpointer' argument has no purpose (and will
+  // hopefully be optimized out).
+  inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one,
+                              BaseFloat tot_cost, Token *backpointer,
+                              bool *changed);
 
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
@@ -338,20 +439,15 @@ class LatticeFasterDecoder {
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem);
 
-  /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
-  /// Returns the cost cutoff for subsequent ProcessNonemitting() to use.
-  /// Templated on FST type for speed; called via ProcessEmittingWrapper().
-  template <typename FstType> BaseFloat ProcessEmitting(DecodableInterface *decodable);
-
-  BaseFloat ProcessEmittingWrapper(DecodableInterface *decodable);
+  /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to
+  /// cur_toks_.  Returns the cost cutoff for subsequent ProcessNonemitting() to
+  /// use.
+  BaseFloat ProcessEmitting(DecodableInterface *decodable);
 
   /// Processes nonemitting (epsilon) arcs for one frame.  Called after
   /// ProcessEmitting() on each frame.  The cost cutoff is computed by the
   /// preceding ProcessEmitting().
-  /// the templated design is similar to ProcessEmitting()
-  template <typename FstType> void ProcessNonemitting(BaseFloat cost_cutoff);
-
-  void ProcessNonemittingWrapper(BaseFloat cost_cutoff);
+  void ProcessNonemitting(BaseFloat cost_cutoff);
 
   // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
   // more than one list (e.g. for current and previous frames), but only one of
@@ -365,11 +461,15 @@ class LatticeFasterDecoder {
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
-  std::vector<StateId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
   std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
-  // make it class member to avoid internal new/delete.
-  const fst::Fst<fst::StdArc> &fst_;
+
+  // fst_ is a pointer to the FST we are decoding from.
+  const FST *fst_;
+  // delete_fst_ is true if the pointer fst_ needs to be deleted when this
+  // object is destroyed.
   bool delete_fst_;
+
   std::vector<BaseFloat> cost_offsets_; // This contains, for each
   // frame, an offset that was added to the acoustic log-likelihoods on that
   // frame in order to keep everything in a nice dynamic range i.e.  close to
@@ -392,7 +492,7 @@ class LatticeFasterDecoder {
   BaseFloat final_relative_cost_;
   BaseFloat final_best_cost_;
 
-  // There are various cleanup tasks... the the toks_ structure contains
+  // There are various cleanup tasks... the toks_ structure contains
   // singly linked lists of Token pointers, where Elem is the list type.
   // It also indexes them in a hash, indexed by state (this hash is only
   // maintained for the most recent frame).  toks_.Clear()
@@ -416,9 +516,11 @@ class LatticeFasterDecoder {
 
   void ClearActiveTokens();
 
-  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoder);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl);
 };
 
+typedef LatticeFasterDecoderTpl<fst::StdFst, decoder::StdToken> LatticeFasterDecoder;
+
 
 
 } // end namespace kaldi.
diff --git a/src/decoder/lattice-faster-online-decoder.cc b/src/decoder/lattice-faster-online-decoder.cc
index 0a921438f94..ca0058155dd 100644
--- a/src/decoder/lattice-faster-online-decoder.cc
+++ b/src/decoder/lattice-faster-online-decoder.cc
@@ -29,85 +29,17 @@
 
 namespace kaldi {
 
-// instantiate this class once for each thing you have to decode.
-LatticeFasterOnlineDecoder::LatticeFasterOnlineDecoder(
-    const fst::Fst<fst::StdArc> &fst,
-    const LatticeFasterDecoderConfig &config):
-    fst_(fst), delete_fst_(false), config_(config), num_toks_(0) {
-  config.Check();
-  toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
-}
-
-
-LatticeFasterOnlineDecoder::LatticeFasterOnlineDecoder(const LatticeFasterDecoderConfig &config,
-                                                       fst::Fst<fst::StdArc> *fst):
-    fst_(*fst), delete_fst_(true), config_(config), num_toks_(0) {
-  config.Check();
-  toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
-}
-
-
-LatticeFasterOnlineDecoder::~LatticeFasterOnlineDecoder() {
-  DeleteElems(toks_.Clear());
-  ClearActiveTokens();
-  if (delete_fst_) delete &(fst_);
-}
-
-void LatticeFasterOnlineDecoder::InitDecoding() {
-  // clean up from last time:
-  DeleteElems(toks_.Clear());
-  cost_offsets_.clear();
-  ClearActiveTokens();
-  warned_ = false;
-  num_toks_ = 0;
-  decoding_finalized_ = false;
-  final_costs_.clear();
-  StateId start_state = fst_.Start();
-  KALDI_ASSERT(start_state != fst::kNoStateId);
-  active_toks_.resize(1);
-  Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL);
-  active_toks_[0].toks = start_tok;
-  toks_.Insert(start_state, start_tok);
-  num_toks_++;
-  ProcessNonemittingWrapper(config_.beam);
-}
-
-// Returns true if any kind of traceback is available (not necessarily from
-// a final state).  It should only very rarely return false; this indicates
-// an unusual search error.
-bool LatticeFasterOnlineDecoder::Decode(DecodableInterface *decodable) {
-  InitDecoding();
-
-  // We use 1-based indexing for frames in this decoder (if you view it in
-  // terms of features), but note that the decodable object uses zero-based
-  // numbering, which we have to correct for when we call it.
-
-  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
-    if (NumFramesDecoded() % config_.prune_interval == 0)
-      PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
-    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);  // Note: the value returned by
-    ProcessNonemittingWrapper(cost_cutoff);
-  }
-  FinalizeDecoding();
-
-  // Returns true if we have any kind of traceback available (not necessarily
-  // to the end state; query ReachedFinal() for that).
-  return !active_toks_.empty() && active_toks_.back().toks != NULL;
-}
-
-
-
-
-
-bool LatticeFasterOnlineDecoder::TestGetBestPath(bool use_final_probs) const {
+template <typename FST>
+bool LatticeFasterOnlineDecoderTpl<FST>::TestGetBestPath(
+    bool use_final_probs) const {
   Lattice lat1;
   {
     Lattice raw_lat;
-    GetRawLattice(&raw_lat, use_final_probs);
+    this->GetRawLattice(&raw_lat, use_final_probs);
     ShortestPath(raw_lat, &lat1);
   }
   Lattice lat2;
-  GetBestPath(&lat2, use_final_probs);  
+  GetBestPath(&lat2, use_final_probs);
   BaseFloat delta = 0.1;
   int32 num_paths = 1;
   if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) {
@@ -120,8 +52,9 @@ bool LatticeFasterOnlineDecoder::TestGetBestPath(bool use_final_probs) const {
 
 
 // Outputs an FST corresponding to the single best path through the lattice.
-bool LatticeFasterOnlineDecoder::GetBestPath(Lattice *olat,
-                                             bool use_final_probs) const {
+template <typename FST>
+bool LatticeFasterOnlineDecoderTpl<FST>::GetBestPath(Lattice *olat,
+                                                     bool use_final_probs) const {
   olat->DeleteStates();
   BaseFloat final_graph_cost;
   BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost);
@@ -141,94 +74,98 @@ bool LatticeFasterOnlineDecoder::GetBestPath(Lattice *olat,
   return true;
 }
 
-
-// Outputs an FST corresponding to the raw, state-level
-// tracebacks.
-bool LatticeFasterOnlineDecoder::GetRawLattice(Lattice *ofst,
-                                               bool use_final_probs) const {
-  typedef LatticeArc Arc;
-  typedef Arc::StateId StateId;
-  typedef Arc::Weight Weight;
-  typedef Arc::Label Label;
-
-  // Note: you can't use the old interface (Decode()) if you want to
-  // get the lattice with use_final_probs = false.  You'd have to do
-  // InitDecoding() and then AdvanceDecoding().
-  if (decoding_finalized_ && !use_final_probs)
+template <typename FST>
+typename LatticeFasterOnlineDecoderTpl<FST>::BestPathIterator LatticeFasterOnlineDecoderTpl<FST>::BestPathEnd(
+    bool use_final_probs,
+    BaseFloat *final_cost_out) const {
+  if (this->decoding_finalized_ && !use_final_probs)
     KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
-              << "GetRawLattice() with use_final_probs == false";
+              << "BestPathEnd() with use_final_probs == false";
+  KALDI_ASSERT(this->NumFramesDecoded() > 0 &&
+               "You cannot call BestPathEnd if no frames were decoded.");
 
   unordered_map<Token*, BaseFloat> final_costs_local;
 
   const unordered_map<Token*, BaseFloat> &final_costs =
-      (decoding_finalized_ ? final_costs_ : final_costs_local);
-  if (!decoding_finalized_ && use_final_probs)
-    ComputeFinalCosts(&final_costs_local, NULL, NULL);
+      (this->decoding_finalized_ ? this->final_costs_ :final_costs_local);
+  if (!this->decoding_finalized_ && use_final_probs)
+    this->ComputeFinalCosts(&final_costs_local, NULL, NULL);
 
-  ofst->DeleteStates();
-  // num-frames plus one (since frames are one-based, and we have
-  // an extra frame for the start-state).
-  int32 num_frames = active_toks_.size() - 1;
-  KALDI_ASSERT(num_frames > 0);
-  const int32 bucket_count = num_toks_/2 + 3;
-  unordered_map<Token*, StateId> tok_map(bucket_count);
-  // First create all states.
-  std::vector<Token*> token_list;
-  for (int32 f = 0; f <= num_frames; f++) {
-    if (active_toks_[f].toks == NULL) {
-      KALDI_WARN << "GetRawLattice: no tokens active on frame " << f
-                 << ": not producing lattice.\n";
-      return false;
+  // Singly linked list of tokens on last frame (access list through "next"
+  // pointer).
+  BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
+  BaseFloat best_final_cost = 0;
+  Token *best_tok = NULL;
+  for (Token *tok = this->active_toks_.back().toks;
+       tok != NULL; tok = tok->next) {
+    BaseFloat cost = tok->tot_cost, final_cost = 0.0;
+    if (use_final_probs && !final_costs.empty()) {
+      // if we are instructed to use final-probs, and any final tokens were
+      // active on final frame, include the final-prob in the cost of the token.
+      typename unordered_map<Token*, BaseFloat>::const_iterator
+          iter = final_costs.find(tok);
+      if (iter != final_costs.end()) {
+        final_cost = iter->second;
+        cost += final_cost;
+      } else {
+        cost = std::numeric_limits<BaseFloat>::infinity();
+      }
+    }
+    if (cost < best_cost) {
+      best_cost = cost;
+      best_tok = tok;
+      best_final_cost = final_cost;
     }
-    TopSortTokens(active_toks_[f].toks, &token_list);
-    for (size_t i = 0; i < token_list.size(); i++)
-      if (token_list[i] != NULL)
-        tok_map[token_list[i]] = ofst->AddState();    
   }
-  // The next statement sets the start state of the output FST.  Because we
-  // topologically sorted the tokens, state zero must be the start-state.
-  ofst->SetStart(0);
-  
-  KALDI_VLOG(4) << "init:" << num_toks_/2 + 3 << " buckets:"
-                << tok_map.bucket_count() << " load:" << tok_map.load_factor()
-                << " max:" << tok_map.max_load_factor();
-  // Now create all arcs.
-  for (int32 f = 0; f <= num_frames; f++) {
-    for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) {
-      StateId cur_state = tok_map[tok];
-      for (ForwardLink *l = tok->links;
-           l != NULL;
-           l = l->next) {
-        unordered_map<Token*, StateId>::const_iterator iter =
-            tok_map.find(l->next_tok);
-        StateId nextstate = iter->second;
-        KALDI_ASSERT(iter != tok_map.end());
-        BaseFloat cost_offset = 0.0;
-        if (l->ilabel != 0) {  // emitting..
-          KALDI_ASSERT(f >= 0 && f < cost_offsets_.size());
-          cost_offset = cost_offsets_[f];
-        }
-        Arc arc(l->ilabel, l->olabel,
-                Weight(l->graph_cost, l->acoustic_cost - cost_offset),
-                nextstate);
-        ofst->AddArc(cur_state, arc);
-      }
-      if (f == num_frames) {
-        if (use_final_probs && !final_costs.empty()) {
-          unordered_map<Token*, BaseFloat>::const_iterator iter =
-              final_costs.find(tok);
-          if (iter != final_costs.end())
-            ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
-        } else {
-          ofst->SetFinal(cur_state, LatticeWeight::One());
+  if (best_tok == NULL) {  // this should not happen, and is likely a code error or
+    // caused by infinities in likelihoods, but I'm not making
+    // it a fatal error for now.
+    KALDI_WARN << "No final token found.";
+  }
+  if (final_cost_out)
+    *final_cost_out = best_final_cost;
+  return BestPathIterator(best_tok, this->NumFramesDecoded() - 1);
+}
+
+
+template <typename FST>
+typename LatticeFasterOnlineDecoderTpl<FST>::BestPathIterator LatticeFasterOnlineDecoderTpl<FST>::TraceBackBestPath(
+    BestPathIterator iter, LatticeArc *oarc) const {
+  KALDI_ASSERT(!iter.Done() && oarc != NULL);
+  Token *tok = static_cast<Token*>(iter.tok);
+  int32 cur_t = iter.frame, ret_t = cur_t;
+  if (tok->backpointer != NULL) {
+    ForwardLinkT *link;
+    for (link = tok->backpointer->links;
+         link != NULL; link = link->next) {
+      if (link->next_tok == tok) { // this is the link to "tok"
+        oarc->ilabel = link->ilabel;
+        oarc->olabel = link->olabel;
+        BaseFloat graph_cost = link->graph_cost,
+            acoustic_cost = link->acoustic_cost;
+        if (link->ilabel != 0) {
+          KALDI_ASSERT(static_cast<size_t>(cur_t) < this->cost_offsets_.size());
+          acoustic_cost -= this->cost_offsets_[cur_t];
+          ret_t--;
         }
+        oarc->weight = LatticeWeight(graph_cost, acoustic_cost);
+        break;
       }
     }
+    if (link == NULL) { // Did not find correct link.
+      KALDI_ERR << "Error tracing best-path back (likely "
+                << "bug in token-pruning algorithm)";
+    }
+  } else {
+    oarc->ilabel = 0;
+    oarc->olabel = 0;
+    oarc->weight = LatticeWeight::One(); // zero costs.
   }
-  return (ofst->NumStates() > 0);
+  return BestPathIterator(tok->backpointer, ret_t);
 }
 
-bool LatticeFasterOnlineDecoder::GetRawLatticePruned(
+template <typename FST>
+bool LatticeFasterOnlineDecoderTpl<FST>::GetRawLatticePruned(
     Lattice *ofst,
     bool use_final_probs,
     BaseFloat beam) const {
@@ -240,57 +177,58 @@ bool LatticeFasterOnlineDecoder::GetRawLatticePruned(
   // Note: you can't use the old interface (Decode()) if you want to
   // get the lattice with use_final_probs = false.  You'd have to do
   // InitDecoding() and then AdvanceDecoding().
-  if (decoding_finalized_ && !use_final_probs)
+  if (this->decoding_finalized_ && !use_final_probs)
     KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
               << "GetRawLattice() with use_final_probs == false";
 
   unordered_map<Token*, BaseFloat> final_costs_local;
 
   const unordered_map<Token*, BaseFloat> &final_costs =
-      (decoding_finalized_ ? final_costs_ : final_costs_local);
-  if (!decoding_finalized_ && use_final_probs)
-    ComputeFinalCosts(&final_costs_local, NULL, NULL);
+      (this->decoding_finalized_ ? this->final_costs_ : final_costs_local);
+  if (!this->decoding_finalized_ && use_final_probs)
+    this->ComputeFinalCosts(&final_costs_local, NULL, NULL);
 
   ofst->DeleteStates();
   // num-frames plus one (since frames are one-based, and we have
   // an extra frame for the start-state).
-  int32 num_frames = active_toks_.size() - 1;
+  int32 num_frames = this->active_toks_.size() - 1;
   KALDI_ASSERT(num_frames > 0);
   for (int32 f = 0; f <= num_frames; f++) {
-    if (active_toks_[f].toks == NULL) {
-      KALDI_WARN << "GetRawLattice: no tokens active on frame " << f
+    if (this->active_toks_[f].toks == NULL) {
+      KALDI_WARN << "No tokens active on frame " << f
                  << ": not producing lattice.\n";
       return false;
     }
   }
-
   unordered_map<Token*, StateId> tok_map;
   std::queue<std::pair<Token*, int32> > tok_queue;
   // First initialize the queue and states.  Put the initial state on the queue;
   // this is the last token in the list active_toks_[0].toks.
-  for (Token *tok = active_toks_[0].toks; tok != NULL; tok = tok->next) {
+  for (Token *tok = this->active_toks_[0].toks;
+       tok != NULL; tok = tok->next) {
     if (tok->next == NULL) {
       tok_map[tok] = ofst->AddState();
       ofst->SetStart(tok_map[tok]);
       std::pair<Token*, int32> tok_pair(tok, 0);  // #frame = 0
       tok_queue.push(tok_pair);
     }
-  }  
-  
+  }
+
   // Next create states for "good" tokens
   while (!tok_queue.empty()) {
     std::pair<Token*, int32> cur_tok_pair = tok_queue.front();
     tok_queue.pop();
     Token *cur_tok = cur_tok_pair.first;
     int32 cur_frame = cur_tok_pair.second;
-    KALDI_ASSERT(cur_frame >= 0 && cur_frame <= cost_offsets_.size());
-    
-    unordered_map<Token*, StateId>::const_iterator iter =
+    KALDI_ASSERT(cur_frame >= 0 &&
+                 cur_frame <= this->cost_offsets_.size());
+
+    typename unordered_map<Token*, StateId>::const_iterator iter =
         tok_map.find(cur_tok);
     KALDI_ASSERT(iter != tok_map.end());
     StateId cur_state = iter->second;
 
-    for (ForwardLink *l = cur_tok->links;
+    for (ForwardLinkT *l = cur_tok->links;
          l != NULL;
          l = l->next) {
       Token *next_tok = l->next_tok;
@@ -304,7 +242,8 @@ bool LatticeFasterOnlineDecoder::GetRawLatticePruned(
         } else {
           nextstate = tok_map[next_tok];
         }
-        BaseFloat cost_offset = (l->ilabel != 0 ? cost_offsets_[cur_frame] : 0);
+        BaseFloat cost_offset = (l->ilabel != 0 ?
+                                 this->cost_offsets_[cur_frame] : 0);
         Arc arc(l->ilabel, l->olabel,
                 Weight(l->graph_cost, l->acoustic_cost - cost_offset),
                 nextstate);
@@ -313,11 +252,11 @@ bool LatticeFasterOnlineDecoder::GetRawLatticePruned(
     }
     if (cur_frame == num_frames) {
       if (use_final_probs && !final_costs.empty()) {
-        unordered_map<Token*, BaseFloat>::const_iterator iter =
+        typename unordered_map<Token*, BaseFloat>::const_iterator iter =
             final_costs.find(cur_tok);
         if (iter != final_costs.end())
           ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
-      } else {        
+      } else {
         ofst->SetFinal(cur_state, LatticeWeight::One());
       }
     }
@@ -326,841 +265,12 @@ bool LatticeFasterOnlineDecoder::GetRawLatticePruned(
 }
 
 
-void LatticeFasterOnlineDecoder::PossiblyResizeHash(size_t num_toks) {
-  size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
-                                      * config_.hash_ratio);
-  if (new_sz > toks_.Size()) {
-    toks_.SetSize(new_sz);
-  }
-}
-
-// FindOrAddToken either locates a token in hash of toks_,
-// or if necessary inserts a new, empty token (i.e. with no forward links)
-// for the current frame.  [note: it's inserted if necessary into hash toks_
-// and also into the singly linked list of tokens active on this frame
-// (whose head is at active_toks_[frame]).
-inline LatticeFasterOnlineDecoder::Token *LatticeFasterOnlineDecoder::FindOrAddToken(
-    StateId state, int32 frame_plus_one, BaseFloat tot_cost,
-    Token *backpointer, bool *changed) {
-  // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
-  // if the token was newly created or the cost changed.
-  KALDI_ASSERT(frame_plus_one < active_toks_.size());
-  Token *&toks = active_toks_[frame_plus_one].toks;
-  Elem *e_found = toks_.Find(state);
-  if (e_found == NULL) {  // no such token presently.
-    const BaseFloat extra_cost = 0.0;
-    // tokens on the currently final frame have zero extra_cost
-    // as any of them could end up
-    // on the winning path.
-    Token *new_tok = new Token (tot_cost, extra_cost, NULL, toks, backpointer);
-    // NULL: no forward links yet
-    toks = new_tok;
-    num_toks_++;
-    toks_.Insert(state, new_tok);
-    if (changed) *changed = true;
-    return new_tok;
-  } else {
-    Token *tok = e_found->val;  // There is an existing Token for this state.
-    if (tok->tot_cost > tot_cost) {  // replace old token
-      tok->tot_cost = tot_cost;
-      tok->backpointer = backpointer;
-      // we don't allocate a new token, the old stays linked in active_toks_
-      // we only replace the tot_cost
-      // in the current frame, there are no forward links (and no extra_cost)
-      // only in ProcessNonemitting we have to delete forward links
-      // in case we visit a state for the second time
-      // those forward links, that lead to this replaced token before:
-      // they remain and will hopefully be pruned later (PruneForwardLinks...)
-      if (changed) *changed = true;
-    } else {
-      if (changed) *changed = false;
-    }
-    return tok;
-  }
-}
-
-// prunes outgoing links for all tokens in active_toks_[frame]
-// it's called by PruneActiveTokens
-// all links, that have link_extra_cost > lattice_beam are pruned
-void LatticeFasterOnlineDecoder::PruneForwardLinks(
-    int32 frame_plus_one, bool *extra_costs_changed,
-    bool *links_pruned, BaseFloat delta) {
-  // delta is the amount by which the extra_costs must change
-  // If delta is larger,  we'll tend to go back less far
-  //    toward the beginning of the file.
-  // extra_costs_changed is set to true if extra_cost was changed for any token
-  // links_pruned is set to true if any link in any token was pruned
-
-  *extra_costs_changed = false;
-  *links_pruned = false;
-  KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size());
-  if (active_toks_[frame_plus_one].toks == NULL) {  // empty list; should not happen.
-    if (!warned_) {
-      KALDI_WARN << "No tokens alive [doing pruning].. warning first "
-          "time only for each utterance\n";
-      warned_ = true;
-    }
-  }
-
-  // We have to iterate until there is no more change, because the links
-  // are not guaranteed to be in topological order.
-  bool changed = true;  // difference new minus old extra cost >= delta ?
-  while (changed) {
-    changed = false;
-    for (Token *tok = active_toks_[frame_plus_one].toks;
-         tok != NULL; tok = tok->next) {
-      ForwardLink *link, *prev_link = NULL;
-      // will recompute tok_extra_cost for tok.
-      BaseFloat tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
-      // tok_extra_cost is the best (min) of link_extra_cost of outgoing links
-      for (link = tok->links; link != NULL; ) {
-        // See if we need to excise this link...
-        Token *next_tok = link->next_tok;
-        BaseFloat link_extra_cost = next_tok->extra_cost +
-            ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
-             - next_tok->tot_cost);  // difference in brackets is >= 0
-        // link_exta_cost is the difference in score between the best paths
-        // through link source state and through link destination state
-        KALDI_ASSERT(link_extra_cost == link_extra_cost);  // check for NaN
-        if (link_extra_cost > config_.lattice_beam) {  // excise link
-          ForwardLink *next_link = link->next;
-          if (prev_link != NULL) prev_link->next = next_link;
-          else tok->links = next_link;
-          delete link;
-          link = next_link;  // advance link but leave prev_link the same.
-          *links_pruned = true;
-        } else {   // keep the link and update the tok_extra_cost if needed.
-          if (link_extra_cost < 0.0) {  // this is just a precaution.
-            if (link_extra_cost < -0.01)
-              KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
-            link_extra_cost = 0.0;
-          }
-          if (link_extra_cost < tok_extra_cost)
-            tok_extra_cost = link_extra_cost;
-          prev_link = link;  // move to next link
-          link = link->next;
-        }
-      }  // for all outgoing links
-      if (fabs(tok_extra_cost - tok->extra_cost) > delta)
-        changed = true;   // difference new minus old is bigger than delta
-      tok->extra_cost = tok_extra_cost;
-      // will be +infinity or <= lattice_beam_.
-      // infinity indicates, that no forward link survived pruning
-    }  // for all Token on active_toks_[frame]
-    if (changed) *extra_costs_changed = true;
-
-    // Note: it's theoretically possible that aggressive compiler
-    // optimizations could cause an infinite loop here for small delta and
-    // high-dynamic-range scores.
-  } // while changed
-}
-
-// PruneForwardLinksFinal is a version of PruneForwardLinks that we call
-// on the final frame.  If there are final tokens active, it uses
-// the final-probs for pruning, otherwise it treats all tokens as final.
-void LatticeFasterOnlineDecoder::PruneForwardLinksFinal() {
-  KALDI_ASSERT(!active_toks_.empty());
-  int32 frame_plus_one = active_toks_.size() - 1;
-
-  if (active_toks_[frame_plus_one].toks == NULL )  // empty list; should not happen.
-    KALDI_WARN << "No tokens alive at end of file\n";
-
-  typedef unordered_map<Token*, BaseFloat>::const_iterator IterType;
-  ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
-  decoding_finalized_ = true;
-  // We call DeleteElems() as a nicety, not because it's really necessary;
-  // otherwise there would be a time, after calling PruneTokensForFrame() on the
-  // final frame, when toks_.GetList() or toks_.Clear() would contain pointers
-  // to nonexistent tokens.
-  DeleteElems(toks_.Clear());
-
-  // Now go through tokens on this frame, pruning forward links...  may have to
-  // iterate a few times until there is no more change, because the list is not
-  // in topological order.  This is a modified version of the code in
-  // PruneForwardLinks, but here we also take account of the final-probs.
-  bool changed = true;
-  BaseFloat delta = 1.0e-05;
-  while (changed) {
-    changed = false;
-    for (Token *tok = active_toks_[frame_plus_one].toks;
-         tok != NULL; tok = tok->next) {
-      ForwardLink *link, *prev_link = NULL;
-      // will recompute tok_extra_cost.  It has a term in it that corresponds
-      // to the "final-prob", so instead of initializing tok_extra_cost to infinity
-      // below we set it to the difference between the (score+final_prob) of this token,
-      // and the best such (score+final_prob).
-      BaseFloat final_cost;
-      if (final_costs_.empty()) {
-        final_cost = 0.0;
-      } else {
-        IterType iter = final_costs_.find(tok);
-        if (iter != final_costs_.end())
-          final_cost = iter->second;
-        else
-          final_cost = std::numeric_limits<BaseFloat>::infinity();
-      }
-      BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_;
-      // tok_extra_cost will be a "min" over either directly being final, or
-      // being indirectly final through other links, and the loop below may
-      // decrease its value:
-      for (link = tok->links; link != NULL; ) {
-        // See if we need to excise this link...
-        Token *next_tok = link->next_tok;
-        BaseFloat link_extra_cost = next_tok->extra_cost +
-            ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
-             - next_tok->tot_cost);
-        if (link_extra_cost > config_.lattice_beam) {  // excise link
-          ForwardLink *next_link = link->next;
-          if (prev_link != NULL) prev_link->next = next_link;
-          else tok->links = next_link;
-          delete link;
-          link = next_link; // advance link but leave prev_link the same.
-        } else { // keep the link and update the tok_extra_cost if needed.
-          if (link_extra_cost < 0.0) { // this is just a precaution.
-            if (link_extra_cost < -0.01)
-              KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
-            link_extra_cost = 0.0;
-          }
-          if (link_extra_cost < tok_extra_cost)
-            tok_extra_cost = link_extra_cost;
-          prev_link = link;
-          link = link->next;
-        }
-      }
-      // prune away tokens worse than lattice_beam above best path.  This step
-      // was not necessary in the non-final case because then, this case
-      // showed up as having no forward links.  Here, the tok_extra_cost has
-      // an extra component relating to the final-prob.
-      if (tok_extra_cost > config_.lattice_beam)
-        tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
-      // to be pruned in PruneTokensForFrame
-
-      if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta))
-        changed = true;
-      tok->extra_cost = tok_extra_cost; // will be +infinity or <= lattice_beam_.
-    }
-  } // while changed
-
-}
-
-BaseFloat LatticeFasterOnlineDecoder::FinalRelativeCost() const {
-  if (!decoding_finalized_) {
-    BaseFloat relative_cost;
-    ComputeFinalCosts(NULL, &relative_cost, NULL);
-    return relative_cost;
-  } else {
-    // we're not allowed to call that function if FinalizeDecoding() has
-    // been called; return a cached value.
-    return final_relative_cost_;
-  }
-}
-
-
-// Prune away any tokens on this frame that have no forward links.
-// [we don't do this in PruneForwardLinks because it would give us
-// a problem with dangling pointers].
-// It's called by PruneActiveTokens if any forward links have been pruned
-void LatticeFasterOnlineDecoder::PruneTokensForFrame(int32 frame_plus_one) {
-  KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size());
-  Token *&toks = active_toks_[frame_plus_one].toks;
-  if (toks == NULL)
-    KALDI_WARN << "No tokens alive [doing pruning]\n";
-  Token *tok, *next_tok, *prev_tok = NULL;
-  for (tok = toks; tok != NULL; tok = next_tok) {
-    next_tok = tok->next;
-    if (tok->extra_cost == std::numeric_limits<BaseFloat>::infinity()) {
-      // token is unreachable from end of graph; (no forward links survived)
-      // excise tok from list and delete tok.
-      if (prev_tok != NULL) prev_tok->next = tok->next;
-      else toks = tok->next;
-      delete tok;
-      num_toks_--;
-    } else {  // fetch next Token
-      prev_tok = tok;
-    }
-  }
-}
-
-// Go backwards through still-alive tokens, pruning them, starting not from
-// the current frame (where we want to keep all tokens) but from the frame before
-// that.  We go backwards through the frames and stop when we reach a point
-// where the delta-costs are not changing (and the delta controls when we consider
-// a cost to have "not changed").
-void LatticeFasterOnlineDecoder::PruneActiveTokens(BaseFloat delta) {
-  int32 cur_frame_plus_one = NumFramesDecoded();
-  int32 num_toks_begin = num_toks_;
-  // The index "f" below represents a "frame plus one", i.e. you'd have to subtract
-  // one to get the corresponding index for the decodable object.
-  for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) {
-    // Reason why we need to prune forward links in this situation:
-    // (1) we have never pruned them (new TokenList)
-    // (2) we have not yet pruned the forward links to the next f,
-    // after any of those tokens have changed their extra_cost.
-    if (active_toks_[f].must_prune_forward_links) {
-      bool extra_costs_changed = false, links_pruned = false;
-      PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta);
-      if (extra_costs_changed && f > 0) // any token has changed extra_cost
-        active_toks_[f-1].must_prune_forward_links = true;
-      if (links_pruned) // any link was pruned
-        active_toks_[f].must_prune_tokens = true;
-      active_toks_[f].must_prune_forward_links = false; // job done
-    }
-    if (f+1 < cur_frame_plus_one &&      // except for last f (no forward links)
-        active_toks_[f+1].must_prune_tokens) {
-      PruneTokensForFrame(f+1);
-      active_toks_[f+1].must_prune_tokens = false;
-    }
-  }
-  KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin
-                << " to " << num_toks_;
-}
-
-void LatticeFasterOnlineDecoder::ComputeFinalCosts(
-    unordered_map<Token*, BaseFloat> *final_costs,
-    BaseFloat *final_relative_cost,
-    BaseFloat *final_best_cost) const {
-  KALDI_ASSERT(!decoding_finalized_);
-  if (final_costs != NULL)
-    final_costs->clear();
-  const Elem *final_toks = toks_.GetList();
-  BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
-  BaseFloat best_cost = infinity,
-      best_cost_with_final = infinity;
-  while (final_toks != NULL) {
-    StateId state = final_toks->key;
-    Token *tok = final_toks->val;
-    const Elem *next = final_toks->tail;
-    BaseFloat final_cost = fst_.Final(state).Value();
-    BaseFloat cost = tok->tot_cost,
-        cost_with_final = cost + final_cost;
-    best_cost = std::min(cost, best_cost);
-    best_cost_with_final = std::min(cost_with_final, best_cost_with_final);
-    if (final_costs != NULL && final_cost != infinity)
-      (*final_costs)[tok] = final_cost;
-    final_toks = next;
-  }
-  if (final_relative_cost != NULL) {
-    if (best_cost == infinity && best_cost_with_final == infinity) {
-      // Likely this will only happen if there are no tokens surviving.
-      // This seems the least bad way to handle it.
-      *final_relative_cost = infinity;
-    } else {
-      *final_relative_cost = best_cost_with_final - best_cost;
-    }
-  }
-  if (final_best_cost != NULL) {
-    if (best_cost_with_final != infinity) { // final-state exists.
-      *final_best_cost = best_cost_with_final;
-    } else { // no final-state exists.
-      *final_best_cost = best_cost;
-    }
-  }
-}
-
-
-LatticeFasterOnlineDecoder::BestPathIterator LatticeFasterOnlineDecoder::BestPathEnd(
-    bool use_final_probs,
-    BaseFloat *final_cost_out) const {
-  if (decoding_finalized_ && !use_final_probs)
-    KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
-              << "BestPathEnd() with use_final_probs == false";
-  KALDI_ASSERT(NumFramesDecoded() > 0 &&
-               "You cannot call BestPathEnd if no frames were decoded.");
-  
-  unordered_map<Token*, BaseFloat> final_costs_local;
-
-  const unordered_map<Token*, BaseFloat> &final_costs =
-      (decoding_finalized_ ? final_costs_ : final_costs_local);
-  if (!decoding_finalized_ && use_final_probs)
-    ComputeFinalCosts(&final_costs_local, NULL, NULL);
-  
-  // Singly linked list of tokens on last frame (access list through "next"
-  // pointer).
-  BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
-  BaseFloat best_final_cost = 0;
-  Token *best_tok = NULL;
-  for (Token *tok = active_toks_.back().toks; tok != NULL; tok = tok->next) {
-    BaseFloat cost = tok->tot_cost, final_cost = 0.0;
-    if (use_final_probs && !final_costs.empty()) {
-      // if we are instructed to use final-probs, and any final tokens were
-      // active on final frame, include the final-prob in the cost of the token.
-      unordered_map<Token*, BaseFloat>::const_iterator iter = final_costs.find(tok);
-      if (iter != final_costs.end()) {
-        final_cost = iter->second;
-        cost += final_cost;
-      } else {
-        cost = std::numeric_limits<BaseFloat>::infinity();
-      }
-    }
-    if (cost < best_cost) {
-      best_cost = cost;
-      best_tok = tok;
-      best_final_cost = final_cost;
-    }
-  }    
-  if (best_tok == NULL) {  // this should not happen, and is likely a code error or
-    // caused by infinities in likelihoods, but I'm not making
-    // it a fatal error for now.
-    KALDI_WARN << "No final token found.";
-  }
-  if (final_cost_out)
-    *final_cost_out = best_final_cost;
-  return BestPathIterator(best_tok, NumFramesDecoded() - 1);
-}
-
-
-LatticeFasterOnlineDecoder::BestPathIterator LatticeFasterOnlineDecoder::TraceBackBestPath(
-    BestPathIterator iter, LatticeArc *oarc) const {
-  KALDI_ASSERT(!iter.Done() && oarc != NULL);
-  Token *tok = static_cast<Token*>(iter.tok);
-  int32 cur_t = iter.frame, ret_t = cur_t;
-  if (tok->backpointer != NULL) {
-    ForwardLink *link;
-    for (link = tok->backpointer->links;
-         link != NULL; link = link->next) {
-      if (link->next_tok == tok) { // this is the link to "tok"
-        oarc->ilabel = link->ilabel;
-        oarc->olabel = link->olabel;
-        BaseFloat graph_cost = link->graph_cost,
-            acoustic_cost = link->acoustic_cost;
-        if (link->ilabel != 0) {
-          KALDI_ASSERT(static_cast<size_t>(cur_t) < cost_offsets_.size());
-          acoustic_cost -= cost_offsets_[cur_t];
-          ret_t--;
-        }
-        oarc->weight = LatticeWeight(graph_cost, acoustic_cost);
-        break;
-      }
-    }
-    if (link == NULL) { // Did not find correct link.
-      KALDI_ERR << "Error tracing best-path back (likely "
-                << "bug in token-pruning algorithm)";
-    }
-  } else {
-    oarc->ilabel = 0;
-    oarc->olabel = 0;
-    oarc->weight = LatticeWeight::One(); // zero costs.
-  }
-  return BestPathIterator(tok->backpointer, ret_t);
-}
-
-
-void LatticeFasterOnlineDecoder::AdvanceDecoding(DecodableInterface *decodable,
-                                                   int32 max_num_frames) {
-  KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ &&
-               "You must call InitDecoding() before AdvanceDecoding");
-  int32 num_frames_ready = decodable->NumFramesReady();
-  // num_frames_ready must be >= num_frames_decoded, or else
-  // the number of frames ready must have decreased (which doesn't
-  // make sense) or the decodable object changed between calls
-  // (which isn't allowed).
-  KALDI_ASSERT(num_frames_ready >= NumFramesDecoded());
-  int32 target_frames_decoded = num_frames_ready;
-  if (max_num_frames >= 0)
-    target_frames_decoded = std::min(target_frames_decoded,
-                                     NumFramesDecoded() + max_num_frames);
-  while (NumFramesDecoded() < target_frames_decoded) {
-    if (NumFramesDecoded() % config_.prune_interval == 0) {
-      PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
-    }
-    // note: ProcessEmitting() increments NumFramesDecoded().
-    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);
-    ProcessNonemittingWrapper(cost_cutoff);
-  }
-}
-
-
-// FinalizeDecoding() is a version of PruneActiveTokens that we call
-// (optionally) on the final frame.  Takes into account the final-prob of
-// tokens.  This function used to be called PruneActiveTokensFinal().
-void LatticeFasterOnlineDecoder::FinalizeDecoding() {
-  int32 final_frame_plus_one = NumFramesDecoded();
-  int32 num_toks_begin = num_toks_;
-  // PruneForwardLinksFinal() prunes final frame (with final-probs), and
-  // sets decoding_finalized_.
-  PruneForwardLinksFinal();
-  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {
-    bool b1, b2; // values not used.
-    BaseFloat dontcare = 0.0; // delta of zero means we must always update
-    PruneForwardLinks(f, &b1, &b2, dontcare);
-    PruneTokensForFrame(f + 1);
-  }
-  PruneTokensForFrame(0);
-  KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin
-                << " to " << num_toks_;
-}
-
-/// Gets the weight cutoff.  Also counts the active tokens.
-BaseFloat LatticeFasterOnlineDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
-                                                BaseFloat *adaptive_beam, Elem **best_elem) {
-  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
-  // positive == high cost == bad.
-  size_t count = 0;
-  if (config_.max_active == std::numeric_limits<int32>::max() &&
-      config_.min_active == 0) {
-    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
-      if (w < best_weight) {
-        best_weight = w;
-        if (best_elem) *best_elem = e;
-      }
-    }
-    if (tok_count != NULL) *tok_count = count;
-    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
-    return best_weight + config_.beam;
-  } else {
-    tmp_array_.clear();
-    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-      BaseFloat w = e->val->tot_cost;
-      tmp_array_.push_back(w);
-      if (w < best_weight) {
-        best_weight = w;
-        if (best_elem) *best_elem = e;
-      }
-    }
-    if (tok_count != NULL) *tok_count = count;
-    
-    BaseFloat beam_cutoff = best_weight + config_.beam,
-        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
-        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
-
-    KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded()
-                  << " is " << tmp_array_.size();
-
-    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
-      std::nth_element(tmp_array_.begin(),
-                       tmp_array_.begin() + config_.max_active,
-                       tmp_array_.end());
-      max_active_cutoff = tmp_array_[config_.max_active];
-    }
-    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
-      if (adaptive_beam)
-        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
-      return max_active_cutoff;
-    }    
-    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
-      if (config_.min_active == 0) min_active_cutoff = best_weight;
-      else {
-        std::nth_element(tmp_array_.begin(),
-                         tmp_array_.begin() + config_.min_active,
-                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
-                         tmp_array_.begin() + config_.max_active :
-                         tmp_array_.end());
-        min_active_cutoff = tmp_array_[config_.min_active];
-      }
-    }
-
-    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
-      if (adaptive_beam)
-        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
-      return min_active_cutoff;
-    } else {
-      *adaptive_beam = config_.beam;
-      return beam_cutoff;
-    }
-  }
-}
-
-
-template <typename FstType>
-BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting(
-    DecodableInterface *decodable) {
-  KALDI_ASSERT(active_toks_.size() > 0);
-  int32 frame = active_toks_.size() - 1; // frame is the frame-index
-  // (zero-based) used to get likelihoods
-  // from the decodable object.
-  active_toks_.resize(active_toks_.size() + 1);
-
-  Elem *final_toks = toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_
-  // in simple-decoder.h.   Removes the Elems from
-  // being indexed in the hash in toks_.
-  Elem *best_elem = NULL;
-  BaseFloat adaptive_beam;
-  size_t tok_cnt;
-  BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem);
-  PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.
-
-  BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
-  // pruning "online" before having seen all tokens
-
-  BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good
-  // dynamic range.
-  const FstType &fst = dynamic_cast<const FstType&>(fst_);
-
-  // First process the best token to get a hopefully
-  // reasonably tight bound on the next cutoff.  The only
-  // products of the next block are "next_cutoff" and "cost_offset".
-  if (best_elem) {
-    StateId state = best_elem->key;
-    Token *tok = best_elem->val;
-    cost_offset = - tok->tot_cost;
-    for (fst::ArcIterator<FstType> aiter(fst, state);
-         !aiter.Done();
-         aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      if (arc.ilabel != 0) {  // propagate..
-        BaseFloat new_weight = arc.weight.Value() + cost_offset - 
-            decodable->LogLikelihood(frame, arc.ilabel) + tok->tot_cost;
-        if (new_weight + adaptive_beam < next_cutoff)
-          next_cutoff = new_weight + adaptive_beam;
-      }
-    }
-  }
-
-  // Store the offset on the acoustic likelihoods that we're applying.
-  // Could just do cost_offsets_.push_back(cost_offset), but we
-  // do it this way as it's more robust to future code changes.
-  cost_offsets_.resize(frame + 1, 0.0);
-  cost_offsets_[frame] = cost_offset;
-
-  // the tokens are now owned here, in final_toks, and the hash is empty.
-  // 'owned' is a complex thing here; the point is we need to call DeleteElem
-  // on each elem 'e' to let toks_ know we're done with them.
-  for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) {
-    // loop this way because we delete "e" as we go.
-    StateId state = e->key;
-    Token *tok = e->val;
-    if (tok->tot_cost <= cur_cutoff) {
-      for (fst::ArcIterator<FstType> aiter(fst, state);
-           !aiter.Done();
-           aiter.Next()) {
-        const Arc &arc = aiter.Value();
-        if (arc.ilabel != 0) {  // propagate..
-          BaseFloat ac_cost = cost_offset -
-              decodable->LogLikelihood(frame, arc.ilabel),
-              graph_cost = arc.weight.Value(),
-              cur_cost = tok->tot_cost,
-              tot_cost = cur_cost + ac_cost + graph_cost;
-          if (tot_cost > next_cutoff) continue;
-          else if (tot_cost + adaptive_beam < next_cutoff)
-            next_cutoff = tot_cost + adaptive_beam; // prune by best current token
-          // Note: the frame indexes into active_toks_ are one-based,
-          // hence the + 1.
-          Token *next_tok = FindOrAddToken(arc.nextstate,
-                                           frame + 1, tot_cost, tok, NULL);
-          // NULL: no change indicator needed
-
-          // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-          tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel,
-                                       graph_cost, ac_cost, tok->links);
-        }
-      } // for all arcs
-    }
-    e_tail = e->tail;
-    toks_.Delete(e); // delete Elem
-  }
-  return next_cutoff;
-}
-
-template BaseFloat LatticeFasterOnlineDecoder::
-    ProcessEmitting<fst::ConstFst<fst::StdArc>>(DecodableInterface *decodable);
-template BaseFloat LatticeFasterOnlineDecoder::
-    ProcessEmitting<fst::VectorFst<fst::StdArc>>(DecodableInterface *decodable);
-template BaseFloat LatticeFasterOnlineDecoder::
-    ProcessEmitting<fst::Fst<fst::StdArc>>(DecodableInterface *decodable);
-
-BaseFloat LatticeFasterOnlineDecoder::ProcessEmittingWrapper(
-        DecodableInterface *decodable) {
-  if (fst_.Type() == "const") {
-    return LatticeFasterOnlineDecoder::
-        ProcessEmitting<fst::ConstFst<Arc>>(decodable);
-  } else if (fst_.Type() == "vector") {
-    return LatticeFasterOnlineDecoder::
-        ProcessEmitting<fst::VectorFst<Arc>>(decodable);
-  } else {
-    return LatticeFasterOnlineDecoder::
-        ProcessEmitting<fst::Fst<Arc>>(decodable);
-  }
-}
-
-template <typename FstType> 
-void LatticeFasterOnlineDecoder::ProcessNonemitting(BaseFloat cutoff) {
-  KALDI_ASSERT(!active_toks_.empty());
-  int32 frame = static_cast<int32>(active_toks_.size()) - 2;
-  // Note: "frame" is the time-index we just processed, or -1 if
-  // we are processing the nonemitting transitions before the
-  // first frame (called from InitDecoding()).
-  const FstType &fst = dynamic_cast<const FstType&>(fst_);
-
-  // Processes nonemitting arcs for one frame.  Propagates within toks_.
-  // Note-- this queue structure is is not very optimal as
-  // it may cause us to process states unnecessarily (e.g. more than once),
-  // but in the baseline code, turning this vector into a set to fix this
-  // problem did not improve overall speed.
-
-  KALDI_ASSERT(queue_.empty());
-  for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail)
-    queue_.push_back(e->key);
-  if (queue_.empty()) {
-    if (!warned_) {
-      KALDI_WARN << "Error, no surviving tokens: frame is " << frame;
-      warned_ = true;
-    }
-  }
-
-  while (!queue_.empty()) {
-    StateId state = queue_.back();
-    queue_.pop_back();
-
-    Token *tok = toks_.Find(state)->val;  // would segfault if state not in toks_ but this can't happen.
-    BaseFloat cur_cost = tok->tot_cost;
-    if (cur_cost > cutoff) // Don't bother processing successors.
-      continue;
-    // If "tok" has any existing forward links, delete them,
-    // because we're about to regenerate them.  This is a kind
-    // of non-optimality (remember, this is the simple decoder),
-    // but since most states are emitting it's not a huge issue.
-    tok->DeleteForwardLinks(); // necessary when re-visiting
-    tok->links = NULL;
-    for (fst::ArcIterator<FstType> aiter(fst, state);
-         !aiter.Done();
-         aiter.Next()) {
-      const Arc &arc = aiter.Value();
-      if (arc.ilabel == 0) {  // propagate nonemitting only...
-        BaseFloat graph_cost = arc.weight.Value(),
-            tot_cost = cur_cost + graph_cost;
-        if (tot_cost < cutoff) {
-          bool changed;
-
-          Token *new_tok = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
-                                          tok, &changed);
-
-          tok->links = new ForwardLink(new_tok, 0, arc.olabel,
-                                       graph_cost, 0, tok->links);
-
-          // "changed" tells us whether the new token has a different
-          // cost from before, or is new [if so, add into queue].
-          if (changed) queue_.push_back(arc.nextstate);
-        }
-      }
-    } // for all arcs
-  } // while queue not empty
-}
-
-template void LatticeFasterOnlineDecoder::
-    ProcessNonemitting<fst::ConstFst<fst::StdArc>>(BaseFloat cutoff);
-template void LatticeFasterOnlineDecoder::
-    ProcessNonemitting<fst::VectorFst<fst::StdArc>>(BaseFloat cutoff);
-template void LatticeFasterOnlineDecoder::
-    ProcessNonemitting<fst::Fst<fst::StdArc>>(BaseFloat cutoff);
-
-void LatticeFasterOnlineDecoder::ProcessNonemittingWrapper(
-        BaseFloat cost_cutoff) {
-  if (fst_.Type() == "const") {
-    return LatticeFasterOnlineDecoder::
-        ProcessNonemitting<fst::ConstFst<Arc>>(cost_cutoff);
-  } else if (fst_.Type() == "vector") {
-    return LatticeFasterOnlineDecoder::
-        ProcessNonemitting<fst::VectorFst<Arc>>(cost_cutoff);
-  } else {
-    return LatticeFasterOnlineDecoder::
-        ProcessNonemitting<fst::Fst<Arc>>(cost_cutoff);
-  }
-}
-
-void LatticeFasterOnlineDecoder::DeleteElems(Elem *list) {
-  for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
-    // Token::TokenDelete(e->val);
-    e_tail = e->tail;
-    toks_.Delete(e);
-  }
-}
-
-void LatticeFasterOnlineDecoder::ClearActiveTokens() { // a cleanup routine, at utt end/begin
-  for (size_t i = 0; i < active_toks_.size(); i++) {
-    // Delete all tokens alive on this frame, and any forward
-    // links they may have.
-    for (Token *tok = active_toks_[i].toks; tok != NULL; ) {
-      tok->DeleteForwardLinks();
-      Token *next_tok = tok->next;
-      delete tok;
-      num_toks_--;
-      tok = next_tok;
-    }
-  }
-  active_toks_.clear();
-  KALDI_ASSERT(num_toks_ == 0);
-}
-
-// static
-void LatticeFasterOnlineDecoder::TopSortTokens(Token *tok_list,
-                                               std::vector<Token*> *topsorted_list) {
-  unordered_map<Token*, int32> token2pos;
-  typedef unordered_map<Token*, int32>::iterator IterType;
-  int32 num_toks = 0;
-  for (Token *tok = tok_list; tok != NULL; tok = tok->next)
-    num_toks++;
-  int32 cur_pos = 0;
-  // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0.
-  // This is likely to be in closer to topological order than
-  // if we had given them ascending order, because of the way
-  // new tokens are put at the front of the list.
-  for (Token *tok = tok_list; tok != NULL; tok = tok->next)
-    token2pos[tok] = num_toks - ++cur_pos;
-
-  unordered_set<Token*> reprocess;
-
-  for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) {
-    Token *tok = iter->first;
-    int32 pos = iter->second;
-    for (ForwardLink *link = tok->links; link != NULL; link = link->next) {
-      if (link->ilabel == 0) {
-        // We only need to consider epsilon links, since non-epsilon links
-        // transition between frames and this function only needs to sort a list
-        // of tokens from a single frame.
-        IterType following_iter = token2pos.find(link->next_tok);
-        if (following_iter != token2pos.end()) { // another token on this frame,
-                                                 // so must consider it.
-          int32 next_pos = following_iter->second;
-          if (next_pos < pos) { // reassign the position of the next Token.
-            following_iter->second = cur_pos++;
-            reprocess.insert(link->next_tok);
-          }
-        }
-      }
-    }
-    // In case we had previously assigned this token to be reprocessed, we can
-    // erase it from that set because it's "happy now" (we just processed it).
-    reprocess.erase(tok);
-  }
-
-  size_t max_loop = 1000000, loop_count; // max_loop is to detect epsilon cycles.
-  for (loop_count = 0;
-       !reprocess.empty() && loop_count < max_loop; ++loop_count) {
-    std::vector<Token*> reprocess_vec;
-    for (unordered_set<Token*>::iterator iter = reprocess.begin();
-         iter != reprocess.end(); ++iter)
-      reprocess_vec.push_back(*iter);
-    reprocess.clear();
-    for (std::vector<Token*>::iterator iter = reprocess_vec.begin();
-         iter != reprocess_vec.end(); ++iter) {
-      Token *tok = *iter;
-      int32 pos = token2pos[tok];
-      // Repeat the processing we did above (for comments, see above).
-      for (ForwardLink *link = tok->links; link != NULL; link = link->next) {
-        if (link->ilabel == 0) {
-          IterType following_iter = token2pos.find(link->next_tok);
-          if (following_iter != token2pos.end()) {
-            int32 next_pos = following_iter->second;
-            if (next_pos < pos) {
-              following_iter->second = cur_pos++;
-              reprocess.insert(link->next_tok);
-            }
-          }
-        }
-      }
-    }
-  }
-  KALDI_ASSERT(loop_count < max_loop && "Epsilon loops exist in your decoding "
-               "graph (this is not allowed!)");
-
-  topsorted_list->clear();
-  topsorted_list->resize(cur_pos, NULL);  // create a list with NULLs in between.
-  for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter)
-    (*topsorted_list)[iter->second] = iter->first;
-}
-
 
+// Instantiate the template for the FST types that we'll need.
+template class LatticeFasterOnlineDecoderTpl<fst::Fst<fst::StdArc> >;
+template class LatticeFasterOnlineDecoderTpl<fst::VectorFst<fst::StdArc> >;
+template class LatticeFasterOnlineDecoderTpl<fst::ConstFst<fst::StdArc> >;
+template class LatticeFasterOnlineDecoderTpl<fst::GrammarFst>;
 
 
 } // end namespace kaldi.
diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h
index 6cf0503d891..8b10996fd0b 100644
--- a/src/decoder/lattice-faster-online-decoder.h
+++ b/src/decoder/lattice-faster-online-decoder.h
@@ -34,23 +34,46 @@
 #include "fstext/fstext-lib.h"
 #include "lat/determinize-lattice-pruned.h"
 #include "lat/kaldi-lattice.h"
-// Use the same configuration class as LatticeFasterDecoder.
 #include "decoder/lattice-faster-decoder.h"
 
 namespace kaldi {
 
 
 
-/** LatticeFasterOnlineDecoder is as LatticeFasterDecoder but also supports an
-    efficient way to get the best path (see the function BestPathEnd()), which
-    is useful in endpointing.
+/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also
+    supports an efficient way to get the best path (see the function
+    BestPathEnd()), which is useful in endpointing and in situations where you
+    might want to frequently access the best path.
+
+    This is only templated on the FST type, since the Token type is required to
+    be BackpointerToken.  Actually it only makes sense to instantiate
+    LatticeFasterDecoderTpl with Token == BackpointerToken if you do so indirectly via
+    this child class.
  */
-class LatticeFasterOnlineDecoder {
+template <typename FST>
+class LatticeFasterOnlineDecoderTpl:
+      public LatticeFasterDecoderTpl<FST, decoder::BackpointerToken> {
  public:
-  typedef fst::StdArc Arc;
-  typedef Arc::Label Label;
-  typedef Arc::StateId StateId;
-  typedef Arc::Weight Weight;
+  using Arc = typename FST::Arc;
+  using Label = typename Arc::Label;
+  using StateId = typename Arc::StateId;
+  using Weight = typename Arc::Weight;
+  using Token = decoder::BackpointerToken;
+  using ForwardLinkT = decoder::ForwardLink<Token>;
+
+  // Instantiate this class once for each thing you have to decode.
+  // This version of the constructor does not take ownership of
+  // 'fst'.
+  LatticeFasterOnlineDecoderTpl(const FST &fst,
+                                const LatticeFasterDecoderConfig &config):
+      LatticeFasterDecoderTpl<FST, Token>(fst, config) { }
+
+  // This version of the initializer takes ownership of 'fst', and will delete
+  // it when this object is destroyed.
+  LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config,
+                                FST *fst):
+      LatticeFasterDecoderTpl<FST, Token>(config, fst) { }
+
 
   struct BestPathIterator {
     void *tok;
@@ -61,45 +84,13 @@ class LatticeFasterOnlineDecoder {
     // is one less than you might reasonably expect, e.g. it's -1 for
     // the nonemitting transitions before the first frame.
     BestPathIterator(void *t, int32 f): tok(t), frame(f) { }
-    bool Done() { return tok == NULL; }
+    bool Done() const { return tok == NULL; }
   };
 
-  // instantiate this class once for each thing you have to decode.
-  LatticeFasterOnlineDecoder(const fst::Fst<fst::StdArc> &fst,
-                             const LatticeFasterDecoderConfig &config);
-
-  // This version of the initializer "takes ownership" of the fst,
-  // and will delete it when this object is destroyed.
-  LatticeFasterOnlineDecoder(const LatticeFasterDecoderConfig &config,
-                             fst::Fst<fst::StdArc> *fst);
-
-
-  void SetOptions(const LatticeFasterDecoderConfig &config) {
-    config_ = config;
-  }
-
-  const LatticeFasterDecoderConfig &GetOptions() const {
-    return config_;
-  }
-
-  ~LatticeFasterOnlineDecoder();
-
-  /// Decodes until there are no more frames left in the "decodable" object..
-  /// note, this may block waiting for input if the "decodable" object blocks.
-  /// Returns true if any kind of traceback is available (not necessarily from a
-  /// final state).
-  bool Decode(DecodableInterface *decodable);
-
-
-  /// says whether a final-state was active on the last frame.  If it was not, the
-  /// lattice (or traceback) will end with states that are not final-states.
-  bool ReachedFinal() const {
-    return FinalRelativeCost() != std::numeric_limits<BaseFloat>::infinity();
-  }
 
   /// Outputs an FST corresponding to the single best path through the lattice.
   /// This is quite efficient because it doesn't get the entire raw lattice and find
-  /// the best path through it; insterad, it uses the BestPathEnd and BestPathIterator
+  /// the best path through it; instead, it uses the BestPathEnd and BestPathIterator
   /// so it basically traces it back through the lattice.
   /// Returns true if result is nonempty (using the return status is deprecated,
   /// it will become void).  If "use_final_probs" is true AND we reached the
@@ -135,16 +126,8 @@ class LatticeFasterOnlineDecoder {
   BestPathIterator TraceBackBestPath(
       BestPathIterator iter, LatticeArc *arc) const;
 
-  /// Outputs an FST corresponding to the raw, state-level
-  /// tracebacks.  Returns true if result is nonempty.
-  /// If "use_final_probs" is true AND we reached the final-state
-  /// of the graph then it will include those as final-probs, else
-  /// it will treat all final-probs as one.
-  /// The raw lattice will be topologically sorted.
-  bool GetRawLattice(Lattice *ofst,
-                     bool use_final_probs = true) const;
 
-  /// Behaves the same like GetRawLattice but only processes tokens whose
+  /// Behaves the same as GetRawLattice but only processes tokens whose
   /// extra_cost is smaller than the best-cost plus the specified beam.
   /// It is only worthwhile to call this function if beam is less than
   /// the lattice_beam specified in the config; otherwise, it would
@@ -153,271 +136,10 @@ class LatticeFasterOnlineDecoder {
                            bool use_final_probs,
                            BaseFloat beam) const;
 
-
-  /// InitDecoding initializes the decoding, and should only be used if you
-  /// intend to call AdvanceDecoding().  If you call Decode(), you don't need to
-  /// call this.  You can also call InitDecoding if you have already decoded an
-  /// utterance and want to start with a new utterance.
-  void InitDecoding();
-
-  /// This will decode until there are no more frames ready in the decodable
-  /// object.  You can keep calling it each time more frames become available.
-  /// If max_num_frames is specified, it specifies the maximum number of frames
-  /// the function will decode before returning.
-  void AdvanceDecoding(DecodableInterface *decodable,
-                       int32 max_num_frames = -1);
-
-  /// This function may be optionally called after AdvanceDecoding(), when you
-  /// do not plan to decode any further.  It does an extra pruning step that
-  /// will help to prune the lattices output by GetRawLattice more accurately,
-  /// particularly toward the end of the utterance.  It does this by using the
-  /// final-probs in pruning (if any final-state survived); it also does a final
-  /// pruning step that visits all states (the pruning that is done during
-  /// decoding may fail to prune states that are within kPruningScale = 0.1
-  /// outside of the beam).  If you call this, you cannot call AdvanceDecoding
-  /// again (it will fail), and you cannot call GetRawLattice() and related
-  /// functions with use_final_probs = false.  Used to be called
-  /// PruneActiveTokensFinal().
-  void FinalizeDecoding();
-
-  /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives
-  /// more information.  It returns the difference between the best (final-cost
-  /// plus cost) of any token on the final frame, and the best cost of any token
-  /// on the final frame.  If it is infinity it means no final-states were
-  /// present on the final frame.  It will usually be nonnegative.  If it not
-  /// too positive (e.g. < 5 is my first guess, but this is not tested) you can
-  /// take it as a good indication that we reached the final-state with
-  /// reasonable likelihood.
-  BaseFloat FinalRelativeCost() const;
-
-  // Returns the number of frames decoded so far.  The value returned changes
-  // whenever we call ProcessEmitting().
-  inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; }
-
- private:
-  // ForwardLinks are the links from a token to a token on the next frame.
-  // or sometimes on the current frame (for input-epsilon links).
-  struct Token;
-  struct ForwardLink {
-    Token *next_tok; // the next token [or NULL if represents final-state]
-    Label ilabel; // ilabel on link.
-    Label olabel; // olabel on link.
-    BaseFloat graph_cost; // graph cost of traversing link (contains LM, etc.)
-    BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing link
-    ForwardLink *next; // next in singly-linked list of forward links from a
-                       // token.
-    inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
-                       BaseFloat graph_cost, BaseFloat acoustic_cost,
-                       ForwardLink *next):
-        next_tok(next_tok), ilabel(ilabel), olabel(olabel),
-        graph_cost(graph_cost), acoustic_cost(acoustic_cost),
-        next(next) { }
-  };
-
-  // Token is what's resident in a particular state at a particular time.
-  // In this decoder a Token actually contains *forward* links.
-  // When first created, a Token just has the (total) cost.    We add forward
-  // links from it when we process the next frame.
-  struct Token {
-    BaseFloat tot_cost; // would equal weight.Value()... cost up to this point.
-    BaseFloat extra_cost; // >= 0.  After calling PruneForwardLinks, this equals
-    // the minimum difference between the cost of the best path, and the cost of
-    // this is on, and the cost of the absolute best path, under the assumption
-    // that any of the currently active states at the decoding front may
-    // eventually succeed (e.g. if you were to take the currently active states
-    // one by one and compute this difference, and then take the minimum).
-
-    ForwardLink *links; // Head of singly linked list of ForwardLinks
-
-    Token *next; // Next in list of tokens for this frame.
-
-    Token *backpointer; // best preceding Token (could be on this frame or a
-                        // previous frame).  This is only required for an
-                        // efficient GetBestPath function, it plays no part in
-                        // the lattice generation (the "links" list is what
-                        // stores the forward links, for that).
-
-    inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links,
-                 Token *next, Token *backpointer):
-        tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next),
-        backpointer(backpointer) { }
-    inline void DeleteForwardLinks() {
-      ForwardLink *l = links, *m;
-      while (l != NULL) {
-        m = l->next;
-        delete l;
-        l = m;
-      }
-      links = NULL;
-    }
-  };
-
-  // head of per-frame list of Tokens (list is in topological order),
-  // and something saying whether we ever pruned it using PruneForwardLinks.
-  struct TokenList {
-    Token *toks;
-    bool must_prune_forward_links;
-    bool must_prune_tokens;
-    TokenList(): toks(NULL), must_prune_forward_links(true),
-                 must_prune_tokens(true) { }
-  };
-
-  typedef HashList<StateId, Token*>::Elem Elem;
-
-  void PossiblyResizeHash(size_t num_toks);
-
-  // FindOrAddToken either locates a token in hash of toks_, or if necessary
-  // inserts a new, empty token (i.e. with no forward links) for the current
-  // frame.  [note: it's inserted if necessary into hash toks_ and also into the
-  // singly linked list of tokens active on this frame (whose head is at
-  // active_toks_[frame]).  The frame_plus_one argument is the acoustic frame
-  // index plus one, which is used to index into the active_toks_ array.
-  // Returns the Token pointer.  Sets "changed" (if non-NULL) to true if the
-  // token was newly created or the cost changed.
-  inline Token *FindOrAddToken(StateId state, int32 frame_plus_one,
-                               BaseFloat tot_cost, Token *backpointer,
-                               bool *changed);
-
-  // prunes outgoing links for all tokens in active_toks_[frame]
-  // it's called by PruneActiveTokens
-  // all links, that have link_extra_cost > lattice_beam are pruned
-  // delta is the amount by which the extra_costs must change
-  // before we set *extra_costs_changed = true.
-  // If delta is larger,  we'll tend to go back less far
-  //    toward the beginning of the file.
-  // extra_costs_changed is set to true if extra_cost was changed for any token
-  // links_pruned is set to true if any link in any token was pruned
-  void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed,
-                         bool *links_pruned,
-                         BaseFloat delta);
-
-  // This function computes the final-costs for tokens active on the final
-  // frame.  It outputs to final-costs, if non-NULL, a map from the Token*
-  // pointer to the final-prob of the corresponding state, for all Tokens
-  // that correspond to states that have final-probs.  This map will be
-  // empty if there were no final-probs.  It outputs to
-  // final_relative_cost, if non-NULL, the difference between the best
-  // forward-cost including the final-prob cost, and the best forward-cost
-  // without including the final-prob cost (this will usually be positive), or
-  // infinity if there were no final-probs.  [c.f. FinalRelativeCost(), which
-  // outputs this quanitity].  It outputs to final_best_cost, if
-  // non-NULL, the lowest for any token t active on the final frame, of
-  // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in
-  // the graph of the state corresponding to token t, or the best of
-  // forward-cost[t] if there were no final-probs active on the final frame.
-  // You cannot call this after FinalizeDecoding() has been called; in that
-  // case you should get the answer from class-member variables.
-  void ComputeFinalCosts(unordered_map<Token*, BaseFloat> *final_costs,
-                         BaseFloat *final_relative_cost,
-                         BaseFloat *final_best_cost) const;
-
-  // PruneForwardLinksFinal is a version of PruneForwardLinks that we call
-  // on the final frame.  If there are final tokens active, it uses
-  // the final-probs for pruning, otherwise it treats all tokens as final.
-  void PruneForwardLinksFinal();
-
-  // Prune away any tokens on this frame that have no forward links.
-  // [we don't do this in PruneForwardLinks because it would give us
-  // a problem with dangling pointers].
-  // It's called by PruneActiveTokens if any forward links have been pruned
-  void PruneTokensForFrame(int32 frame_plus_one);
-
-
-  // Go backwards through still-alive tokens, pruning them if the
-  // forward+backward cost is more than lat_beam away from the best path.  It's
-  // possible to prove that this is "correct" in the sense that we won't lose
-  // anything outside of lat_beam, regardless of what happens in the future.
-  // delta controls when it considers a cost to have changed enough to continue
-  // going backward and propagating the change.  larger delta -> will recurse
-  // less far.
-  void PruneActiveTokens(BaseFloat delta);
-
-  /// Gets the weight cutoff.  Also counts the active tokens.
-  BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
-                      BaseFloat *adaptive_beam, Elem **best_elem);
-
-  /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
-  /// Returns the cost cutoff for subsequent ProcessNonemitting() to use.
-  /// Templated on FST type for speed; called via ProcessEmittingWrapper().
-  template <typename FstType> BaseFloat ProcessEmitting(DecodableInterface *decodable);
-
-  BaseFloat ProcessEmittingWrapper(DecodableInterface *decodable);
-
-  /// Processes nonemitting (epsilon) arcs for one frame.  Called after
-  /// ProcessEmitting() on each frame.  The cost cutoff is computed by the
-  /// preceding ProcessEmitting().
-  /// the templated design is similar to ProcessEmitting()
-  template <typename FstType> void ProcessNonemitting(BaseFloat cost_cutoff);
-
-  void ProcessNonemittingWrapper(BaseFloat cost_cutoff);
-
-  // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
-  // more than one list (e.g. for current and previous frames), but only one of
-  // them at a time can be indexed by StateId.  It is indexed by frame-index
-  // plus one, where the frame-index is zero-based, as used in decodable object.
-  // That is, the emitting probs of frame t are accounted for in tokens at
-  // toks_[t+1].  The zeroth frame is for nonemitting transition at the start of
-  // the graph.
-  HashList<StateId, Token*> toks_;
-
-  std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
-  // frame (members of TokenList are toks, must_prune_forward_links,
-  // must_prune_tokens).
-  std::vector<StateId> queue_;  // temp variable used in ProcessNonemitting,
-  std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
-  // make it class member to avoid internal new/delete.
-  const fst::Fst<fst::StdArc> &fst_;
-  bool delete_fst_;
-  std::vector<BaseFloat> cost_offsets_; // This contains, for each
-  // frame, an offset that was added to the acoustic log-likelihoods on that
-  // frame in order to keep everything in a nice dynamic range i.e.  close to
-  // zero, to reduce roundoff errors.
-  LatticeFasterDecoderConfig config_;
-  int32 num_toks_; // current total #toks allocated...
-  bool warned_;
-
-  /// decoding_finalized_ is true if someone called FinalizeDecoding().  [note,
-  /// calling this is optional].  If true, it's forbidden to decode more.  Also,
-  /// if this is set, then the output of ComputeFinalCosts() is in the next
-  /// three variables.  The reason we need to do this is that after
-  /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some
-  /// of the tokens on the last frame are freed, so we free the list from toks_
-  /// to avoid having dangling pointers hanging around.
-  bool decoding_finalized_;
-  /// For the meaning of the next 3 variables, see the comment for
-  /// decoding_finalized_ above., and ComputeFinalCosts().
-  unordered_map<Token*, BaseFloat> final_costs_;
-  BaseFloat final_relative_cost_;
-  BaseFloat final_best_cost_;
-
-  // There are various cleanup tasks... the the toks_ structure contains
-  // singly linked lists of Token pointers, where Elem is the list type.
-  // It also indexes them in a hash, indexed by state (this hash is only
-  // maintained for the most recent frame).  toks_.Clear()
-  // deletes them from the hash and returns the list of Elems.  The
-  // function DeleteElems calls toks_.Delete(elem) for each elem in
-  // the list, which returns ownership of the Elem to the toks_ structure
-  // for reuse, but does not delete the Token pointer.  The Token pointers
-  // are reference-counted and are ultimately deleted in PruneTokensForFrame,
-  // but are also linked together on each frame by their own linked-list,
-  // using the "next" pointer.  We delete them manually.
-  void DeleteElems(Elem *list);
-
-  // This function takes a singly linked list of tokens for a single frame, and
-  // outputs a list of them in topological order (it will crash if no such order
-  // can be found, which will typically be due to decoding graphs with epsilon
-  // cycles, which are not allowed).  Note: the output list may contain NULLs,
-  // which the caller should pass over; it just happens to be more efficient for
-  // the algorithm to output a list that contains NULLs.
-  static void TopSortTokens(Token *tok_list,
-                            std::vector<Token*> *topsorted_list);
-
-  void ClearActiveTokens();
-
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoder);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl);
 };
 
+typedef LatticeFasterOnlineDecoderTpl<fst::StdFst> LatticeFasterOnlineDecoder;
 
 
 } // end namespace kaldi.
diff --git a/src/decoder/lattice-incremental-decoder.cc b/src/decoder/lattice-incremental-decoder.cc
new file mode 100644
index 00000000000..81e70083301
--- /dev/null
+++ b/src/decoder/lattice-incremental-decoder.cc
@@ -0,0 +1,1720 @@
+// decoder/lattice-incremental-decoder.cc
+
+// Copyright      2019  Zhehuai Chen,  Daniel Povey
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/lattice-incremental-decoder.h"
+#include "lat/lattice-functions.h"
+#include "base/timer.h"
+
+namespace kaldi {
+
+// instantiate this class once for each thing you have to decode.
+template <typename FST, typename Token>
+LatticeIncrementalDecoderTpl<FST, Token>::LatticeIncrementalDecoderTpl(
+    const FST &fst, const TransitionModel &trans_model,
+    const LatticeIncrementalDecoderConfig &config)
+    : fst_(&fst),
+      delete_fst_(false),
+      num_toks_(0),
+      config_(config),
+      determinizer_(trans_model, config) {
+  config.Check();
+  toks_.SetSize(1000); // just so on the first frame we do something reasonable.
+}
+
+template <typename FST, typename Token>
+LatticeIncrementalDecoderTpl<FST, Token>::LatticeIncrementalDecoderTpl(
+    const LatticeIncrementalDecoderConfig &config, FST *fst,
+    const TransitionModel &trans_model)
+    : fst_(fst),
+      delete_fst_(true),
+      num_toks_(0),
+      config_(config),
+      determinizer_(trans_model, config) {
+  config.Check();
+  toks_.SetSize(1000); // just so on the first frame we do something reasonable.
+}
+
+template <typename FST, typename Token>
+LatticeIncrementalDecoderTpl<FST, Token>::~LatticeIncrementalDecoderTpl() {
+  DeleteElems(toks_.Clear());
+  ClearActiveTokens();
+  if (delete_fst_) delete fst_;
+}
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::InitDecoding() {
+  // clean up from last time:
+  DeleteElems(toks_.Clear());
+  cost_offsets_.clear();
+  ClearActiveTokens();
+  warned_ = false;
+  num_toks_ = 0;
+  decoding_finalized_ = false;
+  final_costs_.clear();
+  StateId start_state = fst_->Start();
+  KALDI_ASSERT(start_state != fst::kNoStateId);
+  active_toks_.resize(1);
+  Token *start_tok = new Token(0.0, 0.0, NULL, NULL, NULL);
+  active_toks_[0].toks = start_tok;
+  toks_.Insert(start_state, start_tok);
+  num_toks_++;
+
+  determinizer_.Init();
+  num_frames_in_lattice_ = 0;
+  token2label_map_.clear();
+  next_token_label_ = LatticeIncrementalDeterminizer::kTokenLabelOffset;
+  ProcessNonemitting(config_.beam);
+}
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::UpdateLatticeDeterminization() {
+  if (NumFramesDecoded() - num_frames_in_lattice_ <
+      config_.determinize_max_delay)
+    return;
+
+
+  /* Make sure the token-pruning is active.  Note: PruneActiveTokens() has
+     internal logic that prevents it from doing unnecessary work if you
+     call it and then immediately call it again. */
+  PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
+
+  int32 first = num_frames_in_lattice_ + config_.determinize_min_chunk_size,
+      last = NumFramesDecoded(),
+      fewest_tokens = std::numeric_limits<int32>::max(),
+      best_frame = -1;
+  for (int32 t = last; t >= first; t--) {
+    /* Make sure PruneActiveTokens() has computed num_toks for all these
+       frames... */
+    KALDI_ASSERT(active_toks_[t].num_toks != -1);
+    if (active_toks_[t].num_toks < fewest_tokens) {
+      //  <= because we want the latest one in case of ties.
+      fewest_tokens = active_toks_[t].num_toks;
+      best_frame = t;
+    }
+  }
+  /* OK, determinize the chunk that spans from num_frames_in_lattice_ to
+     best_frame. */
+  bool use_final_probs = false;
+  GetLattice(best_frame, use_final_probs);
+  return;
+}
+// Returns true if any kind of traceback is available (not necessarily from
+// a final state).  It should only very rarely return false; this indicates
+// an unusual search error.
+template <typename FST, typename Token>
+bool LatticeIncrementalDecoderTpl<FST, Token>::Decode(DecodableInterface *decodable) {
+  InitDecoding();
+
+  // We use 1-based indexing for frames in this decoder (if you view it in
+  // terms of features), but note that the decodable object uses zero-based
+  // numbering, which we have to correct for when we call it.
+
+  while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
+    if (NumFramesDecoded() % config_.prune_interval == 0) {
+      PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
+    }
+    UpdateLatticeDeterminization();
+
+    BaseFloat cost_cutoff = ProcessEmitting(decodable);
+    ProcessNonemitting(cost_cutoff);
+  }
+  Timer timer;
+  FinalizeDecoding();
+  bool use_final_probs = true;
+  GetLattice(NumFramesDecoded(), use_final_probs);
+  KALDI_VLOG(2) << "Delay time during and after FinalizeDecoding()"
+                << "(secs): " << timer.Elapsed();
+
+  // Returns true if we have any kind of traceback available (not necessarily
+  // to the end state; query ReachedFinal() for that).
+  return !active_toks_.empty() && active_toks_.back().toks != NULL;
+}
+
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::PossiblyResizeHash(size_t num_toks) {
+  size_t new_sz =
+      static_cast<size_t>(static_cast<BaseFloat>(num_toks) * config_.hash_ratio);
+  if (new_sz > toks_.Size()) {
+    toks_.SetSize(new_sz);
+  }
+}
+
+/*
+  A note on the definition of extra_cost.
+
+  extra_cost is used in pruning tokens, to save memory.
+
+  extra_cost can be thought of as a beta (backward) cost assuming
+  we had set the betas on currently-active tokens to all be the negative
+  of the alphas for those tokens.  (So all currently active tokens would
+  be on (tied) best paths).
+
+
+  Define the 'forward cost' of a token as zero for any token on the frame
+  we're currently decoding; and for other frames, as the shortest-path cost
+  between that token and a token on the frame we're currently decoding.
+  (by "currently decoding" I mean the most recently processed frame).
+
+  Then define the extra_cost of a token (always >= 0) as the forward-cost of
+  the token minus the smallest forward-cost of any token on the same frame.
+
+  We can use the extra_cost to accurately prune away tokens that we know will
+  never appear in the lattice.  If the extra_cost is greater than the desired
+  lattice beam, the token would provably never appear in the lattice, so we can
+  prune away the token.
+
+  The advantage of storing the extra_cost rather than the forward-cost, is that
+  it is less costly to keep the extra_cost up-to-date when we process new frames.
+  When we process a new frame, *all* the previous frames' forward-costs would change;
+  but in general the extra_cost will change only for a finite number of frames.
+  (Actually we don't update all the extra_costs every time we update a frame; we
+  only do it every 'config_.prune_interval' frames).
+ */
+
+// FindOrAddToken either locates a token in hash of toks_,
+// or if necessary inserts a new, empty token (i.e. with no forward links)
+// for the current frame.  [note: it's inserted if necessary into hash toks_
+// and also into the singly linked list of tokens active on this frame
+// (whose head is at active_toks_[frame]).
+template <typename FST, typename Token>
+inline Token *LatticeIncrementalDecoderTpl<FST, Token>::FindOrAddToken(
+    StateId state, int32 frame_plus_one, BaseFloat tot_cost, Token *backpointer,
+    bool *changed) {
+  // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+  // if the token was newly created or the cost changed.
+  KALDI_ASSERT(frame_plus_one < active_toks_.size());
+  Token *&toks = active_toks_[frame_plus_one].toks;
+  Elem *e_found = toks_.Find(state);
+  if (e_found == NULL) { // no such token presently.
+    const BaseFloat extra_cost = 0.0;
+    // tokens on the currently final frame have zero extra_cost
+    // as any of them could end up
+    // on the winning path.
+    Token *new_tok = new Token(tot_cost, extra_cost, NULL, toks, backpointer);
+    // NULL: no forward links yet
+    toks = new_tok;
+    num_toks_++;
+    toks_.Insert(state, new_tok);
+    if (changed) *changed = true;
+    return new_tok;
+  } else {
+    Token *tok = e_found->val;      // There is an existing Token for this state.
+    if (tok->tot_cost > tot_cost) { // replace old token
+      tok->tot_cost = tot_cost;
+      // SetBackpointer() just does tok->backpointer = backpointer in
+      // the case where Token == BackpointerToken, else nothing.
+      tok->SetBackpointer(backpointer);
+      // we don't allocate a new token, the old stays linked in active_toks_
+      // we only replace the tot_cost
+      // in the current frame, there are no forward links (and no extra_cost)
+      // only in ProcessNonemitting we have to delete forward links
+      // in case we visit a state for the second time
+      // those forward links, that lead to this replaced token before:
+      // they remain and will hopefully be pruned later (PruneForwardLinks...)
+      if (changed) *changed = true;
+    } else {
+      if (changed) *changed = false;
+    }
+    return tok;
+  }
+}
+
+// prunes outgoing links for all tokens in active_toks_[frame]
+// it's called by PruneActiveTokens
+// all links, that have link_extra_cost > lattice_beam are pruned
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::PruneForwardLinks(
+    int32 frame_plus_one, bool *extra_costs_changed, bool *links_pruned,
+    BaseFloat delta) {
+  // delta is the amount by which the extra_costs must change
+  // If delta is larger,  we'll tend to go back less far
+  //    toward the beginning of the file.
+  // extra_costs_changed is set to true if extra_cost was changed for any token
+  // links_pruned is set to true if any link in any token was pruned
+
+  *extra_costs_changed = false;
+  *links_pruned = false;
+  KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size());
+  if (active_toks_[frame_plus_one].toks == NULL) { // empty list; should not happen.
+    if (!warned_) {
+      KALDI_WARN << "No tokens alive [doing pruning].. warning first "
+                    "time only for each utterance\n";
+      warned_ = true;
+    }
+  }
+
+  // We have to iterate until there is no more change, because the links
+  // are not guaranteed to be in topological order.
+  bool changed = true; // difference new minus old extra cost >= delta ?
+  while (changed) {
+    changed = false;
+    for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL;
+         tok = tok->next) {
+      ForwardLinkT *link, *prev_link = NULL;
+      // will recompute tok_extra_cost for tok.
+      BaseFloat tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
+      // tok_extra_cost is the best (min) of link_extra_cost of outgoing links
+      for (link = tok->links; link != NULL;) {
+        // See if we need to excise this link...
+        Token *next_tok = link->next_tok;
+        BaseFloat link_extra_cost =
+            next_tok->extra_cost +
+            ((tok->tot_cost + link->acoustic_cost + link->graph_cost) -
+             next_tok->tot_cost); // difference in brackets is >= 0
+        // link_exta_cost is the difference in score between the best paths
+        // through link source state and through link destination state
+        KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN
+        if (link_extra_cost > config_.lattice_beam) {     // excise link
+          ForwardLinkT *next_link = link->next;
+          if (prev_link != NULL)
+            prev_link->next = next_link;
+          else
+            tok->links = next_link;
+          delete link;
+          link = next_link; // advance link but leave prev_link the same.
+          *links_pruned = true;
+        } else { // keep the link and update the tok_extra_cost if needed.
+          if (link_extra_cost < 0.0) { // this is just a precaution.
+            if (link_extra_cost < -0.01)
+              KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+            link_extra_cost = 0.0;
+          }
+          if (link_extra_cost < tok_extra_cost) tok_extra_cost = link_extra_cost;
+          prev_link = link; // move to next link
+          link = link->next;
+        }
+      } // for all outgoing links
+      if (fabs(tok_extra_cost - tok->extra_cost) > delta)
+        changed = true; // difference new minus old is bigger than delta
+      tok->extra_cost = tok_extra_cost;
+      // will be +infinity or <= lattice_beam_.
+      // infinity indicates, that no forward link survived pruning
+    } // for all Token on active_toks_[frame]
+    if (changed) *extra_costs_changed = true;
+
+    // Note: it's theoretically possible that aggressive compiler
+    // optimizations could cause an infinite loop here for small delta and
+    // high-dynamic-range scores.
+  } // while changed
+}
+
+// PruneForwardLinksFinal is a version of PruneForwardLinks that we call
+// on the final frame.  If there are final tokens active, it uses
+// the final-probs for pruning, otherwise it treats all tokens as final.
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::PruneForwardLinksFinal() {
+  KALDI_ASSERT(!active_toks_.empty());
+  int32 frame_plus_one = active_toks_.size() - 1;
+
+  if (active_toks_[frame_plus_one].toks == NULL) // empty list; should not happen.
+    KALDI_WARN << "No tokens alive at end of file";
+
+  typedef typename unordered_map<Token *, BaseFloat>::const_iterator IterType;
+  ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
+  decoding_finalized_ = true;
+  // We call DeleteElems() as a nicety, not because it's really necessary;
+  // otherwise there would be a time, after calling PruneTokensForFrame() on the
+  // final frame, when toks_.GetList() or toks_.Clear() would contain pointers
+  // to nonexistent tokens.
+  DeleteElems(toks_.Clear());
+
+  // Now go through tokens on this frame, pruning forward links...  may have to
+  // iterate a few times until there is no more change, because the list is not
+  // in topological order.  This is a modified version of the code in
+  // PruneForwardLinks, but here we also take account of the final-probs.
+  bool changed = true;
+  BaseFloat delta = 1.0e-05;
+  while (changed) {
+    changed = false;
+    for (Token *tok = active_toks_[frame_plus_one].toks; tok != NULL;
+         tok = tok->next) {
+      ForwardLinkT *link, *prev_link = NULL;
+      // will recompute tok_extra_cost.  It has a term in it that corresponds
+      // to the "final-prob", so instead of initializing tok_extra_cost to infinity
+      // below we set it to the difference between the (score+final_prob) of this
+      // token,
+      // and the best such (score+final_prob).
+      BaseFloat final_cost;
+      if (final_costs_.empty()) {
+        final_cost = 0.0;
+      } else {
+        IterType iter = final_costs_.find(tok);
+        if (iter != final_costs_.end())
+          final_cost = iter->second;
+        else
+          final_cost = std::numeric_limits<BaseFloat>::infinity();
+      }
+      BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_;
+      // tok_extra_cost will be a "min" over either directly being final, or
+      // being indirectly final through other links, and the loop below may
+      // decrease its value:
+      for (link = tok->links; link != NULL;) {
+        // See if we need to excise this link...
+        Token *next_tok = link->next_tok;
+        BaseFloat link_extra_cost =
+            next_tok->extra_cost +
+            ((tok->tot_cost + link->acoustic_cost + link->graph_cost) -
+             next_tok->tot_cost);
+        if (link_extra_cost > config_.lattice_beam) { // excise link
+          ForwardLinkT *next_link = link->next;
+          if (prev_link != NULL)
+            prev_link->next = next_link;
+          else
+            tok->links = next_link;
+          delete link;
+          link = next_link; // advance link but leave prev_link the same.
+        } else {            // keep the link and update the tok_extra_cost if needed.
+          if (link_extra_cost < 0.0) { // this is just a precaution.
+            if (link_extra_cost < -0.01)
+              KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+            link_extra_cost = 0.0;
+          }
+          if (link_extra_cost < tok_extra_cost) tok_extra_cost = link_extra_cost;
+          prev_link = link;
+          link = link->next;
+        }
+      }
+      // prune away tokens worse than lattice_beam above best path.  This step
+      // was not necessary in the non-final case because then, this case
+      // showed up as having no forward links.  Here, the tok_extra_cost has
+      // an extra component relating to the final-prob.
+      if (tok_extra_cost > config_.lattice_beam)
+        tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
+      // to be pruned in PruneTokensForFrame
+
+      if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta)) changed = true;
+      tok->extra_cost = tok_extra_cost; // will be +infinity or <= lattice_beam_.
+    }
+  } // while changed
+}
+
+template <typename FST, typename Token>
+BaseFloat LatticeIncrementalDecoderTpl<FST, Token>::FinalRelativeCost() const {
+  BaseFloat relative_cost;
+  ComputeFinalCosts(NULL, &relative_cost, NULL);
+  return relative_cost;
+}
+
+// Prune away any tokens on this frame that have no forward links.
+// [we don't do this in PruneForwardLinks because it would give us
+// a problem with dangling pointers].
+// It's called by PruneActiveTokens if any forward links have been pruned
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::PruneTokensForFrame(
+    int32 frame_plus_one) {
+  KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size());
+  Token *&toks = active_toks_[frame_plus_one].toks;
+  if (toks == NULL) KALDI_WARN << "No tokens alive [doing pruning]";
+  Token *tok, *next_tok, *prev_tok = NULL;
+  int32 num_toks = 0;
+  for (tok = toks; tok != NULL; tok = next_tok, num_toks++) {
+    next_tok = tok->next;
+    if (tok->extra_cost == std::numeric_limits<BaseFloat>::infinity()) {
+      // token is unreachable from end of graph; (no forward links survived)
+      // excise tok from list and delete tok.
+      if (prev_tok != NULL)
+        prev_tok->next = tok->next;
+      else
+        toks = tok->next;
+      delete tok;
+      num_toks_--;
+    } else { // fetch next Token
+      prev_tok = tok;
+    }
+  }
+  active_toks_[frame_plus_one].num_toks = num_toks;
+}
+
+// Go backwards through still-alive tokens, pruning them, starting not from
+// the current frame (where we want to keep all tokens) but from the frame before
+// that.  We go backwards through the frames and stop when we reach a point
+// where the delta-costs are not changing (and the delta controls when we consider
+// a cost to have "not changed").
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::PruneActiveTokens(BaseFloat delta) {
+  int32 cur_frame_plus_one = NumFramesDecoded();
+  int32 num_toks_begin = num_toks_;
+
+  if (active_toks_[cur_frame_plus_one].num_toks == -1){
+    // The current frame's tokens don't get pruned so they don't get counted
+    // (the count is needed by the incremental determinization code).
+    // Fix this.
+    int this_frame_num_toks = 0;
+    for (Token *t = active_toks_[cur_frame_plus_one].toks; t != NULL; t = t->next)
+      this_frame_num_toks++;
+    active_toks_[cur_frame_plus_one].num_toks = this_frame_num_toks;
+ }
+
+  // The index "f" below represents a "frame plus one", i.e. you'd have to subtract
+  // one to get the corresponding index for the decodable object.
+  for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) {
+    // Reason why we need to prune forward links in this situation:
+    // (1) we have never pruned them (new TokenList)
+    // (2) we have not yet pruned the forward links to the next f,
+    // after any of those tokens have changed their extra_cost.
+    if (active_toks_[f].must_prune_forward_links) {
+      bool extra_costs_changed = false, links_pruned = false;
+      PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta);
+      if (extra_costs_changed && f > 0) // any token has changed extra_cost
+        active_toks_[f - 1].must_prune_forward_links = true;
+      if (links_pruned) // any link was pruned
+        active_toks_[f].must_prune_tokens = true;
+      active_toks_[f].must_prune_forward_links = false; // job done
+    }
+    if (f + 1 < cur_frame_plus_one && // except for last f (no forward links)
+        active_toks_[f + 1].must_prune_tokens) {
+      PruneTokensForFrame(f + 1);
+      active_toks_[f + 1].must_prune_tokens = false;
+    }
+  }
+  KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin
+                << " to " << num_toks_;
+}
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::ComputeFinalCosts(
+    unordered_map<Token *, BaseFloat> *final_costs, BaseFloat *final_relative_cost,
+    BaseFloat *final_best_cost) const {
+  if (decoding_finalized_) {
+    // If we finalized decoding, the list toks_ will no longer exist, so return
+    // something we already computed.
+    if (final_costs) *final_costs = final_costs_;
+    if (final_relative_cost) *final_relative_cost = final_relative_cost_;
+    if (final_best_cost) *final_best_cost = final_best_cost_;
+    return;
+  }
+  if (final_costs != NULL) final_costs->clear();
+  const Elem *final_toks = toks_.GetList();
+  BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
+  BaseFloat best_cost = infinity, best_cost_with_final = infinity;
+
+  while (final_toks != NULL) {
+    StateId state = final_toks->key;
+    Token *tok = final_toks->val;
+    const Elem *next = final_toks->tail;
+    BaseFloat final_cost = fst_->Final(state).Value();
+    BaseFloat cost = tok->tot_cost, cost_with_final = cost + final_cost;
+    best_cost = std::min(cost, best_cost);
+    best_cost_with_final = std::min(cost_with_final, best_cost_with_final);
+    if (final_costs != NULL && final_cost != infinity)
+      (*final_costs)[tok] = final_cost;
+    final_toks = next;
+  }
+  if (final_relative_cost != NULL) {
+    if (best_cost == infinity && best_cost_with_final == infinity) {
+      // Likely this will only happen if there are no tokens surviving.
+      // This seems the least bad way to handle it.
+      *final_relative_cost = infinity;
+    } else {
+      *final_relative_cost = best_cost_with_final - best_cost;
+    }
+  }
+  if (final_best_cost != NULL) {
+    if (best_cost_with_final != infinity) { // final-state exists.
+      *final_best_cost = best_cost_with_final;
+    } else { // no final-state exists.
+      *final_best_cost = best_cost;
+    }
+  }
+}
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::AdvanceDecoding(
+    DecodableInterface *decodable, int32 max_num_frames) {
+  if (std::is_same<FST, fst::Fst<fst::StdArc> >::value) {
+    // if the type 'FST' is the FST base-class, then see if the FST type of fst_
+    // is actually VectorFst or ConstFst.  If so, call the AdvanceDecoding()
+    // function after casting *this to the more specific type.
+    if (fst_->Type() == "const") {
+      LatticeIncrementalDecoderTpl<fst::ConstFst<fst::StdArc>, Token> *this_cast =
+          reinterpret_cast<
+              LatticeIncrementalDecoderTpl<fst::ConstFst<fst::StdArc>, Token> *>(
+              this);
+      this_cast->AdvanceDecoding(decodable, max_num_frames);
+      return;
+    } else if (fst_->Type() == "vector") {
+      LatticeIncrementalDecoderTpl<fst::VectorFst<fst::StdArc>, Token> *this_cast =
+          reinterpret_cast<
+              LatticeIncrementalDecoderTpl<fst::VectorFst<fst::StdArc>, Token> *>(
+              this);
+      this_cast->AdvanceDecoding(decodable, max_num_frames);
+      return;
+    }
+  }
+
+  KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ &&
+               "You must call InitDecoding() before AdvanceDecoding");
+  int32 num_frames_ready = decodable->NumFramesReady();
+  // num_frames_ready must be >= num_frames_decoded, or else
+  // the number of frames ready must have decreased (which doesn't
+  // make sense) or the decodable object changed between calls
+  // (which isn't allowed).
+  KALDI_ASSERT(num_frames_ready >= NumFramesDecoded());
+  int32 target_frames_decoded = num_frames_ready;
+  if (max_num_frames >= 0)
+    target_frames_decoded =
+        std::min(target_frames_decoded, NumFramesDecoded() + max_num_frames);
+  while (NumFramesDecoded() < target_frames_decoded) {
+    if (NumFramesDecoded() % config_.prune_interval == 0) {
+      PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
+    }
+    BaseFloat cost_cutoff = ProcessEmitting(decodable);
+    ProcessNonemitting(cost_cutoff);
+  }
+  UpdateLatticeDeterminization();
+}
+
+// FinalizeDecoding() is a version of PruneActiveTokens that we call
+// (optionally) on the final frame.  Takes into account the final-prob of
+// tokens.  This function used to be called PruneActiveTokensFinal().
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::FinalizeDecoding() {
+  int32 final_frame_plus_one = NumFramesDecoded();
+  int32 num_toks_begin = num_toks_;
+  // PruneForwardLinksFinal() prunes the final frame (with final-probs), and
+  // sets decoding_finalized_.
+  PruneForwardLinksFinal();
+  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {
+    bool b1, b2;              // values not used.
+    BaseFloat dontcare = 0.0; // delta of zero means we must always update
+    PruneForwardLinks(f, &b1, &b2, dontcare);
+    PruneTokensForFrame(f + 1);
+  }
+  PruneTokensForFrame(0);
+  KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin << " to " << num_toks_;
+}
+
+/// Gets the weight cutoff.  Also counts the active tokens.
+template <typename FST, typename Token>
+BaseFloat LatticeIncrementalDecoderTpl<FST, Token>::GetCutoff(
+    Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam, Elem **best_elem) {
+  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
+  // positive == high cost == bad.
+  size_t count = 0;
+  if (config_.max_active == std::numeric_limits<int32>::max() &&
+      config_.min_active == 0) {
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
+    return best_weight + config_.beam;
+  } else {
+    tmp_array_.clear();
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = e->val->tot_cost;
+      tmp_array_.push_back(w);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+
+    BaseFloat beam_cutoff = best_weight + config_.beam,
+              min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
+              max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
+
+    KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded()
+                  << " is " << tmp_array_.size();
+
+    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
+      std::nth_element(tmp_array_.begin(), tmp_array_.begin() + config_.max_active,
+                       tmp_array_.end());
+      max_active_cutoff = tmp_array_[config_.max_active];
+    }
+    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
+      if (adaptive_beam)
+        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
+      return max_active_cutoff;
+    }
+    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
+      if (config_.min_active == 0)
+        min_active_cutoff = best_weight;
+      else {
+        std::nth_element(tmp_array_.begin(), tmp_array_.begin() + config_.min_active,
+                         tmp_array_.size() > static_cast<size_t>(config_.max_active)
+                             ? tmp_array_.begin() + config_.max_active
+                             : tmp_array_.end());
+        min_active_cutoff = tmp_array_[config_.min_active];
+      }
+    }
+    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
+      if (adaptive_beam)
+        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
+      return min_active_cutoff;
+    } else {
+      *adaptive_beam = config_.beam;
+      return beam_cutoff;
+    }
+  }
+}
+
+template <typename FST, typename Token>
+BaseFloat LatticeIncrementalDecoderTpl<FST, Token>::ProcessEmitting(
+    DecodableInterface *decodable) {
+  KALDI_ASSERT(active_toks_.size() > 0);
+  int32 frame = active_toks_.size() - 1; // frame is the frame-index
+                                         // (zero-based) used to get likelihoods
+                                         // from the decodable object.
+  active_toks_.resize(active_toks_.size() + 1);
+
+  Elem *final_toks = toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_
+                                    // in simple-decoder.h.   Removes the Elems from
+                                    // being indexed in the hash in toks_.
+  Elem *best_elem = NULL;
+  BaseFloat adaptive_beam;
+  size_t tok_cnt;
+  BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem);
+  KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is "
+                << adaptive_beam;
+
+  PossiblyResizeHash(tok_cnt); // This makes sure the hash is always big enough.
+
+  BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
+  // pruning "online" before having seen all tokens
+
+  BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good
+                               // dynamic range.
+
+  // First process the best token to get a hopefully
+  // reasonably tight bound on the next cutoff.  The only
+  // products of the next block are "next_cutoff" and "cost_offset".
+  if (best_elem) {
+    StateId state = best_elem->key;
+    Token *tok = best_elem->val;
+    cost_offset = -tok->tot_cost;
+    for (fst::ArcIterator<FST> aiter(*fst_, state); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel != 0) { // propagate..
+        BaseFloat new_weight = arc.weight.Value() + cost_offset -
+                               decodable->LogLikelihood(frame, arc.ilabel) +
+                               tok->tot_cost;
+        if (new_weight + adaptive_beam < next_cutoff)
+          next_cutoff = new_weight + adaptive_beam;
+      }
+    }
+  }
+
+  // Store the offset on the acoustic likelihoods that we're applying.
+  // Could just do cost_offsets_.push_back(cost_offset), but we
+  // do it this way as it's more robust to future code changes.
+  cost_offsets_.resize(frame + 1, 0.0);
+  cost_offsets_[frame] = cost_offset;
+
+  // the tokens are now owned here, in final_toks, and the hash is empty.
+  // 'owned' is a complex thing here; the point is we need to call DeleteElem
+  // on each elem 'e' to let toks_ know we're done with them.
+  for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) {
+    // loop this way because we delete "e" as we go.
+    StateId state = e->key;
+    Token *tok = e->val;
+    if (tok->tot_cost <= cur_cutoff) {
+      for (fst::ArcIterator<FST> aiter(*fst_, state); !aiter.Done(); aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        if (arc.ilabel != 0) { // propagate..
+          BaseFloat ac_cost =
+                        cost_offset - decodable->LogLikelihood(frame, arc.ilabel),
+                    graph_cost = arc.weight.Value(), cur_cost = tok->tot_cost,
+                    tot_cost = cur_cost + ac_cost + graph_cost;
+          if (tot_cost > next_cutoff)
+            continue;
+          else if (tot_cost + adaptive_beam < next_cutoff)
+            next_cutoff = tot_cost + adaptive_beam; // prune by best current token
+          // Note: the frame indexes into active_toks_ are one-based,
+          // hence the + 1.
+          Token *next_tok =
+              FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, NULL);
+          // NULL: no change indicator needed
+
+          // Add ForwardLink from tok to next_tok (put on head of list tok->links)
+          tok->links = new ForwardLinkT(next_tok, arc.ilabel, arc.olabel, graph_cost,
+                                        ac_cost, tok->links);
+        }
+      } // for all arcs
+    }
+    e_tail = e->tail;
+    toks_.Delete(e); // delete Elem
+  }
+  return next_cutoff;
+}
+
+// static inline
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::DeleteForwardLinks(Token *tok) {
+  ForwardLinkT *l = tok->links, *m;
+  while (l != NULL) {
+    m = l->next;
+    delete l;
+    l = m;
+  }
+  tok->links = NULL;
+}
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::ProcessNonemitting(BaseFloat cutoff) {
+  KALDI_ASSERT(!active_toks_.empty());
+  int32 frame = static_cast<int32>(active_toks_.size()) - 2;
+  // Note: "frame" is the time-index we just processed, or -1 if
+  // we are processing the nonemitting transitions before the
+  // first frame (called from InitDecoding()).
+
+  // Processes nonemitting arcs for one frame.  Propagates within toks_.
+  // Note-- this queue structure is is not very optimal as
+  // it may cause us to process states unnecessarily (e.g. more than once),
+  // but in the baseline code, turning this vector into a set to fix this
+  // problem did not improve overall speed.
+
+  KALDI_ASSERT(queue_.empty());
+
+  if (toks_.GetList() == NULL) {
+    if (!warned_) {
+      KALDI_WARN << "Error, no surviving tokens: frame is " << frame;
+      warned_ = true;
+    }
+  }
+
+  for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) {
+    StateId state = e->key;
+    if (fst_->NumInputEpsilons(state) != 0) queue_.push_back(state);
+  }
+
+  while (!queue_.empty()) {
+    StateId state = queue_.back();
+    queue_.pop_back();
+
+    Token *tok =
+        toks_.Find(state)
+            ->val; // would segfault if state not in toks_ but this can't happen.
+    BaseFloat cur_cost = tok->tot_cost;
+    if (cur_cost > cutoff) // Don't bother processing successors.
+      continue;
+    // If "tok" has any existing forward links, delete them,
+    // because we're about to regenerate them.  This is a kind
+    // of non-optimality (remember, this is the simple decoder),
+    // but since most states are emitting it's not a huge issue.
+    DeleteForwardLinks(tok); // necessary when re-visiting
+    tok->links = NULL;
+    for (fst::ArcIterator<FST> aiter(*fst_, state); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel == 0) { // propagate nonemitting only...
+        BaseFloat graph_cost = arc.weight.Value(), tot_cost = cur_cost + graph_cost;
+        if (tot_cost < cutoff) {
+          bool changed;
+
+          Token *new_tok =
+              FindOrAddToken(arc.nextstate, frame + 1, tot_cost, tok, &changed);
+
+          tok->links =
+              new ForwardLinkT(new_tok, 0, arc.olabel, graph_cost, 0, tok->links);
+
+          // "changed" tells us whether the new token has a different
+          // cost from before, or is new [if so, add into queue].
+          if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0)
+            queue_.push_back(arc.nextstate);
+        }
+      }
+    } // for all arcs
+  }   // while queue not empty
+}
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<FST, Token>::DeleteElems(Elem *list) {
+  for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
+    e_tail = e->tail;
+    toks_.Delete(e);
+  }
+}
+
+template <typename FST, typename Token>
+void LatticeIncrementalDecoderTpl<
+    FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin
+  for (size_t i = 0; i < active_toks_.size(); i++) {
+    // Delete all tokens alive on this frame, and any forward
+    // links they may have.
+    for (Token *tok = active_toks_[i].toks; tok != NULL;) {
+      DeleteForwardLinks(tok);
+      Token *next_tok = tok->next;
+      delete tok;
+      num_toks_--;
+      tok = next_tok;
+    }
+  }
+  active_toks_.clear();
+  KALDI_ASSERT(num_toks_ == 0);
+}
+
+
+template <typename FST, typename Token>
+const CompactLattice& LatticeIncrementalDecoderTpl<FST, Token>::GetLattice(
+    int32 num_frames_to_include,
+    bool use_final_probs) {
+  KALDI_ASSERT(num_frames_to_include >= num_frames_in_lattice_ &&
+               num_frames_to_include <= NumFramesDecoded());
+
+  if (decoding_finalized_ && !use_final_probs) {
+    // This is not supported
+    KALDI_ERR << "You cannot get the lattice without final-probs after "
+        "calling FinalizeDecoding().";
+  }
+  if (use_final_probs && num_frames_to_include != NumFramesDecoded()) {
+    /* This is because we only remember the relation between HCLG states and
+       Tokens for the current frame; the Token does not have a `state` field. */
+    KALDI_ERR << "use-final-probs may no be true if you are not "
+        "getting a lattice for all frames decoded so far.";
+  }
+
+
+  if (num_frames_to_include > num_frames_in_lattice_) {
+    /* Make sure the token-pruning is up to date.   If we just pruned the tokens,
+       this will do very little work. */
+    PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
+
+    Lattice chunk_lat;
+
+    unordered_map<Label, LatticeArc::StateId> token_label2state;
+    if (num_frames_in_lattice_ != 0) {
+      determinizer_.InitializeRawLatticeChunk(&chunk_lat,
+                                              &token_label2state);
+    }
+
+    // tok_map will map from Token* to state-id in chunk_lat.
+    // The cur and prev versions alternate on different frames.
+    unordered_map<Token*, StateId> &tok2state_map(temp_token_map_);
+    tok2state_map.clear();
+
+    unordered_map<Token*, Label> &next_token2label_map(token2label_map_temp_);
+    next_token2label_map.clear();
+
+    { // Deal with the last frame in the chunk, the one numbered `num_frames_to_include`.
+      // (Yes, this is backwards).   We allocate token labels, and set tokens as
+      // final, but don't add any transitions.  This may leave some states
+      // disconnected (e.g. due to chains of nonemitting arcs), but it's OK; we'll
+      // fix it when we generate the next chunk of lattice.
+      int32 frame = num_frames_to_include;
+      // Allocate state-ids for all tokens on this frame.
+
+      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+        /* If we included the final-costs at this stage, they will cause
+           non-final states to be pruned out from the end of the lattice. */
+        BaseFloat final_cost;
+        {  // This block computes final_cost
+          if (decoding_finalized_) {
+            if (final_costs_.empty()) {
+              final_cost = 0.0;  /* No final-state survived, so treat all as final
+                                  * with probability One(). */
+            } else {
+              auto iter = final_costs_.find(tok);
+              if (iter == final_costs_.end())
+                final_cost = std::numeric_limits<BaseFloat>::infinity();
+              else
+                final_cost = iter->second;
+            }
+          } else {
+            /* this is a `fake` final-cost used to guide pruning.  It's as if we
+               set the betas (backward-probs) on the final frame to the
+               negatives of the corresponding alphas, so all tokens on the last
+               frae will be on a best path..  the extra_cost for each token
+               always corresponds to its alpha+beta on this assumption.  We want
+               the final_cost here to correspond to the beta (backward-prob), so
+               we get that by final_cost = extra_cost - tot_cost.
+               [The tot_cost is the forward/alpha cost.]
+            */
+            final_cost = tok->extra_cost - tok->tot_cost;
+          }
+        }
+
+        StateId state = chunk_lat.AddState();
+        tok2state_map[tok] = state;
+        if (final_cost < std::numeric_limits<BaseFloat>::infinity()) {
+          next_token2label_map[tok] = AllocateNewTokenLabel();
+          StateId token_final_state = chunk_lat.AddState();
+          LatticeArc::Label ilabel = 0,
+              olabel = (next_token2label_map[tok] = AllocateNewTokenLabel());
+          chunk_lat.AddArc(state,
+                           LatticeArc(ilabel, olabel,
+                                      LatticeWeight::One(),
+                                      token_final_state));
+          chunk_lat.SetFinal(token_final_state, LatticeWeight(final_cost, 0.0));
+        }
+      }
+    }
+
+    // Go in reverse order over the remaining frames so we can create arcs as we
+    // go, and their destination-states will already be in the map.
+    for (int32 frame = num_frames_to_include;
+         frame >= num_frames_in_lattice_; frame--) {
+      // The conditional below is needed for the last frame of the utterance.
+      BaseFloat cost_offset = (frame < cost_offsets_.size() ?
+                               cost_offsets_[frame] : 0.0);
+
+      // For the first frame of the chunk, we need to make sure the states are
+      // the ones created by InitializeRawLatticeChunk() (where not pruned away).
+      if (frame == num_frames_in_lattice_ && num_frames_in_lattice_ != 0) {
+        for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+          auto iter = token2label_map_.find(tok);
+          KALDI_ASSERT(iter != token2label_map_.end());
+          Label token_label = iter->second;
+          auto iter2 = token_label2state.find(token_label);
+          if (iter2 != token_label2state.end()) {
+            StateId state = iter2->second;
+            tok2state_map[tok] = state;
+          } else {
+            // Some states may have been pruned out, but we should still allocate
+            // them.  They might have been part of chains of nonemitting arcs
+            // where the state became disconnected because the last chunk didn't
+            // include arcs starting at this frame.
+            StateId state = chunk_lat.AddState();
+            tok2state_map[tok] = state;
+          }
+        }
+      } else if (frame != num_frames_to_include) {  // We already created states
+                                                    // for the last frame.
+        for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+          StateId state = chunk_lat.AddState();
+          tok2state_map[tok] = state;
+        }
+      }
+      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+        auto iter = tok2state_map.find(tok);
+        KALDI_ASSERT(iter != tok2state_map.end());
+        StateId cur_state = iter->second;
+        for (ForwardLinkT *l = tok->links; l != NULL; l = l->next) {
+          auto next_iter = tok2state_map.find(l->next_tok);
+          if (next_iter == tok2state_map.end()) {
+            // Emitting arcs from the last frame we're including -- ignore
+            // these.
+            KALDI_ASSERT(frame == num_frames_to_include);
+            continue;
+          }
+          StateId next_state = next_iter->second;
+          BaseFloat this_offset = (l->ilabel != 0 ? cost_offset : 0);
+          LatticeArc arc(l->ilabel, l->olabel,
+                         LatticeWeight(l->graph_cost, l->acoustic_cost - this_offset),
+                         next_state);
+          // Note: the epsilons get redundantly included at the end and beginning
+          // of successive chunks.  These will get removed in the determinization.
+          chunk_lat.AddArc(cur_state, arc);
+        }
+      }
+    }
+    if (num_frames_in_lattice_ == 0) {
+      // This block locates the start token.  NOTE: we use the fact that in the
+      // linked list of tokens, things are added at the head, so the start state
+      // must be at the tail.  If this data structure is changed in future, we
+      // might need to explicitly store the start token as a class member.
+      Token *tok = active_toks_[0].toks;
+      if (tok == NULL) {
+        KALDI_WARN << "No tokens exist on start frame";
+        return determinizer_.GetLattice();  // will be empty.
+      }
+      while (tok->next != NULL)
+        tok = tok->next;
+      Token *start_token = tok;
+      auto iter = tok2state_map.find(start_token);
+      KALDI_ASSERT(iter != tok2state_map.end());
+      StateId start_state = iter->second;
+      chunk_lat.SetStart(start_state);
+    }
+    token2label_map_.swap(next_token2label_map);
+
+    // bool finished_before_beam =
+    determinizer_.AcceptRawLatticeChunk(&chunk_lat);
+    // We are ignoring the return status, which say whether it finished before the beam.
+
+    num_frames_in_lattice_ = num_frames_to_include;
+  }
+
+  unordered_map<Token*, BaseFloat> token2final_cost;
+  unordered_map<Label, BaseFloat> token_label2final_cost;
+  if (use_final_probs) {
+    ComputeFinalCosts(&token2final_cost, NULL, NULL);
+    for (const auto &p: token2final_cost) {
+      Token *tok = p.first;
+      BaseFloat cost = p.second;
+      auto iter = token2label_map_.find(tok);
+      if (iter != token2label_map_.end()) {
+        /* Some tokens may not have survived the pruned determinization. */
+        Label token_label = iter->second;
+        bool ret = token_label2final_cost.insert({token_label, cost}).second;
+        KALDI_ASSERT(ret); /* Make sure it was inserted. */
+      }
+    }
+  }
+  /* Note: these final-probs won't affect the next chunk, only the lattice
+     returned from GetLattice().  They are kind of temporaries. */
+  determinizer_.SetFinalCosts(token_label2final_cost.empty() ? NULL :
+                              &token_label2final_cost);
+
+  return determinizer_.GetLattice();
+}
+
+
+template <typename FST, typename Token>
+int32 LatticeIncrementalDecoderTpl<FST, Token>::GetNumToksForFrame(int32 frame) {
+  int32 r = 0;
+  for (Token *tok = active_toks_[frame].toks; tok; tok = tok->next) r++;
+  return r;
+}
+
+
+
+/* This utility function adds an arc to a Lattice, but where the source is a
+   CompactLatticeArc.  If the CompactLatticeArc has a string with length greater
+   than 1, this will require adding extra states to `lat`.
+ */
+static void AddCompactLatticeArcToLattice(
+    const CompactLatticeArc &clat_arc,
+    LatticeArc::StateId src_state,
+    Lattice *lat) {
+  const std::vector<int32> &string = clat_arc.weight.String();
+  size_t N = string.size();
+  if (N == 0) {
+    LatticeArc arc;
+    arc.ilabel = 0;
+    arc.olabel = clat_arc.ilabel;
+    arc.nextstate = clat_arc.nextstate;
+    arc.weight = clat_arc.weight.Weight();
+    lat->AddArc(src_state, arc);
+  } else {
+    LatticeArc::StateId cur_state = src_state;
+    for (size_t i = 0; i < N; i++) {
+      LatticeArc arc;
+      arc.ilabel = string[i];
+      arc.olabel = (i == 0 ? clat_arc.ilabel : 0);
+      arc.nextstate = (i + 1 == N ? clat_arc.nextstate : lat->AddState());
+      arc.weight = (i == 0 ? clat_arc.weight.Weight() : LatticeWeight::One());
+      lat->AddArc(cur_state, arc);
+      cur_state = arc.nextstate;
+    }
+  }
+}
+
+
+void LatticeIncrementalDeterminizer::Init() {
+  non_final_redet_states_.clear();
+  clat_.DeleteStates();
+  final_arcs_.clear();
+  forward_costs_.clear();
+  arcs_in_.clear();
+}
+
+CompactLattice::StateId LatticeIncrementalDeterminizer::AddStateToClat() {
+  CompactLattice::StateId ans = clat_.AddState();
+  forward_costs_.push_back(std::numeric_limits<BaseFloat>::infinity());
+  KALDI_ASSERT(forward_costs_.size() == ans + 1);
+  arcs_in_.resize(ans + 1);
+  return ans;
+}
+
+void LatticeIncrementalDeterminizer::AddArcToClat(
+    CompactLattice::StateId state,
+    const CompactLatticeArc &arc) {
+  BaseFloat forward_cost = forward_costs_[state] +
+      ConvertToCost(arc.weight);
+  if (forward_cost == std::numeric_limits<BaseFloat>::infinity())
+    return;
+  int32 arc_idx = clat_.NumArcs(state);
+  clat_.AddArc(state, arc);
+  arcs_in_[arc.nextstate].push_back({state, arc_idx});
+  if (forward_cost < forward_costs_[arc.nextstate])
+    forward_costs_[arc.nextstate] = forward_cost;
+}
+
+// See documentation in header
+void LatticeIncrementalDeterminizer::IdentifyTokenFinalStates(
+    const CompactLattice &chunk_clat,
+    std::unordered_map<CompactLattice::StateId, CompactLatticeArc::Label> *token_map) const {
+  token_map->clear();
+  using StateId = CompactLattice::StateId;
+  using Label = CompactLatticeArc::Label;
+
+  StateId num_states = chunk_clat.NumStates();
+  for (StateId state = 0; state < num_states; state++) {
+    for (fst::ArcIterator<CompactLattice> aiter(chunk_clat, state);
+       !aiter.Done(); aiter.Next()) {
+      const CompactLatticeArc &arc = aiter.Value();
+      if (arc.olabel >= kTokenLabelOffset && arc.olabel < kMaxTokenLabel) {
+        StateId nextstate = arc.nextstate;
+        auto r = token_map->insert({nextstate, arc.olabel});
+        // Check consistency of labels on incoming arcs
+        KALDI_ASSERT(r.first->second == arc.olabel);
+      }
+    }
+  }
+}
+
+
+
+
+void LatticeIncrementalDeterminizer::GetNonFinalRedetStates() {
+  using StateId = CompactLattice::StateId;
+  non_final_redet_states_.clear();
+  non_final_redet_states_.reserve(final_arcs_.size());
+
+  std::vector<StateId> state_queue;
+  for (const CompactLatticeArc &arc: final_arcs_) {
+    // Note: we abuse the .nextstate field to store the state which is really
+    // the source of that arc.
+    StateId redet_state = arc.nextstate;
+    if (forward_costs_[redet_state] != std::numeric_limits<BaseFloat>::infinity()) {
+      // if it is accessible..
+      if (non_final_redet_states_.insert(redet_state).second) {
+        // it was not already there
+        state_queue.push_back(redet_state);
+      }
+    }
+  }
+  // Add any states that are reachable from the states above.
+  while (!state_queue.empty()) {
+    StateId s = state_queue.back();
+    state_queue.pop_back();
+    for (fst::ArcIterator<CompactLattice> aiter(clat_, s); !aiter.Done();
+         aiter.Next()) {
+      const CompactLatticeArc &arc = aiter.Value();
+      StateId nextstate = arc.nextstate;
+      if (non_final_redet_states_.insert(nextstate).second)
+        state_queue.push_back(nextstate); // it was not already there
+    }
+  }
+}
+
+
+void LatticeIncrementalDeterminizer::InitializeRawLatticeChunk(
+    Lattice *olat,
+    unordered_map<Label, LatticeArc::StateId> *token_label2state) {
+  using namespace fst;
+
+  olat->DeleteStates();
+  LatticeArc::StateId start_state = olat->AddState();
+  olat->SetStart(start_state);
+  token_label2state->clear();
+
+  // redet_state_map maps from state-ids in clat_ to state-ids in olat.  This
+  // will be the set of states from which the arcs to final-states in the
+  // canonical appended lattice leave (physically, these are in the .nextstate
+  // elements of arcs_, since we use that field for the source state), plus any
+  // states reachable from those states.
+  unordered_map<CompactLattice::StateId, LatticeArc::StateId> redet_state_map;
+
+  for (CompactLattice::StateId redet_state: non_final_redet_states_)
+    redet_state_map[redet_state] = olat->AddState();
+
+  // First, process any arcs leaving the non-final redeterminized states that
+  // are not to final-states.  (What we mean by "not to final states" is, not to
+  // stats that are final in the `canonical appended lattice`.. they may
+  // actually be physically final in clat_, because we make clat_ what we want
+  // to return to the user.
+  for (CompactLattice::StateId redet_state: non_final_redet_states_) {
+    LatticeArc::StateId lat_state = redet_state_map[redet_state];
+
+    for (ArcIterator<CompactLattice> aiter(clat_, redet_state);
+         !aiter.Done(); aiter.Next()) {
+      const CompactLatticeArc &arc = aiter.Value();
+      CompactLattice::StateId nextstate = arc.nextstate;
+      LatticeArc::StateId lat_nextstate = olat->NumStates();
+      auto r = redet_state_map.insert({nextstate, lat_nextstate});
+      if (r.second) {  // Was inserted.
+        LatticeArc::StateId s = olat->AddState();
+        KALDI_ASSERT(s == lat_nextstate);
+      } else {
+        // was not inserted -> was already there.
+        lat_nextstate = r.first->second;
+      }
+      CompactLatticeArc clat_arc(arc);
+      clat_arc.nextstate = lat_nextstate;
+      AddCompactLatticeArcToLattice(clat_arc, lat_state, olat);
+    }
+    clat_.DeleteArcs(redet_state);
+    clat_.SetFinal(redet_state, CompactLatticeWeight::Zero());
+  }
+
+  for (const CompactLatticeArc &arc: final_arcs_) {
+    // We abuse the `nextstate` field to store the source state.
+    CompactLattice::StateId src_state = arc.nextstate;
+    auto iter = redet_state_map.find(src_state);
+    if (forward_costs_[src_state] == std::numeric_limits<BaseFloat>::infinity())
+      continue;  /* Unreachable state */
+    KALDI_ASSERT(iter != redet_state_map.end());
+    LatticeArc::StateId src_lat_state = iter->second;
+    Label token_label = arc.ilabel;  // will be == arc.olabel.
+    KALDI_ASSERT(token_label >= kTokenLabelOffset &&
+                 token_label < kMaxTokenLabel);
+    auto r = token_label2state->insert({token_label,
+            olat->NumStates()});
+    LatticeArc::StateId dest_lat_state = r.first->second;
+    if (r.second) { // was inserted
+      LatticeArc::StateId new_state = olat->AddState();
+      KALDI_ASSERT(new_state == dest_lat_state);
+    }
+    CompactLatticeArc new_arc;
+    new_arc.nextstate = dest_lat_state;
+    /*  We convert the token-label to epsilon; it's not needed anymore. */
+    new_arc.ilabel = new_arc.olabel = 0;
+    new_arc.weight = arc.weight;
+    AddCompactLatticeArcToLattice(new_arc, src_lat_state, olat);
+  }
+
+  // Now deal with the initial-probs.  Arcs from initial-states to
+  // redeterminized-states in the raw lattice have an olabel that identifies the
+  // id of that redeterminized-state in clat_, and a cost that is derived from
+  // its entry in forward_costs_.  These forward-probs are used to get the
+  // pruned lattice determinization to behave correctly, and will be canceled
+  // out later on.
+  //
+  // In the paper this is the second-from-last bullet in Sec. 5.2.  NOTE: in the
+  // paper we state that we only include such arcs for "each redeterminized
+  // state that is either initial in det(A) or that has an arc entering it from
+  // a state that is not a redeterminized state."  In fact, we include these
+  // arcs for all redeterminized states.  I realized that it won't make a
+  // difference to the outcome, and it's easier to do it this way.
+  for (CompactLattice::StateId state_id: non_final_redet_states_) {
+    BaseFloat forward_cost = forward_costs_[state_id];
+    LatticeArc arc;
+    arc.ilabel = 0;
+    // The olabel (which appears where the word-id would) is what
+    // we call a 'state-label'.  It identifies a state in clat_.
+    arc.olabel = state_id + kStateLabelOffset;
+    // It doesn't matter what field we put forward_cost in (or whether we
+    // divide it among them both; the effect on pruning is the same, and
+    // we will cancel it out later anyway.
+    arc.weight = LatticeWeight(forward_cost, 0);
+    auto iter = redet_state_map.find(state_id);
+    KALDI_ASSERT(iter != redet_state_map.end());
+    arc.nextstate = iter->second;
+    olat->AddArc(start_state, arc);
+  }
+}
+
+void LatticeIncrementalDeterminizer::GetRawLatticeFinalCosts(
+    const Lattice &raw_fst,
+    std::unordered_map<Label, BaseFloat> *old_final_costs) {
+  LatticeArc::StateId raw_fst_num_states = raw_fst.NumStates();
+  for (LatticeArc::StateId s = 0; s < raw_fst_num_states; s++) {
+    for (fst::ArcIterator<Lattice> aiter(raw_fst, s); !aiter.Done();
+         aiter.Next()) {
+      const LatticeArc &value = aiter.Value();
+      if (value.olabel >= (Label)kTokenLabelOffset &&
+          value.olabel < (Label)kMaxTokenLabel) {
+        LatticeWeight final_weight = raw_fst.Final(value.nextstate);
+        if (final_weight == LatticeWeight::Zero() ||
+            final_weight.Value2() != 0) {
+          KALDI_ERR << "Label " << value.olabel << " from state " << s
+                    << " looks like a token-label but its next-state "
+                    << value.nextstate <<
+              " has unexpected final-weight " << final_weight.Value1() << ','
+                    << final_weight.Value2();
+        }
+        auto r = old_final_costs->insert({value.olabel,
+                final_weight.Value1()});
+        if (!r.second && r.first->second != final_weight.Value1()) {
+          // For any given token-label, all arcs in raw_fst with that
+          // olabel should go to the same state, so this should be
+          // impossible.
+          KALDI_ERR << "Unexpected mismatch in final-costs for tokens, "
+                    << r.first->second << " vs " << final_weight.Value1();
+        }
+      }
+    }
+  }
+}
+
+
+bool LatticeIncrementalDeterminizer::ProcessArcsFromChunkStartState(
+    const CompactLattice &chunk_clat,
+    std::unordered_map<CompactLattice::StateId, CompactLattice::StateId> *state_map,
+    CompactLatticeWeight *extra_start_weight) {
+  using StateId = CompactLattice::StateId;
+  StateId clat_num_states = clat_.NumStates();
+
+  // Process arcs leaving the start state of chunk_clat.  These arcs will have
+  // state-labels on them (unless this is the first chunk).
+  // For destination-states of those arcs, work out which states in
+  // clat_ they correspond to and update their forward_costs.
+  for (fst::ArcIterator<CompactLattice> aiter(chunk_clat, chunk_clat.Start());
+       !aiter.Done(); aiter.Next()) {
+    const CompactLatticeArc &arc = aiter.Value();
+    Label label = arc.ilabel;  // ilabel == olabel; would be the olabel
+                               // in a Lattice.
+    if (!(label >= kStateLabelOffset &&
+          label - kStateLabelOffset < clat_num_states)) {
+      // The label was not a state-label.  This should only be possible on the
+      // first chunk.
+      KALDI_ASSERT(state_map->empty());
+      return true;  // this is the first chunk.
+    }
+    StateId clat_state = label - kStateLabelOffset;
+    StateId chunk_state = arc.nextstate;
+    auto p = state_map->insert({chunk_state, clat_state});
+    StateId dest_clat_state = p.first->second;
+    // We deleted all its arcs in InitializeRawLatticeChunk
+    KALDI_ASSERT(clat_.NumArcs(clat_state) == 0);
+    /*
+      In almost all cases, dest_clat_state and clat_state will be the same state;
+      but there may be situations where two arcs with different state-labels
+      left the start state and entered the same next-state in chunk_clat; and in
+      these cases, they will be different.
+
+      We didn't address this issue in the paper (or actually realize it could be
+      a problem).  What we do is pick one of the clat_states as the "canonical"
+      one, and redirect all incoming transitions of the others to enter the
+      "canonical" one.  (Search below for new_in_arc.nextstate =
+      dest_clat_state).
+     */
+    if (clat_state != dest_clat_state) {
+      // Check that the start state isn't getting merged with any other state.
+      // If this were possible, we'd need to deal with it specially, but it
+      // can't be, because to be merged, 2 states must have identical arcs
+      // leaving them with identical weights, so we'd need to have another state
+      // on frame 0 identical to the start state, which is not possible if the
+      // lattice is deterministic and epsilon-free.
+      KALDI_ASSERT(clat_state != 0 && dest_clat_state != 0);
+    }
+
+    // in_weight is an extra weight that we'll include on arcs entering this
+    // state from the previous chunk.  We need to cancel out
+    // `forward_costs[clat_state]`, which was included in the corresponding arc
+    // in the raw lattice for pruning purposes; and we need to include
+    // the weight from the start-state of `chunk_clat` to this state.
+    CompactLatticeWeight extra_weight_in = arc.weight;
+    extra_weight_in.SetWeight(
+        fst::Times(extra_weight_in.Weight(),
+                   LatticeWeight(-forward_costs_[clat_state], 0.0)));
+
+    if (clat_state == 0) {
+      // if clat_state is the star-state of clat_ (state 0), we can't modify
+      // incoming arcs; we need to modify outgoing arcs, but we'll do that
+      // later, after we add them.
+      *extra_start_weight = extra_weight_in;
+      forward_costs_[0] = forward_costs_[0] + ConvertToCost(extra_weight_in);
+      continue;
+    }
+
+    // Note: 0 is the start state of clat_.  This was checked.
+    forward_costs_[clat_state] = (clat_state == 0 ? 0 :
+                                  std::numeric_limits<BaseFloat>::infinity());
+    std::vector<std::pair<StateId, int32> > arcs_in;
+    arcs_in.swap(arcs_in_[clat_state]);
+    for (auto p: arcs_in) {
+      // Note: we'll be doing `continue` below if this input arc came from
+      // another redeterminized-state, because we did DeleteStates() for them in
+      // InitializeRawLatticeChunk().  Those arcs will be transferred
+      // from chunk_clat later on.
+      CompactLattice::StateId src_state = p.first;
+      int32 arc_pos = p.second;
+      if (arc_pos >= (int32)clat_.NumArcs(src_state))
+        continue;
+      fst::MutableArcIterator<CompactLattice> aiter(&clat_, src_state);
+      aiter.Seek(arc_pos);
+      if (aiter.Value().nextstate != clat_state)
+        continue;  // This arc record has become invalidated.
+      CompactLatticeArc new_in_arc(aiter.Value());
+      // In most cases we will have dest_clat_state == clat_state, so the next
+      // line won't change the value of .nextstate
+      new_in_arc.nextstate = dest_clat_state;
+      new_in_arc.weight = fst::Times(new_in_arc.weight, extra_weight_in);
+      aiter.SetValue(new_in_arc);
+
+      BaseFloat new_forward_cost = forward_costs_[src_state] +
+          ConvertToCost(new_in_arc.weight);
+      if (new_forward_cost < forward_costs_[dest_clat_state])
+        forward_costs_[dest_clat_state] = new_forward_cost;
+      arcs_in_[dest_clat_state].push_back(p);
+    }
+  }
+  return false;  // this is not the first chunk.
+}
+
+void LatticeIncrementalDeterminizer::ReweightStartState(
+    CompactLatticeWeight &extra_start_weight) {
+  for (fst::MutableArcIterator<CompactLattice> aiter(&clat_, 0);
+       !aiter.Done(); aiter.Next()) {
+    CompactLatticeArc arc(aiter.Value());
+    arc.weight = fst::Times(extra_start_weight, arc.weight);
+    aiter.SetValue(arc);
+  }
+}
+
+void LatticeIncrementalDeterminizer::TransferArcsToClat(
+    const CompactLattice &chunk_clat,
+    bool is_first_chunk,
+    const std::unordered_map<CompactLattice::StateId, CompactLattice::StateId> &state_map,
+    const std::unordered_map<CompactLattice::StateId, Label> &chunk_state_to_token,
+    const std::unordered_map<Label, BaseFloat> &old_final_costs) {
+  using StateId = CompactLattice::StateId;
+  StateId chunk_num_states = chunk_clat.NumStates();
+
+  // Now transfer arcs from chunk_clat to clat_.
+  for (StateId chunk_state = (is_first_chunk ? 0 : 1);
+       chunk_state < chunk_num_states; chunk_state++) {
+    auto iter = state_map.find(chunk_state);
+    if (iter == state_map.end()) {
+      KALDI_ASSERT(chunk_state_to_token.count(chunk_state) != 0);
+      // Don't process token-final states.  Anyway they have no arcs leaving
+      // them.
+      continue;
+    }
+    StateId clat_state = iter->second;
+
+    // We know that this point that `clat_state` is not a token-final state
+    // (see glossary for definition) as if it were, we would have done
+    // `continue` above.
+    //
+    // Only in the last chunk of the lattice would be there be a final-prob on
+    // states that are not `token-final states`; these final-probs would
+    // normally all be Zero() at this point.  So in almost all cases the following
+    // call will do nothing.
+    clat_.SetFinal(clat_state, chunk_clat.Final(chunk_state));
+
+    // Process arcs leaving this state.
+    for (fst::ArcIterator<CompactLattice> aiter(chunk_clat, chunk_state);
+         !aiter.Done(); aiter.Next()) {
+      CompactLatticeArc arc(aiter.Value());
+
+      auto next_iter = state_map.find(arc.nextstate);
+      if (next_iter != state_map.end()) {
+        // The normal case (when the .nextstate has a corresponding
+        // state in clat_) is very simple.  Just copy the arc over.
+        arc.nextstate = next_iter->second;
+        KALDI_ASSERT(arc.ilabel < kTokenLabelOffset ||
+                     arc.ilabel > kMaxTokenLabel);
+        AddArcToClat(clat_state, arc);
+      } else {
+        // This is the case when the arc is to a `token-final` state (see
+        // glossary.)
+
+        // TODO: remove the following slightly excessive assertion?
+        KALDI_ASSERT(chunk_clat.Final(arc.nextstate) != CompactLatticeWeight::Zero() &&
+                     arc.olabel >= (Label)kTokenLabelOffset &&
+                     arc.olabel < (Label)kMaxTokenLabel &&
+                     chunk_state_to_token.count(arc.nextstate) != 0 &&
+                     old_final_costs.count(arc.olabel) != 0);
+
+        // Include the final-cost of the next state (which should be final)
+        // in arc.weight.
+        arc.weight = fst::Times(arc.weight,
+                                chunk_clat.Final(arc.nextstate));
+
+        auto cost_iter = old_final_costs.find(arc.olabel);
+        KALDI_ASSERT(cost_iter != old_final_costs.end());
+        BaseFloat old_final_cost = cost_iter->second;
+
+        // `arc` is going to become an element of final_arcs_.  These
+        // contain information about transitions from states in clat_ to
+        // `token-final` states (i.e. states that have a token-label on the arc
+        // to them and that are final in the canonical compact lattice).
+        // We subtract the old_final_cost as it was just a temporary cost
+        // introduced for pruning purposes.
+        arc.weight.SetWeight(fst::Times(arc.weight.Weight(),
+                                        LatticeWeight{-old_final_cost, 0.0}));
+        // In a slight abuse of the Arc data structure, the nextstate is set to
+        // the source state.  The label (ilabel == olabel) indicates the
+        // token it is associated with.
+        arc.nextstate = clat_state;
+        final_arcs_.push_back(arc);
+      }
+    }
+  }
+
+}
+
+bool LatticeIncrementalDeterminizer::AcceptRawLatticeChunk(
+    Lattice *raw_fst) {
+  using Label = CompactLatticeArc::Label;
+  using StateId = CompactLattice::StateId;
+
+  // old_final_costs is a map from a `token-label` (see glossary) to the
+  // associated final-prob in a final-state of `raw_fst`, that is associated
+  // with that Token.  These are Tokens that were active at the end of the
+  // chunk.  The final-probs may arise from beta (backward) costs, introduced
+  // for pruning purposes, and/or from final-probs in HCLG.  Those costs will
+  // not be included in anything we store permamently in this class; they used
+  // only to guide pruned determinization, and we will use `old_final_costs`
+  // later to cancel them out.
+  std::unordered_map<Label, BaseFloat> old_final_costs;
+  GetRawLatticeFinalCosts(*raw_fst, &old_final_costs);
+
+  CompactLattice chunk_clat;
+  bool determinized_till_beam = DeterminizeLatticePhonePrunedWrapper(
+      trans_model_, raw_fst, config_.lattice_beam, &chunk_clat,
+      config_.det_opts);
+
+  TopSortCompactLatticeIfNeeded(&chunk_clat);
+
+  std::unordered_map<StateId, Label> chunk_state_to_token;
+  IdentifyTokenFinalStates(chunk_clat,
+                           &chunk_state_to_token);
+
+  StateId chunk_num_states = chunk_clat.NumStates();
+  if (chunk_num_states == 0) {
+    // This will be an error but user-level calling code can detect it from the
+    // lattice being empty.
+    KALDI_WARN << "Empty lattice, something went wrong.";
+    clat_.DeleteStates();
+    return false;
+  }
+
+  StateId start_state = chunk_clat.Start();  // would be 0.
+  KALDI_ASSERT(start_state == 0);
+
+  // Process arcs leaving the start state of chunk_clat. Unless this is the
+  // first chunk in the lattice, all arcs leaving the start state of chunk_clat
+  // will have `state labels` on them (identifying redeterminized-states in
+  // clat_), and will transition to a state in `chunk_clat` that we can identify
+  // with that redeterminized-state.
+
+  // state_map maps from (non-initial, non-token-final state s in chunk_clat) to
+  // a state in clat_.
+  std::unordered_map<StateId, StateId> state_map;
+
+
+  CompactLatticeWeight extra_start_weight = CompactLatticeWeight::One();
+  bool is_first_chunk = ProcessArcsFromChunkStartState(chunk_clat, &state_map,
+                                                       &extra_start_weight);
+
+  // Remove any existing arcs in clat_ that leave redeterminized-states, and
+  // make those states non-final.  Below, we'll add arcs leaving those states
+  // (and possibly new final-probs.)
+  for (StateId clat_state: non_final_redet_states_) {
+    clat_.DeleteArcs(clat_state);
+    clat_.SetFinal(clat_state, CompactLatticeWeight::Zero());
+  }
+
+  // The previous final-arc info is no longer relevant; we'll recreate it below.
+  final_arcs_.clear();
+
+  // assume chunk_lat.Start() == 0; we asserted it above.  Allocate state-ids
+  // for all remaining states in chunk_clat, except for token-final states.
+  for (StateId state = (is_first_chunk ? 0 : 1);
+       state < chunk_num_states; state++) {
+    if (chunk_state_to_token.count(state) != 0)
+      continue;  // these `token-final` states don't get a state allocated.
+
+    StateId new_clat_state = clat_.NumStates();
+    if (state_map.insert({state, new_clat_state}).second) {
+      // If it was inserted then we need to actually allocate that state
+      StateId s = AddStateToClat();
+      KALDI_ASSERT(s == new_clat_state);
+    }   // else do nothing; it would have been a redeterminized-state and no
+  }     // allocation is needed since they already exist in clat_. and
+        // in state_map.
+
+  if (is_first_chunk) {
+    auto iter = state_map.find(start_state);
+    KALDI_ASSERT(iter != state_map.end());
+    CompactLattice::StateId clat_start_state = iter->second;
+    KALDI_ASSERT(clat_start_state == 0);  // topological order.
+    clat_.SetStart(clat_start_state);
+    forward_costs_[clat_start_state] = 0.0;
+  }
+
+  TransferArcsToClat(chunk_clat, is_first_chunk,
+                     state_map, chunk_state_to_token, old_final_costs);
+
+  if (extra_start_weight != CompactLatticeWeight::One())
+    ReweightStartState(extra_start_weight);
+
+  GetNonFinalRedetStates();
+
+  return determinized_till_beam;
+}
+
+
+
+void LatticeIncrementalDeterminizer::SetFinalCosts(
+    const unordered_map<Label, BaseFloat> *token_label2final_cost) {
+  if (final_arcs_.empty()) {
+    KALDI_WARN << "SetFinalCosts() called when final_arcs_.empty()... possibly "
+        "means you are calling this after Finalize()?  Not allowed: could "
+        "indicate a code error.  Or possibly decoding failed somehow.";
+  }
+
+  /*
+    prefinal states a terminology that does not appear in the paper.  What it
+    means is: the set of states that have an arc with a Token-label as the label
+    leaving them in the canonical appended lattice.
+  */
+  std::unordered_set<int32> &prefinal_states(temp_);
+  prefinal_states.clear();
+  for (const auto &arc: final_arcs_) {
+    /* Caution: `state` is actually the state the arc would
+       leave from in the canonical appended lattice; we just store
+       that in the .nextstate field. */
+    CompactLattice::StateId state = arc.nextstate;
+    prefinal_states.insert(state);
+  }
+
+  for (int32 state: prefinal_states)
+    clat_.SetFinal(state, CompactLatticeWeight::Zero());
+
+
+  for (const CompactLatticeArc &arc: final_arcs_) {
+    Label token_label = arc.ilabel;
+    /* Note: we store the source state in the .nextstate field. */
+    CompactLattice::StateId src_state = arc.nextstate;
+    BaseFloat graph_final_cost;
+    if (token_label2final_cost == NULL) {
+      graph_final_cost = 0.0;
+    } else {
+      auto iter = token_label2final_cost->find(token_label);
+      if (iter == token_label2final_cost->end())
+        continue;
+      else
+        graph_final_cost = iter->second;
+    }
+    /* It might seem odd to set a final-prob on the src-state of the arc..
+       the point is that the symbol on the arc is a token-label, which should not
+       appear in the lattice the user sees, so after that token-label is removed
+       the arc would just become a final-prob.
+    */
+    clat_.SetFinal(src_state,
+                   fst::Plus(clat_.Final(src_state),
+                             fst::Times(arc.weight,
+                                        CompactLatticeWeight(
+                                            LatticeWeight(graph_final_cost, 0), {}))));
+  }
+}
+
+
+
+
+// Instantiate the template for the combination of token types and FST types
+// that we'll need.
+template class LatticeIncrementalDecoderTpl<fst::Fst<fst::StdArc>, decoder::StdToken>;
+template class LatticeIncrementalDecoderTpl<fst::VectorFst<fst::StdArc>,
+                                            decoder::StdToken>;
+template class LatticeIncrementalDecoderTpl<fst::ConstFst<fst::StdArc>,
+                                            decoder::StdToken>;
+template class LatticeIncrementalDecoderTpl<fst::GrammarFst, decoder::StdToken>;
+
+template class LatticeIncrementalDecoderTpl<fst::Fst<fst::StdArc>,
+                                            decoder::BackpointerToken>;
+template class LatticeIncrementalDecoderTpl<fst::VectorFst<fst::StdArc>,
+                                            decoder::BackpointerToken>;
+template class LatticeIncrementalDecoderTpl<fst::ConstFst<fst::StdArc>,
+                                            decoder::BackpointerToken>;
+template class LatticeIncrementalDecoderTpl<fst::GrammarFst,
+                                            decoder::BackpointerToken>;
+
+} // end namespace kaldi.
diff --git a/src/decoder/lattice-incremental-decoder.h b/src/decoder/lattice-incremental-decoder.h
new file mode 100644
index 00000000000..7abc370178a
--- /dev/null
+++ b/src/decoder/lattice-incremental-decoder.h
@@ -0,0 +1,752 @@
+// decoder/lattice-incremental-decoder.h
+
+// Copyright      2019  Zhehuai Chen, Daniel Povey
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_DECODER_LATTICE_INCREMENTAL_DECODER_H_
+#define KALDI_DECODER_LATTICE_INCREMENTAL_DECODER_H_
+
+#include "util/stl-utils.h"
+#include "util/hash-list.h"
+#include "fst/fstlib.h"
+#include "itf/decodable-itf.h"
+#include "fstext/fstext-lib.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "lat/kaldi-lattice.h"
+#include "decoder/grammar-fst.h"
+#include "lattice-faster-decoder.h"
+
+namespace kaldi {
+/**
+   The normal decoder, lattice-faster-decoder.h, sometimes has an issue when
+   doing real-time applications with long utterances, that each time you get the
+   lattice the lattice determinization can take a considerable amount of time;
+   this introduces latency.  This version of the decoder spreads the work of
+   lattice determinization out throughout the decoding process.
+
+   NOTE:
+
+   Please see https://www.danielpovey.com/files/ *TBD* .pdf for a technical
+   explanation of what is going on here.
+
+   GLOSSARY OF TERMS:
+      chunk: We do the determinization on chunks of frames; these
+          may coincide with the chunks on which the user calls
+          AdvanceDecoding().  The basic idea is to extract chunks
+          of the raw lattice and determinize them individually, but
+          it gets much more complicated than that.  The chunks
+          should normally be at least as long as a word (let's say,
+          at least 20 frames), or the overhead of this algorithm
+          might become excessive and affect RTF.
+
+      raw lattice chunk: A chunk of raw (i.e. undeterminized) lattice
+          that we will determinize.  In the paper this corresponds
+          to the FST B that is described in Section 5.2.
+
+      token_label, state_label / token-label, state-label:
+
+          In the paper these are both referred to as `state labels` (these are
+          special, large integer id's that refer to states in the undeterminized
+          lattice and in the the determinized lattice); but we use two separate
+          terms here, for more clarity, when referring to the undeterminized
+          vs. determinized lattice.
+
+           token_label conceptually refers to states in the
+           raw lattice, but we don't materialize the entire
+           raw lattice as a physical FST and and these tokens
+           are actually tokens (template type Token) held by
+           the decoder
+
+           state_label when used in this code refers specifically
+           to labels that identify states in the determinized
+           lattice (i.e. state indexes in lat_).
+
+       token-final state
+          A state in a raw lattice or in a determinized chunk that has an arc
+          entering it that has a `token-label` on it (as defined above).
+          These states will have nonzero final-probs.
+
+       redeterminized-non-splice-state, aka ns_redet:
+         A redeterminized state which is not also a splice state;
+         refer to the paper for explanation.  In the already-determinized
+         part this means a redeterminized state which is not final.
+
+       canonical appended lattice:  This is the appended compact lattice
+         that we conceptually have (i.e. what we described in the paper).
+         The difference from the "actual appended lattice" stored
+         in LatticeIncrementalDeterminizer::clat_ is that the
+         actual appended lattice has all its final-arcs replaced with
+         final-probs, and we keep the real final-arcs "on the side" in a
+         separate data structure.  The final-probs in clat_ aren't
+         necessarily related to the costs on the final-arcs; instead
+         they can have arbitrary values passed in by the user (e.g.
+         if we want to include final-probs).  This means that the
+         clat_ can be returned without modification to the user who wants
+         a partially determinized result.
+
+       final-arc:  An arc in the canonical appended CompactLattice which
+         goes to a final-state.  These arcs will have `state-labels` as
+         their labels.
+
+ */
+struct LatticeIncrementalDecoderConfig {
+  // All the configuration values until det_opts are the same as in
+  // LatticeFasterDecoder.  For clarity we repeat them rather than inheriting.
+  BaseFloat beam;
+  int32 max_active;
+  int32 min_active;
+  BaseFloat lattice_beam;
+  int32 prune_interval;
+  BaseFloat beam_delta; // has nothing to do with beam_ratio
+  BaseFloat hash_ratio;
+  BaseFloat prune_scale; // Note: we don't make this configurable on the command line,
+                         // it's not a very important parameter.  It affects the
+                         // algorithm that prunes the tokens as we go.
+  // Most of the options inside det_opts are not actually queried by the
+  // LatticeIncrementalDecoder class itself, but by the code that calls it, for
+  // example in the function DecodeUtteranceLatticeIncremental.
+  fst::DeterminizeLatticePhonePrunedOptions det_opts;
+
+  // The configuration values from this point on are specific to the
+  // incremental determinization.  See where they are registered for
+  // explanation.
+  // Caution: these are only inspected in UpdateLatticeDeterminization().
+  // If you call
+  int32 determinize_max_delay;
+  int32 determinize_min_chunk_size;
+
+
+  LatticeIncrementalDecoderConfig()
+      : beam(16.0),
+        max_active(std::numeric_limits<int32>::max()),
+        min_active(200),
+        lattice_beam(10.0),
+        prune_interval(25),
+        beam_delta(0.5),
+        hash_ratio(2.0),
+        prune_scale(0.1),
+        determinize_max_delay(60),
+        determinize_min_chunk_size(20) {
+    det_opts.minimize = false;
+  }
+  void Register(OptionsItf *opts) {
+    det_opts.Register(opts);
+    opts->Register("beam", &beam, "Decoding beam.  Larger->slower, more accurate.");
+    opts->Register("max-active", &max_active,
+                   "Decoder max active states.  Larger->slower; "
+                   "more accurate");
+    opts->Register("min-active", &min_active, "Decoder minimum #active states.");
+    opts->Register("lattice-beam", &lattice_beam,
+                   "Lattice generation beam.  Larger->slower, "
+                   "and deeper lattices");
+    opts->Register("prune-interval", &prune_interval,
+                   "Interval (in frames) at "
+                   "which to prune tokens");
+    opts->Register("beam-delta", &beam_delta,
+                   "Increment used in decoding-- this "
+                   "parameter is obscure and relates to a speedup in the way the "
+                   "max-active constraint is applied.  Larger is more accurate.");
+    opts->Register("hash-ratio", &hash_ratio,
+                   "Setting used in decoder to "
+                   "control hash behavior");
+    opts->Register("determinize-max-delay", &determinize_max_delay,
+                   "Maximum frames of delay between decoding a frame and "
+                   "determinizing it");
+    opts->Register("determinize-min-chunk-size", &determinize_min_chunk_size,
+                   "Minimum chunk size used in determinization");
+
+  }
+  void Check() const {
+    if (!(beam > 0.0 && max_active > 1 && lattice_beam > 0.0 &&
+          min_active <= max_active && prune_interval > 0 &&
+          beam_delta > 0.0 && hash_ratio >= 1.0 &&
+          prune_scale > 0.0 && prune_scale < 1.0 &&
+          determinize_max_delay > determinize_min_chunk_size &&
+          determinize_min_chunk_size > 0))
+        KALDI_ERR << "Invalid options given to decoder";
+    /* Minimization of the chunks is not compatible withour algorithm (or at
+       least, would require additional complexity to implement.) */
+    if (det_opts.minimize || !det_opts.word_determinize)
+      KALDI_ERR << "Invalid determinization options given to decoder.";
+  }
+};
+
+
+
+/**
+   This class is used inside LatticeIncrementalDecoderTpl; it handles
+   some of the details of incremental determinization.
+   https://www.danielpovey.com/files/ *TBD*.pdf for the paper.
+
+*/
+class LatticeIncrementalDeterminizer {
+ public:
+  using Label = typename LatticeArc::Label;  /* Actualy the same labels appear
+                                                in both lattice and compact
+                                                lattice, so we don't use the
+                                                specific type all the time but
+                                                just say 'Label' */
+  LatticeIncrementalDeterminizer(
+      const TransitionModel &trans_model,
+      const LatticeIncrementalDecoderConfig &config):
+      trans_model_(trans_model), config_(config) { }
+
+  // Resets the lattice determinization data for new utterance
+  void Init();
+
+  // Returns the current determinized lattice.
+  const CompactLattice &GetDeterminizedLattice() const { return clat_; }
+
+  /**
+     Starts the process of creating a raw lattice chunk.  (Search the glossary
+     for "raw lattice chunk").  This just sets up the initial states and
+     redeterminized-states in the chunk.  Relates to sec. 5.2 in the paper,
+     specifically the initial-state i and the redeterminized-states.
+
+     After calling this, the caller would add the remaining arcs and states
+     to `olat` and then call AcceptRawLatticeChunk() with the result.
+
+        @param [out] olat    The lattice to be (partially) created
+
+        @param [out] token_label2state  This function outputs to here
+                a map from `token-label` to the state we created for
+                it in *olat.  See glossary for `token-label`.
+                The keys actually correspond to the .nextstate fields
+                in the arcs in final_arcs_; values are states in `olat`.
+                See the last bullet point before Sec. 5.3 in the paper.
+  */
+  void InitializeRawLatticeChunk(
+      Lattice *olat,
+      unordered_map<Label, LatticeArc::StateId> *token_label2state);
+
+  /**
+     This function accepts the raw FST (state-level lattice) corresponding to a
+     single chunk of the lattice, determinizes it and appends it to this->clat_.
+     Unless this was the
+
+     Note: final-probs in `raw_fst` are treated specially: they are used to
+     guide the pruned determinization, but when you call GetLattice() it will be
+     -- except for pruning effects-- as if all nonzero final-probs in `raw_fst`
+     were: One() if final_costs == NULL; else the value present in `final_costs`.
+
+       @param [in] raw_fst  (Consumed destructively).  The input
+                  raw (state-level) lattice.  Would correspond to the
+                  FST A in the paper if first_frame == 0, and B
+                  otherwise.
+
+     @return returns false if determinization finished earlier than the beam
+         or the determinized lattice was empty; true otherwise.
+
+     NOTE: if this is not the final chunk, you will probably want to call
+     SetFinalCosts() directly after calling this.
+  */
+  bool AcceptRawLatticeChunk(Lattice *raw_fst);
+
+  /*
+    Sets final-probs in `clat_`.  Must only be called if the final chunk
+    has not been processed.  (The final chunk is whenever GetLattice() is
+    called with finalize == true).
+
+    The reason this is a separate function from AcceptRawLatticeChunk() is that
+    there may be situations where a user wants to get the latice with
+    final-probs in it, after previously getting it without final-probs; or
+    vice versa.  By final-probs, we mean the Final() probabilities in the
+    HCLG (decoding graph; this->fst_).
+
+       @param [in] token_label2final_cost   A map from the token-label
+              corresponding to Tokens active on the final frame of the
+              lattice in the object, to the final-cost we want to use for
+              those tokens.  If NULL, it means all Tokens should be treated
+              as final with probability One().  If non-NULL, and a particular
+              token-label is not a key of this map, it means that Token
+              corresponded to a state that was not final in HCLG; and
+              such tokens will be treated as non-final.  However,
+              if this would result in no states in the lattice being final,
+              we will treat all Tokens as final with probability One(),
+              a warning will be printed (this should not happen.)
+  */
+  void SetFinalCosts(const unordered_map<Label, BaseFloat> *token_label2final_cost = NULL);
+
+  const CompactLattice &GetLattice() { return clat_; }
+
+  // kStateLabelOffset is what we add to state-ids in clat_ to produce labels
+  // to identify them in the raw lattice chunk
+  // kTokenLabelOffset is where we start allocating labels corresponding to Tokens
+  // (these correspond with raw lattice states);
+  enum  { kStateLabelOffset = (int)1e8,  kTokenLabelOffset = (int)2e8, kMaxTokenLabel = (int)3e8 };
+
+ private:
+
+  // [called from AcceptRawLatticeChunk()]
+  // Gets the final costs from token-final states in the raw lattice (see
+  // glossary for definition).  These final costs will be subtracted after
+  // determinization; in the normal case they are `temporaries` used to guide
+  // pruning.  NOTE: the index of the array is not the FST state that is final,
+  // but the label on arcs entering it (these will be `token-labels`).  Each
+  // token-final state will have the same label on all arcs entering it.
+  //
+  // `old_final_costs` is assumed to be empty at entry.
+  void GetRawLatticeFinalCosts(const Lattice &raw_fst,
+                               std::unordered_map<Label, BaseFloat> *old_final_costs);
+
+  // Sets up non_final_redet_states_.  See documentation for that variable.
+  void GetNonFinalRedetStates();
+
+  /** [called from AcceptRawLatticeChunk()] Processes arcs that leave the
+      start-state of `chunk_clat` (if this is not the first chunk); does nothing
+      if this is the first chunk.  This includes using the `state-labels` to
+      work out which states in clat_ these states correspond to, and writing
+      that mapping to `state_map`.
+
+      Also modifies forward_costs_, because it has to do a kind of reweighting
+      of the clat states that are the values it puts in `state_map`, to take
+      account of the probabilities on the arcs from the start state of
+      chunk_clat to the states corresponding to those redeterminized-states
+      (i.e. the states in clat corresponding to the values it puts in
+      `*state_map`).  It also modifies arcs_in_, mostly because there
+      are rare cases when we end up `merging` sets of those redeterminized-states,
+      because the determinization process mapped them to a single state,
+      and that means we need to reroute the arcs into members of that
+      set into one single member (which will appear as a value in
+      `*state_map`).
+
+        @param [in] chunk_clat   The determinized chunk of lattice we are
+                          processing
+        @param [out] state_map    Mapping from states in chunk_clat to
+                          the state in clat_ they correspond to.
+        @param [out] extra_start_weight  If the start-state of
+                          clat_ (its state 0) needs to be modified as
+                          if its incoming arcs were multiplied by
+                          `extra_start_weight`, this isn't possible
+                          using the `in_arcs_` data-structure,
+                          so we remember the extra weight and multiply
+                          it in later, after processing arcs leaving
+                          the start state of clat_.  This is set
+                          only if the start-state of clat_ is a
+                          redeterminized state.
+        @return     Returns true if this is the first chunk.
+  */
+  bool ProcessArcsFromChunkStartState(
+      const CompactLattice &chunk_clat,
+      std::unordered_map<CompactLattice::StateId, CompactLattice::StateId> *state_map,
+      CompactLatticeWeight *extra_start_weight);
+
+  /**
+     This function, called from AcceptRawLatticeChunk(), takes care of an
+     unusual situation where we need to reweight the start state of clat_.  This
+     `extra_start_weight` is to be thought of as an extra `incoming` weight, and
+     we need to left-multiply all the arcs leaving the start state, by it.
+
+     This function does not need to modify forward_costs_; that will
+     already have been done by ProcessArcsFromChunkStartState().
+   */
+  void ReweightStartState(CompactLatticeWeight &extra_start_weight);
+
+
+  /**
+     This function, called from AcceptRawLatticeChunk(), transfers arcs from
+     `chunk_clat` to clat_.  For those arcs that have `token-labels` on them,
+     they don't get written to clat_ but instead are stored in the arcs_ array.
+
+        @param [in] chunk_clat    The determinized lattice for the chunk
+                         we are processing; this is the source of the arcs
+                         we are moving.
+        @param [in] is_first_chunk  True if this is the first chunk in the
+                         utterance; it's needed because if it is, we
+                         will also transfer arcs from the start state of
+                         chunk_clat.
+        @param [in] state_map  Map from state-ids in chunk_clat to state-ids
+                         in clat_.
+        @param [in] chunk_state_to_token  Map from `token-final states`
+                         (see glossary) in chunk_clat, to the token-label
+                         on arcs entering those states.
+        @param [in] old_final_costs  Map from token-label to the
+                         final-costs that were on the corresponding
+                         token-final states in the undeterminized lattice;
+                         these final-costs need to be removed when
+                         we record the weights in final_arcs_, because
+                         they were just temporary.
+   */
+  void TransferArcsToClat(
+      const CompactLattice &chunk_clat,
+      bool is_first_chunk,
+      const std::unordered_map<CompactLattice::StateId, CompactLattice::StateId> &state_map,
+      const std::unordered_map<CompactLattice::StateId, Label> &chunk_state_to_token,
+      const std::unordered_map<Label, BaseFloat> &old_final_costs);
+
+
+
+  /**
+     Adds one arc to `clat_`.  It's like clat_.AddArc(state, arc), except
+     it also modifies arcs_in_ and forward_costs_.
+   */
+  void AddArcToClat(CompactLattice::StateId state,
+                    const CompactLatticeArc &arc);
+  CompactLattice::StateId AddStateToClat();
+
+
+  // Identifies token-final states in `chunk_clat`; see glossary above for
+  // definition of `token-final`.  This function outputs a map from such states
+  // in chunk_clat, to the `token-label` on arcs entering them.  (It is not
+  // possible that the same state would have multiple arcs entering it with
+  // different token-labels, or some arcs entering with one token-label and some
+  // another, or be both initial and have such arcs; this is true due to how we
+  // construct the raw lattice.)
+  void IdentifyTokenFinalStates(
+      const CompactLattice &chunk_clat,
+      std::unordered_map<CompactLattice::StateId, CompactLatticeArc::Label> *token_map) const;
+
+  // trans_model_ is needed by DeterminizeLatticePhonePrunedWrapper() which this
+  // class calls.
+  const TransitionModel &trans_model_;
+  // config_ is needed by DeterminizeLatticePhonePrunedWrapper() which this
+  // class calls.
+  const LatticeIncrementalDecoderConfig &config_;
+
+
+  // Contains the set of redeterminized-states which are not final in the
+  // canonical appended lattice.  Since the final ones don't physically appear
+  // in clat_, this means the set of redeterminized-states which are physically
+  // in clat_.  In code terms, this means set of .first elements in final_arcs,
+  // plus whatever other states in clat_ are reachable from such states.
+  std::unordered_set<CompactLattice::StateId> non_final_redet_states_;
+
+
+  // clat_ is the appended lattice (containing all chunks processed so
+  // far), except its `final-arcs` (i.e. arcs which in the canonical
+  // lattice would go to final-states) are not present (they are stored
+  // separately in final_arcs_) and states which in the canonical lattice
+  // should have final-arcs leaving them will instead have a final-prob.
+  CompactLattice clat_;
+
+
+  // arcs_in_ is indexed by (state-id in clat_), and is a list of
+  // arcs that come into this state, in the form (prev-state,
+  // arc-index).  CAUTION: not all these input-arc records will always
+  // be valid (some may be out-of-date, and may refer to an out-of-range
+  // arc or an arc that does not point to this state).  But all
+  // input arcs will always be listed.
+  std::vector<std::vector<std::pair<CompactLattice::StateId, int32> > > arcs_in_;
+
+  // final_arcs_ contains arcs which would appear in the canonical appended
+  // lattice but for implementation reasons are not physically present in clat_.
+  // These are arcs to final states in the canonical appended lattice.  The
+  // .first elements are the source states in clat_ (these will all be elements
+  // of non_final_redet_states_); the .nextstate elements of the arcs does not
+  // contain a physical state, but contain state-labels allocated by
+  // AllocateNewStateLabel().
+  std::vector<CompactLatticeArc> final_arcs_;
+
+  // forward_costs_, indexed by the state-id in clat_, stores the alpha
+  // (forward) costs, i.e. the minimum cost from the start state to each state
+  // in clat_.  This is relevant for pruned determinization.  The BaseFloat can
+  // be thought of as the sum of a Value1() + Value2() in a LatticeWeight.
+  std::vector<BaseFloat> forward_costs_;
+
+  // temporary used in a function, kept here to avoid excessive reallocation.
+  std::unordered_set<int32> temp_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeIncrementalDeterminizer);
+};
+
+
+/** This is an extention to the "normal" lattice-generating decoder.
+   See \ref lattices_generation \ref decoders_faster and \ref decoders_simple
+    for more information.
+
+   The main difference is the incremental determinization which will be
+   discussed in the function GetLattice().  This means that the work of determinizatin
+   isn't done all at once at the end of the file, but incrementally while decoding.
+   See the comment at the top of this file for more explanation.
+
+   The decoder is templated on the FST type and the token type.  The token type
+   will normally be StdToken, but also may be BackpointerToken which is to support
+   quick lookup of the current best path (see lattice-faster-online-decoder.h)
+
+   The FST you invoke this decoder with is expected to be of type
+   Fst::Fst<fst::StdArc>, a.k.a. StdFst, or GrammarFst.  If you invoke it with
+   FST == StdFst and it notices that the actual FST type is
+   fst::VectorFst<fst::StdArc> or fst::ConstFst<fst::StdArc>, the decoder object
+   will internally cast itself to one that is templated on those more specific
+   types; this is an optimization for speed.
+ */
+template <typename FST, typename Token = decoder::StdToken>
+class LatticeIncrementalDecoderTpl {
+ public:
+  using Arc = typename FST::Arc;
+  using Label = typename Arc::Label;
+  using StateId = typename Arc::StateId;
+  using Weight = typename Arc::Weight;
+  using ForwardLinkT = decoder::ForwardLink<Token>;
+
+  // Instantiate this class once for each thing you have to decode.
+  // This version of the constructor does not take ownership of
+  // 'fst'.
+  LatticeIncrementalDecoderTpl(const FST &fst, const TransitionModel &trans_model,
+                               const LatticeIncrementalDecoderConfig &config);
+
+  // This version of the constructor takes ownership of the fst, and will delete
+  // it when this object is destroyed.
+  LatticeIncrementalDecoderTpl(const LatticeIncrementalDecoderConfig &config,
+                               FST *fst, const TransitionModel &trans_model);
+
+  void SetOptions(const LatticeIncrementalDecoderConfig &config) { config_ = config; }
+
+  const LatticeIncrementalDecoderConfig &GetOptions() const { return config_; }
+
+  ~LatticeIncrementalDecoderTpl();
+
+  /**
+     CAUTION: it's unlikely that you will ever want to call this function.  In a
+     scenario where you have the entire file and just want to decode it, there
+     is no point using this decoder.
+
+     An example of how to do decoding together with incremental
+     determinization. It decodes until there are no more frames left in the
+     "decodable" object.
+
+     In this example, config_.determinize_delay, config_.determinize_period
+     and config_.determinize_max_active are used to determine the time to
+     call GetLattice().
+
+     Users will probably want to use appropriate combinations of
+     AdvanceDecoding() and GetLattice() to build their application; this just
+     gives you some idea how.
+
+     The function returns true if any kind of traceback is available (not
+     necessarily from a final state).
+  */
+  bool Decode(DecodableInterface *decodable);
+
+  /// says whether a final-state was active on the last frame.  If it was not,
+  /// the lattice (or traceback) will end with states that are not final-states.
+  bool ReachedFinal() const {
+    return FinalRelativeCost() != std::numeric_limits<BaseFloat>::infinity();
+  }
+
+  /**
+     This decoder has no GetBestPath() function.
+     If you need that functionality you should probably use lattice-incremental-online-decoder.h,
+     which makes it very efficient to obtain the best path. */
+
+  /**
+     This GetLattice() function returns the lattice containing
+     `num_frames_to_decode` frames; this will be all frames decoded so
+     far, if you let num_frames_to_decode == NumFramesDecoded(),
+     but it will generally be better to make it a few frames less than
+     that to avoid the lattice having too many active states at
+     the end.
+
+     @param [in] num_frames_to_include  The number of frames that you want
+                     to be included in the lattice.  Must be >=
+                     NumFramesInLattice() and <= NumFramesDecoded().
+
+     @param [in] use_final_probs  True if you want the final-probs
+                    of HCLG to be included in the output lattice.  Must not
+                    be set to true if num_frames_to_include !=
+                    NumFramesDecoded().  Must be set to true if you have
+                    previously called FinalizeDecoding().
+
+                    (If no state was final on frame `num_frames_to_include`, the
+                    final-probs won't be included regardless of
+                    `use_final_probs`; you can test whether this
+                    was the case by calling ReachedFinal().
+
+      @return clat   The CompactLattice representing what has been decoded
+                     up until `num_frames_to_include` (e.g., LatticeStateTimes()
+                     on this lattice would return `num_frames_to_include`).
+
+     See also UpdateLatticeDeterminizaton().  Caution: this const ref
+     is only valid until the next time you call AdvanceDecoding() or
+     GetLattice().
+
+     CAUTION: the lattice may contain disconnnected states; you should
+     call Connect() on the output before writing it out.
+  */
+  const CompactLattice &GetLattice(int32 num_frames_to_include,
+                                   bool use_final_probs = false);
+
+  /*
+    Returns the number of frames in the currently-determinized part of the
+    lattice which will be a number in [0, NumFramesDecoded()].  It will
+    be the largest number that GetLattice() was called with, but note
+    that GetLattice() may be called from UpdateLatticeDeterminization().
+
+    Made available in case the user wants to give that same number to
+    GetLattice().
+   */
+  int NumFramesInLattice() const { return num_frames_in_lattice_; }
+
+  /**
+     InitDecoding initializes the decoding, and should only be used if you
+     intend to call AdvanceDecoding().  If you call Decode(), you don't need to
+     call this.  You can also call InitDecoding if you have already decoded an
+     utterance and want to start with a new utterance.
+  */
+  void InitDecoding();
+
+  /**
+     This will decode until there are no more frames ready in the decodable
+     object.  You can keep calling it each time more frames become available
+     (this is the normal pattern in a real-time/online decoding scenario).
+     If max_num_frames is specified, it specifies the maximum number of frames
+     the function will decode before returning.
+  */
+  void AdvanceDecoding(DecodableInterface *decodable, int32 max_num_frames = -1);
+
+
+  /** FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives
+      more information.  It returns the difference between the best (final-cost
+      plus cost) of any token on the final frame, and the best cost of any token
+      on the final frame.  If it is infinity it means no final-states were
+      present on the final frame.  It will usually be nonnegative.  If it not
+      too positive (e.g. < 5 is my first guess, but this is not tested) you can
+      take it as a good indication that we reached the final-state with
+      reasonable likelihood. */
+  BaseFloat FinalRelativeCost() const;
+
+  /** Returns the number of frames decoded so far. */
+  inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; }
+
+  /**
+     Finalizes the decoding, doing an extra pruning step on the last frame
+     that uses the final-probs.  May be called only once.
+  */
+  void FinalizeDecoding();
+
+ protected:
+  /* Some protected things are needed in LatticeIncrementalOnlineDecoderTpl. */
+
+  /** NOTE: for parts the internal implementation that are shared with LatticeFasterDecoer,
+      we have removed the comments.*/
+  inline static void DeleteForwardLinks(Token *tok);
+  struct TokenList {
+    Token *toks;
+    bool must_prune_forward_links;
+    bool must_prune_tokens;
+    int32 num_toks;  /* Note: you can only trust `num_toks` if must_prune_tokens
+                      * == false, because it is only set in
+                      * PruneTokensForFrame(). */
+    TokenList()
+        : toks(NULL), must_prune_forward_links(true), must_prune_tokens(true),
+          num_toks(-1) {}
+  };
+  using Elem = typename HashList<StateId, Token *>::Elem;
+  void PossiblyResizeHash(size_t num_toks);
+  inline Token *FindOrAddToken(StateId state, int32 frame_plus_one,
+                               BaseFloat tot_cost, Token *backpointer, bool *changed);
+  void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed,
+                         bool *links_pruned, BaseFloat delta);
+  void ComputeFinalCosts(unordered_map<Token *, BaseFloat> *final_costs,
+                         BaseFloat *final_relative_cost,
+                         BaseFloat *final_best_cost) const;
+  void PruneForwardLinksFinal();
+  void PruneTokensForFrame(int32 frame_plus_one);
+  void PruneActiveTokens(BaseFloat delta);
+  BaseFloat GetCutoff(Elem *list_head, size_t *tok_count, BaseFloat *adaptive_beam,
+                      Elem **best_elem);
+  BaseFloat ProcessEmitting(DecodableInterface *decodable);
+  void ProcessNonemitting(BaseFloat cost_cutoff);
+
+  HashList<StateId, Token *> toks_;
+  std::vector<TokenList> active_toks_;  // indexed by frame.
+  std::vector<StateId> queue_;       // temp variable used in ProcessNonemitting,
+  std::vector<BaseFloat> tmp_array_; // used in GetCutoff.
+  const FST *fst_;
+  bool delete_fst_;
+  std::vector<BaseFloat> cost_offsets_;
+  int32 num_toks_;
+  bool warned_;
+  bool decoding_finalized_;
+
+  unordered_map<Token *, BaseFloat> final_costs_;
+  BaseFloat final_relative_cost_;
+  BaseFloat final_best_cost_;
+
+  /***********************
+      Variables below this point relate to the incremental
+      determinization.
+  *********************/
+  LatticeIncrementalDecoderConfig config_;
+  /** Much of the the incremental determinization algorithm is encapsulated in
+      the determinize_ object.  */
+  LatticeIncrementalDeterminizer determinizer_;
+
+
+  /* Just a temporary used in a function; stored here to avoid reallocation. */
+  unordered_map<Token*, StateId> temp_token_map_;
+
+  /** num_frames_in_lattice_ is the highest `num_frames_to_include_` argument
+      for any prior call to GetLattice(). */
+  int32 num_frames_in_lattice_;
+
+  // A map from Token to its token_label.  Will contain an entry for
+  // each Token in active_toks_[num_frames_in_lattice_].
+  unordered_map<Token*, Label> token2label_map_;
+
+  // A temporary used in a function, kept here to avoid reallocation.
+  unordered_map<Token*, Label> token2label_map_temp_;
+
+  // we allocate a unique id for each Token
+  Label next_token_label_;
+
+  inline Label AllocateNewTokenLabel() { return next_token_label_++; }
+
+
+  // There are various cleanup tasks... the the toks_ structure contains
+  // singly linked lists of Token pointers, where Elem is the list type.
+  // It also indexes them in a hash, indexed by state (this hash is only
+  // maintained for the most recent frame).  toks_.Clear()
+  // deletes them from the hash and returns the list of Elems.  The
+  // function DeleteElems calls toks_.Delete(elem) for each elem in
+  // the list, which returns ownership of the Elem to the toks_ structure
+  // for reuse, but does not delete the Token pointer.  The Token pointers
+  // are reference-counted and are ultimately deleted in PruneTokensForFrame,
+  // but are also linked together on each frame by their own linked-list,
+  // using the "next" pointer.  We delete them manually.
+  void DeleteElems(Elem *list);
+
+  void ClearActiveTokens();
+
+
+  // Returns the number of active tokens on frame `frame`.  Can be used as part
+  // of a heuristic to decide which frame to determinize until, if you are not
+  // at the end of an utterance.
+  int32 GetNumToksForFrame(int32 frame);
+
+  /**
+     UpdateLatticeDeterminization() ensures the work of determinization is kept
+     up to date so that when you do need the lattice you can get it fast.  It
+     uses the configuration values `determinize_delay`, `determinize_max_delay`
+     and `determinize_min_chunk_size` to decide whether and when to call
+     GetLattice().  You can safely call this as often as you want (e.g.  after
+     each time you call AdvanceDecoding(); it won't do subtantially more work if
+     it is called frequently.
+  */
+  void UpdateLatticeDeterminization();
+
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeIncrementalDecoderTpl);
+};
+
+typedef LatticeIncrementalDecoderTpl<fst::StdFst, decoder::StdToken>
+    LatticeIncrementalDecoder;
+
+
+} // end namespace kaldi.
+
+#endif
diff --git a/src/decoder/lattice-incremental-online-decoder.cc b/src/decoder/lattice-incremental-online-decoder.cc
new file mode 100644
index 00000000000..85f902bde3d
--- /dev/null
+++ b/src/decoder/lattice-incremental-online-decoder.cc
@@ -0,0 +1,150 @@
+// decoder/lattice-incremental-online-decoder.cc
+
+// Copyright      2019  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// see note at the top of lattice-faster-decoder.cc, about how to maintain this
+// file in sync with lattice-faster-decoder.cc
+
+#include "decoder/lattice-incremental-decoder.h"
+#include "decoder/lattice-incremental-online-decoder.h"
+#include "lat/lattice-functions.h"
+#include "base/timer.h"
+
+namespace kaldi {
+
+// Outputs an FST corresponding to the single best path through the lattice.
+template <typename FST>
+bool LatticeIncrementalOnlineDecoderTpl<FST>::GetBestPath(Lattice *olat,
+                                                     bool use_final_probs) const {
+  olat->DeleteStates();
+  BaseFloat final_graph_cost;
+  BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost);
+  if (iter.Done())
+    return false;  // would have printed warning.
+  StateId state = olat->AddState();
+  olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0));
+  while (!iter.Done()) {
+    LatticeArc arc;
+    iter = TraceBackBestPath(iter, &arc);
+    arc.nextstate = state;
+    StateId new_state = olat->AddState();
+    olat->AddArc(new_state, arc);
+    state = new_state;
+  }
+  olat->SetStart(state);
+  return true;
+}
+
+template <typename FST>
+typename LatticeIncrementalOnlineDecoderTpl<FST>::BestPathIterator LatticeIncrementalOnlineDecoderTpl<FST>::BestPathEnd(
+    bool use_final_probs,
+    BaseFloat *final_cost_out) const {
+  if (this->decoding_finalized_ && !use_final_probs)
+    KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
+              << "BestPathEnd() with use_final_probs == false";
+  KALDI_ASSERT(this->NumFramesDecoded() > 0 &&
+               "You cannot call BestPathEnd if no frames were decoded.");
+
+  unordered_map<Token*, BaseFloat> final_costs_local;
+
+  const unordered_map<Token*, BaseFloat> &final_costs =
+      (this->decoding_finalized_ ? this->final_costs_ :final_costs_local);
+  if (!this->decoding_finalized_ && use_final_probs)
+    this->ComputeFinalCosts(&final_costs_local, NULL, NULL);
+
+  // Singly linked list of tokens on last frame (access list through "next"
+  // pointer).
+  BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
+  BaseFloat best_final_cost = 0;
+  Token *best_tok = NULL;
+  for (Token *tok = this->active_toks_.back().toks;
+       tok != NULL; tok = tok->next) {
+    BaseFloat cost = tok->tot_cost, final_cost = 0.0;
+    if (use_final_probs && !final_costs.empty()) {
+      // if we are instructed to use final-probs, and any final tokens were
+      // active on final frame, include the final-prob in the cost of the token.
+      typename unordered_map<Token*, BaseFloat>::const_iterator
+          iter = final_costs.find(tok);
+      if (iter != final_costs.end()) {
+        final_cost = iter->second;
+        cost += final_cost;
+      } else {
+        cost = std::numeric_limits<BaseFloat>::infinity();
+      }
+    }
+    if (cost < best_cost) {
+      best_cost = cost;
+      best_tok = tok;
+      best_final_cost = final_cost;
+    }
+  }
+  if (best_tok == NULL) {  // this should not happen, and is likely a code error or
+    // caused by infinities in likelihoods, but I'm not making
+    // it a fatal error for now.
+    KALDI_WARN << "No final token found.";
+  }
+  if (final_cost_out == NULL)
+    *final_cost_out = best_final_cost;
+  return BestPathIterator(best_tok, this->NumFramesDecoded() - 1);
+}
+
+
+template <typename FST>
+typename LatticeIncrementalOnlineDecoderTpl<FST>::BestPathIterator LatticeIncrementalOnlineDecoderTpl<FST>::TraceBackBestPath(
+    BestPathIterator iter, LatticeArc *oarc) const {
+  KALDI_ASSERT(!iter.Done() && oarc != NULL);
+  Token *tok = static_cast<Token*>(iter.tok);
+  int32 cur_t = iter.frame, ret_t = cur_t;
+  if (tok->backpointer != NULL) {
+    ForwardLinkT *link;
+    for (link = tok->backpointer->links;
+         link != NULL; link = link->next) {
+      if (link->next_tok == tok) { // this is the link to "tok"
+        oarc->ilabel = link->ilabel;
+        oarc->olabel = link->olabel;
+        BaseFloat graph_cost = link->graph_cost,
+            acoustic_cost = link->acoustic_cost;
+        if (link->ilabel != 0) {
+          KALDI_ASSERT(static_cast<size_t>(cur_t) < this->cost_offsets_.size());
+          acoustic_cost -= this->cost_offsets_[cur_t];
+          ret_t--;
+        }
+        oarc->weight = LatticeWeight(graph_cost, acoustic_cost);
+        break;
+      }
+    }
+    if (link == NULL) { // Did not find correct link.
+      KALDI_ERR << "Error tracing best-path back (likely "
+                << "bug in token-pruning algorithm)";
+    }
+  } else {
+    oarc->ilabel = 0;
+    oarc->olabel = 0;
+    oarc->weight = LatticeWeight::One(); // zero costs.
+  }
+  return BestPathIterator(tok->backpointer, ret_t);
+}
+
+// Instantiate the template for the FST types that we'll need.
+template class LatticeIncrementalOnlineDecoderTpl<fst::Fst<fst::StdArc> >;
+template class LatticeIncrementalOnlineDecoderTpl<fst::VectorFst<fst::StdArc> >;
+template class LatticeIncrementalOnlineDecoderTpl<fst::ConstFst<fst::StdArc> >;
+template class LatticeIncrementalOnlineDecoderTpl<fst::GrammarFst>;
+
+
+} // end namespace kaldi.
diff --git a/src/decoder/lattice-incremental-online-decoder.h b/src/decoder/lattice-incremental-online-decoder.h
new file mode 100644
index 00000000000..8bd41c851ab
--- /dev/null
+++ b/src/decoder/lattice-incremental-online-decoder.h
@@ -0,0 +1,132 @@
+// decoder/lattice-incremental-online-decoder.h
+
+// Copyright      2019  Zhehuai Chen
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// see note at the top of lattice-faster-decoder.h, about how to maintain this
+// file in sync with lattice-faster-decoder.h
+
+
+#ifndef KALDI_DECODER_LATTICE_INCREMENTAL_ONLINE_DECODER_H_
+#define KALDI_DECODER_LATTICE_INCREMENTAL_ONLINE_DECODER_H_
+
+#include "util/stl-utils.h"
+#include "util/hash-list.h"
+#include "fst/fstlib.h"
+#include "itf/decodable-itf.h"
+#include "fstext/fstext-lib.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "lat/kaldi-lattice.h"
+#include "decoder/lattice-incremental-decoder.h"
+
+namespace kaldi {
+
+
+
+/** LatticeIncrementalOnlineDecoderTpl is as LatticeIncrementalDecoderTpl but also
+    supports an efficient way to get the best path (see the function
+    BestPathEnd()), which is useful in endpointing and in situations where you
+    might want to frequently access the best path.
+
+    This is only templated on the FST type, since the Token type is required to
+    be BackpointerToken.  Actually it only makes sense to instantiate
+    LatticeIncrementalDecoderTpl with Token == BackpointerToken if you do so indirectly via
+    this child class.
+ */
+template <typename FST>
+class LatticeIncrementalOnlineDecoderTpl:
+      public LatticeIncrementalDecoderTpl<FST, decoder::BackpointerToken> {
+ public:
+  using Arc = typename FST::Arc;
+  using Label = typename Arc::Label;
+  using StateId = typename Arc::StateId;
+  using Weight = typename Arc::Weight;
+  using Token = decoder::BackpointerToken;
+  using ForwardLinkT = decoder::ForwardLink<Token>;
+
+  // Instantiate this class once for each thing you have to decode.
+  // This version of the constructor does not take ownership of
+  // 'fst'.
+  LatticeIncrementalOnlineDecoderTpl(const FST &fst,
+    const TransitionModel &trans_model,
+                                const LatticeIncrementalDecoderConfig &config):
+      LatticeIncrementalDecoderTpl<FST, Token>(fst, trans_model, config) { }
+
+  // This version of the initializer takes ownership of 'fst', and will delete
+  // it when this object is destroyed.
+  LatticeIncrementalOnlineDecoderTpl(const LatticeIncrementalDecoderConfig &config,
+                                FST *fst,
+    const TransitionModel &trans_model):
+      LatticeIncrementalDecoderTpl<FST, Token>(config, fst, trans_model) { }
+
+
+  struct BestPathIterator {
+    void *tok;
+    int32 frame;
+    // note, "frame" is the frame-index of the frame you'll get the
+    // transition-id for next time, if you call TraceBackBestPath on this
+    // iterator (assuming it's not an epsilon transition).  Note that this
+    // is one less than you might reasonably expect, e.g. it's -1 for
+    // the nonemitting transitions before the first frame.
+    BestPathIterator(void *t, int32 f): tok(t), frame(f) { }
+    bool Done() { return tok == NULL; }
+  };
+
+
+  /// Outputs an FST corresponding to the single best path through the lattice.
+  /// This is quite efficient because it doesn't get the entire raw lattice and find
+  /// the best path through it; instead, it uses the BestPathEnd and BestPathIterator
+  /// so it basically traces it back through the lattice.
+  /// Returns true if result is nonempty (using the return status is deprecated,
+  /// it will become void).  If "use_final_probs" is true AND we reached the
+  /// final-state of the graph then it will include those as final-probs, else
+  /// it will treat all final-probs as one.
+  bool GetBestPath(Lattice *ofst,
+                   bool use_final_probs = true) const;
+
+
+
+  /// This function returns an iterator that can be used to trace back
+  /// the best path.  If use_final_probs == true and at least one final state
+  /// survived till the end, it will use the final-probs in working out the best
+  /// final Token, and will output the final cost to *final_cost (if non-NULL),
+  /// else it will use only the forward likelihood, and will put zero in
+  /// *final_cost (if non-NULL).
+  /// Requires that NumFramesDecoded() > 0.
+  BestPathIterator BestPathEnd(bool use_final_probs,
+                               BaseFloat *final_cost = NULL) const;
+
+
+  /// This function can be used in conjunction with BestPathEnd() to trace back
+  /// the best path one link at a time (e.g. this can be useful in endpoint
+  /// detection).  By "link" we mean a link in the graph; not all links cross
+  /// frame boundaries, but each time you see a nonzero ilabel you can interpret
+  /// that as a frame.  The return value is the updated iterator.  It outputs
+  /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" pointer,
+  /// while leaving its "nextstate" variable unchanged.
+  BestPathIterator TraceBackBestPath(
+      BestPathIterator iter, LatticeArc *arc) const;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeIncrementalOnlineDecoderTpl);
+};
+
+typedef LatticeIncrementalOnlineDecoderTpl<fst::StdFst> LatticeIncrementalOnlineDecoder;
+
+
+} // end namespace kaldi.
+
+#endif
diff --git a/src/decoder/lattice-simple-decoder.cc b/src/decoder/lattice-simple-decoder.cc
index 79d6d5288be..f2b16782827 100644
--- a/src/decoder/lattice-simple-decoder.cc
+++ b/src/decoder/lattice-simple-decoder.cc
@@ -564,7 +564,9 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
   for (unordered_map<StateId, Token*>::iterator iter = cur_toks_.begin();
        iter != cur_toks_.end();
        ++iter) {
-    queue.push_back(iter->first);
+    StateId state = iter->first;
+    if (fst_.NumInputEpsilons(state) != 0)
+      queue.push_back(state);
     best_cost = std::min(best_cost, iter->second->tot_cost);
   }
   if (queue.empty()) {
@@ -604,7 +606,7 @@ void LatticeSimpleDecoder::ProcessNonemitting() {
             
           // "changed" tells us whether the new token has a different
           // cost from before, or is new [if so, add into queue].
-          if (changed)
+          if (changed && fst_.NumInputEpsilons(arc.nextstate) != 0)
             queue.push_back(arc.nextstate);
         }
       }
diff --git a/src/decoder/lattice-simple-decoder.h b/src/decoder/lattice-simple-decoder.h
index 072bc72a4c7..dd8de81e139 100644
--- a/src/decoder/lattice-simple-decoder.h
+++ b/src/decoder/lattice-simple-decoder.h
@@ -79,7 +79,7 @@ class LatticeSimpleDecoder {
   typedef Arc::Label Label;
   typedef Arc::StateId StateId;
   typedef Arc::Weight Weight;
-  // instantiate this class onece for each thing you have to decode.
+  // instantiate this class once for each thing you have to decode.
   LatticeSimpleDecoder(const fst::Fst<fst::StdArc> &fst,
                        const LatticeSimpleDecoderConfig &config):
       fst_(fst), config_(config), num_toks_(0) { config.Check(); }
diff --git a/src/decoder/simple-decoder.cc b/src/decoder/simple-decoder.cc
index 836f87556c8..0b42c0993d6 100644
--- a/src/decoder/simple-decoder.cc
+++ b/src/decoder/simple-decoder.cc
@@ -32,13 +32,7 @@ SimpleDecoder::~SimpleDecoder() {
 
 bool SimpleDecoder::Decode(DecodableInterface *decodable) {
   InitDecoding();
-  while( !decodable->IsLastFrame(num_frames_decoded_ - 1)) {
-    ClearToks(prev_toks_);
-    cur_toks_.swap(prev_toks_);
-    ProcessEmitting(decodable);
-    ProcessNonemitting();
-    PruneToks(beam_, &cur_toks_);
-  }
+  AdvanceDecoding(decodable);
   return (!cur_toks_.empty());
 }
 
@@ -76,7 +70,7 @@ void SimpleDecoder::AdvanceDecoding(DecodableInterface *decodable,
     ProcessEmitting(decodable);
     ProcessNonemitting();
     PruneToks(beam_, &cur_toks_);
-  }   
+  }
 }
 
 bool SimpleDecoder::ReachedFinal() const {
@@ -188,7 +182,7 @@ void SimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
       if (arc.ilabel != 0) {  // propagate..
         BaseFloat acoustic_cost = -decodable->LogLikelihood(frame, arc.ilabel);
         double total_cost = tok->cost_ + arc.weight.Value() + acoustic_cost;
-        
+
         if (total_cost > cutoff) continue;
         if (total_cost + beam_  < cutoff)
           cutoff = total_cost + beam_;
@@ -214,20 +208,20 @@ void SimpleDecoder::ProcessEmitting(DecodableInterface *decodable) {
 void SimpleDecoder::ProcessNonemitting() {
   // Processes nonemitting arcs for one frame.  Propagates within
   // cur_toks_.
-  std::vector<StateId> queue_;
+  std::vector<StateId> queue;
   double infinity = std::numeric_limits<double>::infinity();
   double best_cost = infinity;
   for (unordered_map<StateId, Token*>::iterator iter = cur_toks_.begin();
        iter != cur_toks_.end();
        ++iter) {
-    queue_.push_back(iter->first);
+    queue.push_back(iter->first);
     best_cost = std::min(best_cost, iter->second->cost_);
   }
   double cutoff = best_cost + beam_;
-  
-  while (!queue_.empty()) {
-    StateId state = queue_.back();
-    queue_.pop_back();
+
+  while (!queue.empty()) {
+    StateId state = queue.back();
+    queue.pop_back();
     Token *tok = cur_toks_[state];
     KALDI_ASSERT(tok != NULL && state == tok->arc_.nextstate);
     for (fst::ArcIterator<fst::Fst<StdArc> > aiter(fst_, state);
@@ -244,12 +238,12 @@ void SimpleDecoder::ProcessNonemitting() {
               = cur_toks_.find(arc.nextstate);
           if (find_iter == cur_toks_.end()) {
             cur_toks_[arc.nextstate] = new_tok;
-            queue_.push_back(arc.nextstate);
+            queue.push_back(arc.nextstate);
           } else {
             if ( *(find_iter->second) < *new_tok ) {
               Token::TokenDelete(find_iter->second);
               find_iter->second = new_tok;
-              queue_.push_back(arc.nextstate);
+              queue.push_back(arc.nextstate);
             } else {
               Token::TokenDelete(new_tok);
             }
diff --git a/src/decoder/simple-decoder.h b/src/decoder/simple-decoder.h
index cfd06546257..cd3a690bf1a 100644
--- a/src/decoder/simple-decoder.h
+++ b/src/decoder/simple-decoder.h
@@ -40,7 +40,7 @@ class SimpleDecoder {
   typedef StdArc::Weight StdWeight;
   typedef StdArc::Label Label;
   typedef StdArc::StateId StateId;
-  
+
   SimpleDecoder(const fst::Fst<fst::StdArc> &fst, BaseFloat beam): fst_(fst), beam_(beam) { }
 
   ~SimpleDecoder();
@@ -62,9 +62,9 @@ class SimpleDecoder {
   // It returns true if the output lattice was nonempty (i.e. had states in it);
   // using the return value is deprecated.
   bool GetBestPath(Lattice *fst_out, bool use_final_probs = true) const;
-  
+
   /// *** The next functions are from the "new interface". ***
-  
+
   /// FinalRelativeCost() serves the same function as ReachedFinal(), but gives
   /// more information.  It returns the difference between the best (final-cost plus
   /// cost) of any token on the final frame, and the best cost of any token
@@ -75,8 +75,8 @@ class SimpleDecoder {
   /// InitDecoding initializes the decoding, and should only be used if you
   /// intend to call AdvanceDecoding().  If you call Decode(), you don't need
   /// to call this.  You can call InitDecoding if you have already decoded an
-  /// utterance and want to start with a new utterance. 
-  void InitDecoding();  
+  /// utterance and want to start with a new utterance.
+  void InitDecoding();
 
   /// This will decode until there are no more frames ready in the decodable
   /// object, but if max_num_frames is >= 0 it will decode no more than
@@ -84,8 +84,8 @@ class SimpleDecoder {
   /// which is a kind of error state.
   void AdvanceDecoding(DecodableInterface *decodable,
                          int32 max_num_frames = -1);
-  
-  /// Returns the number of frames already decoded.  
+
+  /// Returns the number of frames already decoded.
   int32 NumFramesDecoded() const { return num_frames_decoded_; }
 
  private:
@@ -134,18 +134,18 @@ class SimpleDecoder {
   void ProcessEmitting(DecodableInterface *decodable);
 
   void ProcessNonemitting();
-  
+
   unordered_map<StateId, Token*> cur_toks_;
   unordered_map<StateId, Token*> prev_toks_;
   const fst::Fst<fst::StdArc> &fst_;
   BaseFloat beam_;
   // Keep track of the number of frames decoded in the current file.
   int32 num_frames_decoded_;
-  
+
   static void ClearToks(unordered_map<StateId, Token*> &toks);
 
   static void PruneToks(BaseFloat beam, unordered_map<StateId, Token*> *toks);
-  
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(SimpleDecoder);
 };
 
diff --git a/src/decoder/training-graph-compiler.cc b/src/decoder/training-graph-compiler.cc
index 8b28ad2d11f..191d02f1720 100644
--- a/src/decoder/training-graph-compiler.cc
+++ b/src/decoder/training-graph-compiler.cc
@@ -1,5 +1,7 @@
 // decoder/training-graph-compiler.cc
-// Copyright 2009-2011 Microsoft Corporation
+
+// Copyright 2009-2011  Microsoft Corporation
+//                2018  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -40,15 +42,15 @@ TrainingGraphCompiler::TrainingGraphCompiler(const TransitionModel &trans_model,
       KALDI_ERR << "Disambiguation symbol " << disambig_syms_[i]
                 << " is also a phone.";
 
-  int32 subseq_symbol = 1 + phone_syms.back();
-  if (!disambig_syms_.empty() && subseq_symbol <= disambig_syms_.back())
-    subseq_symbol = 1 + disambig_syms_.back();
+  subsequential_symbol_ = 1 + phone_syms.back();
+  if (!disambig_syms_.empty() && subsequential_symbol_ <= disambig_syms_.back())
+    subsequential_symbol_ = 1 + disambig_syms_.back();
 
   {
     int32 N = ctx_dep.ContextWidth(),
         P = ctx_dep.CentralPosition();
     if (P != N-1)
-      AddSubsequentialLoop(subseq_symbol, lex_fst_);  // This is needed for
+      AddSubsequentialLoop(subsequential_symbol_, lex_fst_);  // This is needed for
     // systems with right-context or we will not successfully compose
     // with C.
   }
@@ -80,25 +82,19 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
 
   KALDI_ASSERT(phone2word_fst.Start() != kNoStateId);
 
-  ContextFst<StdArc> *cfst = NULL;
-  {  // make cfst [ it's expanded on the fly ]
-    const std::vector<int32> &phone_syms = trans_model_.GetPhones();  // needed to create context fst.
-    int32 subseq_symbol = phone_syms.back() + 1;
-    if (!disambig_syms_.empty() && subseq_symbol <= disambig_syms_.back())
-      subseq_symbol = 1 + disambig_syms_.back();
-
-    cfst = new ContextFst<StdArc>(subseq_symbol,
-                                  phone_syms,
-                                  disambig_syms_,
-                                  ctx_dep_.ContextWidth(),
-                                  ctx_dep_.CentralPosition());
-  }
+  const std::vector<int32> &phone_syms = trans_model_.GetPhones();  // needed to create context fst.
+
+  // inv_cfst will be expanded on the fly, as needed.
+  InverseContextFst inv_cfst(subsequential_symbol_,
+                             phone_syms,
+                             disambig_syms_,
+                             ctx_dep_.ContextWidth(),
+                             ctx_dep_.CentralPosition());
 
-  VectorFst<StdArc> ctx2word_fst;
-  ComposeContextFst(*cfst, phone2word_fst, &ctx2word_fst);
-  // ComposeContextFst is like Compose but faster for this particular Fst type.
-  // [and doesn't expand too many arcs in the ContextFst.]
 
+  VectorFst<StdArc> ctx2word_fst;
+  ComposeDeterministicOnDemandInverse(phone2word_fst, &inv_cfst, &ctx2word_fst);
+  // now ctx2word_fst is C * LG, assuming phone2word_fst is written as LG.
   KALDI_ASSERT(ctx2word_fst.Start() != kNoStateId);
 
   HTransducerConfig h_cfg;
@@ -106,7 +102,7 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
 
   std::vector<int32> disambig_syms_h; // disambiguation symbols on
   // input side of H.
-  VectorFst<StdArc> *H = GetHTransducer(cfst->ILabelInfo(),
+  VectorFst<StdArc> *H = GetHTransducer(inv_cfst.IlabelInfo(),
                                         ctx_dep_,
                                         trans_model_,
                                         h_cfg,
@@ -142,7 +138,6 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst<fst::StdArc> &word
                &trans2word_fst);
 
   delete H;
-  delete cfst;
   return true;
 }
 
@@ -173,19 +168,14 @@ bool TrainingGraphCompiler::CompileGraphs(
   out_fsts->resize(word_fsts.size(), NULL);
   if (word_fsts.empty()) return true;
 
-  ContextFst<StdArc> *cfst = NULL;
-  {  // make cfst [ it's expanded on the fly ]
-    const std::vector<int32> &phone_syms = trans_model_.GetPhones();  // needed to create context fst.
-    int32 subseq_symbol = phone_syms.back() + 1;
-    if (!disambig_syms_.empty() && subseq_symbol <= disambig_syms_.back())
-      subseq_symbol = 1 + disambig_syms_.back();
-
-    cfst = new ContextFst<StdArc>(subseq_symbol,
-                                  phone_syms,
-                                  disambig_syms_,
-                                  ctx_dep_.ContextWidth(),
-                                  ctx_dep_.CentralPosition());
-  }
+  const std::vector<int32> &phone_syms = trans_model_.GetPhones();  // needed to create context fst.
+
+  // inv_cfst will be expanded on the fly, as needed.
+  InverseContextFst inv_cfst(subsequential_symbol_,
+                             phone_syms,
+                             disambig_syms_,
+                             ctx_dep_.ContextWidth(),
+                             ctx_dep_.CentralPosition());
 
   for (size_t i = 0; i < word_fsts.size(); i++) {
     VectorFst<StdArc> phone2word_fst;
@@ -196,10 +186,8 @@ bool TrainingGraphCompiler::CompileGraphs(
                  "Perhaps you have words missing in your lexicon?");
 
     VectorFst<StdArc> ctx2word_fst;
-    ComposeContextFst(*cfst, phone2word_fst, &ctx2word_fst);
-    // ComposeContextFst is like Compose but faster for this particular Fst type.
-    // [and doesn't expand too many arcs in the ContextFst.]
-
+    ComposeDeterministicOnDemandInverse(phone2word_fst, &inv_cfst, &ctx2word_fst);
+    // now ctx2word_fst is C * LG, assuming phone2word_fst is written as LG.
     KALDI_ASSERT(ctx2word_fst.Start() != kNoStateId);
 
     (*out_fsts)[i] = ctx2word_fst.Copy();  // For now this contains the FST with symbols
@@ -210,7 +198,7 @@ bool TrainingGraphCompiler::CompileGraphs(
   h_cfg.transition_scale = opts_.transition_scale;
 
   std::vector<int32> disambig_syms_h;
-  VectorFst<StdArc> *H = GetHTransducer(cfst->ILabelInfo(),
+  VectorFst<StdArc> *H = GetHTransducer(inv_cfst.IlabelInfo(),
                                         ctx_dep_,
                                         trans_model_,
                                         h_cfg,
@@ -247,7 +235,6 @@ bool TrainingGraphCompiler::CompileGraphs(
   }
 
   delete H;
-  delete cfst;
   return true;
 }
 
diff --git a/src/decoder/training-graph-compiler.h b/src/decoder/training-graph-compiler.h
index 36bd62db4f7..ee56c6dfb3d 100644
--- a/src/decoder/training-graph-compiler.h
+++ b/src/decoder/training-graph-compiler.h
@@ -1,6 +1,7 @@
 // decoder/training-graph-compiler.h
 
-// Copyright 2009-2011 Microsoft Corporation
+// Copyright 2009-2011  Microsoft Corporation
+//                2018  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,6 +24,7 @@
 #include "hmm/transition-model.h"
 #include "fst/fstlib.h"
 #include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
 
 
 namespace kaldi {
@@ -65,14 +67,14 @@ class TrainingGraphCompiler {
                         const TrainingGraphCompilerOptions &opts);
 
 
-  /// CompileGraph compiles a single training graph its input is a
+  // CompileGraph compiles a single training graph its input is a
   // weighted acceptor (G) at the word level, its output is HCLG.
-  // Note: G could actually be an acceptor, it would also work.
+  // Note: G could actually be a transducer, it would also work.
   // This function is not const for technical reasons involving the cache.
   // if not for "table_compose" we could make it const.
   bool CompileGraph(const fst::VectorFst<fst::StdArc> &word_grammar,
                     fst::VectorFst<fst::StdArc> *out_fst);
-  
+
   // CompileGraphs allows you to compile a number of graphs at the same
   // time.  This consumes more memory but is faster.
   bool CompileGraphs(
@@ -87,8 +89,8 @@ class TrainingGraphCompiler {
   bool CompileGraphsFromText(
       const std::vector<std::vector<int32> >  &word_grammar,
       std::vector<fst::VectorFst<fst::StdArc> *> *out_fsts);
-  
-  
+
+
   ~TrainingGraphCompiler() { delete lex_fst_; }
  private:
   const TransitionModel &trans_model_;
@@ -96,6 +98,7 @@ class TrainingGraphCompiler {
   fst::VectorFst<fst::StdArc> *lex_fst_; // lexicon FST (an input; we take
   // ownership as we need to modify it).
   std::vector<int32> disambig_syms_; // disambig symbols (if any) in the phone
+  int32 subsequential_symbol_;  // search in ../fstext/context-fst.h for more info.
   // symbol table.
   fst::TableComposeCache<fst::Fst<fst::StdArc> > lex_cache_;  // stores matcher..
   // this is one of Dan's extensions.
diff --git a/src/doc/build_setup.dox b/src/doc/build_setup.dox
index 86dca5cad69..5ea2e212b20 100644
--- a/src/doc/build_setup.dox
+++ b/src/doc/build_setup.dox
@@ -32,12 +32,12 @@
 
  The build process for Windows is separate from the build process for
  UNIX-like systems, and is described in windows/INSTALL (tested some time ago with
- Windows 7 and Microsoft Visual Studio 10.0).  We use scripts to
+ Windows 7 and Microsoft Visual Studio 2013).  We use scripts to
  create the Visual Studio 10.0 solution file.  There are two options for
- the math library on Windows: either you can use Cygwin to compile ATLAS, or you
- can use the Intel MKL library.  Detailed instructions are provided.  However, note
+ the math library on Windows: either Intel MKL, or use Cygwin to compile ATLAS.
+ Detailed instructions are provided.  However, note
  that the Windows setup is becoming out of date and is not regularly tested,
- and not all the code currently compiles on it.
+ and not all the may compile.
 
  \section build_setup_configure How our configure script works (for UNIX variants)
 
@@ -58,11 +58,9 @@
 
   Changes that you might want to make to kaldi.mk after running "configure" are the following:
     - Changing the debug level:
-      - The default (which creates the easiest-to-debug binaries) is enabled by the options "-g -O0 -DKALDI_PARANOID".
-      - For faster, but still debuggable, binaries, you can change -O0 to -O1
-      - If you won't need to debug the binaries, you can remove the "-g -O0 -DKALDI_PARANOID" options, which 
-        will make it even faster.
-      - For maximum speed and no checking, you can replace the "-g -O0 -DKALDI_PARANOID" options with
+      - The default is "-O1"
+      - Easy to debug binaries can be enabled by uncommenting the options "-O0 -DKALDI_PARANOID".
+      - For maximum speed and no checking, you can replace the "-O0 -DKALDI_PARANOID" options with
         "-O2 -DNDEBUG" or "-O3 -DNDEBUG"
     - Changing the default precision
       - To test algorithms in double precision (e.g. if you suspect that roundoff is affecting
@@ -145,6 +143,6 @@ preprocessor variables, setting compile options, linking with libraries, and so
 
 We have compiled Kaldi on Windows, Cygwin, various flavors of Linux (including
 Ubuntu, CentOS, Debian, Red Hat and SUSE), and Darwin. We recommend you use g++ version
-4.4 or above, although other compilers such as llvm and Intel's icc are also known to work.
+4.7 or above, although other compilers such as llvm and Intel's icc are also known to work.
 
 */
diff --git a/src/doc/cudamatrix.dox b/src/doc/cudamatrix.dox
index 21087da4dd6..a1f02ce0c85 100644
--- a/src/doc/cudamatrix.dox
+++ b/src/doc/cudamatrix.dox
@@ -103,8 +103,8 @@ on the GPU- mainly neural net training.
  caused by the virtualization, and we're not sure whether that problem still exists.
  Anyway, the memory caching can cause a problem if for some reason you run using
  the default (non-exclusive) compute mode, because it can cause allocation
- failures.  You can disable it at the code level by calling
- <tt>CuDevice::Instantiate().DisableCaching()</tt>, if needed.
+ failures.  You can disable it at the code level by setting
+ <tt>CuAllocatorOptions::cache_memory</tt> to <tt>false</tt>, if needed.
 
 
 
diff --git a/src/doc/data_prep.dox b/src/doc/data_prep.dox
index 89fd19ed8d4..37586892737 100644
--- a/src/doc/data_prep.dox
+++ b/src/doc/data_prep.dox
@@ -191,7 +191,7 @@ the speaker identities, you can just make the speaker-ids the same as the uttera
 so the format of the file would be just <DFN>\<utterance-id\> \<utterance-id\></DFN>.
 We have made the previous sentence bold because we have encountered people creating
 a "global" speaker-id.  This is a bad idea because it makes cepstral mean normalization
-ineffective in traning (since it's applied globally), and because it will create problems
+ineffective in training (since it's applied globally), and because it will create problems
 when you use utils/split_data_dir.sh to split your data into pieces.
 
 There is another file that exists in some setups; it is used only occasionally and
@@ -356,7 +356,7 @@ fstprint.
 The file <DFN>L.fst</DFN> is the Finite State Transducer form of the lexicon (L,
 see  <a href=http://www.cs.nyu.edu/~mohri/pub/hbka.pdf> "Speech Recognition
 with Weighted Finite-State Transducers" </a> by Mohri, Pereira and
-Riley, in Springer Handbook on SpeechProcessing and Speech Communication, 2008).
+Riley, in Springer Handbook on Speech Processing and Speech Communication, 2008).
 with phone symbols on the input and word symbols on the output.  The file
 <DFN>L_disambig.fst</DFN> is the lexicon, as above but including the disambiguation
 symbols \#1, \#2, and so on, as well as the self-loop with \#0 on it to "pass through"
@@ -373,7 +373,7 @@ training.  There is nothing special about "<UNK>" here, and it does not have
 to be this particular word; what is important is that this word should have a pronunciation
 containing just a phone that we designate as a "garbage phone"; this phone will
 align with various kinds of spoken noise.  In our particular setup, this phone
-is called <DFN>\<SPN\></DFN> (short for "spoken noise"):
+is called <DFN>SPN</DFN> (short for "spoken noise"):
 \verbatim
 s5# grep -w UNK data/local/dict/lexicon.txt
 <UNK> SPN
@@ -577,7 +577,7 @@ we don't like to hardcode this in the text form of the phones-- for one thing, K
 never see the text form of the phones, but only an integerized form.  So it is specified
 by this file <DFN>word_boundary.txt</DFN>.  The main reason we need this information is
 in order to recover the word boundaries within lattices (for example, the program
-lattice-align-words reads the integer versin of this file, <DFN>word_boundaray.int</DFN>).
+lattice-align-words reads the integer version of this file, <DFN>word_boundaray.int</DFN>).
 Finding the word boundaries is useful for reasons including NIST sclite scoring, which requires
 the time markings for words, and for other downstream processing.
 
@@ -811,7 +811,7 @@ state transducers.  (Note that language models would be represented as finite st
 acceptors, or FSAs, which can be considered as a special case of finite state transducers).
 
 The script <DFN>utils/format_lm.sh</DFN> deals with converting the ARPA-format language
-models into an OpenFst format. Here is the usage messages of that script: 
+models into an OpenFst format. Here is the usage messages of that script:
 \verbatim
 Usage: utils/format_lm.sh <lang_dir> <arpa-LM> <lexicon> <out_dir>
 E.g.: utils/format_lm.sh data/lang data/local/lm/foo.kn.gz data/local/dict/lexicon.txt data/lang_test
@@ -829,7 +829,7 @@ into a Weight Finite State Transducer (actually, an acceptor).
 A popular toolkit for building language models is SRILM.  Various language
 modeling toolkits are used in the Kaldi example scripts.  SRILM is the best
 documented and most fully featured, and we generally recommend it (its only
-drawback is that it don't have the most free licence). Here is the usage
+drawback is that it don't have the most free license). Here is the usage
 messages of <DFN>utils/format_lm_sri.sh</DFN>
 
 \verbatim
@@ -838,4 +838,52 @@ E.g.: utils/format_lm_sri.sh data/lang data/local/lm/foo.kn.gz data/lang_test
 Converts ARPA-format language models to FSTs. Change the LM vocabulary using SRILM.
 \endverbatim
 
+
+\section data_prep_unknown Note on unknown words
+
+This is an explanation of how Kaldi deals with unknown words (words not in the
+vocabulary); we are putting it on the "data preparation" page for lack of a more obvious
+location.
+
+In many setups, <DFN>\<unk\></DFN> or something similar will be present in the
+LM as long as the data that you used to train the LM had words that were not
+in the vocabulary you used to train the LM,
+because language modeling toolkits tend to map those all to a
+single special world, usually called <DFN>\<unk\></DFN> or
+<DFN>\<UNK\></DFN>.  You can look at the arpa file to figure out what it's called; it
+will usually be one of those two.
+
+
+During training, if there are words in the <DFN>text</DFN> file in your data
+directory that are not in the <DFN>words.txt</DFN> in the lang directory that
+you are using, Kaldi will map them to a special word that's specified in the
+lang directory in the file <DFN>data/lang/oov.txt</DFN>; it will usually be
+either <DFN>\<unk\></DFN>, <DFN>\<UNK\></DFN> or maybe
+<DFN>\<SPOKEN_NOISE\></DFN>.  This word will have been chosen by the user
+(i.e., you), and supplied to <DFN>prepare_lang.sh</DFN> as a command-line argument.
+If this word has nonzero probability in the language model (which you can test
+by looking at the arpa file), then it will be possible for Kaldi to recognize
+this word in test time.  This will often be the case if you call this word
+<DFN>\<unk\></DFN>, because as we mentioned above, language modeling toolkits
+will often use this spelling for ``unknown word'' (which is a special word that
+all out-of-vocabulary words get mapped to).  Decoding output will always be limited to the
+intersection of the words in the language model with the words in the lexicon.txt (or whatever file format you supplied the
+lexicon in, e.g. lexicop.txt); these words will all be present in the <DFN>words.txt</DFN>
+in your <DFN>lang</DFN> directory.
+So if Kaldi's "unknown word" doesn't match the LM's "unknown word", you will
+simply never decode this word.  In any
+case, even when allowed to be decoded, this word typically won't be output very
+often and in practice it doesn't tend to have much impact on WERs.
+
+Of course a single phone isn't a very good, or accurate, model of OOV words.  In
+some Kaldi setups we have example scripts with names
+<DFN>local/run_unk_model.sh</DFN>: e.g., see the file
+<DFN>tedlium/s5_r2/local/run_unk_model.sh</DFN>.  These scripts replace the unk
+phone with a phone-level LM on phones.  They make it possible to get access to
+the sequence of phones in a hypothesized unknown word.  Note: unknown words
+should be considered an "advanced topic" in speech recognition and we discourage
+beginners from looking into this topic too closely.
+
+
+
 */
diff --git a/src/doc/dependencies.dox b/src/doc/dependencies.dox
index 63d2658b726..d8a5591955f 100644
--- a/src/doc/dependencies.dox
+++ b/src/doc/dependencies.dox
@@ -113,7 +113,7 @@
     - CLAPACK, the linear algebra library (we download the headers).
       This is useful only on systems where you don't have ATLAS and are
       instead compiling with CLAPACK.
-    - OpenBLAS: this is an alernative to ATLAS or CLAPACK.  The scripts don't
+    - OpenBLAS: this is an alternative to ATLAS or CLAPACK.  The scripts don't
       use it by default but we provide installation scripts so you can install
       it if you want to compare it against ATLAS (it's more actively
       maintained than ATLAS).
diff --git a/src/doc/dnn.dox b/src/doc/dnn.dox
index 5b3d2b98261..bab4658e552 100644
--- a/src/doc/dnn.dox
+++ b/src/doc/dnn.dox
@@ -37,7 +37,7 @@ namespace kaldi {
   We currently have three separate codebases for deep neural nets in Kaldi.  All
   are still active in the sense that the up-to-date recipes refer to all of
   them.  The first one ("nnet1"( is located in code subdirectories nnet/ and
-  nnetbin/, and is primiarly maintained by Karel Vesely.  The second is located
+  nnetbin/, and is primarily maintained by Karel Vesely.  The second is located
   in code subdirectories nnet2/ and nnet2bin/, and is primarily maintained by
   Daniel Povey (this code was originally based on an earlier version of Karel's
   code, but it has been extensively rewritten).  The third is located
diff --git a/src/doc/dnn1.dox b/src/doc/dnn1.dox
index 223b7665274..e8dcfd90d3f 100644
--- a/src/doc/dnn1.dox
+++ b/src/doc/dnn1.dox
@@ -35,13 +35,13 @@ show some \ref dnn1_advanced_features, and do a light introduction to the \ref d
 <hr><!-- #################################################################################################################### -->
 
 \section dnn1_toplevel_scripts Top-level script
-Let's have a look at the script <b><a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a></b>.
+Let's have a look at the script <b><a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a></b>.
 This script assumes to use a single CUDA GPU, and that kaldi was compiled with CUDA (check for 'CUDA = true' in src/kaldi.mk).
 Also we assume that 'cuda_cmd' is set properly in egs/wsj/s5/cmd.sh either to a GPU cluster node using 'queue.pl' or to a local machine using 'run.pl'.
 And finally the script assumes we already have a SAT GMM system exp/tri4b and corresponding fMLLR transforms, as generated by egs/wsj/s5/run.sh.
 Note that for other databases the run_dnn.sh is typically in the same location s5/local/nnet/run_dnn.sh.
 
-The script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a> is split into several stages:
+The script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/local/nnet/run_dnn.sh">egs/wsj/s5/local/nnet/run_dnn.sh</a> is split into several stages:
 
 0. <b>storing 40-dimensional fMLLR features to disk, steps/nnet/make_fmllr_feats.sh,</b>
 this simplifies the training scripts, the 40-dimensional features are MFCC-LDA-MLLT-fMLLR with CMN
@@ -100,7 +100,7 @@ Besides the DNN recipe, there are also other example scripts which can be handy:
 <hr><!-- #################################################################################################################### -->
 
 \section dnn1_training_script_internals Training script internals
-The main neural network training script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> is invoked as:
+The main neural network training script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> is invoked as:
 
 \verbatim
 steps/nnet/train.sh <data-train> <data-dev> <lang-dir> <ali-train> <ali-dev> <exp-dir>
@@ -111,11 +111,11 @@ The <lang-dir> is used only in the special case when using LDA feature-transform
 The output (i.e. the trained networks and logfiles) goes into <exp-dir>.
 
 Internally the script prepares the feature+target pipelines, generates a neural-network prototype and initialization, creates feature_transform and calls the scheduler script 
-<a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>,
+<a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>,
 which runs the training epochs and controls the learning rate.
 
 
-<b>While looking inside <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> we see:</b>
+<b>While looking inside <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train.sh">steps/nnet/train.sh</a> we see:</b>
 
 1. CUDA is required, the scripts exit if no GPU was detected or was CUDA not compiled in (one can still use '--skip-cuda-check true' to run on CPU, but it is 10-20x slower)
 
@@ -165,12 +165,12 @@ $ cat exp/dnn5b_pretrain-dbn_dnn/nnet.proto
 
 7. the network is initialized by : \ref nnet-initialize.cc , the DBN gets prepended in the next step using \ref nnet-concat.cc
 
-8. finally the training gets called by running scheduler script <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>
+8. finally the training gets called by running scheduler script <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a>
 
 Note : both neural networks and feature transforms can be viewed by \ref nnet-info.cc, or shown in ascii by \ref nnet-copy.cc
 
 
-<b>While looking inside <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> we see:</b>
+<b>While looking inside <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> we see:</b>
 
 the initial cross-validation run and the main for-loop over $iter which runs the epochs and controls the learning rate. Typically, the train_scheduler.sh is called from train.sh.
 - the default learning-rate scheduling is based on the relative improvement of the objective function: 
@@ -310,7 +310,7 @@ AddMat	174.307s
 AddMatMat	1922.11s
 \endverbatim
 
-<b> Running <a href="http://sourceforge.net/p/kaldi/code/HEAD/tree/trunk/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> directly:</b>
+<b> Running <a href="https://github.com/kaldi-asr/kaldi/blob/master/egs/wsj/s5/steps/nnet/train_scheduler.sh">steps/nnet/train_scheduler.sh</a> directly:</b>
 - The script train_scheduler.sh can be called outside train.sh, it allows to override the default NN-input and NN-target streams, which can be handy.
 - However the script assumes everything is set-up correctly, and there are almost no sanity checks, which makes it suitable for more advanced users only.
 - It is highly recommended to have a look at how train_scheduler.sh is usually called before trying to call it directly.
diff --git a/src/doc/faq.dox b/src/doc/faq.dox
new file mode 100644
index 00000000000..46a1d9c56ed
--- /dev/null
+++ b/src/doc/faq.dox
@@ -0,0 +1,536 @@
+// doc/faq.dox
+
+
+// Copyright 2013  Johns Hopkins University (author: Guoguo Chen)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+namespace kaldi {
+
+/**
+  \page faq  Frequently Asked Questions
+
+  \section faq_intro    Introduction
+
+
+  This page contains the answers to some miscellaneous frequently asked
+  questions from the mailing lists. This should not be your primary way of
+  finding such answers: the mailing lists and github contain many more
+  discussions, and a web search may be the easiest way to find answers.
+  
+  \section kaldi_name_and_logo About the name and logo of Kaldi
+  According to legend, Kaldi was the Ethiopian goatherder who discovered the coffee plant. The name was chosen by sponsors of this project because they drank a lot of coffee that time (in 2009 according to <a href="https://groups.google.com/forum/#!topic/kaldi-help/J2F3y6KOejY"> Ondrej Glembek </a>). 
+  Then the logo symbolizes those guys working on a speech project (the microphone in the logo) while drinking coffee (the coffee bean in the logo). There are some <a href=https://groups.google.com/forum/#!topic/kaldi-help/DmbFnaDi7Co>dissenting opinion</a> about the logo, they suggest we should use more awesome logo. Generally, we would like to change logo if someone comes up with a well-designed new logo.  
+
+  \section how_to_check_version How to check the Kaldi version?  
+  Refer to version.h.
+  
+  \section reading_materials_for_beginners Reading materials for beginners in speech recognition.  
+  We notice that there are more and more beginners in speech recognition starting using Kaldi as their first toolkit for speech recognition. For those guys, we recommend them first to read these basic materials to get started:
+  - HTK book (at least reading the Tutorial Overview part)
+  - The Application of Hidden Markov Models in Speech Recognition
+  - Speech Recognition with Weighted Finite-State Transducers for WSFT
+  - A Bit of Progress in Language Modeling (Extended Version)
+  
+  For those who may want a "Kaldi Book" with tutorial on theory and implementation like what HTK Book does, we would generally just say sorry. As Dan explains in this <a href=https://groups.google.com/forum/#!topic/kaldi-help/3LBSzmploC0> post</a>, the field of speech recognition is moving so fast that we need to implement too many things in Kaldi and have no time to write such a book.
+  
+  \section free_data_to_train_model Free dataset to get started
+  If you have not bought any LDC license, there are also some free dataset for you to get started, that is, Librispeech, Tedlium and AMI. 
+
+  \section about_timit About TIMIT
+  There are many people asked questions about TIMIT on mailing lists, as Dan says in this <a href=https://groups.google.com/forum/#!topic/kaldi-help/YUbX_XUkFCw> post</a>, generally we'll suggest you do not use TIMIT. 
+  
+  \section windows_asr_toolkit Windows ASR toolkit based on Kaldi
+  <a href=https://ai-toolkit.blogspot.com/p/voicebridge.html> VoiceBridge</a> is a ASR toolkit which is designed for windows developers and based on Kaldi. Currently it only supports GMM-based ASR but it will be updated with more models added as the author declared <a href=https://groups.google.com/forum/#!topic/kaldi-help/-CBE3qoXyeU> here</a>. 
+  Of course, if anyone create or know any other windows ASR toolkit based on Kaldi, please feel free to let us know and we will add it in this section. 
+
+  \section python_wrapper_for_kaldi Python wrapper for Kaldi
+  There are a few Python wrappers for Kaldi including:
+  - <a href=https://github.com/pykaldi/pykaldi> PyKaldi </a>
+  - <a href=https://github.com/jzlianglu/pykaldi2> PyKaldi2 </a>
+  - <a href=https://github.com/mravanelli/pytorch-kaldi> PyTorch-Kaldi </a>
+  - <a href=https://github.com/gooofy/py-kaldi-asr> py-kaldi-asr</a>: just for nnet3 online decoder
+  
+  People may wonder why TensorFlow or PyTorch isn't used in Kaldi DNN setup. It is mainly a historical reason as Dan explained <a href=https://groups.google.com/forum/#!topic/kaldi-help/DO_m3KwXr70> here</a>. A good news is that a PyTorch-integrated version of Kaldi that Dan declared <a href=https://groups.google.com/forum/#!topic/kaldi-help/ueXh-xvzZxo>here</a> is already in the planning stage. Dan may announce it when it's ready.
+  
+  \section docker_for_kaldi Docker for Kaldi
+  Kaldi offers two set of images: CPU-based images and GPU-based images, please see <a href=https://github.com/kaldi-asr/kaldi/tree/master/docker> here</a>.
+  
+  \section kaldi_for_android Kaldi for Android
+  A guide for compiling Kaldi for Android with the corresponding Dockerfile can be found <a href=http://jcsilva.github.io/2017/03/18/compile-kaldi-android/>here</a>. Note that this build is just based on Ubuntu and does not continue to update for new version of NDK, so if you build Kaldi for Android on different computing platform or using different toolchain (e.g. CMake instead of ndk-build in this post), please let us know. 
+  
+  \section naming_conventions_of_tool Naming conventions of common tools in Kaldi
+  There are many tools in Kaldi following simple and consistent naming conventions. Three typical frequently used tools with self-explanatory names are: 
+  - <b>copy-* (or *-copy)</b>: e.g. copy-matrix, copy-feats, copy-feats-to-htk, copy-tree, copy-transition-model, copy-posts, wav-copy, gmm-copy, sgmm2-copy, nnet3-copy, lattice-copy.  
+  - <b>*-info</b>: e.g. tree-info, hmm-info, gmm-info, am-info, nnet3-am-info, nnet3-info.
+  - <b>*-to*</b>: e.g. feat-to-dim, feat-to-len, ali-to-phones, ali-to-post, lattice-to-nbest, lattice-to-post, nbest-to-lattice.
+  
+  We strongly suggest you search first in your build output directory to find tools you need before seeking help from others in mailing lists. Here I'll just give some example usages which are asked in the mailing lists.
+  
+  \verbatim
+  copy-feats ark:data/raw_mfcc.ark ark,t:data/mfcc.txt  # copy binary feature archive to text archive format
+  \endverbatim
+  
+  \verbatim
+  cat feats_with_range.scp
+  utt_id_1 raw_mfcc.1.ark:9[0:2,0:5]
+  utt_id_2 raw_mfcc.1.ark:16965[0:3]
+  
+  # copy ranges of feature archive to stdout with text archive format
+  copy-feats scp:feats_with_range.scp ark,t:- 
+  \endverbatim
+  
+  \verbatim
+  cat cmvn.scp
+  speaker_id_1 data/cmvn_test.ark:4
+  speaker_id_2 data/cmvn_test.ark:247
+  speaker_id_3 data/cmvn_test.ark:490
+  speaker_id_4 data/cmvn_test.ark:733
+  speaker_id_5 data/cmvn_test.ark:976
+  
+  # copy specific speaker's cmvn vector to stdout with text format
+  copy-feats --binary=false $(grep speaker_id_2 cmvn.scp | awk '{print $2}') -    
+  \endverbatim
+  
+  \verbatim
+  # copy GMM model to text format
+  gmm-copy --binary=false final.mdl final_text.mdl  
+  \endverbatim
+  
+  \verbatim
+  hmm-info final.mdl
+  number of phones 351
+  number of pdfs 3400
+  number of transition-ids 47952
+  number of transition-states 23916
+  \endverbatim
+  
+  \verbatim
+  nnet3-am-info final.mdl | head
+  input-dim: 40
+  ivector-dim: 100
+  num-pdfs: 2856
+  prior-dimension: 0
+  # Nnet info follows.
+  left-context: 29
+  right-context: 29
+  num-parameters: 8355408
+  modulus: 1
+  input-node name=ivector dim=100
+  input-node name=input dim=40
+  component-node name=idct component=idct input=input input-dim=40 output-dim=40
+  \endverbatim
+  
+  \verbatim
+  # write utterance length in frames to stdout with text archive format
+  feat-to-len scp:feats.scp ark,t:- | head  
+  \endverbatim
+
+  \section some_useful_script_for_data_processing Some useful scripts for data preparation and processing
+  Besides tools mentioned above, there are also some useful scripts in Kaldi in the directory of "steps" and "utils". Here we will list some frequently used scripts in data preparation and processing and leave other important scripts to be illustrated in the corresponding sections below.
+  
+  \verbatim
+  steps/combine_ali_dirs.sh              # combine alignment directories
+  steps/combine_lat_dirs.sh              # combine lattice directories
+  
+  # create lattices for the aug dirs by copying the lattices of original train dir
+  steps/copy_lat_dir.sh 
+  # create alignments for the aug dirs by copying the alignments of original train dir  
+  steps/copy_ali_dir.sh                 
+  
+  steps/cleanup/split_long_utterance.sh  # truncate the long audio into smaller overlapping segments
+  
+  # perform segmentation of the input data based on the transcription
+  # and outputs segmented data along with the corresponding aligned transcription
+  steps/cleanup/segment_long_utterances[_nnet3].sh                                                     
+  \endverbatim
+  
+  \verbatim
+  # copy train/test data directory to another directory, 
+  # possibly adding a specified prefix or a suffix to the utterance and/or speaker names
+  utils/copy_data_dir.sh
+  
+  # combine the data from multiple source directories into a single destination directory
+  utils/combine_data.sh 
+  # split data-dir to multiple subsets according to num-to-split or speaker numbers
+  utils/split_data.sh
+  # split scp file up with an approximately equal number of lines in each output file 
+  utils/split_scp.pl
+  # create a subset of train/test data with different options, consisting of some specified number of utterances 
+  utils/subset_data_dir.sh
+
+  # filter a scp file by list of utterance-ids
+  utils/filter_scp.pl
+  
+  utils/int2sym.pl   # map from integers to symbols (e.g. word-ids to transcript)
+  utils/sym2int.pl   # map from symbols to integers
+  # like ./sym2int.pl, but a bit more general in that it doesn't assume the things being mapped to are single tokens
+  utils/apply_map.pl 
+  
+  utils/utt2spk_to_spk2utt.pl   # convert an utt2spk file to a spk2utt file
+  utils/spk2utt_to_utt2spk.pl   # convert a spk2utt file to an utt2spk file
+
+  \endverbatim
+  
+  \verbatim
+  # get file 'utt2dur' which maps from utterance to the duration of the utterance in seconds
+  utils/data/get_utt2dur.sh
+  # get file 'reco2durr' which maps from recording to the duration of the recording in seconds 
+  utils/data/get_reco2dur.sh
+  
+  # copy the data directory and modify it to use the recording-id as the speaker 
+  utils/data/modify_speaker_info_to_recording.sh
+  
+  # remove excess utterances once they appear more than a specified number of times with the same transcription
+  utils/data/remove_dup_utts.sh
+  
+  # create a new subsegmented output directory from an existing data-dir with 'segments' file
+  utils/data/subsegment_data_dir.sh
+  
+  # do the standard 3-way speed perturbing of a data directory (it operates on the wav.scp).
+  utils/data/perturb_data_dir_speed_3way.sh
+  # generate the files which are used for perturbing the speed of the original data
+  utils/perturb_data_dir_speed.sh
+  
+  \endverbatim
+  
+  \section indeterminacy_in_feature_extraction Indeterminacy in feature extraction
+  Uses may notice that there is tiny difference when they run two rounds of feature extraction including MFCC, Fbank and PLP. This is because the random signal-level ‘dithering’ used in the extraction process to prevent zeros in the filterbank energy computation. The corresponding code is 'Dither' function in file feature-window.cc. For those who want deterministic result, just set 
+  \verbatim
+  --dither=0 and --energy-floor=1
+  \endverbatim
+  or do srand(0) at the start of program. 
+  BTW, we really do NOT think the determinism is worth considering, users should measure WER instead of the output of feature extraction. For more discussions, please refer to:
+  - <a href=https://groups.google.com/forum/#!topic/kaldi-help/fQS3VmQ0eCA> MFCC feature extraction </a>
+  - <a href=https://groups.google.com/forum/#!topic/kaldi-help/LOD4A7Z9hYY> mfcc features significantly different if run more than once </a>
+  
+  \section faq_candidates Below are FAQ candidates (with some TODOs) from the mailing lists, we will update these candidates to make them more readable.
+
+ 1. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/zMK8pnHJmHE> How to interpret the final.mdl </a> </b>
+	- interpret final.mdl with an example
+	- how to extract info from final.mdl (am-info and other tools)
+	- related questions:
+	    - <a href="https://groups.google.com/forum/#!topic/kaldi-help/nHgOS-7Hfb0"> number of (tri-phone) classes from Kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/ZM9mnRNdIAc> how to get pdf number and transition state number </a>
+
+2. <b> WFST  </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/T-X8fp-839M> FST visualization </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/-C_XipM18Os> FST format </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/ialNoCbuB-w> arpa to fst</a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/NUW9ornQHzo> fst to carpa </a>
+
+4. <b> Lattice </b>
+	- list lattice related tools with example: nbest-to-linear, linear-to-1best, etc.
+	- list alignment related tools with example: ali-to-phones, ali-to-post, etc.
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/GHBb-3nrBLo> About lattice-to-post </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/82bbwMLIVAI> Convert pdf alignments to word alignments </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/8nDW_mYGRG8> Size of lattice generated by decoder </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/EFDSnKKASsU> How convert CompactLattice to Lattice </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/KPiGjVQ9v6A> Documentation for output format for lattice-to-post </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/APrZQNaF-S4> Query on converting phoneme lattice to word lattice </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/iBgeXN4diSY> the difference between posteriors calculated by forward-backward and lattice-to-post </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/RlH_noG1FRw> Help: split phone-level force alignments to word-level alignments </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/KNCJ6zbuA3U> phone alignment </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/8IFJBI3bJjM> how to get word aligment per-frame using prons-to-wordali </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/QCxBC818UCA> Forced alignment </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Ie40bOzpU6E> alignment to word_level/phone_level with ctm </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/-hWk12QfSBE> How to understand the results of lat1.gz? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/6pS5F5pPZAU> Viewing Lattice file (for example: lat.1.gz) </a>
+
+5. <b> CTM </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/FqHdUBAzxoU> How to get phone boundary from decoded sequence? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/tc7cKjCqPPI> How can I get phoneme-level and word-level segmentation during online recognition </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/vFea4w7XJJA> How to pipe the recognized transcriptions </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/6KaTh53vQfU> how to view phone lattice </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/4Fi3d3Ep8Ks> Empty Utterance at Decoding and CTM </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/xivVTSBZO3E> Print Lattice in text form </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Fye4gkL51T0> Trying to get time stamps of testing data at the word level </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/usSg2a6Kn3o> Word timings of nnet3 decoding </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/oTVYGg-5zOo> how to output the timestamp of the predicted transcript using online2-wav-nnet3-latgen-faster? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/GLetxqL9wMM> Generating phonemes from lattice </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/raRNZNCopfk> how to get lattice file to show words rather than numbers </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/cUTG5KYykF0> CTM from transcript file and audio file </a>
+	    - - <a href=https://groups.google.com/forum/#!topic/kaldi-help/g36HqAEC9lI> How to get timestamps while decoding with mfcc feature? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/6FjW0tWPjFI> How to get the phones from recognition </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/5nfBCim6Ptw> Getting a CTM file of the phone alignment of the training data </a>
+		- <a href=https://groups.google.com/forum/#!topic/kaldi-help/lbo9IRSqXgk> ctm file with words, times and confidences for each n-best paths </a>
+		- <a href=https://groups.google.com/forum/#!topic/kaldi-help/PPlvG_Vfmto> Phone level output </a>
+
+9. <b> Resume training </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/iS6Rx2Pl7Ug> Stop and resume DNN training </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/1lg6NcwamOs> How to resume a LSTM training ? </a>
+
+10. <b> GOP (goodness of pronunciation) and confidence score in Kaldi </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/WeZGDRUKWlo> Confidence scores in Kaldi </a> 
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/q58KAFbKqeo> confidence of the decoding results </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/yi9hyWxTPQQ> How to extract likelihood score (or confidence score) from force-alignment </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/g_hqxScLRo4> How to get additional information as confidance score ? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/CtJmgOuGag0> add confidence measure into aligning process </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/92okJBP4CC8> Convert word lattice to phone and ctm format with scores </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/UJrphc7HMFQ> How to get DNN-HMM likehood for a phoneme segment </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/OzjyFL3K_NY> ASR decoding confidence scores (sentence and word) from lattice </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Gc14xjmSymI> chain model frame-level senone posteriors </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/8N-bzymJRhw> How to use kaldi for phoneme error detection on non-native English </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/qEI47vdZRf0> Is there any command can estimate the confidence score for each utterance using lattice? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/dpYYiqU1i2U> Pronunciation assessment with KALDI </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/JsqCQIgVFb4> Confidences for edges in lattice </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/4enr5jJrz3I> implementation of GOP with Kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/JUKZB2e3YXM> Why the log-likelihoods calculated during force-alignment are positive? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/14h_O922XaE> how can we calculate the posterior probabilities of phone according to force alignment </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/IxOVRauRDw4> how can I get sentence confidence and word-to-word confidences for every path in nbest from compact lattice </a>
+
+11. <b> Decision tree </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/k9q6B2N4WqE> How can view decision tree more efficiently </a>
+
+12. <b> --Transition-scale, --self-loop-scale, --acoustic-scale, lm-weight </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/3hHxdi48IQ4> --transition-scale and --self-loop-scale </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/bs9Pc4yEah8> Conceptual question regarding acoustic scale and LMWT </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Uod-4Zl-Xyk> Online Decoding for chain-tdnn </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/JG56KPCB-v4> LM-weight for lattice rescoring </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Av9iyMpXDnw> How to tune the acwt and lmwt when changing to a new task </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/cXX7y3Hvf3w> Acoustic and language model costs in Kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/CqlTY_8Tw0Q> Lattice scaling </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/75T1YJUWQr0> acoustic and language model weight for lattice-to-nbest </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/9DZK95yTWl4> Why LM weight is used only after decode completes? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/B0nbUl3_a1I> Why the acwt shoud be set as 0.1 when the last logsoftmax layer is removed? </a>
+
+13. <b> Interaction between Kaldi and HTK </b>
+	- Feature level (copy-feats-to-htk, etc)
+	- Model Level (they are primarily different, with further explanation)
+	- releated questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/7nGWu8A-LAs> using copy-feats-to-htk </a>
+
+14. <b> The effect of Beam </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/VW_hBCdM_lE> How does beam value in forced alignment affect the performance? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/EPjBhweN2A4> what does the "beam" mean in the decode model? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/gY8e5dVh6uM> Decoding Beam vs Lattice Generation Beam </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/9txwWq-nWLY> improve decoding time </a>
+
+15. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/L6ZBsYaAExE> Nnet-align-compiled used too much memory? </a> </b>
+
+17. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/7AxZqD69vvY> Getting acoustic scores on state level in decoding </a> </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/d7ZjCLveU8c> How do I get nnet3 computed matrix (scores) from online-wav-nnet3-latgen-faster </a>
+
+18. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/jnkXeHVASo0> Mandarin: Pitch vs No Pitch </a> </b>
+
+19. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/6FADEVV7Rt4> Is it possible to run kaldi on AMD gpu? Is a opencl port available? </a> </b>
+
+20. <b> Rescore </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/dImbnLTKZu0> How to generate lattice for large LM? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/jDG5CmAy_3w> How to accelerate the rescoring </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/LsfPUwHj3d8> Kaldi with Huge LM </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/DddJe3_d5wY> what does rescoring mean?? </a>
+		- <a href=https://groups.google.com/forum/#!topic/kaldi-help/LNJrE93J3UI> Use the same n-gram to do the lattice rescore but lost accuracy </a>
+		- <a href=https://groups.google.com/forum/#!topic/kaldi-help/sZQ4LE9J6ig> worse result on lattice rescoring </a>
+		- <a href=https://groups.google.com/forum/#!topic/kaldi-help/bWoq7vcRCvw> please help, can i do the lattice-rescore with a big lm trained by different and larger corpus </a>
+
+22. <b> Thread safe in Kaldi </b>
+	- related questions
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/x1Vozgv9HSI> is kaldi thread safe ? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/HjraFqQFSnI> MfccComputer is not thread-safe? </a>
+
+24. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/qVT1EziyqpI> Lexicon free Text recognition </a> </b>
+
+25. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/xs4wSKSK3vk> Decoding .wav files </a> </b>
+
+26. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/3o8JKY17tcE> How to remove the silence modeling during training and testing </a> </b>
+
+28. <b> Examples for different task </b>
+	- related questions:
+	    -  <a href=https://groups.google.com/forum/#!topic/kaldi-help/5TDgpcWQhos> There are no examples of identifying music and speech </a> 
+	    -  <a href=https://groups.google.com/forum/#!topic/kaldi-help/yNDN58ekbZ0> Is Kaldi an appropriate tool for making a speech-vs-music classifier? </a>
+	    -  <a href=https://groups.google.com/forum/#!topic/kaldi-help/X297IbfWIlw> gender and accent recognition in Kaldi </a>
+
+29. <b> Model (update) in Kaldi </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/VVmKJeRlPzY> How could I find papers corresponding exactly to the implementation of models such as tdnn,lstm,tdnn_lstm in kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/JFt7TtA6I1w> what is the best neural networks implementation in kaldi ? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/QYRKeQWotJE> max pooling for nnet3 </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/PRhpfPDGmis> The performance of LSTM, LSTMP and GRU </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/ExJxr1vsJUU> L2 penalty nnet3 </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/ZfFj1ELQJ84> ResNets </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/fFyLnlYTHBY> CNN for raw speech </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/hb8eQnRXWhs> Comparison of the different Kaldi models </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/feljuy056Jc> So is there any proof paper for TDNN as the state-of-the-art? </a>
+
+30. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/SgCK63GdJRc> The use of !SIL word in the lexicon </a> </b>
+
+31. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/_Ij6L4uORBE> Problem when do alignment </a> </b>
+
+32. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/vioP5AhdzSg> Why is there a disambiguation symbol in L_disambig.fst after optional silence? </a> </b>
+
+35. <b> RNNLM </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/fd_RQT15NpU> How can I use a RNNLM to get a final result of a WER </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/e-YnbBNNrkw> How to calculate the perplexity using RNNLM? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/hBzx3vql3AM> rescore the lattices using rnnlm's trained with PyTorch </a> 
+
+36. <b> Run nnet3 without ivectors </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/9bMpka993g8> Are there available recipes for training nnet3 models but without i-vectors </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/8TkWpz-yGk8> run_tdnn.sh without ivectors </a>
+
+37. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/0e-byMXyXOs> Which is a best starting point to learn online decoding </a> </b>
+
+38. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/-II4S8YsefA> How to print partial result in online2-wav-nnet3-latgen-faster </a>  </b>
+
+39. <b> Data preprocessing and augmentation </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/sDi46bW-Bq8> Differences in the volume of the audio input </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/n1kezANCvrs> speed and volume perturbation trick in Kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/ohBkUGe9Dj4> data augmentation by speech coding conversion </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/AOelHpfbOZc> is there some method to remove noise at the begining of the decoding utterance </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/BaXSEJGFip4> How to remove bad transcriptions more efficiently? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/jiff9IstTmg> Split the long audio file into utterances </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/J90ilujJYYY> Audio super resolution </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/L9XW7P96zIU> Data augmentation </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/LACiyggRIe4> to get segment wav file </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/T4VbMhX0skA> ram requirements for long duration utterances </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/q0feUFsAf1w> Noise Reduction </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/CEAEpZvkC70> While cutting off the beginning silence of a wav, the decode result changed a lot (get better!) </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/lF6lZe5WBbY> which kind of audio format is possible in kaldi? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/b7bbZyow2XA> in tdnn and mono, Does the audio have a maximum duration limit? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/dRzaIdgGFYs> Error in data preparation </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/n8es2XWVkec> utt2spk is not in sorted order or has duplicates </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/36lxCfa2Us8> long audio alignment </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/dwwb020B398> Train ASR model without time alignment information </a>
+
+40. <b> Speaker diarization </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/tBdy2y4v6Tw> Diarization and WER </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/XOConkLKKcQ> Speech Activate Detection (SAD) for Diarization </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/BYfnUtDc94E> Speaker diarization with x-vector </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/ATl5b6CclDk> is there a recipe of speaker diarization with i-vector? </a> 
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/VfB00bNScek> Speaker diarisation in Kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/_8ZRao2PlFs> Speech Diariztion with Kaldi tutorial </a>
+
+41. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/wr_4KDb1fus> How to specify GPU for chain model training </a> </b>
+
+42. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/sztmblIYZsU> How to do the Latency control training in kaldi ? </a> </b>
+
+43. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/axOTLOlNZ_4> What's the meaning of content of nnet3's config? </a> </b>
+
+44. <b> Keyword spotting </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/P_DdBW7mxj0> kaldi for key word spotting in live audio </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Xwd1naCxs4I> keyword sporting in continuous speech </a>
+
+45. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/BPZKi9YhwOw> Kaldi supported gpus </a> </b>
+
+47. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/faZtacN7CrU> Optimizing model load time? </a> </b>
+
+48. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/iIvV81L77Dk> DNN input feature</a> </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/nCGEX6YqNZ8> training a DNN with 2 sets of features </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Cwydlqkk-ZY> Does online2-wav-nnet3-latgen-faster support delta feature? </a>
+
+49. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/JiROILMYh44> Is there any trick to accelerate the nnet-compute? </a> </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/-ODc_2vKOH0> How to know each component's computation time </a>
+
+50. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/hYfzRxlvw78> Reading *.ark files from bash or python </a> </b>
+
+51. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/gWJoQWMS818> What is meant by WER and SER? </a> </b>
+
+52. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/ZNokBc2mb34> Training DNN over LDA+MLLT system </a> </b>
+
+53. <b> End-to-End SR </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/RoWbg11_t6s> End-to-end speech recognition </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/flchZuDr8w0> Are there any plans to support the LSTM-CTC in Kaldi? </a>
+
+54. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/fDrqHbus4Cs> Kaldi already supports SVD. Can you give me an example of how to use SVD in LSTMP network? </a> </b>
+
+55. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/ojCeMK-ByC8> Decoding a built graph without grammar </a> </b>
+
+56. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/x8xEd65J0bM> Why is mfcc used in tdnn，but not fbank? </a> </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/zKIMYRh1xLo> MFCC or FBANK </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/_7hB74HKhC4> MFCC vs FBANK for chain models ? </a>
+
+57. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/OtyxFqnvYsQ> What's the maximum amount of data used with kaldi for training acoustic models </a> </b>
+
+58. <b> Ivector </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/iaYsHKoBVjs> Corpus recommendation for testing speaker adaptation </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/MhGbi5gEoM0> Confusion about Online I-vector Extraction with Kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/98sXUraTJjU> Implementation details of i-vector in kaldi </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/M8IE0FebwT0> Looking for a simple explanation of why ivectors are used in speech recognition? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/mr-OUG-2VKQ> What is the significance of ivectors? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/g485uCjF6Is> Ivector </a>
+
+59. <b> CMVN, VTLN, FMLLR adaptation </b>
+	- related question:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/3OIzO0ETG38> CMVN, VTLN and other techniques to improve recognition? </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/AD3ze1HOov0> nnet3 + cmvn </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/SjUtIb5EIpw> Noise Robustness Issue in KALDI </a>
+	    - <a href=https://groups.google.com/forum/#!msg/kaldi-help/D94JWOO5hi4/kBrZPHuMAAAJ> Acoustic features for a model without speaker info </a> 
+
+60. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/RL72hAkMM88> What causes too many words delete？ </a> </b>
+
+61. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/Z-iLS01_EVo> Kaldi linear Model Combination or Model Merging </a> </b>
+
+62. <b> OCR </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/d2INsls923M> End to end for OCR </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/GZklbkF6BLo> Decoding - Handwriting Recognition </a>
+
+63. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/rAfu9TKJJYI> Robustness of ASR </a> </b>
+
+64. <b> <a href=https://groups.google.com/forum/#!forum/kaldi-help> Python3 vs. Python2.7 in Kaldi scripts </a> </b>
+
+67. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/s7zq30ckaEM> Adapt speaker recognition model </a> </b>
+
+68. <b> Teacher-student model in Kaldi </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/HVSx5JH8MhM> Teacher-Student implementation in kaldi by Vimal Monohar </a>
+	    -  <a href=https://groups.google.com/forum/#!topic/kaldi-help/HfSAAIADdBs> Distillation framework in kaldi </a> 
+
+69. <b> Language model </b>
+	- related questions:
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/d1KIOgREd84> The right way of adding new words to existing ngram LM. </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/l2VMlgcjTw4> How to delete word's path form HCLG.fst </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/DTFxysLc6qA> Add penalty in the grammar </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/Kkr2pvX3PE8> How to run kaldi with limited dictionary </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/g7HiIxlQp7o> Run without Language Model </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/IIRb5L_zRew> LM scores from n-gram </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/X8k9UCz-SKA> Extend Kaldi ASR to new words </a>
+	    - <a href=https://groups.google.com/forum/#!topic/kaldi-help/vXBQgdaTxZc> How to create G.fst file for isolated word recognition? </a>
+
+70. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/LjOq37fg05E> Real time time decoding force last audio data decoding </a> </b>
+
+71. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/IXelzRxGnPo> QR Decomposition within Kaldi </a> </b>
+
+72. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/4FAIw9PQ31s> Is the word_boundary.int necessary for online-audio-server-decode-faster </a> </b>
+
+73. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/zxm6hiG1wwI> Is WER a lexcion error or a character error when training kaldi Mandarin speech recognition model? </a> </b>
+
+74. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/ecmv0Mipwy0> What is word_boundary file and how can I create this? </a> </b>
+
+75. <b> <a href=https://groups.google.com/forum/#!topic/kaldi-help/nbeO5I6JnpU> Different results from lattice-align-words and lattice-mbr-decode </a> </b>
+
+*/
+}
diff --git a/src/doc/get_version_info.sh b/src/doc/get_version_info.sh
index c11fb7f805e..3b9b8e1f2fe 100755
--- a/src/doc/get_version_info.sh
+++ b/src/doc/get_version_info.sh
@@ -42,8 +42,12 @@ fi
 # Note: when you add new tuples here you'll also want to add ndew
 # \htmlinclude directives in versions.dox.
 
-for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 5.3 131cdd4cb544" \
-              "5.4 master be969d7baf04"; do
+for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 5.3 db28650346ba07" \
+                                 "5.4 5.4 be969d7baf04" "5.5 master 7aab92b7c"; do
+  if [ $(echo $tuple | wc -w) != 3 ]; then
+    echo "$0: tuple should have 3 fields: '$tuple'"
+    exit 1
+  fi
   major_minor_number=$(echo $tuple | awk '{print $1}')  # e.g. 5.0
   branch=$(echo $tuple | awk '{print $2}')  # e.g. 'master', or '5.1' (it's a branch name)
   first_commit=$(echo $tuple | awk '{print $3}')
diff --git a/src/doc/grammar.dox b/src/doc/grammar.dox
new file mode 100644
index 00000000000..30396041d22
--- /dev/null
+++ b/src/doc/grammar.dox
@@ -0,0 +1,545 @@
+// doc/grammar.dox
+
+
+// Copyright  2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+namespace kaldi {
+
+/**
+
+ \page grammar   Support for grammars and graphs with on-the-fly parts.
+
+
+ This page explains our support for dynamically created grammars and graphs with
+ extra parts that you want be able to compile quickly (like words you want to
+ add to the lexicon; contact lists; things like that).  We have used the word
+ "grammar" as an easy searchable term for this framework, but this is not the
+ only way to implement grammars in Kaldi.  If you have a smallish, fixed grammar
+ it would probably be much easier to create an FST (G.fst) directly from the
+ grammar (ensuring it is determinizable by means of disambiguation symbols if
+ necessary), and using the normal graph creation recipe.  This framework is
+ specifically for where you have a compelling need to pre-compile the HCLG.fst
+ for various sub-parts and have them dynamically stitched together (typically to
+ avoid recompiling large graphs at runtime).
+
+ This framework is limited to work only with left-biphone models.  This is
+ without loss of performance, because our best models (chain models) already use
+ left-biphone context.
+
+
+  \section grammar_replace  Relation to OpenFst's 'Replace()' operation
+
+  The design of these tools is inspired by OpenFst's  "Replace()"" operation, as implemented
+  by its command-line tool fstreplace.  The basic idea is illustrated by its usage message:
+\verbatim
+Recursively replaces FST arcs with other FST(s).
+
+  Usage: fstreplace root.fst rootlabel [rule1.fst label1 ...] [out.fst]
+\endverbatim
+  Below is a very trivial example of using <code>fstreplace</code>; it just replaces the olabel 5
+  in the top-level FST with 6.
+\verbatim
+# (echo 0  1  0  5; echo 1 0) | fstcompile > top.fst
+# (echo 0  1  0  6; echo 1 0) | fstcompile > x.fst
+# fstreplace top.fst 1000 x.fst 5 | fstprint
+0	1	0	0
+1	2	0	6
+2	3	0	0
+3
+\endverbatim
+  The framework of these tools is similar, in that at the G.fst level there are
+  symbols that will end up getting replaced by other FSTs.  Most of the
+  complexity has to do with the need to handle phonetic context-- and this is
+  the reason why we can't just use the existing Replace() operation or its
+  on-demand equivalent.
+
+  A slight difference in interface of our tools versus <code>fstreplace</code> is that in our
+  tools, the top-level FST (corresponding to the 1st arg of <code>fstreplace</code>) does not
+  have a symbol assigned to it and thus cannot be "replaced into" any
+  FST.
+
+
+  \section grammar_overview  Overview of the framework
+
+ To explain how this works, we'll take the "contact list" scenario, where you want to
+ build a large language model with a nonterminal, say <code>\#nonterm:contact_list</code> in it,
+ and at recognition time you quickly build some kind of small LM representing
+ the contact list (possibly with previously unseen words), and compile that graph.
+ Both the "big graph" and the "small graph" are fully compiled down to the HCLG level.
+ The GrammarFst code "stitches them together" at decode time.  The way this is
+ accomplished is by putting special ilabels in the two HCLGs that the GrammarFst
+ code knows how to interpret.  That is: most ilabels in the HCLGs correspond to
+ transition-ids, but there are "special ilabels" with values over ten million, that
+ the GrammarFst code knows how to interpret, and it uses them to stitch together
+ the FSTs, in a way that's related to OpenFst's Replace() operation, but is a little
+ more complicated due to the need to get the phonetic context right.  (It only supports
+ left-biphone context, to keep the complexity manageable).
+
+ The GrammarFst has an interface very similar to OpenFst's "Fst" type--
+ sufficiently similar that the decoder can use it as a drop-in replacement for a
+ normal FST-- but it does not actually inherit from any OpenFst type; this is to
+ simplify the implementation and give us more freedom in designing it.  The
+ decoders that use GrammarFst are templated on the FST type, and we use
+ GrammarFst as the template argument when we want to decode with them.
+
+ The StateId used in the GrammarFst code is a 64-bit StateId, which we interpret
+ as a pair of 32-bit integers.  The high-order bits are the "fst instance" and the
+ low-order bits are the state in that "fst instance".  In the contact-list example,
+ fst-instance zero would be the top-level graph, and there would potentially be
+ a new fst-instance, numbered 1, 2, ..., for each time the <code>\#nonterm:contact_list</code> nonterminal
+ appears in the big language model.  However, these are only generated on demand
+ as those parts of the graph are actually accessed.  The GrammarFst is a
+ lightweight object that does very little work at startup.  It is designed to be
+ as fast as possible in the "normal case" when we are not crossing FST
+ boundaries, and are just traversing inside a single FST.  The GrammarFst code
+ needs a fast-to-evaluate "signal" that it needs to do something special for a
+ particular FST state.  We let the final-probabilities be that signal: that is,
+ each time we initialize an ArcIterator, the GrammarFst code tests
+ whether the final-prob has a special value or not.  If it has that special value
+ (4096.0), then the GrammarFst code does a little bit of extra work to see whether
+ it needs to expand the state, and to look up a previously expanded
+ version of the state (or expand it if it wasn't already present).   By "expand"
+ the state we mean compute the vector of arcs leaving it.
+
+
+ The FST compilation process-- i.e. the process of going from G.fst to HCLG.fst--
+ is a little different when we intend to support grammars.  That is, we need to
+ extend some of the tools used in compilation to work correctly with certain
+ special symbols that we introduce.  The differences are explained below.
+
+
+  \subsection  grammar_overview  Where to find example script
+
+ The top-level example scripts for this setup are in egs/mini_librispeech/s5;
+ see the scripts local/grammar/simple_demo.sh and local/grammar/extend_vocab_demo.sh.
+ There are also versions of these scripts that use silence probabilities, in
+ local/grammar/simple_demo_silprobs.sh and local/grammar/extend_vocab_demo_silprobs.sh.
+ (Actually the workflow is exactly the same in the silprob and no-silprob versions
+ of the scripts; we created those different versions for testing purposes, as those
+ demo scripts also help us test the correctness of the code).
+
+
+  \section grammar_symtabs  Symbol tables and special symbols
+
+  When using this framework, we to add certain extra symbols to the words.txt
+  and phones.txt symbol tables.  These extra symbols represent certain special
+  symbols intrinsic to the framework, plus the user-defined nonterminal symbols.
+  In the following example the user-defined special symbols are \#nonterm:foo
+  and \#nonterm:bar.
+\verbatim
+tail words.txt
+ZZZ  8431
+#0   8432
+#nonterm_begin  8434
+#nonterm_end  8435
+#nonterm:foo  8437
+#nonterm:bar  8438
+\endverbatim
+ The phones.txt contains a couple more symbols:
+\verbatim
+tail phones.txt
+Z_S  243
+#0  244
+#1  245
+#2  246
+#nonterm_bos  247
+#nonterm_begin  248
+#nonterm_end  249
+#nonterm_reenter  250
+#nonterm:foo  251
+#nonterm:bar  252
+\endverbatim
+ The user should never need to explicitly add these symbols to the words.txt and
+ phones.txt files; they are automatically added by utils/prepare_lang.sh.  All the user
+ has to do is to create the file 'nonterminals.txt' in the 'dict dir' (the directory
+ containing the dictionary, as validated by validate_dict_dir.pl).
+
+ The C++ code never directly interacts with the nonterminal symbols in
+ words.txt; that is all done at the script level (e.g. creating L.fst), and the
+ C++ code only interacts with the nonterminal symbols in phones.txt.  Therefore
+ there are no particularly strong constraints on the symbols in words.txt if you
+ are prepared to modify the scripts or create "LG.fst"-type graphs directly.
+ There are some constraints on the order of these symbols in phones.txt: in that case,
+ the inbuilt symbols (the ones without a colon) must be in the order shown,
+ the user-defined nonterminals must directly follow them, and there must be no
+ phones numbered higher than the nonterminal-related symbols (although higher-numbered
+ disambiguation symbols are allowed).
+
+ Some binaries accept an option <code>--nonterm-phones-offset</code>, which tell them
+ where to find the nonterminal symbols.  This should always be equal to the
+ integer id of the symbol <code>\#nonterm_bos</code> in <code>phones.txt</code>.  In the above example
+ it would be <code>--nonterm-phones-offset=247</code>.
+
+  \section grammar_special_g  Special symbols in G.fst
+
+ If you are using this framework you will be creating several graphs, so there
+ may be several copies of G.fst (and the intermediate and fully compiled
+ versions thereof).  All of them are allowed to include sub-graphs via
+ nonterminals, and this can be done recursively; it is OK if the fully
+ compiled graph is infinite, because it is only expanded on demand.
+
+ If you want to include a particular nonterminal (say the one for
+ <code>\#nonterm:foo</code>), you have to include that symbol <code>\#nonterm:foo</code> on the input
+ side of G.fst.  As to what you include on the output side: that's up to you, as
+ the framework doesn't care, but bear in mind that symbols without
+ pronunciations may cause problems for lattice word alignment.  Note to more
+ advanced users: the program lattice-align-words won't work if there are output
+ symbols in HCLG.fst that don't have any pronunciation, but the alternative
+ solution lattice-align-words-lexicon will still work, as long as you add
+ entries for those words with empty pronunciations, in align_lexicon.int; the
+ entries will be of the form <code>200007 200007</code>, assuming 200007 is the integer id
+ of the word with the empty pronunciation.  The script prepare_lang.sh adds
+ these entries for you.
+
+ For graphs which are not top-level graphs, all ilabel sequences in
+ G.fst should begin with the special symbol <code>\#nonterm_begin</code> and end with
+ <code>\#nonterm_end</code>.  This can be accomplished via <code>fstconcat</code> from the command
+ line, or by just adding them directly as you create the graph.  These
+ symbols will later be involved in selecting the correct phonetic context when we
+ enter the compiled HCLG.fst.
+
+ For some applications, such as the contact-list scenario where you are adding
+ new vocabulary items, it may be easier to skip creating G.fst and just create
+ LG.fst manually; this won't be hard to do once you know its expected structure.
+ The example script local/grammar/extend_vocab_demo.sh in egs/mini_librispeech/s5/
+ may be a good reference for this, even if you don't plan to actually use those
+ scripts in production.
+
+
+  \section grammar_special_lg  Special symbols in LG.fst
+
+ Before we describe what L.fst does with the special symbols,
+ we will state what we expect LG.fst to contain after composition.  All the
+ special symbols are on the ilabels of LG.fst.
+
+ Let us define the set of <em>"left-context phones"</em> as the set of phones that can
+ end a word, plus the optional silence, plus the special symbol <code>\#nonterm_bos</code>.
+ This is the set of phones that can possibly appear as the left-context when we
+ are beginning a word, plus <code>\#nonterm_bos</code> as a stand-in for the beginning-of-sequence
+ context where no previous phone was seen.  We will italicize the phrase
+ <em>left-context phones</em> when we use it, to emphasize that it has a special meaning.
+
+ For non-top-level graphs only:
+
+  - All ilabel sequences in the FST must begin with <code>\#nonterm_begin</code> followed by each possible
+    <em>left-context phone</em>, i.e. parallel arcs enumerating all possible phonetic
+    left-contexts that could precede this nonterminal.
+
+    In non-word-position-dependent systems we can just let this set be all phones;
+    in word-position-dependent systems it can be all phones except word-internal
+    and word-begin phones, i.e.  all phones except those that look like <code>XX_B</code>
+    and <code>XX_I</code>.  If the set of possible left contexts is known to be smaller, it may
+    be more efficient to make this a smaller set.  In addition to real phones,
+    we include <code>\#nonterm_bos</code> in this set, which represents the phonetic
+    context we encounter at the start of an utterance.
+
+  - All ilabel sequences must end with <code>\#nonterm_end</code>.
+
+ Whenever a nonterminal is invoked, whether from a top-level or non-top-level
+ graph, the ilabels in LG.fst will be, for example, <code>\#nonterm:foo</code> followed by
+ in parallel, all possible <em>left-context phones</em>.  These left-context get added
+ by L.fst.
+
+  \section grammar_special_l  Special symbols in L.fst
+
+ This section explains what sequences involving special symbols in L.fst we need to
+ add, in order to compile a LG.fst with the desired properties from G.fst.
+ The things we describe below are implemented by
+ utils/lang/make_lexicon_fst.py and utils/lang/make_lexicon_fst_silprob.py,
+ and is activated when you provide the <code>--left-context-phones</code> and <code>--nonterminals</code>
+ options.  This is automatically called from prepare_lang.sh when it sees the
+ file nonterminals.txt in the input dictionary directory.
+
+ Let the loop-state of L.fst be the state in L.fst with very high out-degree,
+ from which all the words leave (and return).
+
+
+ The lexicon needs to include, in addition to the normal things:
+
+ - A sequence starting at the start state and ending at the loop-state, with
+   olabel <code>\#nonterm_begin</code> and ilabels consisting of, <code>\#nonterm_begin</code>
+   followed by all possible left-context phones (and <code>\#nonterm_bos</code>) in
+   parallel.
+ - An arc from the loop-state to a final state, with ilabel and olabel equal to <code>\#nonterm_end</code>.
+ - For each user-defined nonterminal (e.g. <code>\#nonterm:foo</code>) and for
+   <code>\#nonterm_begin</code>, a loop beginning and ending at the loop-state that starts with
+   the user-defined nontermal, e.g. <code>\#nonterm:foo</code>, on the ilabel and
+    olabel, and then has all <em>left-context-phones</em> on the ilabel only.
+
+ In order to keep LG.fst as stochastic as possible (i.e. as "sum-to-one" as possible
+ in probabilistic terms), when we have states from which there leave arcs containing
+ all <em>left-context phones</em> we add a cost equal to the log of the number of
+ left-context phones.  This will allow us to push the weights later
+ on in the graph-building procedure, without causing strange effects that would
+ be harmful to decoding speed and accuracy.  When the graphs actually get spliced
+ together, all but one of the alternative paths for "all possible left-context
+ phone" will be disallowed; and that that point we will cancel out the cost of
+ log(number of left-context phones).  This happens in the function
+ GrammarFst::CombineArcs().
+
+ Note that the above means that each sub-graph corresponding to
+ a user-defined nonterminal will allow optional silence after the nonterminal
+ but not before it.  This is consistent with the way the nonterminal is invoked
+ from the higher-level graph, and generates exactly one optional silence between each pair of
+ "real" words, plus one at the beginning and end of the top-level graph.  This equivalence
+  is something we test at the end of the example script
+  egs/mini_librispeech/s5/local/grammar/simple_demo.sh.
+ Users should bear all this in mind if they are going to construct these sub-graphs
+ manually at the LG.fst level rather than using the provided scripts.
+
+ \subsection grammar_special_l   Interaction with 'silprobs'
+
+ In the versions of the lexicons that have word-specific silence probabilities
+(see <a href=http://www.danielpovey.com/files/2015_interspeech_silprob.pdf> this paper</a> for explanation)
+ there are actually two versions of the loop state, one for after silence
+ and one for after nonsilence .
+ When using 'silprobs', each word has a word-specific cost at its beginning and end that
+ is associated with the transition to/from nonsilence and silence respectively (where by
+ "silence" we specifically mean the optional silence added by the lexicon, not silence phones
+ in a more general sense).
+
+ Please refer to utils/lang/make_lexicon_fst_silprob.py for the
+ details of how we handle nonterminal symbols in combination with these types of
+ graphs.  We will just share the top-level idea here, which is this: when we
+ enter the HCLG.fst for the nonterminal, and when we return from it, we 'know' the
+ identity of immediately preceding phone.  (That is how this framework works; read
+ further if you find this surprising).  We use that information to implement
+ the 'silprob' idea without having to give the FST additional entry
+ points; basically, if the left-context phone was the optional-silence phone, we
+ go to the state in L.fst that would have been in after seeing optional silence.
+ This will do the right thing in the normal case.  In the specific configuration
+ where you were not using word-position-dependent phones (c.f. the  --position-dependent-phones
+ option of prepare_lang.sh) and where there are words in your lexicon that end with
+ the optional-silence phone (e.g. SIL), this will not quite do the right thing,
+ but we don't expect that this difference will be particularly significant in any real-world
+ use cases.
+
+  \section grammar_special_clg  Special symbols in CLG.fst
+
+  First, some background: the symbols on the input of CLG.fst (i.e. the ilabels) have interpretation
+  given by a what we call the <code>ilabel_info</code>.  This is explained more in \ref tree_ilabel.  Programs
+  that consume CLG.fst always also consume the <code>ilabel_info</code>, which is a <code>vector<vector<int32> ></code>.
+  For a particular ilabel, say 1536, <code>ilabel_info[1536] = { 5, 21 }</code> is a vector of integers representing
+  a phone-in-context.  E.g. this would represent the phone 21 with a left-context of 5.
+  Disambiguation symbols also appear on the input of CLG.fst, and they are represented in the <code>ilabel_info</code>
+  a 1-dimensional vector like <code>{ -104 }</code> containing the negative of the disambiguation symbol's
+  integer id.
+
+  The special symbols we add to the input of CLG.fst to support the grammar-decoding framework
+  always correspond to pairs of symbols,
+  specifically pairs (</code>\#nontermXXX</code>, <em>left-context phone</em>), where <code>\#nontermXXX</code> is any
+  of the symbols <code>\#nonterm_begin</code>, <code>\#nonterm_end</code>, <code>\#nonterm_reenter</code>, or user-defined
+  nonterminals like <code>\#nonterm:foo</code>.  The ilabel-info for these special symbols will be
+  pairs like <code>{-104, 21}</code> where the first element is the negative of the <code>\#nontermXXX</code> symbol
+  and the second is the  <em>left-context phone</em>.  The negation makes it easy to distinguish these
+ ilabel_info entries from regular phones-in-context.
+
+  The special symbols in CLG.fst will be as follows.
+
+  The following special symbols may appear in any CLG graph, top-level or not:
+   - When any graph invokes a sub-graph, there will be an arc with an ilabel
+     (</code>\#nonterm:foo</code>, <em>left-context-phone</em>) representing the
+     user-specified nonterminal and the actual left-context, which will be
+     followed by arcs with ilabels of the form (</code>\#nonterm_reenter</code>,
+     <em>left-context-phone</em>), for all left-context phones.
+
+  For non-top-level CLG graphs only:
+   - These graphs will begin with ilabels representing pairs (</code>\#nonterm_begin</code>, <em>left-context-phone</em>),
+     representing all potential left-contexts.
+   - They will end with ilabels (</code>\#nonterm_end</code>, <em>left-context-phone</em>), representing
+     actual left-contexts.
+
+
+   \subsection grammar_special_c  Special symbols in C.fst
+
+  First, background.  Since this framework only supports left-biphone
+  context, the states of C.fst correspond to the left context phone, and the
+  ilabels on the transitions correspond to biphones (plus self-loops for
+  disambiguation symbols).
+
+  Next, what we are trying to accomplish.  C.fst needs to do as follows
+ (describing how it needs to change sequences in LG.fst to sequences in CLG.fst):
+
+   - It needs to change the sequence <code>\#nonterm_begin</code> p1 (where p1 is a <em>left-context-phone</em>)
+     to a single symbol representing the pair (</code>\#nonterm_begin</code>, p1).
+   - It needs to change the symbol <code>\#nonterm_end</code> to a single symbol representing
+     the pair (</code>\#nonterm_end</code> <em>left-context-phone</em>), where <em>left-context-phone</em>
+     represents the current phonetic left-context.
+   - For each user-defined nonterminal e.g. <code>\#nonterm:foo</code>, it needs to change
+     the sequence <code>\#nonterm:foo</code> p1 (where p1 is a <em>left-context-phone</em>)
+     to a sequence of two symbols representing the pairs (</code>\#nonterm:foo</code>, p0) and
+     (</code>\#nonterm_renter</code> p1) respectively.  Here, p0 represents the phone that was
+     previous to the symbol <code>\#nonterm:foo</code>.
+
+ In order to implement the above, we augment the state-space of C.fst by adding
+ three new states:
+
+    - One which we transition to when the olabel is
+     <code>\#nonterm_begin</code>
+    - One which we transition to when we see any user-defined
+      symbol <code>\#nonterm:foo</code>.
+    - One which we transition to when the olabel is <code>\#nonterm_end</code>.
+
+ In order to avoid changing the main context-fst code, we implement this in a
+ special class fst::InverseLeftBiphoneContextFst which implements these extensions
+ and which only supports the left-biphone case.  See that code for more
+ details (search for "state space" in grammar-context-fst.h).
+
+
+ \section grammar_special_hclg  Special symbols in HCLG.fst
+
+  The special symbols in the HCLG.fst graphs will represent the same thing as
+  those in CLG.fst graphs, discussed above; but their representation in integer
+  form is different.
+
+  Firstly, some background.  At the input of CLG.fst the symbols are indexes
+  into an <code>ilabel_info</code> table.  At the input of HCLG.fst the symbols, in general,
+  represent <code>transition-ids</code>-- and also disambiguation symbols, but those
+  are removed after determinization.  The point is that HCLG.fst does not come with
+  a table like the <code>ilabel_info</code> that gives us the interpretation of symbols,
+  so we need to use an encoding that allows us to combine two integers into one.
+
+  We choose a representation of the special symbols in HCLG.fst that avoids
+  clashing with the transition-ids and which makes it relatively painless to
+  decode the symbols to find what they represent.  The representation  of
+  a pair (</code>\#nonterm:XXX</code>, <em>left-context-phone</em>) is,
+  in the typical case:
+\verbatim
+  hclg_ilabel = 1000000 + 1000 * nonterm_xxx + left_context_phone
+\endverbatim
+  where of course <code>nonterm_xxx</code> and <code>left_context_phone</code> are the corresponding
+  symbol-ids in <code>phones.txt</code>.  Actually, in place of
+ the "1000" above we use the smallest multiple of 1000 that is greater than the value passed to the
+ <code>--nonterm-phones-offset</code> option; this allows us to handle large phone sets while also being fairly
+ human-readable.
+
+
+  \subsection grammar_special_h  Special symbols in H.fst
+
+ Since H.fst only needs to change the integer represention of the special
+ symbols but otherwise leaves them unchanged, the changes to it are quite trivial.
+ H.fst has a high-out-degree state which we will refer to as the loop-state.
+ We just need to add a self-loop arc at the loop-state for each of the special
+ symbols referred to in the <code>ilabel_info</code>.  The ilabel and olabel
+ are different since the integer encodings are different.
+
+
+  \section grammar_decoder  The decoder
+
+
+ The current approach to decoding with grammars
+ is to wrap up the entire thing as an FST so that the same decoding code as
+ before can be used.   That is, we just invoke the decoder with a different FST.
+ We use 64-bit state-ids, so that we can let the higher-order 32 bits encode the "fst instance"
+ and the lower-order bits encode the state within that instance.  The fst instances
+ are created on the fly as states are visited.  Instance 0 is always the "top-level" FST,
+ and we create new FST instances on the fly as needed, when we encounter arcs with
+ "special symbols" on.
+
+ The actual decoder code is the same as the regular decoder; we just template it on
+ a different FST type: type fst::GrammarFst instead of fst::Fst. Class fst::GrammarFst does not
+ inherit from class fst::Fst or support its entire interface (this would have been very
+ complex to implement); it only supports the parts of the interface actually needed
+ by the decoder.
+
+
+  \subsection grammar_decoder_arc_iterator The ArcIterator of GrammarFst
+
+  Probably the most critical part of the design is the ArcIterator
+ code, since the inner loop of the decoder is a loop over arcs.  In order to avoid
+ having to copy the underlying FSTs, for "normal states" (those that don't have arcs
+ leaving them which enter or return from other FST instances), the ArcIterator code actually points into the
+ arcs of the underlying FSTs, which of course have a differently-typed 'nextstate', with 32 bits
+ not 64 bits.  The ArcIterator also stores the higher 32 bits of the state-id, which
+ corresponds to the "fst instance" id, and every time you call its Next() function it
+ creates a new local copy of the 'current arc' it points to, which differs from the
+ underlying arc by having a 64-bit 'nextstate'.  The overhead of copying the arc to a temporary will,
+ we hope, be mostly removed by compiler optimation.  (In fact this does seem to be the
+ case: the overhead of GrammarFst decoding is about 15\% with -O0 and 5\% with -O2).
+
+ Some states in the GrammarFst are 'special' states because they have arcs leaving them that
+ cross FST boundaries.   For these 'special' states we have to construct the arcs separately, and
+ we store this information in a hash in class GrammarFst.
+
+ To keep the decoder code fast and memory-efficient, we need to know quickly,
+ every time we visit a state, whether it is a "special" state or a normal state.
+ We don't want to do this with a big array indexed by state, because it would
+ take up too much memory per GrammarFst object.  Instead we do it by giving a
+ special final-prob value to "special states" in the underlying FSTs that
+ GrammarFst stitches together.  The ArcIterator code tests whether the
+ final-cost has this special value (4096.0) and if it does, it knows that it's a
+ "special state" and looks it up in a hash; if not, it just looks up the start
+ of the array of arcs for this state in the underlying FST.
+
+ In order to avoid having any extra if-statements in the ArcIterator that would
+ have to be evaluated while we loop over arcs, we make sure that even "expanded
+ states" have vectors of arcs that use the underlying arc type (fst::StdArc)
+ with 32-bit state-ids.  The "fst-instance" index of the destination FST is
+ stored separately in the ArcIterator, just as it is for normal states.  This,
+ of course, requires that we must not have states with arcs leaving them
+ that transition to multiple FST instances.  See the next section for how
+ we ensure this.
+
+
+  \section grammar_prepare Preparing FSTs for use in grammar decoding
+
+
+  The GrammarFst code has various requirements on the FSTs that it stitches together,
+  some of which were mentioned above.  These requirements are designed to help
+  keep the GrammarFst code fast.  The function fst::PrepareForGrammarFst (internally implemented
+  by class fst::GrammarFstPreparer) ensures that these preconditions are met.  The user is required
+  to call this preparation code prior to instantiating the GrammarFst object, so the preparation is
+  considered part of the graph construction;  this keeps the run-time code fast.
+  The standard graph-construction script utils/mkgraph.sh calls this automatically (via the binary
+  make-grammar-fst) if it detects that you are using this framework.
+
+ The tasks of fst::PrepareForGrammarFst include setting a final-cost of 4096.0 for
+ FST states that will end up being "special" states, and also making various small
+ changes to the HCLG.fst that ensure it has the properties needed by class fst::GrammarFst
+ (e.g. ensuring no state will have transitions to multiple FST instances).  These
+ changes are mostly accomplished by inserting epsilon arcs; for details, see the
+ documentation of class fst::GrammarFstPreparer.
+
+  \section grammar_olabels  Output labels in GrammarFsts
+
+ In the example scripts we provided, because we only wanted "real words" to
+ appear on the output side of HCLG.fst, we ensured that no special symbols of
+ the form <code>\#nontermXXX</code> on the output side of G.fst.  However, the
+ graph compilation framework does allow you to include those symbols if you
+ want.  These might be useful in certain application scenarios, where you want
+ to know that a particular span of words was decoded as part of a sub-grammar.
+ The only thing you have to be careful of is that the program
+ lattice-align-words (and the code underlying it) will not work if you have
+ words that have an empty pronunciation.  That can be an issue if you need to find the
+ exact time-alignment of words for some reason.  In those cases you should use
+ the alternative program lattice-align-words-lexicon (which reads a file
+ lexicon.int giving the pronunciation of words in your lexicon), which should
+ work even in this case.  The prepare_lang.sh script already puts empty
+ pronunciation entries for symbols of the form <code>\#nontermXXX</code>
+ in lexicon.int, so lattice-align-words-lexicon method of word alignment
+ should "just work" if you made the lang and graph directories using the
+ provided scripts.
+
+
+
+
+
+*/
+
+
+}
diff --git a/src/doc/history.dox b/src/doc/history.dox
index 40d46c7e32f..0813f2331cc 100644
--- a/src/doc/history.dox
+++ b/src/doc/history.dox
@@ -54,7 +54,8 @@
  Sandeep Boda, Sandeep Reddy and Haihua Xu (who helped with coding, code cleanup
  and documentation); we were visited by Michael Riley (who helped us to understand
  OpenFst and gave some lectures on FSTs), and would like to acknowledge the help of
- Honza Cernocky (for allowing us to have the workshop and helping to organize it),
+ Honza Cernocky (for negotiating the venue and some support for the workshop from
+ the Faculty of Information Technology of BUT and helping to organize it),
  Renata Kohlova (administration), and Tomas Kasparek (system administration).
  It is possible that this list of contributors contains
  oversights; any important omissions are unlikely to be intentional.
@@ -62,13 +63,16 @@
  A lot of code was written during the summer of 2010 but we still did not have a
  complete working system.  Some of the participants of the 2010 workshop
  continued working to complete the toolkit and get a working set of training scripts.
- The code was released on May 14th, 2011.
+ The code was released on May 14th, 2011, and presented to public at ICASSP 2011 
+ in Prague,
+ <a href="https://www.superlectures.com/icassp2011/category.php?lang=en&id=131">
+ see the recordings</a>.
 
  Since the initial release, Kaldi has been maintained and developed to a large
  extent by Daniel Povey, working at Microsoft Research until early 2012 and
  since then at Johns Hopkins University; but also with major contributions by
  others: notably Karel Vesely, who developed the neural-net training framework,
- and Arnab Ghoshal, who co-ordinated the acoustic modeling work early on; but
+ and Arnab Ghoshal, who coordinated the acoustic modeling work early on; but
  also other major contributors whom we do not name here because it is too hard
  to determine where to cut off the list; and a long tail of minor contributors;
  the total number of people who have contributed code or scripts or patches is
diff --git a/src/doc/hmm.dox b/src/doc/hmm.dox
index c410b1ba5a1..fb936bf2d25 100644
--- a/src/doc/hmm.dox
+++ b/src/doc/hmm.dox
@@ -98,7 +98,7 @@ numbered state of a
 "prototype HMM" has two variables "forward_pdf_class" and "self_loop_pdf_class".
 The "self_loop_pdf_class" is a kind of pdf-class that is associated
 with self-loop transition. It is by default identical to "forward_pdf_class",
-but it can be used to define less-convectional HMM topologies
+but it can be used to define less-conventional HMM topologies
 where the pdfs on the self-loop and forward transitions are different.
 The decision to allow the pdf-class on just the self-loop to be different,
 while not embracing a fully "arc-based" representation where the pdfs on
diff --git a/src/doc/io.dox b/src/doc/io.dox
index dc958f57a6f..8f3a3cc05b6 100644
--- a/src/doc/io.dox
+++ b/src/doc/io.dox
@@ -383,7 +383,7 @@ namespace kaldi {
   std::string rspecifier2 = "ark:-"; // archive read from stdin.
   // write to a gzipped text archive.
   std::string wspecifier1 = "ark,t:| gzip -c > /some/dir/foo.ark.gz";
-  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.ark";
+  std::string wspecifier2 = "ark,scp:data/my.ark,data/my.scp";
  \endcode
 
  Usually, an rspecifier or wspecifier consists of a comma-separated, unordered
@@ -401,7 +401,7 @@ namespace kaldi {
  \endverbatim
  This will write an archive, and a
  script file with lines like "utt_id /somedir/foo.ark:1234" that specify offsets into the
- archive for more efficient random access.  You can then do what you like which
+ archive for more efficient random access.  You can then do whatever you like with
  the script file, including breaking it up into segments, and it will behave like
  any other script file.  Note that although the order of options before the colon
  doesn't generally matter, in this particular case the "ark" must come before
diff --git a/src/doc/kaldi_for_dummies.dox b/src/doc/kaldi_for_dummies.dox
index c04e0d0c3e9..b48d6dd8dac 100644
--- a/src/doc/kaldi_for_dummies.dox
+++ b/src/doc/kaldi_for_dummies.dox
@@ -71,7 +71,7 @@ and installation,
  - \c awk – programming language, used for searching and processing patterns
 in files and data streams,
  - \c bash – Unix shell and script programming language,
- - \c grep – command-line utility for searching plain-text data sets for lines
+ - \c grep – command-line utility for searching plain-text datasets for lines
 matching a regular expression,
  - \c make – automatically builds executable programs and libraries from
 source code,
@@ -87,16 +87,16 @@ If you do not have much idea about how to use GIT, please read about it:
 \ref tutorial_git.
 
 I installed Kaldi in this directory (called 'Kaldi root path'):
-\c /home/{user}/kaldi-trunk
+\c /home/{user}/kaldi
 
 \section kaldi_for_dummies_directories Kaldi directories structure
 
 Try to acknowledge where particular Kaldi components are placed. Also it would
 be nice if you read any \c README files you find.
 
-\c kaldi-trunk - main Kaldi directory which contains:
+\c kaldi - main Kaldi directory which contains:
  - \c egs – example scripts allowing you to quickly build ASR
-systems for over 30 popular speech corporas (documentation is attached for each
+systems for over 30 popular speech corpora (documentation is attached for each
 project),
  - \c misc – additional tools and supplies, not needed for proper
 Kaldi functionality,
@@ -127,7 +127,7 @@ train it, test it and get some decoding results.
 
 <h2>Your first task</h2>
 Something to begin with - create a folder \c digits in
-\c kaldi-trunk/egs/ directory. This is a place where you will put all
+\c kaldi/egs/ directory. This is a place where you will put all
 the stuff related to your project.
 
 \section kaldi_for_dummies_data Data preparation
@@ -136,34 +136,34 @@ the stuff related to your project.
 
 I assume that you want to set up an ASR system, basing on your own audio data.
 For example - let it be a set of 100 files. File format is WAV. Each file
-contains 3 spoken digits recorded in english language, one by one. Each of
+contains 3 spoken digits recorded in English language, one by one. Each of
 these audio files is named in a recognizable way (e.g. \c 1_5_6.wav,
 which in my pattern means that the spoken sentence is 'one, five, six') and
 placed in the recognizable folder representing particular speaker during a
 particular recording session (there may be a situation that you have recordings
 of the same person but in two different quality/noise environments - put these
-in separate folders). So to sum up, my exemplary data set looks like this:
+in separate folders). So to sum up, my exemplary dataset looks like this:
  - 10 different speakers (ASR systems must be trained and tested on different
 speakers, the more speakers you have the better),
  - each speaker says 10 sentences,
- - 100 senteces/utterances (in 100 *.wav files placed in 10 folders related to
+ - 100 sentences/utterances (in 100 *.wav files placed in 10 folders related to
 particular speakers - 10 *.wav files in each folder),
  - 300 words (digits from zero to nine),
  - each sentence/utterance consist of 3 words.
 
-Whatever your first data set is, adjust my example to your particular case. Be
-careful with big data sets and complex grammars - start with something simple.
+Whatever your first dataset is, adjust my example to your particular case. Be
+careful with big datasets and complex grammars - start with something simple.
 Sentences that contain only digits are perfect in this case.
 
 <h2>Task</h2>
-Go to \c kaldi-trunk/egs/digits directory and create
-\c digits_audio folder. In \c kaldi-trunk/egs/digits/digits_audio
+Go to \c kaldi/egs/digits directory and create
+\c digits_audio folder. In \c kaldi/egs/digits/digits_audio
 create two folders: \c train and \c test. Select one speaker
-of your choice to represent testing data set. Use this speaker's 'speakerID' as
-a name for an another new folder in \c kaldi-trunk/egs/digits/digits_audio/test
+of your choice to represent testing dataset. Use this speaker's 'speakerID' as
+a name for an another new folder in \c kaldi/egs/digits/digits_audio/test
 directory. Then put there all the audio files related to that person. Put the
 rest (9 speakers) into \c train folder - this will be your training
-data set. Also create subfolders for each speaker.
+dataset. Also create subfolders for each speaker.
 
 \subsection kaldi_for_dummies_acoustic Acoustic data
 
@@ -174,14 +174,14 @@ section as well) can be considered as a text file with some number of strings
 (each string in a new line). These strings need to be sorted. If you will
 encounter any sorting issues you can use Kaldi scripts for checking
 (\c utils/validate_data_dir.sh) and fixing (\c utils/fix_data_dir.sh) data order.
-And for you information - \c utils directory will be attached to your project in
+And for your information - \c utils directory will be attached to your project in
 \ref kaldi_for_dummies_tools "Tools attachment" section.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits directory, create a folder \c data. Then create
+In \c kaldi/egs/digits directory, create a folder \c data. Then create
 \c test and \c train subfolders inside. Create in each subfolder following files
 (so you have files named in <b>the same way in \c test and \c train subfolders
-but they relate to two different data sets</b> that you created before):
+but they relate to two different datasets</b> that you created before):
 
 a.) \c spk2gender <br>
 This file informs about speakers gender. As we assumed, 'speakerID' is a unique
@@ -207,9 +207,9 @@ for examples below).
 
 <b>Pattern:</b> <uterranceID> <full_path_to_audio_file>
 \verbatim
-dad_4_4_2 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/dad/4_4_2.wav
-july_1_2_5 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/july/1_2_5.wav
-july_6_8_3 /home/{user}/kaldi-trunk/egs/digits/digits_audio/train/july/6_8_3.wav
+dad_4_4_2 /home/{user}/kaldi/egs/digits/digits_audio/train/dad/4_4_2.wav
+july_1_2_5 /home/{user}/kaldi/egs/digits/digits_audio/train/july/1_2_5.wav
+july_6_8_3 /home/{user}/kaldi/egs/digits/digits_audio/train/july/6_8_3.wav
 # and so on...
 \endverbatim
 
@@ -236,8 +236,8 @@ july_6_8_3 july
 \endverbatim
 
 e.) \c corpus.txt <br>
-This file has a slightly different directory. In \c kaldi-trunk/egs/digits/data
-create another folder \c local. In \c kaldi-trunk/egs/digits/data/local create a
+This file has a slightly different directory. In \c kaldi/egs/digits/data
+create another folder \c local. In \c kaldi/egs/digits/data/local create a
 file \c corpus.txt which should contain every single utterance transcription
 that can occur in your ASR system (in our case it will be 100 lines from 100
 audio files).
@@ -252,14 +252,14 @@ four four two
 
 \subsection kaldi_for_dummies_language Language data
 
-This section relates to language modelling files that also need to be considered
+This section relates to language modeling files that also need to be considered
 as 'must be done'. Look for the syntax details here: \ref data_prep (each file
 is precisely described). Also feel free to read some examples in other \c egs
 scripts. Now is the perfect time.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits/data/local directory, create a folder \c dict. In
-\c kaldi-trunk/egs/digits/data/local/dict create following files:
+In \c kaldi/egs/digits/data/local directory, create a folder \c dict. In
+\c kaldi/egs/digits/data/local/dict create following files:
 
 a.) \c lexicon.txt <br>
 This file contains every word from your dictionary with its 'phone
@@ -337,19 +337,19 @@ complete.
 You need to add necessary Kaldi tools that are widely used in exemplary scripts.
 
 <h2>Task</h2>
-From \c kaldi-trunk/egs/wsj/s5 copy two folders (with the whole content) -
+From \c kaldi/egs/wsj/s5 copy two folders (with the whole content) -
 \c utils and \c steps - and put them in your
-\c kaldi-trunk/egs/digits directory. You can also create links to these
+\c kaldi/egs/digits directory. You can also create links to these
 directories. You may find such links in, for example,
-\c kaldi-trunk/egs/voxforge/s5.
+\c kaldi/egs/voxforge/s5.
 
 \subsection kaldi_for_dummies_scoring Scoring script
 
 This script will help you to get decoding results.
 
 <h2>Task</h2>
-From \c kaldi-trunk/egs/voxforge/s5/local copy the script \c score.sh into
-similar location in your project (\c kaldi-trunk/egs/digits/local).
+From \c kaldi/egs/voxforge/s5/local copy the script \c score.sh into
+similar location in your project (\c kaldi/egs/digits/local).
 
 \subsection kaldi_for_dummies_srilm SRILM installation
 
@@ -358,7 +358,7 @@ example - SRI Language Modeling Toolkit (SRILM).
 
 <h2>Task</h2>
 For detailed installation instructions go to
-\c kaldi-trunk/tools/install_srilm.sh (read all comments inside).
+\c kaldi/tools/install_srilm.sh (read all comments inside).
 
 \subsection kaldi_for_dummies_configuration Configuration files
 
@@ -366,8 +366,8 @@ It is not necessary to create configuration files but it can be a good habit
 for future.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits create a folder \c conf. Inside
-\c kaldi-trunk/egs/digits/conf create two files (for some configuration
+In \c kaldi/egs/digits create a folder \c conf. Inside
+\c kaldi/egs/digits/conf create two files (for some configuration
 modifications in decoding and mfcc feature extraction processes - taken from
 \c /egs/voxforge):
 
@@ -395,10 +395,10 @@ decided to use two different training methods:
 - TRI1 - simple triphone training (first triphone pass).
 
 These two methods are enough to show noticable differences in decoding results
-using only digits lexicon and small training data set.
+using only digits lexicon and small training dataset.
 
 <h2>Task</h2>
-In \c kaldi-trunk/egs/digits directory create 3 scripts:
+In \c kaldi/egs/digits directory create 3 scripts:
 
 a.) \c cmd.sh <br>
 \code{.sh}
@@ -416,7 +416,7 @@ export KALDI_ROOT=`pwd`/../..
 export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$PWD:$PATH
 
 # Defining audio data directory (modify it for your installation directory!)
-export DATA_ROOT="/home/{user}/kaldi-trunk/egs/digits/digits_audio"
+export DATA_ROOT="/home/{user}/kaldi/egs/digits/digits_audio"
 
 # Enable SRILM
 . $KALDI_ROOT/tools/env.sh
@@ -432,7 +432,7 @@ c.) \c run.sh
 . ./path.sh || exit 1
 . ./cmd.sh || exit 1
 
-nj=1       # number of parallel jobs - 1 is perfect for such a small data set
+nj=1       # number of parallel jobs - 1 is perfect for such a small dataset
 lm_order=1 # language model order (n-gram quantity) - 1 is enough for digits grammar
 
 # Safety mechanism (possible running this script with modified arguments)
@@ -564,7 +564,7 @@ Now all you have to do is to run \c run.sh script. If I have made any mistakes
 in this tutorial, logs from the terminal should guide you how to deal with it.
 
 Besides the fact that you will notice some decoding results in the terminal
-window, go to newly made \c kaldi-trunk/egs/digits/exp. You may notice there
+window, go to newly made \c kaldi/egs/digits/exp. You may notice there
 folders with \c mono and \c tri1 results as well - directories structure are the
 same. Go to \c mono/decode directory. Here you may find result files (named in
 a <c>wer_{number}</c> way). Logs for decoding process may be found in \c log
@@ -575,7 +575,7 @@ folder (same directory).
 This is just an example. The point of this short tutorial is to show you how to
 create 'anything' in Kaldi and to get a better understanding of how to think
 while using this toolkit. Personally I started with looking for tutorials made
-by the Kaldi authors/developers. After succesful Kaldi installation I launched
+by the Kaldi authors/developers. After successful Kaldi installation I launched
 some example scripts (Yesno, Voxforge, LibriSpeech - they are relatively easy
 and have free acoustic/language data to download - I used these three as a base
 for my own scripts).
@@ -586,7 +586,7 @@ There are two very useful sections for beginners inside: <br>
 a.) \ref tutorial - almost 'step by step' tutorial on how to set up an ASR
 system; up to some point this can be done without RM dataset. It is good to
 read it, <br>
-b.) \ref data_prep - very detailed explaination of how to use your own data
+b.) \ref data_prep - very detailed explanation of how to use your own data
 in Kaldi.
 
 More useful links about Kaldi I found: <br>
diff --git a/src/doc/lattices.dox b/src/doc/lattices.dox
index 1b05102c721..15b476825ec 100644
--- a/src/doc/lattices.dox
+++ b/src/doc/lattices.dox
@@ -808,7 +808,7 @@ or:
 \verbatim
   lattice-to-nbest --acoustic-scale=0.1 --n=10 ark:1.lats ark:1.nbest
 \endverbatim
- then the archive 1.nbest will contain lattices for uttA-1, uttA-2, ... uttA-2,
+ then the archive 1.nbest will contain lattices for uttA-1, uttA-2, ... uttA-10,
  for uttB-1 ... uttB-10, and so on.  Of course some of the utterances may not
  have that many N-best entries if the lattice did not contain that many distinct
  word sequences.  The program lattice-to-nbest needs the acoustic scale as this
diff --git a/src/doc/mainpage.dox b/src/doc/mainpage.dox
index c6a3468a5d0..1f09ba5e019 100644
--- a/src/doc/mainpage.dox
+++ b/src/doc/mainpage.dox
@@ -46,6 +46,7 @@
    - \subpage tutorial
    - \subpage kaldi_for_dummies
    - \subpage examples
+   - \subpage faq
    - \subpage glossary
    - \subpage data_prep
    - \subpage build_setup
@@ -66,6 +67,7 @@
    - \subpage graph
    - \subpage graph_recipe_test
    - \subpage graph_recipe_train
+   - \subpage grammar
    - \subpage fst_algo
    - \subpage decoders
    - \subpage lattices
diff --git a/src/doc/matrixwrap.dox b/src/doc/matrixwrap.dox
index fb595d581fe..9cf5e92ca48 100644
--- a/src/doc/matrixwrap.dox
+++ b/src/doc/matrixwrap.dox
@@ -22,93 +22,155 @@ namespace kaldi {
 
 /** \page matrixwrap External matrix libraries
 
-  Here we describe how our \ref matrix "matrix library" makes use of 
+  Here we describe how our \ref matrix "matrix library" makes use of
   external libraries.
 
   \section matrixwrap_summary Overview
- 
-  The matrix code in Kaldi is mostly a wrapper on top of the
-  linear-algebra libraries BLAS and LAPACK.  The code has been designed to be as flexible
-  as possible in terms of what libraries it can use.  Currently it supports four options:
+
+  The matrix code in Kaldi is mostly a wrapper on top of the linear-algebra
+  libraries BLAS and LAPACK.  The code has been designed to be as flexible as
+  possible in terms of what libraries it can use.  Currently it supports four
+  options:
+    -  Intel MKL, which provides both BLAS and LAPACK (the default)
+    -  OpenBLAS, which provides BLAS and LAPACK
     -  ATLAS, which is an implementation of BLAS plus a subset of LAPACK (with a different interface)
     -  Some implementation of BLAS plus CLAPACK (note: this has not been tested recently).
-    -  Intel's MKL, which provides both BLAS and LAPACK
-    -  OpenBLAS, which provides BLAS and LAPACK
 
-  The code has to "know" which of these four options is being used, because although in principle
-  BLAS and LAPACK are standardized, there are some differences in the interfaces.
-  The Kaldi code requires exactly one
-  of the three strings HAVE_ATLAS, HAVE_CLAPACK, HAVE_OPENBLAS or HAVE_MKL to be defined 
-  (e.g. using -DHAVE_ATLAS as an option to the compiler).  It must then be 
-  linked with the appropriate libraries.  The code that deals most directly
-  with including the external libraries and setting up the appropriate
-  typedef's and defines, is in \ref kaldi-blas.h.   However, the rest of
-  the matrix code is not completely insulated from these issues because the ATLAS
-  and CLAPACK versions of higher-level routines are called differently (so
-  we have a lot of "#ifdef HAVE_ATLAS" directives and the like).  Additionally, some routines
-  are not even available in ATLAS so we have had to implement them ourselves.
-
-  The "configure" script in the "src" directory is responsible for setting up Kaldi to use the libraries.
-  It does this by creating the file "kaldi.mk" in the "src" directory, which gives appropriate flags
-  to the compiler.   If called with no arguments it will use any ATLAS installation it can find in "normal" places
-  in your system, but it is quite configurable.  See the script itself for usage.
-
- \section matrixwrap_blas Basic Linear Algebra Subroutines (BLAS)
-
-   Because we refer a lot to BLAS in this section, we briefly explain what it is. 
-   BLAS is a set of subroutine declarations that correspond to low-level
-   matrix-vector operations.  There is Level 1 Blas (vector-vector), Level 2
-   (vector-matrix) and Level 3 (matrix-matrix).  They have names like daxpy (for
-   double-precision a*x plus y), and dgemm (for double general matrix-matrix
-   multiply).  BLAS has various actual implementations.  The "reference BLAS",
-   supplied I believe by Netlib (the folks who also brought us the most common version
-   of LAPACK), is one.  ATLAS is another one (but it also implements some functions
-   from LAPACK).
-
- \section matrixwrap_lapack Linear Algebra PACKage (LAPACK)
-
-   Lapack is a set of linear-algebra routines, originally written in Fortran.  It includes
-   higher-level routines than BLAS, such as matrix inversion, SVD, etc.  
-   Netlib has implemented this (this is the "normal" LAPACK).  LAPACK requires
-   BLAS.  It is possible to mix-and-match LAPACK and BLAS implementations 
-   (e.g. Netlib's LAPACK with ATLAS's BLAS).
- 
-  CLAPACK is a version of LAPACK that has been converted from Fortan to C automatically
-  using the f2c utility.  When we talk about using LAPACK, we are actually
-  talking about using CLAPACK.  Because CLAPACK has been converted to C using the
-  f2c utility, when we link against it we need to include the f2c library (e.g. -lf2c,
-  or -lg2c if using recent versions of gcc), otherwise we will get linking errors.
-
-
-  \section matrixwrap_atlas Automatically Tuned Linear Algebra Software (ATLAS) 
+  The code has to "know" which of these four options is being used, because
+  although in principle BLAS and LAPACK are standardized, there are some
+  differences in the interfaces.  The Kaldi code requires exactly one of the
+  three macros \c HAVE_ATLAS, \c HAVE_CLAPACK, \c HAVE_OPENBLAS or \c HAVE_MKL
+  to be defined (normally using \c -DHAVE_ATLAS as an option to the compiler).
+  It must then be linked with the appropriate libraries.  The code that deals
+  most directly with including the external libraries and setting up the
+  appropriate typedef's and defines, is in \ref kaldi-blas.h.  However, the rest
+  of the matrix code is not completely insulated from these issues because the
+  ATLAS and CLAPACK versions of higher-level routines are called differently (so
+  we have a lot of "#ifdef HAVE_ATLAS" directives and the like).  Additionally,
+  some routines are not even available in ATLAS so we have had to implement them
+  ourselves.
+
+  The "configure" script in the "src" directory is responsible for setting up
+  Kaldi to use the libraries.  It does this by creating the file "kaldi.mk" in
+  the "src" directory, which gives appropriate flags to the compiler. If called
+  with no arguments it will use any Intel MKL installation it can find in
+  "normal" places in your system, but it is configurable. Run the script with
+  the \c \--help option for the complete option list.
+
+ \section matrixwrap_matalgebra Understanding BLAS and LAPACK
+
+  Because we refer a lot to BLAS (and more often CBLAS) and LAPACK (or, rarely,
+  CLAPACK) in this section, we briefly explain what it is.
+
+ \subsection matrixwrap_blas Basic Linear Algebra Subroutines (BLAS)
+
+  BLAS is a set of subroutine declarations that correspond to low-level
+  matrix-vector operations.  There is BLAS Level 1 (vector-vector), Level 2
+  (vector-matrix) and Level 3 (matrix-matrix). They have names like \c daxpy
+  (for \"<b>d</b>ouble-precision \b a \b x <b>p</b>lus \b y\"), and \c dgemm
+  (for "double-precision general matrix-matrix multiply"). BLAS has various
+  actual implementations. The <a href="http://www.netlib.org/blas/">reference
+  implementation of BLAS</a> originated back in 1979, and has been maintained
+  since by Netlib. The reference implementation lacks any optimization
+  whatsoever, and exists solely as a touchstone to validate the correctness of
+  other implementations. MKL, ATLAS and OpenBLAS provide optimized
+  implementations of BLAS.
+
+  CBLAS is just the C language interface to BLAS.
+
+ \subsection matrixwrap_lapack Linear Algebra PACKage (LAPACK)
+
+  LAPACK is a set of linear-algebra routines, originally written in Fortran.  It
+  includes higher-level routines than BLAS, such as matrix inversion, SVD, etc.
+  The <a href="https://github.com/Reference-LAPACK">reference implementation of
+  LAPACK</a> was implemented and has been maintained by Netlib.  LAPACK
+  internally uses BLAS. It is possible to mix-and-match LAPACK and BLAS
+  implementations (e.g. Netlib's LAPACK with ATLAS's BLAS).
+
+  CLAPACK is a version of LAPACK that has been converted from Fortan to C
+  automatically using the f2c utility. Because of this, the f2c library is
+  required during linking with the "original" CLAPACK (usually \c -lg2c or
+  \c -lf2c).
+
+  MKL provides complete C-callable interfaces for its own BLAS and LAPACK
+  implementations; no additional libraries are required.
+
+ \section matrixwrap_mkl Intel Math Kernel Library (MKL)
+
+  Intel MKL provides C-language interface to a high-performance implementation
+  of the BLAS and LAPACK routines, and is currently the preferred CBLAS/CLAPACK
+  provider for Kaldi. To use MKL with Kaldi use the \c -DHAVE_MKL compiler flag.
+
+  Previously MKL used to be a paid product. Starting 2017, Intel made MKL freely
+  available and allows royalty-freely runtime redistribution even for commercial
+  application (although, just like, for example, CUDA, it is still a
+  closed-source commercial product).
+
+  MKL provides a very highly optimized implementation of linear algebra
+  routines, and especially on Intel CPUs. In fact, the library contains multiple
+  code paths, which are selected at runtime depending on individual features of
+  the CPU it is being loaded on. Thus with MKL you will automatically benefit
+  from all features and instruction sets (such as AVX2 and AVX512) if they are
+  available on your CPU, without any additional configuration. These
+  instructions accelerate linear algebra operations on CPU significantly.  It is
+  usually a good idea to use a recent MKL version if your CPU is of a newer
+  architecture.
+
+  To simplify MKL setup on Linux, we provide a script
+  \c tools/extras/install_mkl.sh. We install only 64-bit binaries for MKL, but
+  once the \c install_mkl.sh script completes successfully once, the Intel
+  repositories are registered on your system, and you can both obtain new
+  versions and 32-bit libraries using your system's package manager.
+
+  For Mac and Windows, <a href="https://software.intel.com/mkl/choose-download">
+  download the installer from Intel's Web site</a> (registration may be
+  required).  Refer to the same page in case the above Linux script does not
+  support your Linux distribution. The Intel installers (Mac, Windows) let you
+  select the 32-bit and 64-bit packages separately. To run Kaldi training
+  recipes only the 64-bit version is required.
+
+  We have tested Kaldi extensively with 64-bit libraries under Linux and
+  Windows.
+
+  The <a href="http://software.intel.com/articles/intel-mkl-link-line-advisor/">
+  MKL Link Line Advisor</a> is an interactive Web tool that allows configuring
+  the compiler flags for various systems and compilers, in case our "configure"
+  script does not cover it.
+  \n \b NOTE: Do not use the the multithreaded mode for
+  Kaldi training (select "sequential" as the threading option). Our script and
+  binary setups are designed to run multiple processes on a single machine,
+  presumably maxing out its CPU, and an attempt to multi-thread linear algebra
+  computations will only adversely impact the performance.
+
+  \section matrixwrap_atlas Automatically Tuned Linear Algebra Software (ATLAS)
 
   ATLAS is a well known implementation of BLAS plus a subset of LAPACK.  The
   general idea of ATLAS is to tune to the particular processor setup, so the
   compilation process is quite complex and can take a while.  For this reason,
-  it can be quite tricky to compile ATLAS.  On UNIX-based systems, you can't even do it unless you 
+  it can be quite tricky to compile ATLAS.  On UNIX-based systems, you can't even do it unless you
   are root or are friendly with your system administrator, because to compile
   it you need to turn off CPU throttling; and on Windows, ATLAS does not compile
   "natively", only in Cygwin.  Sometimes it can be a better bet to find libraries that
   have been compiled by someone else for your particular platform, but we can't offer
-  much advice on how to do this.  ATLAS generally performs better 
+  much advice on how to do this.  ATLAS generally performs better
   than the "reference BLAS" available from Netlib.   ATLAS only includes
   a few LAPACK routines.  These include matrix inversion and Cholesky factorization,
   but not SVD.  For this reason we have implemented a couple more of the LAPACK
-  routines (SVD and eigenvalue decomposition); see 
+  routines (SVD and eigenvalue decomposition); see
   the next section.
-  
+
   ATLAS conforms to the BLAS interface, but its interface for the subset of
-  LAPACK routines that it provides is not the same as Netlib's (it's more
-  C-like and less FORTRAN-ish).  For this reason, there are quite a number of #ifdef's in our code
-  to switch between the calling styles, depending whether we are
+  LAPACK routines that it provides is not the same as Netlib's (it's more C-like
+  and less FORTRAN-ish).  For this reason, there are quite a number of \#ifdef's
+  in our code to switch between the calling styles, depending whether we are
   linking with ATLAS or CLAPACK.
-  
+
   \subsection matrixwrap_atlas_install_windows Installing ATLAS (on Windows)
 
   For instructions on how to install ATLAS on Windows (and note that these
   instructions require Cygwin), see the file windows/INSTALL.atlas
   in our source distribution.  Note that our Windows setup is not being
-  actvely maintained at the moment and we don't anticipate that it will work
+  actively maintained at the moment and we don't anticipate that it will work
   very cleanly.
 
   \subsection matrixwrap_atlas_install_linux Installing ATLAS (on Linux)
@@ -118,39 +180,31 @@ namespace kaldi {
   pre-built binaries available, they may not be the best binaries possible for your
   architecture so it is probably a better idea to compile from source.
   The easiest way to do this
-  is to cd from "src" to "../tools" and to run ./install_atlas.sh.  
+  is to cd from "src" to "../tools" and to run ./install_atlas.sh.
   If this does not work, the detailed installation
-  instructions can be found at: http://math-atlas.sourceforge.net/atlas_install/. 
-	
+  instructions can be found at: http://math-atlas.sourceforge.net/atlas_install/.
+
   One useful note is that before installing ATLAS you should turn off CPU
- throttling using "cpufreq-selector -g performance" (cpufreq-selector may be in
- sbin), if it is enabled (see the ATLAS install page).  You can first try running the 
- "install_atlas.sh" script before doing this, to see whether it works-- if CPU
+  throttling using "cpufreq-selector -g performance" (cpufreq-selector may be in
+  sbin), if it is enabled (see the ATLAS install page).  You can first try running the
+  "install_atlas.sh" script before doing this, to see whether it works-- if CPU
   throttling is enabled, the ATLAS installation scripts will die with an error.
-	
-	\section matrixwrap_mkl Intel Math Kernel Library (MKL)
-	Intel MKL also provides C-language interface to the BLAS and LAPACK routines,
-	and can be used with Kaldi by using the -DHAVE_MKL compiler flag. The linker
-	flags for MKL tend to be quite different depending on the OS, architecture, 
-	compiler, etc. used. We have tested Kaldi on 32-bit Windows and x86_64 (or EMT64) Linux.
-	Flags for other platforms can be obtained from:
-  http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/
 
   \section matrixwrap_openblas OpenBLAS
 
-    Kaldi now supports linking against the OpenBLAS library, which  is an implementation
+  Kaldi now supports linking against the OpenBLAS library, which  is an implementation
   of BLAS and parts of LAPACK.  OpenBLAS also automatically compiles Netlib's implementation of LAPACK,
-  so that it can explort LAPACK in its entirety.
+  so that it can export LAPACK in its entirety.
   OpenBLAS is a fork of the GotoBLAS project (an assembler-heavy implementation of BLAS) which is no longer being
   maintained.  In order to use GotoBLAS you can cd from "src" to "../tools", type
   "make openblas", then cd to "../src" and give the correct option to the "configure" script
   to use OpenBLAS (look at the comments at the top of the configure script to find this option).
   Thanks to Sola Aina for suggesting this and helping us to get this to work.
-  
+
   \section matrixwrap_jama Java Matrix Package (JAMA)
 
   JAMA is an implementation of linear-algebra routines for Java, written
-  in collaboration between NIST and MathWorks and put into the public domain 
+  in collaboration between NIST and MathWorks and put into the public domain
   (see math.nist.gov/javanumerics/jama).  We used some of this code to fill
   in a couple of holes in ATLAS-- specifically, if we're compiling with
  -DHAVE_ATLAS, we don't have the CLAPACK routines for SVD and eigenvalue
@@ -165,7 +219,7 @@ namespace kaldi {
   directory and see if it succeeds.  A lot of compilation issues will manifest themselves
   as linking errors.  In this section we give a summary of some of the more common
   linking errors (at least, those that relate specifically to the matrix library).
- 
+
    Depending on the compilation option (-DHAVE_CLAPACK, -DHAVE_LAPACK or -DHAVE_MKL),
   the code will be expecting to link with different things.  When debugging linking
   errors, bear in mind that the problem could be a mismatch between the compilation
@@ -182,7 +236,7 @@ namespace kaldi {
    s_cat, pow_dd, r_sign, pow_ri, pow_di, s_copy, s_cmp, d_sign
 
   \subsection matrix_err_clapack CLAPACK linking errors
-    
+
    You will get these errors if you compiled with -DHAVE_CLAPACK but did
    not provide the CLAPACK library.  The symbols you will be missing are:
 
@@ -195,15 +249,15 @@ namespace kaldi {
   but it supplies different symbols.   The native CLAPACK version of liblapack
   has symbols like those above (e.g. sgesvd_, sgetrf_), but the ATLAS version
   has symbols like clapack_sgetrf and also ones like ATL_sgetrf.
-  
+
   \subsection matrix_err_blas BLAS linking errors
-  
+
    You will get these errors if you failed to link against an implementation
    of BLAS.  These errors can also occur if libraries are linked in the wrong
    order.  CLAPACK requires BLAS, so you have to link BLAS after CLAPACK.
-   
+
    The symbols you will see if you failed to link with BLAS include:
-  
+
    cblas_sger, cblas_saxpy, cblas_dapy, cblas_ddot, cblas_sdot, cblas_sgemm, cblas_dgemm
 
    To fix these, link with a static library like libcblas.a, or do -lcblas (assuming
@@ -220,7 +274,7 @@ namespace kaldi {
   CLAPACK.  The cblaswrap library should be invoked before the cblas one.  If you
   are missing cblaswrap, you will see errors about symbols like:
 
-  f2c_sgemm, f2c_strsm, f2c_sswap, f2c_scopy, f2c_sspmv, f2c_sdot, f2c_sgemv 
+  f2c_sgemm, f2c_strsm, f2c_sswap, f2c_scopy, f2c_sspmv, f2c_sdot, f2c_sgemv
 
   and so on (there are a lot of these symbols).
 
@@ -235,15 +289,15 @@ namespace kaldi {
 
   \subsection matrix_err_atl_clapack Missing the ATLAS implementation of (parts of) CLAPACK
 
-  These errors can only occur if you compiled wiht the -DHAVE_ATLAS option.
+  These errors can only occur if you compiled with the -DHAVE_ATLAS option.
   Atlas's name for the CLAPACK routines are different from clapack's own (they
   have clapack_ prepended to indicate the origin, which can be quite confusing).
 
   If you have undefined references to the following symbols:
-  
+
    clapack_sgetrf, clapack_sgetri, clapack_dgetrf, clapack_dgetri
 
-  then it means you failed to link with an ATLAS library containing these symbols.  
+  then it means you failed to link with an ATLAS library containing these symbols.
   This may be variously called liblapack.a, libclapack.a or liblapack_atlas.a,
   but you can tell that it is the right one if it defines a symbol called ATL_cgetrf
   (type "nm <library-name> | grep ATL_cgetrf" to see).  You may be able to link
@@ -254,7 +308,6 @@ namespace kaldi {
   out is to look inside it using "nm" or "strings".
 
 
-
 */
 
 }
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index 799bfb5895f..9bcc2575be1 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -438,6 +438,89 @@ and downloadable models that can be used with online nnet3 decoding, please
 see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
 includes instructions in a README file).
 
+\subsection online_decoding_nnet3_tcp TCP server for nnet3 online decoding
+
+The program to run the TCP sever is online2-tcp-nnet3-decode-faster located in the
+~/src/online2bin folder. The usage is as follows:
+
+\verbatim
+online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table>
+\endverbatim
+
+For example:
+
+\verbatim
+online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt
+\endverbatim
+
+The word symbol table is mandatory (unlike other nnet3 online decoding programs) because
+the server outputs word strings. Endpointing is mandatory to make the operation of the
+program reasonable. Other, non-standard options include:
+    - port-num - the port the server listens on (by default 5050)
+    - samp-freq - sampling frequency of audio (usually 8000 for telephony and 16000 for other uses)
+    - chunk-length - length of signal being processed by decoder at each step
+    - output-period - how often we check for changes in the decoding (ie. output refresh rate, default 1s)
+    - num-threads-startup - number of threads used when initializing iVector extractor
+    - read-timeout - it the program doesn't receive data during this timeout, the server terminates the connection.
+		Use -1 to disable this feature.
+
+The TCP protocol simply takes RAW signal on input (16-bit signed integer
+encoding at chosen sampling frequency) and outputs simple text using the following
+logic:
+    - each refresh period (output-freq argument) the current state of decoding is output
+    - each line is terminated by '\r'
+    - once an utterance boundary is detected due to endpointing a '\n' char is output
+
+Each output string (delimited by '\r') should be treated as uncertain and can change
+entirely until the utterance delimiter ('\n') is sent. The delimiter chars are chosen
+specifically in order to make the output look neat in the terminal. It is possible to
+use it with other interfaces and a web demo (HTML/JS AudioAPI+WebSockets) exists.
+
+To run the program from the terminal you can use one of the following commands. First,
+make sure the server is running and accepting connections. Using the Aspire models, the
+command should look like this:
+\verbatim
+online2-tcp-nnet3-decode-faster --samp-freq=8000 --frames-per-chunk=20 --extra-left-context-initial=0
+    --frame-subsampling-factor=3 --config=model/conf/online.conf --min-active=200 --max-active=7000
+    --beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 --port-num=5050 model/final.mdl graph/HCLG.fst graph/words.txt
+\endverbatim
+
+Note in order to make the communication as simple as possible, the server has to accept
+any data on input and cannot figure out when the stream is over. It will therefore not
+be able to terminate the connection and it is the client's resposibility to disconnect
+when it is ready to do so. As a fallback for certain situations, the read-timeout option
+was added, which will automatically disconnect if a chosen amount of seconds has passed.
+Keep in mind, that this is not an ideal solution and it's a better idea to design your
+client to properly disconnect the connection when neccessary.
+
+For testing purposes, we will use the netcat program. We will also use sox to reeoncode the
+files properly from any source. Netcat has an issue that, similarly to what was stated above 
+about the server, it cannot always interpret the data and usually it won't automatically
+disconnect the TCP connection. To get around this, we will use the '-N' switch, which kills
+the connection once streaming of the file is complete, but this can have a small sideffect of
+not reading the whole output from the Kaldi server if the discconect comes too fast. Just
+keep this in mind if you intend to implement any of these programs into a production environment.
+
+To send a WAV file into the server, it first needs to be decoded into raw audio, then it can be
+sent to the socket:
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | nc -N localhost 5050
+\endverbatim
+
+It is possible to play audio (almost) simultaneously as decoding. It may require installing the
+'pv' program (used to throttle the signal into Kaldi at the same speed as the playback):
+
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | \
+    tee >(play -t raw -r 8k -e signed-integer -b 16 -c 1 -q -) | \
+    pv -L 16000 -q | nc -N localhost 5050
+\endverbatim
+
+Finally, it is possible to send audio from the microphone directly into the server:
+
+\verbatim
+rec -r 8k -e signed-integer -c 1 -b 16 -t raw -q - | nc -N localhost 5050
+\endverbatim
 
 
 */
diff --git a/src/doc/online_programs.dox b/src/doc/online_programs.dox
index eeb6136448a..e4c4b0fc730 100644
--- a/src/doc/online_programs.dox
+++ b/src/doc/online_programs.dox
@@ -77,7 +77,7 @@ and even (because of 16-bit sampling). The last packet of size 0 is treated as t
 \subsection results Recognition Results
 
  There are two types of packets sent by the server. Time-aligned results and partial results. Partial results are sent as fast as the decoder recognizes each word and time-alignment is performed
-every time the decoder recognizes the end of an utterance (which may or may not correspond to silences and/or sentance boundaries).
+every time the decoder recognizes the end of an utterance (which may or may not correspond to silences and/or sentence boundaries).
 
  Each partial result packet is prepended by the characters "PARTIAL:" follwed by exactly one word. Each word is sent in a different partial packet.
 
@@ -200,7 +200,7 @@ Element Properties:
                         flags: readable, writable
                         String. Default: "HCLG.fst"
 [..]
-  min-cmn-window      : Minumum CMN window used at start of decoding (adds latency only at start)
+  min-cmn-window      : Minimum CMN window used at start of decoding (adds latency only at start)
                         flags: readable, writable
                         Integer. Range: -2147483648 - 2147483647 Default: 100 
 
@@ -230,7 +230,7 @@ Note that the audio stream is segmented on the fly, with "<#s>" denoting silence
 You can easily try live decoding of microphone input by replacing `filesrc location=test1.wav` with `pulsesrc` (given that
 your OS uses the PulseAudio framework).
 
-An example stript that uses the plugin via the command-line to process a buch of audio files is located in `egs/voxforge/gst_demo/run-simulated.sh`.
+An example script that uses the plugin via the command-line to process a buch of audio files is located in `egs/voxforge/gst_demo/run-simulated.sh`.
 
 \subsection usage_gst Usage through GStreamer bindings
 
@@ -264,7 +264,7 @@ produce some new recognized words).
 
 \subsection code_structure Overview of the online decoders' code
 
-The main components used in the implemention of the online decoders are as follows:
+The main components used in the implementation of the online decoders are as follows:
     - \ref OnlineAudioSource - represents a source of raw audio samples. There are different 
       implementations of this "interface", that can for example acquire samples from microphone
       (e.g. \ref OnlinePaSource, using PortAudio), or read them from existing \ref Vector
diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox
index 01f8dc433bf..84497109d92 100644
--- a/src/doc/tree_externals.dox
+++ b/src/doc/tree_externals.dox
@@ -201,7 +201,7 @@ stress-dependent phones; in this case each "real" phone would correspond
 to a set of integer phone ids.  In that case we share the roots for all
 versions of a particular underlying phone.  Below is an example of a roots file
 for Wall Street Journal, from the egs/wsj/s5 scripts (this is in text, not integer form;
-it would have to be converted to integer form before being read by Kalid):
+it would have to be converted to integer form before being read by Kaldi):
 \verbatim
 not-shared not-split SIL SIL_B SIL_E SIL_I SIL_S SPN SPN_B SPN_E SPN_I SPN_S NSN NSN_B NSN_E NSN_I NSN_S
 shared split AA_B AA_E AA_I AA_S AA0_B AA0_E AA0_I AA0_S AA1_B AA1_E AA1_I AA1_S AA2_B AA2_E AA2_I AA2_S
@@ -306,7 +306,7 @@ SplitEventMap and TableEventMap both have a "key" that they query, which in this
 would be 0, 1 or 2 corresponding to left, central or right context, or -1 corresponding
 to the identity of the "pdf-class".  Normally the value of the pdf-class
 is the same as the index of the HMM state, i.e. 0, 1 or 2.  Try not to get confused by
-this: the key is -1, but the value is 0, 1 or 2, and this has no connnection to the 0, 1 or 2
+this: the key is -1, but the value is 0, 1 or 2, and this has no connection to the 0, 1 or 2
 which are the keys of the phones in the context-window.
 The SplitEventMap has a set of values that will trigger the "yes" branch of the tree.
 Below is a kind of quasi-BNF notation that explains the tree-file format.
@@ -344,7 +344,7 @@ has value 0 (i.e. the leftmost HMM-state).  Assuming the answer is "yes", the ne
 is "SE 2 [ 220 221 222 223 ]", which is asking whether the phone to the right is one of various
 forms of the phone "M" (a rather unintuitive question to ask, since we're
 in the leftmost HMM-state); if yes, we ask "SE 0 [ 104 105 106 107... 286 287 ]" which is
-a question about the phone to the right; if yes, then the pdf-id is 5 ("CE 5") and if
+a question about the phone to the left; if yes, then the pdf-id is 5 ("CE 5") and if
 no, 696 ("CE 696").
 \verbatim
 s3# copy-tree --binary=false exp/tri1/tree - 2>/dev/null | head -100
diff --git a/src/doc/tree_internals.dox b/src/doc/tree_internals.dox
index 6ed51205736..a58fc624e91 100644
--- a/src/doc/tree_internals.dox
+++ b/src/doc/tree_internals.dox
@@ -32,7 +32,7 @@ namespace kaldi {
   The central concept in the decision-tree building code is the event map,
   represented by type EventMap.  Don't be misled by the term "event" to think
   that this is something happening at a particular time.  An event is just a set
-  of (key,value) pairs, with no key repeated.  Conceptually, it could be represented by 
+  of (key,value) pairs, with no key repeated.  Conceptually, it could be represented by
   the type std::map<int32,int32>.  In fact, for efficiency we represent it as a
   sorted vector of pairs, via the typedef:
 \code
@@ -41,7 +41,7 @@ namespace kaldi {
   Here, EventKeyType and EventValueType are just typedefs to int32, but the code is
   easier to understand if we use distinct names for them.  Think of an Event
   (type EventType) as being like a collection of variables, except that the
-  names and values of the variables are both integers.   There is also the type 
+  names and values of the variables are both integers.   There is also the type
   \ref EventAnswerType, and this is also int32.  It is the type returned by the EventMap;
   in practice it is a pdf identifier (acoustic-state index).
   Functions that require EventType expect you to have sorted it first (e.g. by calling
@@ -64,7 +64,7 @@ namespace kaldi {
    // caution: not valid C++.
    EventType e = { {-1,1}, {0,10}, {1,11}, {2,12} }; 
  \endcode
-  Suppose we that the acoustic-state index (pdf-id) corresponding to this
+  Suppose that the acoustic-state index (pdf-id) corresponding to this
   acoustic state happens to be 1000.  Then if we have an
   EventMap "emap" representing the tree, then we would expect the following assert
   not to fail:
@@ -87,14 +87,14 @@ namespace kaldi {
   EventMap is a polymorphic, pure-virtual class (i.e. it cannot be instantiated because it
   has virtual functions that are not implemented).  There are three concrete classes
   that implement the EventMap interface:
-      - ConstantEventMap:  Think of this as a decision-tree leaf node. 
+      - ConstantEventMap:  Think of this as a decision-tree leaf node.
           This class stores an integer of type EventAnswerType and its Map function always returns that value.  
       - SplitEventMap: Think of this as a decision-tree non-leaf node that queries a certain
          key and goes to the "yes" or "no" child node depending on the answer.  Its Map
          function calls the Map function of the appropriate child node.  It stores a set of
          integers of type kAnswerType that correspond to the "yes" child (everything else goes to "no").
       - TableEventMap: This does a complete split on a particular key.  A typical example is:
-         you might first split completely on the central phone and then have a separate 
+         you might first split completely on the central phone and then have a separate
          decision tree for each value of that phone.  Internally it stores a vector of
          EventMap* pointers.  It looks up the value corresponding to the key that it
          is splitting on, and calls the Map function of the EventMap at the corresponding
@@ -240,7 +240,7 @@ namespace kaldi {
  The function AccumulateTreeStats() accumulates statistics for training the tree, given
  features and an alignment that is a sequence of transition-ids (see \ref transition_model_identifiers).
  This function is defined in a different directory from the other tree-building-related functions 
- (hmm/ not tree/) as it depends on more code (e.g. it knows about the the TransitionModel class), 
+ (hmm/ not tree/) as it depends on more code (e.g. it knows about the TransitionModel class), 
  and we preferred to keep the dependencies of the core tree-building code to a minimum.
   
 */
diff --git a/src/doc/tutorial_looking.dox b/src/doc/tutorial_looking.dox
index 420abfc9bce..831d721c7eb 100644
--- a/src/doc/tutorial_looking.dox
+++ b/src/doc/tutorial_looking.dox
@@ -171,7 +171,7 @@ making sure have their normal values, begin with KALDI_.  This is a precaution
 to avoid future conflicts with other codebases (since \#defines don't limit themselves
 to the kaldi namespace).  Notice the style of the function names: LikeThis().
 Our style is generally based on
-<a href=http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml> this one </a>,
+<a href=https://google.github.io/styleguide/cppguide.html> this one </a>,
 to conform with OpenFst, but there are some differences.
 
 To see other elements of the style, which will help you to understand Kaldi
@@ -190,7 +190,7 @@ It prints out the usage, which should give you a generic idea of how Kaldi progr
 are called.  Note that while there is a --config option that can be used to
 pass a configuration file, in general Kaldi is not as config-driven as HTK and these
 files are not widely used.  You will see a --binary option.  In general, Kaldi file
-formats come in both binary and test forms, and the --binary option controls how
+formats come in both binary and text forms, and the --binary option controls how
 they are written.  However, this only controls how single objects (e.g. acoustic models)
 are written.  For whole collections of objects (e.g. collections of feature files),
 there is a different mechanism that we will come to later.
diff --git a/src/doc/tutorial_prereqs.dox b/src/doc/tutorial_prereqs.dox
index 82079a281b9..72b1fcf8ad8 100644
--- a/src/doc/tutorial_prereqs.dox
+++ b/src/doc/tutorial_prereqs.dox
@@ -51,7 +51,7 @@
   The most difficult part of the installation process relates to the math library
   ATLAS; if this is not already installed as a library on your system you will
   have to compile it, and this requires that CPU throttling be turned off, which
-  may require root priveleges.  We provide scripts and detailed instructions for
+  may require root privileges.  We provide scripts and detailed instructions for
   all installation steps.  When scripts fail, read the output carefully because
   it tries to provide guidance as to how to fix problems.  Please inform us if there
   are problems at any point, however minor; see \ref other.
diff --git a/src/doc/tutorial_running.dox b/src/doc/tutorial_running.dox
index f977348a3cb..d639cd4e664 100644
--- a/src/doc/tutorial_running.dox
+++ b/src/doc/tutorial_running.dox
@@ -115,14 +115,14 @@ Now go back to the data directory and change directory to /train. Then execute t
 
 \verbatim
 head text
-head spk2gender.map
+head spk2gender
 head spk2utt
 head utt2spk
 head wav.scp
 \endverbatim
 
 - text - This file contains mappings between utterances and utterance ids which will be used by Kaldi. This file will be turned into an integer format-- still a text file, but with the words replaced with integers.
-- spk2gender.map - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
+- spk2gender - This file contains mappings between speakers and their gender. This also acts as a list of unique users involved in training. 
 - spk2utt - This is a mapping between the speaker identifiers and all the utterance identifiers associated with the speaker. 
 - utt2spk - This is a one-to-one mapping between utterance ids and the corresponding speaker identifiers. 
 - wav.scp - This file is actually read directly by Kaldi programs when doing feature extraction. Look at the file again. It is parsed as a set of key-value pairs, where the key is the first string on each line. The value is a kind of "extended filename", and you can guess how it works. Since it is for reading we will refer to this type of string as an "rxfilename" (for writing we use the term wxfilename). See \ref io_sec_xfilename if you are curious. Note that although we use the extension .scp, this is not a script file in the HTK sense (i.e. it is not viewed as an extension to the command-line arguments).
@@ -383,7 +383,7 @@ do
 copy-tree --binary=false exp/mono/tree - | less
 \endverbatim
 Note that this is a monophone "tree" so it is very trivial-- it
-does not have any "splits".  Although this tree format was not indended to be
+does not have any "splits".  Although this tree format was not intended to be
 very human-readable, we have received a number of queries about the tree format so we
 will explain it.  The rest of this paragraph can be skipped over by the casual reader.
 After "ToPdf", the tree file contains an object of the
@@ -442,7 +442,7 @@ Type
 \verbatim
 grep Overall exp/mono/log/acc.{?,??}.{?,??}.log
 \endverbatim
-You can see the acoustic likelihods on each iteration.  Next look at one of the files
+You can see the acoustic likelihoods on each iteration.  Next look at one of the files
 exp/mono/log/update.*.log to see what kind of information is in the update log.
 
 When the monophone training is finished, we can test the monophone decoding. Before decoding, we have to create the decode graph. Type:
@@ -505,7 +505,7 @@ gmm-decode-faster
 \endverbatim
 to see the usage message, and match up the arguments with what you see in the log file.
 Recall that "rspecifier" is one of those strings that specifies how to read a table,
-and "wspecifier" specifies how to write one.  Look carefuly at these arguments and try
+and "wspecifier" specifies how to write one.  Look carefully at these arguments and try
 to figure out what they mean.  Look at the rspecifier that corresponds to the features, and
 try to understand it (this one has spaces inside, so Kaldi prints it out with single quotes
 around it so that you could paste it into the shell and the program would run as intended).
diff --git a/src/doc/tutorial_setup.dox b/src/doc/tutorial_setup.dox
index 11d97a945f9..13f5e3e9c74 100644
--- a/src/doc/tutorial_setup.dox
+++ b/src/doc/tutorial_setup.dox
@@ -34,16 +34,16 @@
 
   Assuming Git is installed, to get the latest code you can type
   \verbatim
-    git clone https://github.com/kaldi-asr/kaldi.git kaldi-trunk --origin golden
+    git clone https://github.com/kaldi-asr/kaldi.git
   \endverbatim
-  Then cd to kaldi-trunk.  Look at the INSTALL file and follow the instructions
+  Then cd to kaldi.  Look at the INSTALL file and follow the instructions
   (it points you to two subdirectories).  Look carefully at the output of the
   installation scripts, as they try to guide you what to do.  Some installation
   errors are non-fatal, and the installation scripts will tell you so (i.e. there
   are some things it installs which are nice to have but are not really needed).
   The "best-case" scenario is that you do:
  \verbatim
-   cd kaldi-trunk/tools/; make; cd ../src; ./configure; make
+   cd kaldi/tools/; make; cd ../src; ./configure; make
  \endverbatim
  and everything will just work; however, if this does not happen there are
  fallback plans (e.g. you may have to install some package on your machine, or run
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index d12b8621ccd..08e2c2bbda7 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -28,7 +28,7 @@
 
    \section versions_scheme Versioning scheme
 
-     During its lifetime, Kaldi has has three different versioning methods.
+     During its lifetime, Kaldi has three different versioning methods.
      Originally Kaldi was a subversion (svn)-based project, and was hosted
      on Sourceforge.  Then Kaldi was moved to github, and for some time the
      only version-number available was the git hash of the commit.
@@ -61,7 +61,7 @@
     This is the first major/minor version number after introducing the versioning scheme.
     The latest revision of version 5.0 is saved as branch "5.0" on github.
 
-    Below are patches corresponding to minor version numbers 5.0.x.
+    Below are commits corresponding to minor version numbers 5.0.x.
 
      \htmlinclude 5.0.html
 
@@ -89,7 +89,7 @@
 
    The latest revision of version 5.1 is saved as branch "5.1" on github.
 
-   Below are patches corresponding to minor version numbers 5.1.x.
+   Below are commits corresponding to minor version numbers 5.1.x.
 
     \htmlinclude 5.1.html
 
@@ -110,7 +110,7 @@
 
   The latest revision of version 5.2 is saved as branch "5.2" on github.
 
-  Below are patches corresponding to minor version numbers 5.1.x.
+  Below are commits corresponding to minor version numbers 5.1.x.
 
     \htmlinclude 5.2.html
 
@@ -121,30 +121,52 @@
       - Create a nnet3-based setup for RNN language models (i.e. recurrent and neural net based
         language models)
       - Some extentions to the core of the nnet3 framework to support constant values and
-        scalar multiplication without dedicated compoennts.
+        scalar multiplication without dedicated components.
 
-   Below are patches corresponding to minor version numbers 5.3.x.
+   Below are commits corresponding to minor version numbers 5.3.x.
 
    \htmlinclude 5.3.html
 
  \subsection versions_versions_54 Version 5.4
 
 
-   Version 5.4 is the current master branch.   The main changes that were made between
+   The main changes that were made between
    the end of 5.3.x and the start of the 5.4 branch include:
     - Some code changes in the nnet3 codebase, for speed and memory efficiency.
     - Various simplifications and code reorganizations in the nnet3 code.
-    - Support for a new kind of factorized TDNN which gives substantially better
+    - Support for a new kind of factorized TDNN (TDNN-F) which gives substantially better
       results than our old TDNN recipe, and is even better than our old TDNN+LSTM
       recipe.  A good example of this is in egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh.
       Some nnet3 code changes were needed for this as well (mostly: support for constraining
       a matrix to have orthonormal rows).
 
-   Below are patches corresponding to minor version numbers 5.4.x.
+  Some of the larger changes that were made while 5.4 was the major version number include:
+    - Improvements to handwriting recognition and OCR recipes, including BPE (word-piece) encoding.
+    - An updated version of the TDNN-F configuration, including ResNet-style bypass,
+      which is now the default in many recipes.  (it's called tdnnf-layer in xconfigs).
+    - A rewrite of the CUDA memory allocator to be based on a small number of large regions
+      (since with newer drivers and hardware, allocation speed was becoming a bottleneck).
+    - A decoder speedup (make use of OpenFst's NumInputEpsilons() function).
+
+
+   Below are commits corresponding to minor version numbers 5.4.x.
 
    \htmlinclude 5.4.html
 
 
 
+ \subsection versions_versions_55 Version 5.5
+
+
+  Version 5.5 is the current master branch.   The change that was made between the end of
+  5.4 and the start of 5.5 is support for \ref grammar grammar decoding; this allows support for things like
+  the "contact list scenario" where you want to use a dynamically changing contact list in
+  a larger, fixed decoding graph.
+
+  Below are commits corresponding to minor version numbers 5.5.x.
+
+
+    \htmlinclude 5.5.html
+
 
 */
diff --git a/src/feat/Makefile b/src/feat/Makefile
index 2af9da2ec59..dcd029f7f94 100644
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
@@ -16,7 +16,7 @@ OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
 LIBNAME = kaldi-feat
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/feat/feature-common-inl.h b/src/feat/feature-common-inl.h
index 546f272e821..26127a4dc4d 100644
--- a/src/feat/feature-common-inl.h
+++ b/src/feat/feature-common-inl.h
@@ -33,44 +33,29 @@ void OfflineFeatureTpl<F>::ComputeFeatures(
     Matrix<BaseFloat> *output) {
   KALDI_ASSERT(output != NULL);
   BaseFloat new_sample_freq = computer_.GetFrameOptions().samp_freq;
-  if (sample_freq == new_sample_freq)
+  if (sample_freq == new_sample_freq) {
     Compute(wave, vtln_warp, output);
-  else {
-    if (new_sample_freq < sample_freq) {
-      if (! computer_.GetFrameOptions().allow_downsample)
+  } else {
+    if (new_sample_freq < sample_freq &&
+        ! computer_.GetFrameOptions().allow_downsample)
         KALDI_ERR << "Waveform and config sample Frequency mismatch: "
                   << sample_freq << " .vs " << new_sample_freq
-                  << " ( use --allow_downsample=true option to allow "
+                  << " (use --allow-downsample=true to allow "
                   << " downsampling the waveform).";
-
-      // Downsample the waveform.
-      Vector<BaseFloat> downsampled_wave(wave);
-      DownsampleWaveForm(sample_freq, wave,
-                         new_sample_freq, &downsampled_wave);
-      Compute(downsampled_wave, vtln_warp, output);
-    } else
-      KALDI_ERR << "The waveform is allowed to get downsampled."
-                << "New sample Frequency " << new_sample_freq
-                << " is larger than waveform original sampling frequency "
-                << sample_freq;
-
+    else if (new_sample_freq > sample_freq &&
+             ! computer_.GetFrameOptions().allow_upsample)
+      KALDI_ERR << "Waveform and config sample Frequency mismatch: "
+                  << sample_freq << " .vs " << new_sample_freq
+                << " (use --allow-upsample=true option to allow "
+                << " upsampling the waveform).";
+    // Resample the waveform.
+    Vector<BaseFloat> resampled_wave(wave);
+    ResampleWaveform(sample_freq, wave,
+                     new_sample_freq, &resampled_wave);
+    Compute(resampled_wave, vtln_warp, output);
   }
 }
 
-template <class F>
-void OfflineFeatureTpl<F>::ComputeFeatures(
-    const VectorBase<BaseFloat> &wave,
-    BaseFloat sample_freq,
-    BaseFloat vtln_warp,
-    Matrix<BaseFloat> *output) const {
-  OfflineFeatureTpl<F> temp(*this);
-  // This const version of ComputeFeatures() is a wrapper that
-  // calls the non-const ComputeFeatures() on a temporary object
-  // that is a copy of *this.  It is not as efficient because of the
-  // overhead of copying *this.
-  temp.ComputeFeatures(wave, vtln_warp, output);
-}
-
 template <class F>
 void OfflineFeatureTpl<F>::Compute(
     const VectorBase<BaseFloat> &wave,
diff --git a/src/feat/feature-common.h b/src/feat/feature-common.h
index 1c83aed8ea9..3c2fbd37381 100644
--- a/src/feat/feature-common.h
+++ b/src/feat/feature-common.h
@@ -53,12 +53,12 @@ class ExampleFeatureComputer {
   }
 
   /// Returns the feature dimension
-  int32 Dim();
+  int32 Dim() const;
 
   /// Returns true if this function may inspect the raw log-energy of the signal
   /// (before windowing and pre-emphasis); it's safe to always return true, but
   /// setting it to false enables an optimization.
-  bool NeedRawLogEnergy() { return true; }
+  bool NeedRawLogEnergy() const { return true; }
 
   /// constructor from options class; it should not store a reference or pointer
   /// to the options class but should copy it.
@@ -89,7 +89,7 @@ class ExampleFeatureComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
@@ -152,16 +152,6 @@ class OfflineFeatureTpl {
                        BaseFloat sample_freq,
                        BaseFloat vtln_warp,
                        Matrix<BaseFloat> *output);
-  /**
-     This const version of ComputeFeatures() is a wrapper that
-     calls the non-const ComputeFeatures() on a temporary object
-     that is a copy of *this.  It is not as efficient because of the
-     overhead of copying *this.
-  */
-  void ComputeFeatures(const VectorBase<BaseFloat> &wave,
-                       BaseFloat sample_freq,
-                       BaseFloat vtln_warp,
-                       Matrix<BaseFloat> *output) const;
 
   int32 Dim() const { return computer_.Dim(); }
 
diff --git a/src/feat/feature-fbank.cc b/src/feat/feature-fbank.cc
index c54069696b5..d9ac03e5920 100644
--- a/src/feat/feature-fbank.cc
+++ b/src/feat/feature-fbank.cc
@@ -69,7 +69,7 @@ const MelBanks* FbankComputer::GetMelBanks(BaseFloat vtln_warp) {
   return this_mel_banks;
 }
 
-void FbankComputer::Compute(BaseFloat signal_log_energy,
+void FbankComputer::Compute(BaseFloat signal_raw_log_energy,
                             BaseFloat vtln_warp,
                             VectorBase<BaseFloat> *signal_frame,
                             VectorBase<BaseFloat> *feature) {
@@ -82,8 +82,8 @@ void FbankComputer::Compute(BaseFloat signal_log_energy,
 
   // Compute energy after window function (not the raw one).
   if (opts_.use_energy && !opts_.raw_energy)
-    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<BaseFloat>::min()));
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<float>::epsilon()));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
@@ -108,17 +108,17 @@ void FbankComputer::Compute(BaseFloat signal_log_energy,
   mel_banks.Compute(power_spectrum, &mel_energies);
   if (opts_.use_log_fbank) {
     // Avoid log of zero (which should be prevented anyway by dithering).
-    mel_energies.ApplyFloor(std::numeric_limits<BaseFloat>::epsilon());
+    mel_energies.ApplyFloor(std::numeric_limits<float>::epsilon());
     mel_energies.ApplyLog();  // take the log.
   }
 
   // Copy energy as first value (or the last, if htk_compat == true).
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_) {
-      signal_log_energy = log_energy_floor_;
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
+      signal_raw_log_energy = log_energy_floor_;
     }
     int32 energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
-    (*feature)(energy_index) = signal_log_energy;
+    (*feature)(energy_index) = signal_raw_log_energy;
   }
 }
 
diff --git a/src/feat/feature-fbank.h b/src/feat/feature-fbank.h
index 41ef2eef50a..f57d185a41c 100644
--- a/src/feat/feature-fbank.h
+++ b/src/feat/feature-fbank.h
@@ -53,7 +53,7 @@ struct FbankOptions {
                  // this seems to be common for 16khz-sampled data,
                  // but for 8khz-sampled data, 15 may be better.
                  use_energy(false),
-                 energy_floor(0.0),  // not in log scale: a small value e.g. 1.0e-10
+                 energy_floor(0.0),
                  raw_energy(true),
                  htk_compat(false),
                  use_log_fbank(true),
@@ -65,7 +65,9 @@ struct FbankOptions {
     opts->Register("use-energy", &use_energy,
                    "Add an extra dimension with energy to the FBANK output.");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in FBANK computation");
+                   "Floor on energy (absolute, not relative) in FBANK computation. "
+                   "Only makes a difference if --use-energy=true; only necessary if "
+                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
     opts->Register("raw-energy", &raw_energy,
                    "If true, compute energy before preemphasis and windowing");
     opts->Register("htk-compat", &htk_compat, "If true, put energy last.  "
@@ -92,7 +94,7 @@ class FbankComputer {
     return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
   }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
 
   const FrameExtractionOptions &GetFrameOptions() const {
     return opts_.frame_opts;
@@ -119,7 +121,7 @@ class FbankComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc
index 4ae2550c364..76500ccf87a 100644
--- a/src/feat/feature-functions.cc
+++ b/src/feat/feature-functions.cc
@@ -37,7 +37,7 @@ void ComputePowerSpectrum(VectorBase<BaseFloat> *waveform) {
   // as we just want power spectrum.
 
   // now we have in waveform, first half of complex spectrum
-  // it's stored as [real0, realN/2-1, real1, im1, real2, im2, ...]
+  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
   int32 half_dim = dim/2;
   BaseFloat first_energy = (*waveform)(0) * (*waveform)(0),
       last_energy = (*waveform)(1) * (*waveform)(1);  // handle this special case
diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index 122ba1b100d..73ab4b312c4 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -25,7 +25,7 @@
 namespace kaldi {
 
 
-void MfccComputer::Compute(BaseFloat signal_log_energy,
+void MfccComputer::Compute(BaseFloat signal_raw_log_energy,
                            BaseFloat vtln_warp,
                            VectorBase<BaseFloat> *signal_frame,
                            VectorBase<BaseFloat> *feature) {
@@ -35,8 +35,8 @@ void MfccComputer::Compute(BaseFloat signal_log_energy,
   const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
 
   if (opts_.use_energy && !opts_.raw_energy)
-    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<BaseFloat>::min()));
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<float>::epsilon()));
 
   if (srfft_ != NULL)  // Compute FFT using the split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
@@ -51,7 +51,7 @@ void MfccComputer::Compute(BaseFloat signal_log_energy,
   mel_banks.Compute(power_spectrum, &mel_energies_);
 
   // avoid log of zero (which should be prevented anyway by dithering).
-  mel_energies_.ApplyFloor(std::numeric_limits<BaseFloat>::epsilon());
+  mel_energies_.ApplyFloor(std::numeric_limits<float>::epsilon());
   mel_energies_.ApplyLog();  // take the log.
 
   feature->SetZero();  // in case there were NaNs.
@@ -62,9 +62,9 @@ void MfccComputer::Compute(BaseFloat signal_log_energy,
     feature->MulElements(lifter_coeffs_);
 
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-      signal_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_log_energy;
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
+      signal_raw_log_energy = log_energy_floor_;
+    (*feature)(0) = signal_raw_log_energy;
   }
 
   if (opts_.htk_compat) {
diff --git a/src/feat/feature-mfcc.h b/src/feat/feature-mfcc.h
index d1d2b8f9d09..dbfb9d60364 100644
--- a/src/feat/feature-mfcc.h
+++ b/src/feat/feature-mfcc.h
@@ -40,7 +40,8 @@ struct MfccOptions {
   MelBanksOptions mel_opts;
   int32 num_ceps;  // e.g. 13: num cepstral coeffs, counting zero.
   bool use_energy;  // use energy; else C0
-  BaseFloat energy_floor;
+  BaseFloat energy_floor;  // 0 by default; set to a value like 1.0 or 0.1 if
+                           // you disable dithering.
   bool raw_energy;  // If true, compute energy before preemphasis and windowing
   BaseFloat cepstral_lifter;  // Scaling factor on cepstra for HTK compatibility.
                               // if 0.0, no liftering is done.
@@ -53,7 +54,7 @@ struct MfccOptions {
                   // but for 8khz-sampled data, 15 may be better.
                   num_ceps(13),
                   use_energy(true),
-                  energy_floor(0.0),  // not in log scale: a small value e.g. 1.0e-10
+                  energy_floor(0.0),
                   raw_energy(true),
                   cepstral_lifter(22.0),
                   htk_compat(false) {}
@@ -66,7 +67,9 @@ struct MfccOptions {
     opts->Register("use-energy", &use_energy,
                    "Use energy (not C0) in MFCC computation");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in MFCC computation");
+                   "Floor on energy (absolute, not relative) in MFCC computation. "
+                   "Only makes a difference if --use-energy=true; only necessary if "
+                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
     opts->Register("raw-energy", &raw_energy,
                    "If true, compute energy before preemphasis and windowing");
     opts->Register("cepstral-lifter", &cepstral_lifter,
@@ -93,7 +96,7 @@ class MfccComputer {
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
 
   /**
      Function that computes one frame of features from
@@ -116,7 +119,7 @@ class MfccComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
@@ -126,6 +129,7 @@ class MfccComputer {
   // disallow assignment.
   MfccComputer &operator = (const MfccComputer &in);
 
+ protected:
   const MelBanks *GetMelBanks(BaseFloat vtln_warp);
 
   MfccOptions opts_;
diff --git a/src/feat/feature-plp.cc b/src/feat/feature-plp.cc
index 719e55dd6da..e0c270c7061 100644
--- a/src/feat/feature-plp.cc
+++ b/src/feat/feature-plp.cc
@@ -109,7 +109,7 @@ const Vector<BaseFloat> *PlpComputer::GetEqualLoudness(BaseFloat vtln_warp) {
   return ans;
 }
 
-void PlpComputer::Compute(BaseFloat signal_log_energy,
+void PlpComputer::Compute(BaseFloat signal_raw_log_energy,
                           BaseFloat vtln_warp,
                           VectorBase<BaseFloat> *signal_frame,
                           VectorBase<BaseFloat> *feature) {
@@ -124,8 +124,8 @@ void PlpComputer::Compute(BaseFloat signal_log_energy,
 
 
   if (opts_.use_energy && !opts_.raw_energy)
-    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<BaseFloat>::min()));
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<float>::min()));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
@@ -159,8 +159,8 @@ void PlpComputer::Compute(BaseFloat signal_log_energy,
 
   BaseFloat residual_log_energy = ComputeLpc(autocorr_coeffs_, &lpc_coeffs_);
 
-  residual_log_energy = std::max(residual_log_energy,
-                                 std::numeric_limits<BaseFloat>::min());
+  residual_log_energy = std::max<BaseFloat>(residual_log_energy,
+                                 std::numeric_limits<float>::min());
 
   Lpc2Cepstrum(opts_.lpc_order, lpc_coeffs_.Data(), raw_cepstrum_.Data());
   feature->Range(1, opts_.num_ceps - 1).CopyFromVec(
@@ -174,9 +174,9 @@ void PlpComputer::Compute(BaseFloat signal_log_energy,
     feature->Scale(opts_.cepstral_scale);
 
   if (opts_.use_energy) {
-    if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-      signal_log_energy = log_energy_floor_;
-    (*feature)(0) = signal_log_energy;
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
+      signal_raw_log_energy = log_energy_floor_;
+    (*feature)(0) = signal_raw_log_energy;
   }
 
   if (opts_.htk_compat) {  // reorder the features.
diff --git a/src/feat/feature-plp.h b/src/feat/feature-plp.h
index d7deab07ec1..4f156ca1e88 100644
--- a/src/feat/feature-plp.h
+++ b/src/feat/feature-plp.h
@@ -61,7 +61,7 @@ struct PlpOptions {
                  lpc_order(12),
                  num_ceps(13),
                  use_energy(true),
-                 energy_floor(0.0),  // not in log scale: a small value e.g. 1.0e-10
+                 energy_floor(0.0),
                  raw_energy(true),
                  compress_factor(0.33333),
                  cepstral_lifter(22),
@@ -78,7 +78,9 @@ struct PlpOptions {
     opts->Register("use-energy", &use_energy,
                    "Use energy (not C0) for zeroth PLP feature");
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in PLP computation");
+                   "Floor on energy (absolute, not relative) in PLP computation. "
+                   "Only makes a difference if --use-energy=true; only necessary if "
+                   "--dither=0.0.  Suggested values: 0.1 or 1.0");
     opts->Register("raw-energy", &raw_energy,
                    "If true, compute energy before preemphasis and windowing");
     opts->Register("compress-factor", &compress_factor,
@@ -108,7 +110,7 @@ class PlpComputer {
 
   int32 Dim() const { return opts_.num_ceps; }
 
-  bool NeedRawLogEnergy() { return opts_.use_energy && opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
 
   /**
      Function that computes one frame of features from
@@ -131,7 +133,7 @@ class PlpComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
diff --git a/src/feat/feature-spectrogram.cc b/src/feat/feature-spectrogram.cc
index 953f38fc54f..1a9ea3a93ee 100644
--- a/src/feat/feature-spectrogram.cc
+++ b/src/feat/feature-spectrogram.cc
@@ -44,7 +44,7 @@ SpectrogramComputer::~SpectrogramComputer() {
   delete srfft_;
 }
 
-void SpectrogramComputer::Compute(BaseFloat signal_log_energy,
+void SpectrogramComputer::Compute(BaseFloat signal_raw_log_energy,
                                   BaseFloat vtln_warp,
                                   VectorBase<BaseFloat> *signal_frame,
                                   VectorBase<BaseFloat> *feature) {
@@ -54,29 +54,34 @@ void SpectrogramComputer::Compute(BaseFloat signal_log_energy,
 
   // Compute energy after window function (not the raw one)
   if (!opts_.raw_energy)
-    signal_log_energy = Log(std::max(VecVec(*signal_frame, *signal_frame),
-                                     std::numeric_limits<BaseFloat>::epsilon()));
+    signal_raw_log_energy = Log(std::max<BaseFloat>(VecVec(*signal_frame, *signal_frame),
+                                     std::numeric_limits<float>::epsilon()));
 
   if (srfft_ != NULL)  // Compute FFT using split-radix algorithm.
     srfft_->Compute(signal_frame->Data(), true);
   else  // An alternative algorithm that works for non-powers-of-two
     RealFft(signal_frame, true);
 
+  if (opts_.return_raw_fft) {
+    feature->CopyFromVec(*signal_frame);
+    return;
+  }
+
   // Convert the FFT into a power spectrum.
   ComputePowerSpectrum(signal_frame);
   SubVector<BaseFloat> power_spectrum(*signal_frame,
                                       0, signal_frame->Dim() / 2 + 1);
 
-  power_spectrum.ApplyFloor(std::numeric_limits<BaseFloat>::epsilon());
+  power_spectrum.ApplyFloor(std::numeric_limits<float>::epsilon());
   power_spectrum.ApplyLog();
 
   feature->CopyFromVec(power_spectrum);
 
-  if (opts_.energy_floor > 0.0 && signal_log_energy < log_energy_floor_)
-    signal_log_energy = log_energy_floor_;
+  if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_)
+    signal_raw_log_energy = log_energy_floor_;
   // The zeroth spectrogram component is always set to the signal energy,
   // instead of the square of the constant component of the signal.
-  (*feature)(0) = signal_log_energy;
+  (*feature)(0) = signal_raw_log_energy;
 }
 
 }  // namespace kaldi
diff --git a/src/feat/feature-spectrogram.h b/src/feat/feature-spectrogram.h
index ec318556f24..0b1061ad991 100644
--- a/src/feat/feature-spectrogram.h
+++ b/src/feat/feature-spectrogram.h
@@ -39,17 +39,27 @@ struct SpectrogramOptions {
   FrameExtractionOptions frame_opts;
   BaseFloat energy_floor;
   bool raw_energy;  // If true, compute energy before preemphasis and windowing
+  bool return_raw_fft; // If true, return the raw FFT spectrum
+                       // Note that in that case the Dim() will return double
+                       // the expected dimension (because of the complex domain of it)
 
   SpectrogramOptions() :
-    energy_floor(0.0),  // not in log scale: a small value e.g. 1.0e-10
-    raw_energy(true) {}
+    energy_floor(0.0),
+    raw_energy(true),
+    return_raw_fft(false) {}
 
   void Register(OptionsItf *opts) {
     frame_opts.Register(opts);
     opts->Register("energy-floor", &energy_floor,
-                   "Floor on energy (absolute, not relative) in Spectrogram computation");
+                   "Floor on energy (absolute, not relative) in Spectrogram "
+                   "computation.  Caution: this floor is applied to the zeroth "
+                   "component, representing the total signal energy.  The "
+                   "floor on the individual spectrogram elements is fixed at "
+                   "std::numeric_limits<float>::epsilon().");
     opts->Register("raw-energy", &raw_energy,
                    "If true, compute energy before preemphasis and windowing");
+    opts->Register("return-raw-fft", &return_raw_fft,
+                   "If true, return raw FFT complex numbers instead of log magnitudes");
   }
 };
 
@@ -64,9 +74,15 @@ class SpectrogramComputer {
     return opts_.frame_opts;
   }
 
-  int32 Dim() const { return opts_.frame_opts.PaddedWindowSize() / 2 + 1; }
+  int32 Dim() const {
+    if (opts_.return_raw_fft) {
+      return opts_.frame_opts.PaddedWindowSize();
+    } else {
+      return opts_.frame_opts.PaddedWindowSize() / 2 + 1;
+    }
+  }
 
-  bool NeedRawLogEnergy() { return opts_.raw_energy; }
+  bool NeedRawLogEnergy() const { return opts_.raw_energy; }
 
 
   /**
@@ -87,7 +103,7 @@ class SpectrogramComputer {
      @param [out] feature  Pointer to a vector of size this->Dim(), to which
          the computed feature will be written.
   */
-  void Compute(BaseFloat signal_log_energy,
+  void Compute(BaseFloat signal_raw_log_energy,
                BaseFloat vtln_warp,
                VectorBase<BaseFloat> *signal_frame,
                VectorBase<BaseFloat> *feature);
diff --git a/src/feat/feature-window.cc b/src/feat/feature-window.cc
index 98afe1849e9..1dea03d6791 100644
--- a/src/feat/feature-window.cc
+++ b/src/feat/feature-window.cc
@@ -115,6 +115,10 @@ FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
     double i_fl = static_cast<double>(i);
     if (opts.window_type == "hanning") {
       window(i) = 0.5  - 0.5*cos(a * i_fl);
+    } else if (opts.window_type == "sine") {
+      // when you are checking ws wikipedia, please
+      // note that 0.5 * a = M_PI/(frame_length-1)
+      window(i) = sin(0.5 * a * i_fl);
     } else if (opts.window_type == "hamming") {
       window(i) = 0.54 - 0.46*cos(a * i_fl);
     } else if (opts.window_type == "povey") {  // like hamming but goes to zero at edges.
@@ -144,8 +148,8 @@ void ProcessWindow(const FrameExtractionOptions &opts,
     window->Add(-window->Sum() / frame_length);
 
   if (log_energy_pre_window != NULL) {
-    BaseFloat energy = std::max(VecVec(*window, *window),
-                                std::numeric_limits<BaseFloat>::epsilon());
+    BaseFloat energy = std::max<BaseFloat>(VecVec(*window, *window),
+                                std::numeric_limits<float>::epsilon());
     *log_energy_pre_window = Log(energy);
   }
 
@@ -219,20 +223,4 @@ void ExtractWindow(int64 sample_offset,
   ProcessWindow(opts, window_function, &frame, log_energy_pre_window);
 }
 
-void ExtractWaveformRemainder(const VectorBase<BaseFloat> &wave,
-                              const FrameExtractionOptions &opts,
-                              Vector<BaseFloat> *wave_remainder) {
-  int32 frame_shift = opts.WindowShift();
-  int32 num_frames = NumFrames(wave.Dim(), opts);
-  // offset is the amount at the start that has been extracted.
-  int32 offset = num_frames * frame_shift;
-  KALDI_ASSERT(wave_remainder != NULL);
-  int32 remaining_len = wave.Dim() - offset;
-  wave_remainder->Resize(remaining_len);
-  KALDI_ASSERT(remaining_len >= 0);
-  if (remaining_len > 0)
-    wave_remainder->CopyFromVec(SubVector<BaseFloat>(wave, offset, remaining_len));
-}
-
-
 }  // namespace kaldi
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index a897c6fa4b0..e6d673937ac 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -40,14 +40,16 @@ struct FrameExtractionOptions {
   BaseFloat preemph_coeff;  // Preemphasis coefficient.
   bool remove_dc_offset;  // Subtract mean of wave before FFT.
   std::string window_type;  // e.g. Hamming window
+  // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
+  // "povey" is a window I made to be similar to Hamming but to go to zero at the
+  // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
+  // I just don't think the Hamming window makes sense as a windowing function.
   bool round_to_power_of_two;
   BaseFloat blackman_coeff;
   bool snip_edges;
   bool allow_downsample;
-  // May be "hamming", "rectangular", "povey", "hanning", "blackman"
-  // "povey" is a window I made to be similar to Hamming but to go to zero at the
-  // edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85)
-  // I just don't think the Hamming window makes sense as a windowing function.
+  bool allow_upsample;
+  int max_feature_vectors;
   FrameExtractionOptions():
       samp_freq(16000),
       frame_shift_ms(10.0),
@@ -59,7 +61,10 @@ struct FrameExtractionOptions {
       round_to_power_of_two(true),
       blackman_coeff(0.42),
       snip_edges(true),
-      allow_downsample(false) { }
+      allow_downsample(false),
+      allow_upsample(false),
+      max_feature_vectors(-1)
+      { }
 
   void Register(OptionsItf *opts) {
     opts->Register("sample-frequency", &samp_freq,
@@ -71,10 +76,12 @@ struct FrameExtractionOptions {
                    "Coefficient for use in signal preemphasis");
     opts->Register("remove-dc-offset", &remove_dc_offset,
                    "Subtract mean from waveform on each frame");
-    opts->Register("dither", &dither, "Dithering constant (0.0 means no dither)");
+    opts->Register("dither", &dither, "Dithering constant (0.0 means no dither). "
+                   "If you turn this off, you should set the --energy-floor "
+                   "option, e.g. to 1.0 or 0.1");
     opts->Register("window-type", &window_type, "Type of window "
                    "(\"hamming\"|\"hanning\"|\"povey\"|\"rectangular\""
-                   "|\"blackmann\")");
+                   "|\"sine\"|\"blackmann\")");
     opts->Register("blackman-coeff", &blackman_coeff,
                    "Constant coefficient for generalized Blackman window.");
     opts->Register("round-to-power-of-two", &round_to_power_of_two,
@@ -88,6 +95,13 @@ struct FrameExtractionOptions {
     opts->Register("allow-downsample", &allow_downsample,
                    "If true, allow the input waveform to have a higher frequency than "
                    "the specified --sample-frequency (and we'll downsample).");
+    opts->Register("max-feature-vectors", &max_feature_vectors,
+                   "Memory optimization. If larger than 0, periodically remove feature "
+                   "vectors so that only this number of the latest feature vectors is "
+                   "retained.");
+    opts->Register("allow-upsample", &allow_upsample,
+                   "If true, allow the input waveform to have a lower frequency than "
+                   "the specified --sample-frequency (and we'll upsample).");
   }
   int32 WindowShift() const {
     return static_cast<int32>(samp_freq * 0.001 * frame_shift_ms);
@@ -148,7 +162,7 @@ void Preemphasize(VectorBase<BaseFloat> *waveform, BaseFloat preemph_coeff);
 
 /**
   This function does all the windowing steps after actually
-  extracting the windowed signal: depeding on the
+  extracting the windowed signal: depending on the
   configuration, it does dithering, dc offset removal,
   preemphasis, and multiplication by the windowing function.
    @param [in] opts  The options class to be used
@@ -202,15 +216,6 @@ void ExtractWindow(int64 sample_offset,
                    BaseFloat *log_energy_pre_window = NULL);
 
 
-// ExtractWaveformRemainder is useful if the waveform is coming in segments.
-// It extracts the bit of the waveform at the end of this block that you
-// would have to append the next bit of waveform to, if you wanted to have
-// the same effect as everything being in one big block.
-void ExtractWaveformRemainder(const VectorBase<BaseFloat> &wave,
-                              const FrameExtractionOptions &opts,
-                              Vector<BaseFloat> *wave_remainder);
-
-
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
 
diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc
index 810b6247e93..bb5e9f9acff 100644
--- a/src/feat/mel-computations.cc
+++ b/src/feat/mel-computations.cc
@@ -262,7 +262,7 @@ void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
 // Durbin's recursion - converts autocorrelation coefficients to the LPC
 // pTmp - temporal place [n]
 // pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
+// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
 //       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
 BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp) {
   BaseFloat ki;                // reflection coefficient
diff --git a/src/feat/mel-computations.h b/src/feat/mel-computations.h
index 5df36c8cb90..0c1d41ca45c 100644
--- a/src/feat/mel-computations.h
+++ b/src/feat/mel-computations.h
@@ -50,7 +50,7 @@ struct MelBanksOptions {
                         // to the Nyquist frequency to get the cutoff.
   bool debug_mel;
   // htk_mode is a "hidden" config, it does not show up on command line.
-  // Enables more exact compatibibility with HTK, for testing purposes.  Affects
+  // Enables more exact compatibility with HTK, for testing purposes.  Affects
   // mel-energy flooring and reproduces a bug in HTK.
   bool htk_mode;
   explicit MelBanksOptions(int num_bins = 25)
@@ -63,7 +63,7 @@ struct MelBanksOptions {
     opts->Register("low-freq", &low_freq,
                    "Low cutoff frequency for mel bins");
     opts->Register("high-freq", &high_freq,
-                   "High cutoff frequency for mel bins (if < 0, offset from Nyquist)");
+                   "High cutoff frequency for mel bins (if <= 0, offset from Nyquist)");
     opts->Register("vtln-low", &vtln_low,
                    "Low inflection point in piecewise linear VTLN warping function");
     opts->Register("vtln-high", &vtln_high,
@@ -116,6 +116,10 @@ class MelBanks {
   // returns vector of central freq of each bin; needed by plp code.
   const Vector<BaseFloat> &GetCenterFreqs() const { return center_freqs_; }
 
+  const std::vector<std::pair<int32, Vector<BaseFloat> > >& GetBins() const {
+    return bins_;
+  }
+
   // Copy constructor
   MelBanks(const MelBanks &other);
  private:
@@ -144,8 +148,8 @@ void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs);
 // Durbin's recursion - converts autocorrelation coefficients to the LPC
 // pTmp - temporal place [n]
 // pAC - autocorrelation coefficients [n + 1]
-// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}})
-//       F(z) = 1 / (1 - A(z)), 1 is not stored in the demoninator
+// pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i-1] * s[n-i]}})
+//       F(z) = 1 / (1 - A(z)), 1 is not stored in the denominator
 // Returns log energy of residual (I think)
 BaseFloat Durbin(int n, const BaseFloat *pAC, BaseFloat *pLP, BaseFloat *pTmp);
 
diff --git a/src/feat/online-feature-test.cc b/src/feat/online-feature-test.cc
index e3a1d5f99f3..7ba6c7c32be 100644
--- a/src/feat/online-feature-test.cc
+++ b/src/feat/online-feature-test.cc
@@ -375,6 +375,45 @@ void TestOnlineAppendFeature() {
   }
 }
 
+void TestRecyclingVector() {
+  RecyclingVector full_vec;
+  RecyclingVector shrinking_vec(10);
+  for (int i = 0; i != 100; ++i) {
+    Vector <BaseFloat> data(1);
+    data.Set(i);
+    full_vec.PushBack(new Vector<BaseFloat>(data));
+    shrinking_vec.PushBack(new Vector<BaseFloat>(data));
+  }
+  KALDI_ASSERT(full_vec.Size() == 100);
+  KALDI_ASSERT(shrinking_vec.Size() == 100);
+
+  // full_vec should contain everything
+  for (int i = 0; i != 100; ++i) {
+    Vector <BaseFloat> *data = full_vec.At(i);
+    KALDI_ASSERT(data != nullptr);
+    KALDI_ASSERT((*data)(0) == static_cast<BaseFloat>(i));
+  }
+
+  // shrinking_vec may throw an exception for the first 90 elements
+  int caught_exceptions = 0;
+  for (int i = 0; i != 90; ++i) {
+    try {
+      shrinking_vec.At(i);
+    } catch (const std::runtime_error &) {
+      ++caught_exceptions;
+    }
+  }
+  // it may actually store a bit more elements for performance efficiency considerations
+  KALDI_ASSERT(caught_exceptions >= 80);
+
+  // shrinking_vec should contain the last 10 elements
+  for (int i = 90; i != 100; ++i) {
+    Vector <BaseFloat> *data = shrinking_vec.At(i);
+    KALDI_ASSERT(data != nullptr);
+    KALDI_ASSERT((*data)(0) == static_cast<BaseFloat>(i));
+  }
+}
+
 }  // end namespace kaldi
 
 int main() {
@@ -387,6 +426,7 @@ int main() {
     TestOnlinePlp();
     TestOnlineTransform();
     TestOnlineAppendFeature();
+    TestRecyclingVector();
   }
   std::cout << "Test OK.\n";
 }
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 267a4724580..047909e7a21 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -24,50 +24,148 @@
 
 namespace kaldi {
 
-template<class C>
+RecyclingVector::RecyclingVector(int items_to_hold):
+  items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
+  first_available_index_(0) {
+}
+
+RecyclingVector::~RecyclingVector() {
+  for (auto *item : items_) {
+    delete item;
+  }
+}
+
+Vector<BaseFloat> *RecyclingVector::At(int index) const {
+  if (index < first_available_index_) {
+    KALDI_ERR << "Attempted to retrieve feature vector that was "
+                 "already removed by the RecyclingVector (index = "
+              << index << "; "
+              << "first_available_index = " << first_available_index_ << "; "
+              << "size = " << Size() << ")";
+  }
+  // 'at' does size checking.
+  return items_.at(index - first_available_index_);
+}
+
+void RecyclingVector::PushBack(Vector<BaseFloat> *item) {
+  if (items_.size() == items_to_hold_) {
+    delete items_.front();
+    items_.pop_front();
+    ++first_available_index_;
+  }
+  items_.push_back(item);
+}
+
+int RecyclingVector::Size() const {
+  return first_available_index_ + items_.size();
+}
+
+template <class C>
 void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
                                            VectorBase<BaseFloat> *feat) {
-  // 'at' does size checking.
-  feat->CopyFromVec(*(features_.at(frame)));
+  feat->CopyFromVec(*(features_.At(frame)));
 };
 
-template<class C>
+template <class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
     const typename C::Options &opts):
     computer_(opts), window_function_(computer_.GetFrameOptions()),
-    input_finished_(false), waveform_offset_(0) { }
+    features_(opts.frame_opts.max_feature_vectors),
+    input_finished_(false), waveform_offset_(0) {
+  // RE the following assert: search for ONLINE_IVECTOR_LIMIT in
+  // online-ivector-feature.cc.
+  // Casting to uint32, an unsigned type, means that -1 would be treated
+  // as `very large`.
+  KALDI_ASSERT(static_cast<uint32>(opts.frame_opts.max_feature_vectors) > 200);
+}
 
-template<class C>
-void OnlineGenericBaseFeature<C>::AcceptWaveform(BaseFloat sampling_rate,
-                                                 const VectorBase<BaseFloat> &waveform) {
+
+template <class C>
+void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
+    BaseFloat sampling_rate) {
   BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
-  if (sampling_rate != expected_sampling_rate)
+
+  if (resampler_ != nullptr) {
+    KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate);
+    KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate);
+  } else if (((sampling_rate < expected_sampling_rate) &&
+              computer_.GetFrameOptions().allow_downsample) ||
+             ((sampling_rate > expected_sampling_rate) &&
+              computer_.GetFrameOptions().allow_upsample)) {
+    resampler_.reset(new LinearResample(
+        sampling_rate, expected_sampling_rate,
+        std::min(sampling_rate / 2, expected_sampling_rate / 2), 6));
+  } else if (sampling_rate != expected_sampling_rate) {
     KALDI_ERR << "Sampling frequency mismatch, expected "
-              << expected_sampling_rate << ", got " << sampling_rate;
-  if (waveform.Dim() == 0)
+              << expected_sampling_rate << ", got " << sampling_rate
+              << "\nPerhaps you want to use the options "
+                 "--allow_{upsample,downsample}";
+  }
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::InputFinished() {
+  if (resampler_ != nullptr) {
+    // There may be a few samples left once we flush the resampler_ object, telling it
+    // that the file has finished.  This should rarely make any difference.
+    Vector<BaseFloat> appended_wave;
+    Vector<BaseFloat> resampled_wave;
+    resampler_->Resample(appended_wave, true, &resampled_wave);
+
+    if (resampled_wave.Dim() != 0) {
+      appended_wave.Resize(waveform_remainder_.Dim() +
+                           resampled_wave.Dim());
+      if (waveform_remainder_.Dim() != 0)
+        appended_wave.Range(0, waveform_remainder_.Dim())
+            .CopyFromVec(waveform_remainder_);
+      appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
+          .CopyFromVec(resampled_wave);
+      waveform_remainder_.Swap(&appended_wave);
+    }
+  }
+  input_finished_ = true;
+  ComputeFeatures();
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::AcceptWaveform(
+    BaseFloat sampling_rate, const VectorBase<BaseFloat> &original_waveform) {
+  if (original_waveform.Dim() == 0)
     return;  // Nothing to do.
   if (input_finished_)
     KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
-  // append 'waveform' to 'waveform_remainder_.'
-  Vector<BaseFloat> appended_wave(waveform_remainder_.Dim() + waveform.Dim());
+
+  Vector<BaseFloat> appended_wave;
+  Vector<BaseFloat> resampled_wave;
+
+  const VectorBase<BaseFloat> *waveform;
+
+  MaybeCreateResampler(sampling_rate);
+  if (resampler_ == nullptr) {
+    waveform = &original_waveform;
+  } else {
+    resampler_->Resample(original_waveform, false, &resampled_wave);
+    waveform = &resampled_wave;
+  }
+
+  appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim());
   if (waveform_remainder_.Dim() != 0)
-    appended_wave.Range(0, waveform_remainder_.Dim()).CopyFromVec(
-        waveform_remainder_);
-  appended_wave.Range(waveform_remainder_.Dim(), waveform.Dim()).CopyFromVec(
-      waveform);
+    appended_wave.Range(0, waveform_remainder_.Dim())
+        .CopyFromVec(waveform_remainder_);
+  appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim())
+      .CopyFromVec(*waveform);
   waveform_remainder_.Swap(&appended_wave);
   ComputeFeatures();
 }
 
-template<class C>
+template <class C>
 void OnlineGenericBaseFeature<C>::ComputeFeatures() {
   const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
   int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
-  int32 num_frames_old = features_.size(),
+  int32 num_frames_old = features_.Size(),
       num_frames_new = NumFrames(num_samples_total, frame_opts,
                                  input_finished_);
   KALDI_ASSERT(num_frames_new >= num_frames_old);
-  features_.resize(num_frames_new, NULL);
 
   Vector<BaseFloat> window;
   bool need_raw_log_energy = computer_.NeedRawLogEnergy();
@@ -81,7 +179,7 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
     // note: this online feature-extraction code does not support VTLN.
     BaseFloat vtln_warp = 1.0;
     computer_.Compute(raw_log_energy, vtln_warp, &window, this_feature);
-    features_[frame] = this_feature;
+    features_.PushBack(this_feature);
   }
   // OK, we will now discard any portion of the signal that will not be
   // necessary to compute frames in the future.
@@ -110,7 +208,6 @@ template class OnlineGenericBaseFeature<MfccComputer>;
 template class OnlineGenericBaseFeature<PlpComputer>;
 template class OnlineGenericBaseFeature<FbankComputer>;
 
-
 OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
     speaker_cmvn_stats(other.speaker_cmvn_stats),
     global_cmvn_stats(other.global_cmvn_stats),
@@ -138,12 +235,12 @@ void OnlineCmvnState::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "</OnlineCmvnState>");
 }
 
-
-
 OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
                        const OnlineCmvnState &cmvn_state,
                        OnlineFeatureInterface *src):
-    opts_(opts), src_(src) {
+    opts_(opts), temp_stats_(2, src->Dim() + 1),
+    temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
+    src_(src) {
   SetState(cmvn_state);
   if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
     KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
@@ -151,7 +248,10 @@ OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
 }
 
 OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
-                       OnlineFeatureInterface *src): opts_(opts), src_(src) {
+                       OnlineFeatureInterface *src):
+    opts_(opts), temp_stats_(2, src->Dim() + 1),
+    temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
+    src_(src) {
   if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
     KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
               <<  "integers)";
@@ -160,7 +260,7 @@ OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
 
 void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
                                           int32 *cached_frame,
-                                          Matrix<double> *stats) {
+                                          MatrixBase<double> *stats) {
   KALDI_ASSERT(frame >= 0);
   InitRingBufferIfNeeded();
   // look for a cached frame on a previous frame as close as possible in time
@@ -174,7 +274,7 @@ void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
     int32 index = t % opts_.ring_buffer_size;
     if (cached_stats_ring_[index].first == t) {
       *cached_frame = t;
-      *stats = cached_stats_ring_[index].second;
+      stats->CopyFromMat(cached_stats_ring_[index].second);
       return;
     }
   }
@@ -182,7 +282,7 @@ void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
   if (n >= cached_stats_modulo_.size()) {
     if (cached_stats_modulo_.size() == 0) {
       *cached_frame = -1;
-      stats->Resize(2, this->Dim() + 1);
+      stats->SetZero();
       return;
     } else {
       n = static_cast<int32>(cached_stats_modulo_.size() - 1);
@@ -190,7 +290,7 @@ void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
   }
   *cached_frame = n * opts_.modulus;
   KALDI_ASSERT(cached_stats_modulo_[n] != NULL);
-  *stats = *(cached_stats_modulo_[n]);
+  stats->CopyFromMat(*(cached_stats_modulo_[n]));
 }
 
 // Initialize ring buffer for caching stats.
@@ -202,7 +302,7 @@ void OnlineCmvn::InitRingBufferIfNeeded() {
   }
 }
 
-void OnlineCmvn::CacheFrame(int32 frame, const Matrix<double> &stats) {
+void OnlineCmvn::CacheFrame(int32 frame, const MatrixBase<double> &stats) {
   KALDI_ASSERT(frame >= 0);
   if (frame % opts_.modulus == 0) {  // store in cached_stats_modulo_.
     int32 n = frame / opts_.modulus;
@@ -239,18 +339,18 @@ void OnlineCmvn::ComputeStatsForFrame(int32 frame,
   KALDI_ASSERT(frame >= 0 && frame < src_->NumFramesReady());
 
   int32 dim = this->Dim(), cur_frame;
-  Matrix<double> stats(2, dim + 1);
-  GetMostRecentCachedFrame(frame, &cur_frame, &stats);
+  GetMostRecentCachedFrame(frame, &cur_frame, stats_out);
 
-  Vector<BaseFloat> feats(dim);
-  Vector<double> feats_dbl(dim);
+  Vector<BaseFloat> &feats(temp_feats_);
+  Vector<double> &feats_dbl(temp_feats_dbl_);
   while (cur_frame < frame) {
     cur_frame++;
     src_->GetFrame(cur_frame, &feats);
     feats_dbl.CopyFromVec(feats);
-    stats.Row(0).Range(0, dim).AddVec(1.0, feats_dbl);
-    stats.Row(1).Range(0, dim).AddVec2(1.0, feats_dbl);
-    stats(0, dim) += 1.0;
+    stats_out->Row(0).Range(0, dim).AddVec(1.0, feats_dbl);
+    if (opts_.normalize_variance)
+      stats_out->Row(1).Range(0, dim).AddVec2(1.0, feats_dbl);
+    (*stats_out)(0, dim) += 1.0;
     // it's a sliding buffer; a frame at the back may be
     // leaving the buffer so we have to subtract that.
     int32 prev_frame = cur_frame - opts_.cmn_window;
@@ -258,13 +358,13 @@ void OnlineCmvn::ComputeStatsForFrame(int32 frame,
       // we need to subtract frame prev_f from the stats.
       src_->GetFrame(prev_frame, &feats);
       feats_dbl.CopyFromVec(feats);
-      stats.Row(0).Range(0, dim).AddVec(-1.0, feats_dbl);
-      stats.Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl);
-      stats(0, dim) -= 1.0;
+      stats_out->Row(0).Range(0, dim).AddVec(-1.0, feats_dbl);
+      if (opts_.normalize_variance)
+        stats_out->Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl);
+      (*stats_out)(0, dim) -= 1.0;
     }
-    CacheFrame(cur_frame, stats);
+    CacheFrame(cur_frame, (*stats_out));
   }
-  stats_out->CopyFromMat(stats);
 }
 
 
@@ -273,12 +373,23 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
                                        const MatrixBase<double> &global_stats,
                                        const OnlineCmvnOptions &opts,
                                        MatrixBase<double> *stats) {
+  if (speaker_stats.NumRows() == 2 && !opts.normalize_variance) {
+    // this is just for efficiency: don't operate on the variance if it's not
+    // needed.
+    int32 cols = speaker_stats.NumCols();  // dim + 1
+    SubMatrix<double> stats_temp(*stats, 0, 1, 0, cols);
+    SmoothOnlineCmvnStats(speaker_stats.RowRange(0, 1),
+                          global_stats.RowRange(0, 1),
+                          opts, &stats_temp);
+    return;
+  }
   int32 dim = stats->NumCols() - 1;
   double cur_count = (*stats)(0, dim);
   // If count exceeded cmn_window it would be an error in how "window_stats"
   // was accumulated.
   KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window);
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (speaker_stats.NumRows() != 0) {  // if we have speaker stats..
     double count_from_speaker = opts.cmn_window - cur_count,
         speaker_count = speaker_stats(0, dim);
@@ -291,7 +402,8 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
                              speaker_stats);
     cur_count = (*stats)(0, dim);
   }
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (global_stats.NumRows() != 0) {
     double count_from_global = opts.cmn_window - cur_count,
         global_count = global_stats(0, dim);
@@ -311,7 +423,8 @@ void OnlineCmvn::GetFrame(int32 frame,
   src_->GetFrame(frame, feat);
   KALDI_ASSERT(feat->Dim() == this->Dim());
   int32 dim = feat->Dim();
-  Matrix<double> stats(2, dim + 1);
+  Matrix<double> &stats(temp_stats_);
+  stats.Resize(2, dim + 1, kUndefined);  // Will do nothing if size was correct.
   if (frozen_state_.NumRows() != 0) {  // the CMVN state has been frozen.
     stats.CopyFromMat(frozen_state_);
   } else {
@@ -329,14 +442,13 @@ void OnlineCmvn::GetFrame(int32 frame,
 
   // call the function ApplyCmvn declared in ../transform/cmvn.h, which
   // requires a matrix.
-  Matrix<BaseFloat> feat_mat(1, dim);
-  feat_mat.Row(0).CopyFromVec(*feat);
+  // 1 row; num-cols == dim; stride  == dim.
+  SubMatrix<BaseFloat> feat_mat(feat->Data(), 1, dim, dim);
   // the function ApplyCmvn takes a matrix, so form a one-row matrix to give it.
   if (opts_.normalize_mean)
     ApplyCmvn(stats, opts_.normalize_variance, &feat_mat);
   else
     KALDI_ASSERT(!opts_.normalize_variance);
-  feat->CopyFromVec(feat_mat.Row(0));
 }
 
 void OnlineCmvn::Freeze(int32 cur_frame) {
@@ -383,7 +495,7 @@ void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) {
 
 int32 OnlineSpliceFrames::NumFramesReady() const {
   int32 num_frames = src_->NumFramesReady();
-  if (num_frames > 0 && src_->IsLastFrame(num_frames-1))
+  if (num_frames > 0 && src_->IsLastFrame(num_frames - 1))
     return num_frames;
   else
     return std::max<int32>(0, num_frames - right_context_);
@@ -430,6 +542,17 @@ void OnlineTransform::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
   feat->AddMatVec(1.0, linear_term_, kNoTrans, input_feat, 1.0);
 }
 
+void OnlineTransform::GetFrames(
+    const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
+  KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
+  int32 num_frames = feats->NumRows(),
+      input_dim = linear_term_.NumCols();
+  Matrix<BaseFloat> input_feats(num_frames, input_dim, kUndefined);
+  src_->GetFrames(frames, &input_feats);
+  feats->CopyRowsFromVec(offset_);
+  feats->AddMatMat(1.0, input_feats, kNoTrans, linear_term_, kTrans, 1.0);
+}
+
 
 int32 OnlineDeltaFeature::Dim() const {
   int32 src_dim = src_->Dim();
@@ -493,6 +616,49 @@ void OnlineCacheFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
   }
 }
 
+void OnlineCacheFeature::GetFrames(
+    const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
+  int32 num_frames = frames.size();
+  // non_cached_frames will be the subset of 't' values in 'frames' which were
+  // not previously cached, which we therefore need to get from src_.
+  std::vector<int32> non_cached_frames;
+  // 'non_cached_indexes' stores the indexes 'i' into 'frames' corresponding to
+  // the corresponding frames in 'non_cached_frames'.
+  std::vector<int32> non_cached_indexes;
+  non_cached_frames.reserve(frames.size());
+  non_cached_indexes.reserve(frames.size());
+  for (int32 i = 0; i < num_frames; i++) {
+    int32 t = frames[i];
+    if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
+      feats->Row(i).CopyFromVec(*(cache_[t]));
+    } else {
+      non_cached_frames.push_back(t);
+      non_cached_indexes.push_back(i);
+    }
+  }
+  if (non_cached_frames.empty())
+    return;
+  int32 num_non_cached_frames = non_cached_frames.size(),
+      dim = this->Dim();
+  Matrix<BaseFloat> non_cached_feats(num_non_cached_frames, dim,
+                                     kUndefined);
+  src_->GetFrames(non_cached_frames, &non_cached_feats);
+  for (int32 i = 0; i < num_non_cached_frames; i++) {
+    int32 t = non_cached_frames[i];
+    if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
+      // We can reach this point due to repeat indexes in 'non_cached_frames'.
+      feats->Row(non_cached_indexes[i]).CopyFromVec(*(cache_[t]));
+    } else {
+      SubVector<BaseFloat> this_feat(non_cached_feats, i);
+      feats->Row(non_cached_indexes[i]).CopyFromVec(this_feat);
+      if (static_cast<size_t>(t) >= cache_.size())
+        cache_.resize(t + 1, NULL);
+      cache_[t] = new Vector<BaseFloat>(this_feat);
+    }
+  }
+}
+
+
 void OnlineCacheFeature::ClearCache() {
   for (size_t i = 0; i < cache_.size(); i++)
     delete cache_[i];
@@ -500,7 +666,6 @@ void OnlineCacheFeature::ClearCache() {
 }
 
 
-
 void OnlineAppendFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
   KALDI_ASSERT(feat->Dim() == Dim());
 
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index 11d170972fa..f2ebe45bf3e 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -41,6 +41,36 @@ namespace kaldi {
 /// @{
 
 
+/// This class serves as a storage for feature vectors with an option to limit
+/// the memory usage by removing old elements. The deleted frames indices are
+/// "remembered" so that regardless of the MAX_ITEMS setting, the user always
+/// provides the indices as if no deletion was being performed.
+/// This is useful when processing very long recordings which would otherwise
+/// cause the memory to eventually blow up when the features are not being removed.
+class RecyclingVector {
+public:
+  /// By default it does not remove any elements.
+  RecyclingVector(int items_to_hold = -1);
+
+  /// The ownership is being retained by this collection - do not delete the item.
+  Vector<BaseFloat> *At(int index) const;
+
+  /// The ownership of the item is passed to this collection - do not delete the item.
+  void PushBack(Vector<BaseFloat> *item);
+
+  /// This method returns the size as if no "recycling" had happened,
+  /// i.e. equivalent to the number of times the PushBack method has been called.
+  int Size() const;
+
+  ~RecyclingVector();
+
+private:
+  std::deque<Vector<BaseFloat>*> items_;
+  int items_to_hold_;
+  int first_available_index_;
+};
+
+
 /// This is a templated class for online feature extraction;
 /// it's templated on a class like MfccComputer or PlpComputer
 /// that does the basic feature extraction.
@@ -61,7 +91,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
     return computer_.GetFrameOptions().frame_shift_ms / 1000.0f;
   }
 
-  virtual int32 NumFramesReady() const { return features_.size(); }
+  virtual int32 NumFramesReady() const { return features_.Size(); }
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
@@ -83,14 +113,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // more waveform.  This will help flush out the last frame or two
   // of features, in the case where snip-edges == false; it also
   // affects the return value of IsLastFrame().
-  virtual void InputFinished() {
-    input_finished_ = true;
-    ComputeFeatures();
-  }
-
-  ~OnlineGenericBaseFeature() {
-    DeletePointers(&features_);
-  }
+  virtual void InputFinished();
 
  private:
   // This function computes any additional feature frames that it is possible to
@@ -101,13 +124,19 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // waveform_remainder_ while incrementing waveform_offset_ by the same amount.
   void ComputeFeatures();
 
+  void MaybeCreateResampler(BaseFloat sampling_rate);
+
   C computer_;  // class that does the MFCC or PLP or filterbank computation
 
+  // resampler in cases when the input sampling frequency is not equal to
+  // the expected sampling rate
+  std::unique_ptr<LinearResample> resampler_;
+
   FeatureWindowFunction window_function_;
 
   // features_ is the Mfcc or Plp or Fbank features that we have already computed.
 
-  std::vector<Vector<BaseFloat>*> features_;
+  RecyclingVector features_;
 
   // True if the user has called "InputFinished()"
   bool input_finished_;
@@ -117,7 +146,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   BaseFloat sampling_frequency_;
 
   // waveform_offset_ is the number of samples of waveform that we have
-  // already discarded, i.e. thatn were prior to 'waveform_remainder_'.
+  // already discarded, i.e. that were prior to 'waveform_remainder_'.
   int64 waveform_offset_;
 
   // waveform_remainder_ is a short piece of waveform that we may need to keep
@@ -182,7 +211,8 @@ struct OnlineCmvnOptions {
                   // class computes the cmvn internally.  smaller->more
                   // time-efficient but less memory-efficient.  Must be >= 1.
   int32 ring_buffer_size;  // not configurable from command line; size of ring
-                           // buffer used for caching CMVN stats.
+                           // buffer used for caching CMVN stats.  Must be >=
+                           // modulus.
   std::string skip_dims; // Colon-separated list of dimensions to skip normalization
                          // of, e.g. 13:14:15.
 
@@ -196,7 +226,7 @@ struct OnlineCmvnOptions {
       ring_buffer_size(20),
       skip_dims("") { }
 
-  void Check() {
+  void Check() const {
     KALDI_ASSERT(speaker_frames <= cmn_window && global_frames <= speaker_frames
                  && modulus > 0);
   }
@@ -240,7 +270,7 @@ struct OnlineCmvnState {
 
   // The following is the global CMVN stats, in the usual
   // format, of dimension 2 x (dim+1), as [  sum-stats          count
-  //                                       sum-sqared-stats   0    ]
+  //                                       sum-squared-stats   0    ]
   Matrix<double> global_cmvn_stats;
 
   // If nonempty, contains CMVN stats representing the "frozen" state
@@ -371,10 +401,10 @@ class OnlineCmvn: public OnlineFeatureInterface {
   /// were cached, sets up empty stats for frame zero and returns that].
   void GetMostRecentCachedFrame(int32 frame,
                                 int32 *cached_frame,
-                                Matrix<double> *stats);
+                                MatrixBase<double> *stats);
 
   /// Cache this frame of stats.
-  void CacheFrame(int32 frame, const Matrix<double> &stats);
+  void CacheFrame(int32 frame, const MatrixBase<double> &stats);
 
   /// Initialize ring buffer for caching stats.
   inline void InitRingBufferIfNeeded();
@@ -403,6 +433,12 @@ class OnlineCmvn: public OnlineFeatureInterface {
   // frame index.
   std::vector<std::pair<int32, Matrix<double> > > cached_stats_ring_;
 
+  // Some temporary variables used inside functions of this class, which
+  // put here to avoid reallocation.
+  Matrix<double> temp_stats_;
+  Vector<BaseFloat> temp_feats_;
+  Vector<double> temp_feats_dbl_;
+
   OnlineFeatureInterface *src_;  // Not owned here
 };
 
@@ -472,6 +508,9 @@ class OnlineTransform: public OnlineFeatureInterface {
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  virtual void GetFrames(const std::vector<int32> &frames,
+                         MatrixBase<BaseFloat> *feats);
+
   //
   // Next, functions that are not in the interface.
   //
@@ -537,6 +576,9 @@ class OnlineCacheFeature: public OnlineFeatureInterface {
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  virtual void GetFrames(const std::vector<int32> &frames,
+                         MatrixBase<BaseFloat> *feats);
+
   virtual ~OnlineCacheFeature() { ClearCache(); }
 
   // Things that are not in the shared interface:
diff --git a/src/feat/pitch-functions-test.cc b/src/feat/pitch-functions-test.cc
index 098e590a8e9..0e481c18674 100644
--- a/src/feat/pitch-functions-test.cc
+++ b/src/feat/pitch-functions-test.cc
@@ -449,7 +449,7 @@ static void UnitTestKeeleNccfBallast() {
       // use pitch code with default configuration..
       PitchExtractionOptions op;
       op.nccf_ballast = 0.05 * k;
-      KALDI_LOG << " nccf_ballast " << op.nccf_ballast << std::endl;
+      KALDI_LOG << " nccf_ballast " << op.nccf_ballast;
       // compute pitch.
       Matrix<BaseFloat> m;
       ComputeKaldiPitch(op, waveform, &m);
@@ -493,7 +493,7 @@ static void UnitTestPitchExtractionSpeed() {
     double tot_time = timer.Elapsed(),
         speech_time = test_num * waveform.Dim() / wave.SampFreq();
     KALDI_LOG << " Pitch extraction time per second of speech is "
-              << (tot_time / speech_time) << " seconds " << std::endl;
+              << (tot_time / speech_time) << " seconds.";
   }
 }
 static void UnitTestPitchExtractorCompareKeele() {
diff --git a/src/feat/resample.cc b/src/feat/resample.cc
index 518685d85c8..11f4c62bf1c 100644
--- a/src/feat/resample.cc
+++ b/src/feat/resample.cc
@@ -302,7 +302,7 @@ void ArbitraryResample::Resample(const VectorBase<BaseFloat> &input,
                                  VectorBase<BaseFloat> *output) const {
   KALDI_ASSERT(input.Dim() == num_samples_in_ &&
                output->Dim() == weights_.size());
-  
+
   int32 output_dim = output->Dim();
   for (int32 i = 0; i < output_dim; i++) {
     SubVector<BaseFloat> input_part(input, first_index_[i], weights_[i].Dim());
@@ -365,13 +365,13 @@ BaseFloat ArbitraryResample::FilterFunc(BaseFloat t) const {
   return filter * window;
 }
 
-void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                        BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
-  KALDI_ASSERT(new_freq < orig_freq);
-  BaseFloat lowpass_cutoff = 0.99 * 0.5 * new_freq;
+void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                      BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
+  BaseFloat min_freq = std::min(orig_freq, new_freq);
+  BaseFloat lowpass_cutoff = 0.99 * 0.5 * min_freq;
   int32 lowpass_filter_width = 6;
-  LinearResample signal_downsampler(orig_freq, new_freq,
-                                    lowpass_cutoff, lowpass_filter_width);
-  signal_downsampler.Resample(wave, true, new_wave);
+  LinearResample resampler(orig_freq, new_freq,
+                           lowpass_cutoff, lowpass_filter_width);
+  resampler.Resample(wave, true, new_wave);
 }
 }  // namespace kaldi
diff --git a/src/feat/resample.h b/src/feat/resample.h
index cc3e5064863..e0b4688c99b 100644
--- a/src/feat/resample.h
+++ b/src/feat/resample.h
@@ -40,7 +40,7 @@ namespace kaldi {
 
 /**
    \file[resample.h]
-   
+
    This header contains declarations of classes for resampling signals.  The
    normal cases of resampling a signal are upsampling and downsampling
    (increasing and decreasing the sample rate of a signal, respectively),
@@ -51,7 +51,7 @@ namespace kaldi {
    The input signal is always evenly spaced, say sampled with frequency S, and
    we assume the original signal was band-limited to S/2 or lower.  The n'th
    input sample x_n (with n = 0, 1, ...) is interpreted as the original
-   signal's value at time n/S.  
+   signal's value at time n/S.
 
    For resampling, it is convenient to view the input signal as a
    continuous function x(t) of t, where each sample x_n becomes a delta function
@@ -73,14 +73,14 @@ namespace kaldi {
    means we window the sinc function out to its first zero on the left and right,
    w = 2 means the second zero, and so on; we normally choose w to be at least two.
    We call this num_zeros, not w, in the code.
-   
+
    Convolving the signal x(t) with this windowed filter h(t) = f(t)g(t) and evaluating the resulting
    signal s(t) at an arbitrary time t is easy: we have
     \f[          s(t) = 1/S \sum_n x_n h(t - n/S)        \f].
    (note: the sign of t - n/S might be wrong, but it doesn't matter as the filter
    and window are symmetric).
    This is true for arbitrary values of t.  What the class ArbitraryResample does
-   is to allow you to evaluate the signal for specified values of t.  
+   is to allow you to evaluate the signal for specified values of t.
 */
 
 
@@ -90,7 +90,7 @@ namespace kaldi {
    don't have to be linearly spaced.  The low-pass filter cutoff
    "filter_cutoff_hz" should be less than half the sample rate;
    "num_zeros" should probably be at least two preferably more; higher numbers give
-   sharper filters but will be less efficient. 
+   sharper filters but will be less efficient.
 */
 class ArbitraryResample {
  public:
@@ -115,7 +115,7 @@ class ArbitraryResample {
   /// This version of the Resample function processes just
   /// one vector.
   void Resample(const VectorBase<BaseFloat> &input,
-                VectorBase<BaseFloat> *output) const;  
+                VectorBase<BaseFloat> *output) const;
  private:
   void SetIndexes(const Vector<BaseFloat> &sample_points);
 
@@ -185,6 +185,10 @@ class LinearResample {
   /// Resample(x, y, true) for the last piece.  Call it unnecessarily between
   /// signals will not do any harm.
   void Reset();
+
+  //// Return the input and output sampling rates (for checks, for example)
+  inline int32 GetInputSamplingRate() { return samp_rate_in_; }
+  inline int32 GetOutputSamplingRate() { return samp_rate_out_; }
  private:
   /// This function outputs the number of output samples we will output
   /// for a signal with "input_num_samp" input samples.  If flush == true,
@@ -248,20 +252,35 @@ class LinearResample {
                                        ///< previously seen input signal.
 };
 
-/// Downsample a waveform. This is a convenience wrapper for the
-/// class 'LinearResample'.
-/// The low-pass filter cutoff used in 'LinearResample' is 0.99 of half of the
-/// new_freq and num_zeros is 6.
-/// The downsampling results is also checked wit sox resampling toolkit.
-/// Sox design is inspired by Laurent De Soras' paper,
-/// https://ccrma.stanford.edu/~jos/resample/Implementation.html
-/// It designs low pass filter using pass-band, stop-band, Nyquist freq
-/// and stop-band attenuation.
-/// e.g. The mainlob for Hanning window is 4pi/M, where the main-lobe width is
-/// equal to (pass-band-freq - stop-band-freq).
-/// Also the cutoff frequency is equal to (pass-band-freq - stop-band-freq).
-void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
-                        BaseFloat new_freq, Vector<BaseFloat> *new_wave);
+/**
+   Downsample or upsample a waveform. This is a convenience wrapper for the
+   class 'LinearResample'.
+   The low-pass filter cutoff used in 'LinearResample' is 0.99 of the Nyquist,
+   where the Nyquist is half of the minimum of (orig_freq, new_freq).  The
+   resampling is done with a symmetric FIR filter with N_z (number of zeros)
+   as 6.
+
+   We compared the downsampling results with those from the sox resampling
+   toolkit.
+   Sox's design is inspired by Laurent De Soras' paper,
+   https://ccrma.stanford.edu/~jos/resample/Implementation.html
+
+   Note: we expect that while orig_freq and new_freq are of type BaseFloat, they
+   are actually required to have exact integer values (like 16000 or 8000) with
+   a ratio between them that can be expressed as a rational number with
+   reasonably small integer factors.
+*/
+void ResampleWaveform(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                      BaseFloat new_freq, Vector<BaseFloat> *new_wave);
+
+
+/// This function is deprecated.  It is provided for backward compatibility, to avoid
+/// breaking older code.
+inline void DownsampleWaveForm(BaseFloat orig_freq, const VectorBase<BaseFloat> &wave,
+                               BaseFloat new_freq, Vector<BaseFloat> *new_wave) {
+  ResampleWaveform(orig_freq, wave, new_freq, new_wave);
+}
+
 
 /// @} End of "addtogroup feat"
 }  // namespace kaldi
diff --git a/src/feat/wave-reader.h b/src/feat/wave-reader.h
index 7ba981c2c24..dae74139232 100644
--- a/src/feat/wave-reader.h
+++ b/src/feat/wave-reader.h
@@ -119,7 +119,7 @@ class WaveData {
   void Write(std::ostream &os) const;
 
   // This function returns the wave data-- it's in a matrix
-  // becase there may be multiple channels.  In the normal case
+  // because there may be multiple channels.  In the normal case
   // there's just one channel so Data() will have one row.
   const Matrix<BaseFloat> &Data() const { return data_; }
 
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 8e72d0f744c..861ba3f7a93 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -25,7 +25,7 @@ TESTFILES =
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/featbin/apply-cmvn-sliding.cc b/src/featbin/apply-cmvn-sliding.cc
index 4a6d02d16cd..7bff99850f1 100644
--- a/src/featbin/apply-cmvn-sliding.cc
+++ b/src/featbin/apply-cmvn-sliding.cc
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
         "Useful for speaker-id; see also apply-cmvn-online\n"
         "\n"
         "Usage: apply-cmvn-sliding [options] <feats-rspecifier> <feats-wspecifier>\n";
-    
+
     ParseOptions po(usage);
     SlidingWindowCmnOptions opts;
     opts.Register(&po);
@@ -48,13 +48,13 @@ int main(int argc, char *argv[]) {
     }
 
     int32 num_done = 0, num_err = 0;
-    
+
     std::string feat_rspecifier = po.GetArg(1);
     std::string feat_wspecifier = po.GetArg(2);
 
     SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
     BaseFloatMatrixWriter feat_writer(feat_wspecifier);
-    
+
     for (;!feat_reader.Done(); feat_reader.Next()) {
       std::string utt = feat_reader.Key();
       Matrix<BaseFloat> feat(feat_reader.Value());
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
                                   feat.NumCols(), kUndefined);
 
       SlidingWindowCmn(opts, feat, &cmvn_feat);
-      
+
       feat_writer.Write(utt, cmvn_feat);
       num_done++;
     }
diff --git a/src/featbin/compute-cmvn-stats-two-channel.cc b/src/featbin/compute-cmvn-stats-two-channel.cc
index bf312bf3db2..d174622ae31 100644
--- a/src/featbin/compute-cmvn-stats-two-channel.cc
+++ b/src/featbin/compute-cmvn-stats-two-channel.cc
@@ -119,15 +119,15 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: compute-cmvn-stats-two-channel  [options] <reco2file-and-channel> <feats-rspecifier> <stats-wspecifier>\n"
         "e.g.: compute-cmvn-stats-two-channel data/train_unseg/reco2file_and_channel scp:data/train_unseg/feats.scp ark,t:-\n";
-        
-    
+
+
     ParseOptions po(usage);
     BaseFloat quieter_channel_weight = 0.01;
 
     po.Register("quieter-channel-weight", &quieter_channel_weight,
                 "For the quieter channel, apply this weight to the stats, so "
                 "that we still get stats if one channel always dominates.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
 
     std::vector<std::vector<std::string> > utt_pairs;
     GetUtterancePairs(reco2file_and_channel_rxfilename, &utt_pairs);
-    
+
     RandomAccessBaseFloatMatrixReader feat_reader(feats_rspecifier);
     DoubleMatrixWriter writer(stats_wspecifier);
 
@@ -156,7 +156,7 @@ int main(int argc, char *argv[]) {
         std::string utt1 = this_pair[0], utt2 = this_pair[1];
         if (!feat_reader.HasKey(utt1)) {
           KALDI_WARN << "No feature data for utterance " << utt1;
-          num_err++;          
+          num_err++;
           this_pair[0] = utt2;
           this_pair.pop_back();
           // and fall through to the singleton code below.
diff --git a/src/featbin/compute-cmvn-stats.cc b/src/featbin/compute-cmvn-stats.cc
index 09296832453..9cbb54eeaa1 100644
--- a/src/featbin/compute-cmvn-stats.cc
+++ b/src/featbin/compute-cmvn-stats.cc
@@ -25,7 +25,7 @@
 
 namespace kaldi {
 
-bool AccCmvnStatsWrapper(std::string utt,
+bool AccCmvnStatsWrapper(const std::string &utt,
                          const MatrixBase<BaseFloat> &feats,
                          RandomAccessBaseFloatVectorReader *weights_reader,
                          Matrix<double> *cmvn_stats) {
@@ -47,7 +47,7 @@ bool AccCmvnStatsWrapper(std::string utt,
     return true;
   }
 }
-                  
+
 
 }
 
@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) {
         "e.g.: compute-cmvn-stats --spk2utt=ark:data/train/spk2utt"
         " scp:data/train/feats.scp ark,scp:/foo/bar/cmvn.ark,data/train/cmvn.scp\n"
         "See also: apply-cmvn, modify-cmvn-stats\n";
-    
+
     ParseOptions po(usage);
     std::string spk2utt_rspecifier, weights_rspecifier;
     bool binary = true;
@@ -72,7 +72,7 @@ int main(int argc, char *argv[]) {
     po.Register("binary", &binary, "write in binary mode (applies only to global CMN/CVN)");
     po.Register("weights", &weights_rspecifier, "rspecifier for a vector of floats "
                 "for each utterance, that's a per-frame weight.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
     std::string wspecifier_or_wxfilename = po.GetArg(2);
 
     RandomAccessBaseFloatVectorReader weights_reader(weights_rspecifier);
-    
+
     if (ClassifyWspecifier(wspecifier_or_wxfilename, NULL, NULL, NULL)
         != kNoWspecifier) { // writing to a Table: per-speaker or per-utt CMN/CVN.
       std::string wspecifier = wspecifier_or_wxfilename;
@@ -95,7 +95,7 @@ int main(int argc, char *argv[]) {
       if (spk2utt_rspecifier != "") {
         SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
         RandomAccessBaseFloatMatrixReader feat_reader(rspecifier);
-        
+
         for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
           std::string spk = spk2utt_reader.Key();
           const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
         }
       } else {  // per-utterance normalization
         SequentialBaseFloatMatrixReader feat_reader(rspecifier);
-        
+
         for (; !feat_reader.Done(); feat_reader.Next()) {
           std::string utt = feat_reader.Key();
           Matrix<double> stats;
@@ -138,7 +138,7 @@ int main(int argc, char *argv[]) {
             num_err++;
             continue;
           }
-          writer.Write(feat_reader.Key(), stats);          
+          writer.Write(feat_reader.Key(), stats);
           num_done++;
         }
       }
@@ -170,7 +170,7 @@ int main(int argc, char *argv[]) {
     }
     KALDI_LOG << "Done accumulating CMVN stats for " << num_done
               << " utterances; " << num_err << " had errors.";
-    return (num_done != 0 ? 0 : 1);    
+    return (num_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/featbin/compute-fbank-feats.cc b/src/featbin/compute-fbank-feats.cc
index 41df621d62d..e52b30bafb6 100644
--- a/src/featbin/compute-fbank-feats.cc
+++ b/src/featbin/compute-fbank-feats.cc
@@ -19,9 +19,9 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-fbank.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -29,35 +29,42 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
     const char *usage =
         "Create Mel-filter bank (FBANK) feature files.\n"
-        "Usage:  compute-fbank-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-fbank-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     FbankOptions fbank_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the option struct
+    // Register the option struct.
     fbank_opts.Register(&po);
-    // Register the options
-    po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. ");
-    po.Register("vtln-warp", &vtln_warp, "Vtln warp factor (only applicable if vtln-map not specified)");
-    po.Register("vtln-map", &vtln_map_rspecifier, "Map from utterance or speaker-id to vtln warp factor (rspecifier)");
-    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map (if doing VTLN and you have warps per speaker)");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds).");
-
-    // OPTION PARSING ..........................................................
-    //
-
-    // parse options (+filling the registered variables)
+    // Register the options.
+    po.Register("output-format", &output_format,
+                "Format of the output files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("vtln-warp", &vtln_warp,
+                "Vtln warp factor (only applicable if vtln-map not specified)");
+    po.Register("vtln-map", &vtln_map_rspecifier,"Map from utterance or "
+                "speaker-id to vtln warp factor (rspecifier)");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Utterance to speaker-id map "
+                "(if doing VTLN and you have warps per speaker)");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -71,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Fbank fbank(fbank_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -93,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -105,7 +114,7 @@ int main(int argc, char *argv[]) {
       }
       int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
       {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+        KALDI_ASSERT(num_chan > 0);  // This should have been caught in
         // reading code if no channels.
         if (channel == -1) {
           this_chan = 0;
@@ -136,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        fbank.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        fbank.ComputeFeatures(waveform, wave_data.SampFreq(),
+                              vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -165,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -177,6 +189,4 @@ int main(int argc, char *argv[]) {
     std::cerr << e.what();
     return -1;
   }
-  return 0;
 }
-
diff --git a/src/featbin/compute-mfcc-feats.cc b/src/featbin/compute-mfcc-feats.cc
index 09efcd38dd0..d93505fbf2a 100644
--- a/src/featbin/compute-mfcc-feats.cc
+++ b/src/featbin/compute-mfcc-feats.cc
@@ -19,33 +19,35 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-mfcc.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     const char *usage =
         "Create MFCC feature files.\n"
-        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-mfcc-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     MfccOptions mfcc_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the MFCC option struct
+    // Register the MFCC option struct.
     mfcc_opts.Register(&po);
 
-    // Register the options
+    // Register the options.
     po.Register("output-format", &output_format, "Format of the output "
                 "files [kaldi, htk]");
     po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
@@ -60,6 +62,8 @@ int main(int argc, char *argv[]) {
                 "0 -> left, 1 -> right)");
     po.Register("min-duration", &min_duration, "Minimum duration of segments "
                 "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
 
     po.Read(argc, argv);
 
@@ -74,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Mfcc mfcc(mfcc_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier == "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-    
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -96,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -139,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        mfcc.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        mfcc.ComputeFeatures(waveform, wave_data.SampFreq(),
+                             vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -168,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -181,4 +190,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/featbin/compute-plp-feats.cc b/src/featbin/compute-plp-feats.cc
index 3e9fe9d7423..5c3b9843b4d 100644
--- a/src/featbin/compute-plp-feats.cc
+++ b/src/featbin/compute-plp-feats.cc
@@ -19,9 +19,9 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-plp.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -29,21 +29,23 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
     const char *usage =
         "Create PLP feature files.\n"
-        "Usage:  compute-plp-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-plp-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     PlpOptions plp_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     BaseFloat vtln_warp = 1.0;
     std::string vtln_map_rspecifier;
     std::string utt2spk_rspecifier;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
-    // Register the options
+    // Register the options.
     po.Register("output-format", &output_format, "Format of the output "
                 "files [kaldi, htk]");
     po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
@@ -58,11 +60,13 @@ int main(int argc, char *argv[]) {
                 "0 -> left, 1 -> right)");
     po.Register("min-duration", &min_duration, "Minimum duration of segments "
                 "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
 
     plp_opts.Register(&po);
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -74,16 +78,16 @@ int main(int argc, char *argv[]) {
 
     Plp plp(plp_opts);
 
+    if (utt2spk_rspecifier != "" && vtln_map_rspecifier != "")
+      KALDI_ERR << ("The --utt2spk option is only needed if "
+                    "the --vtln-map option is used.");
+    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
+                                                      utt2spk_rspecifier);
+
     SequentialTableReader<WaveHolder> reader(wav_rspecifier);
     BaseFloatMatrixWriter kaldi_writer;  // typedef to TableWriter<something>.
     TableWriter<HtkMatrixHolder> htk_writer;
 
-    if (utt2spk_rspecifier != "")
-      KALDI_ASSERT(vtln_map_rspecifier != "" && "the utt2spk option is only "
-                   "needed if the vtln-map option is used.");
-    RandomAccessBaseFloatReaderMapped vtln_map_reader(vtln_map_rspecifier,
-                                                      utt2spk_rspecifier);
-    
     if (output_format == "kaldi") {
       if (!kaldi_writer.Open(output_wspecifier))
         KALDI_ERR << "Could not initialize output with wspecifier "
@@ -96,6 +100,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -107,8 +113,8 @@ int main(int argc, char *argv[]) {
         continue;
       }
       int32 num_chan = wave_data.Data().NumRows(), this_chan = channel;
-      {  // This block works out the channel (0=left, 1=right...)
-        KALDI_ASSERT(num_chan > 0);  // should have been caught in
+      {  // This block works out the channel (0=left, 1=right...).
+        KALDI_ASSERT(num_chan > 0);  // This should have been caught in
         // reading code if no channels.
         if (channel == -1) {
           this_chan = 0;
@@ -139,10 +145,10 @@ int main(int argc, char *argv[]) {
       SubVector<BaseFloat> waveform(wave_data.Data(), this_chan);
       Matrix<BaseFloat> features;
       try {
-        plp.ComputeFeatures(waveform, wave_data.SampFreq(), vtln_warp_local, &features);
+        plp.ComputeFeatures(waveform, wave_data.SampFreq(),
+                            vtln_warp_local, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -168,6 +174,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if (num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -181,4 +190,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/featbin/compute-spectrogram-feats.cc b/src/featbin/compute-spectrogram-feats.cc
index 3b40a6fa5c7..67932915278 100644
--- a/src/featbin/compute-spectrogram-feats.cc
+++ b/src/featbin/compute-spectrogram-feats.cc
@@ -18,9 +18,9 @@
 // limitations under the License.
 
 #include "base/kaldi-common.h"
-#include "util/common-utils.h"
 #include "feat/feature-spectrogram.h"
 #include "feat/wave-reader.h"
+#include "util/common-utils.h"
 
 
 int main(int argc, char *argv[]) {
@@ -28,29 +28,33 @@ int main(int argc, char *argv[]) {
     using namespace kaldi;
     const char *usage =
         "Create spectrogram feature files.\n"
-        "Usage:  compute-spectrogram-feats [options...] <wav-rspecifier> <feats-wspecifier>\n";
+        "Usage:  compute-spectrogram-feats [options...] <wav-rspecifier> "
+        "<feats-wspecifier>\n";
 
-    // construct all the global objects
+    // Construct all the global objects.
     ParseOptions po(usage);
     SpectrogramOptions spec_opts;
+    // Define defaults for global options.
     bool subtract_mean = false;
     int32 channel = -1;
     BaseFloat min_duration = 0.0;
-    // Define defaults for gobal options
     std::string output_format = "kaldi";
+    std::string utt2dur_wspecifier;
 
     // Register the option struct
     spec_opts.Register(&po);
     // Register the options
-    po.Register("output-format", &output_format, "Format of the output files [kaldi, htk]");
-    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each feature file [CMS]; not recommended to do it this way. ");
-    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right)");
-    po.Register("min-duration", &min_duration, "Minimum duration of segments to process (in seconds).");
-
-    // OPTION PARSING ..........................................................
-    //
+    po.Register("output-format", &output_format,
+                "Format of the output files [kaldi, htk]");
+    po.Register("subtract-mean", &subtract_mean, "Subtract mean of each "
+                "feature file [CMS]; not recommended to do it this way. ");
+    po.Register("channel", &channel, "Channel to extract (-1 -> expect mono, "
+                "0 -> left, 1 -> right)");
+    po.Register("min-duration", &min_duration, "Minimum duration of segments "
+                "to process (in seconds).");
+    po.Register("write-utt2dur", &utt2dur_wspecifier, "Wspecifier to write "
+                "duration of each utterance in seconds, e.g. 'ark,t:utt2dur'.");
 
-    // parse options (+filling the registered variables)
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -80,6 +84,8 @@ int main(int argc, char *argv[]) {
       KALDI_ERR << "Invalid output_format string " << output_format;
     }
 
+    DoubleWriter utt2dur_writer(utt2dur_wspecifier);
+
     int32 num_utts = 0, num_success = 0;
     for (; !reader.Done(); reader.Next()) {
       num_utts++;
@@ -114,8 +120,7 @@ int main(int argc, char *argv[]) {
       try {
         spec.ComputeFeatures(waveform, wave_data.SampFreq(), 1.0, &features);
       } catch (...) {
-        KALDI_WARN << "Failed to compute features for utterance "
-                   << utt;
+        KALDI_WARN << "Failed to compute features for utterance " << utt;
         continue;
       }
       if (subtract_mean) {
@@ -141,6 +146,9 @@ int main(int argc, char *argv[]) {
         p.second = header;
         htk_writer.Write(utt, p);
       }
+      if (utt2dur_writer.IsOpen()) {
+        utt2dur_writer.Write(utt, wave_data.Duration());
+      }
       if(num_utts % 10 == 0)
         KALDI_LOG << "Processed " << num_utts << " utterances";
       KALDI_VLOG(2) << "Processed features for key " << utt;
@@ -153,6 +161,4 @@ int main(int argc, char *argv[]) {
     std::cerr << e.what();
     return -1;
   }
-  return 0;
 }
-
diff --git a/src/featbin/extract-feature-segments.cc b/src/featbin/extract-feature-segments.cc
index 93f599feb3a..f6cdcb96b18 100644
--- a/src/featbin/extract-feature-segments.cc
+++ b/src/featbin/extract-feature-segments.cc
@@ -25,7 +25,7 @@
 #include "matrix/kaldi-matrix.h"
 
 /** @brief This is a program for extracting segments from feature files/archives
- - usage : 
+ - usage :
      - extract-feature-segments [options ..]  <scriptfile/archive> <segments-file> <features-written-specifier>
      - "segments-file" should have the information of the segments that needs to be extracted from the feature files
      - the format of the segments file : speaker_name filename start_time(in secs) end_time(in secs)
@@ -37,6 +37,10 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Create feature files by segmenting input files.\n"
+        "Note: this program should no longer be needed now that\n"
+        "'ranges' in scp files are supported; search for 'ranges' in\n"
+        "http://kaldi-asr.org/doc/io_tut.html, or see the script\n"
+        "utils/data/subsegment_data_dir.sh.\n"
         "Usage:  "
         "extract-feature-segments [options...] <feats-rspecifier> "
         " <segments-file> <feats-wspecifier>\n"
@@ -144,9 +148,9 @@ int main(int argc, char *argv[]) {
         }
       }
 
-      /* check whether a segment start time and end time exists in utterance 
+      /* check whether a segment start time and end time exists in utterance
        * if fails , skips the segment.
-       */ 
+       */
       if (!feat_reader.HasKey(utterance)) {
         KALDI_WARN << "Did not find features for utterance " << utterance
                    << ", skipping segment " << segment;
@@ -167,7 +171,7 @@ int main(int argc, char *argv[]) {
         end_samp -= snip_length;
       }
 
-      /* start sample must be less than total number of samples 
+      /* start sample must be less than total number of samples
        * otherwise skip the segment
        */
       if (start_samp < 0 || start_samp >= num_samp) {
@@ -177,7 +181,7 @@ int main(int argc, char *argv[]) {
         continue;
       }
 
-      /* end sample must be less than total number samples 
+      /* end sample must be less than total number samples
        * otherwise skip the segment
        */
       if (end_samp > num_samp) {
@@ -221,4 +225,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/featbin/extract-segments.cc b/src/featbin/extract-segments.cc
index dd4f5fbb32c..bda79879483 100644
--- a/src/featbin/extract-segments.cc
+++ b/src/featbin/extract-segments.cc
@@ -70,18 +70,18 @@ int main(int argc, char *argv[]) {
 
     RandomAccessTableReader<WaveHolder> reader(wav_rspecifier);
     TableWriter<WaveHolder> writer(wav_wspecifier);
-    Input ki(segments_rxfilename);  // no binary argment: never binary.
+    Input ki(segments_rxfilename);  // No binary argment: never binary.
 
     int32 num_lines = 0, num_success = 0;
 
     std::string line;
-    /* read each line from segments file */
+    // Read each line from the segments file.
     while (std::getline(ki.Stream(), line)) {
       num_lines++;
       std::vector<std::string> split_line;
-      // Split the line by space or tab and check the number of fields in each
-      // line. There must be 4 fields--segment name , reacording wav file name,
-      // start time, end time; 5th field (channel info) is optional.
+      // Split the line into whitespace-separated fields and verify their
+      // number. There must be 4 or 5 fields: segment name, reacording ID, start
+      // time, end time, and the optional channel number.
       SplitStringToVector(line, " \t\r", true, &split_line);
       if (split_line.size() != 4 && split_line.size() != 5) {
         KALDI_WARN << "Invalid line in segments file: " << line;
@@ -92,8 +92,8 @@ int main(int argc, char *argv[]) {
           start_str = split_line[2],
           end_str = split_line[3];
 
-      // Convert the start time and endtime to real from string. Segment is
-      // ignored if start or end time cannot be converted to real.
+      // Parse the start and end times as float values. Segment is ignored if
+      // any of end times is malformed.
       double start, end;
       if (!ConvertStringToReal(start_str, &start)) {
         KALDI_WARN << "Invalid line in segments file [bad start]: " << line;
@@ -103,24 +103,24 @@ int main(int argc, char *argv[]) {
         KALDI_WARN << "Invalid line in segments file [bad end]: " << line;
         continue;
       }
-      // start time must not be negative; start time must not be greater than
-      // end time, except if end time is -1
-      if (start < 0 || (end != -1.0 && end <= 0) || ((start >= end) && (end > 0))) {
-        KALDI_WARN << "Invalid line in segments file [empty or invalid segment]: "
-                   << line;
+      // Start time must be non-negative and not greater than the end time,
+      // except if the end time is -1.
+      if (start < 0 || (end != -1.0 && end <= 0) ||
+          ((start >= end) && (end > 0))) {
+        KALDI_WARN << ("Invalid line in segments file "
+                       "[empty or invalid segment]: ") << line;
         continue;
       }
-      int32 channel = -1;  // means channel info is unspecified.
-      // if each line has 5 elements then 5th element must be channel identifier
+      int32 channel = -1;  // -1 means channel is unspecified.
+      // If the line has 5 elements, then the 5th element is the channel number.
       if (split_line.size() == 5) {
         if (!ConvertStringToInteger(split_line[4], &channel) || channel < 0) {
           KALDI_WARN << "Invalid line in segments file [bad channel]: " << line;
           continue;
         }
       }
-      /* check whether a segment start time and end time exists in recording
-       * if fails , skips the segment.
-       */
+
+      // Check whether the recording ID is in wav.scp; if not, skip the segment.
       if (!reader.HasKey(recording)) {
         KALDI_WARN << "Could not find recording " << recording
                    << ", skipping segment " << segment;
@@ -129,74 +129,73 @@ int main(int argc, char *argv[]) {
 
       const WaveData &wave = reader.Value(recording);
       const Matrix<BaseFloat> &wave_data = wave.Data();
-      BaseFloat samp_freq = wave.SampFreq();  // read sampling fequency
-      int32 num_samp = wave_data.NumCols(),  // number of samples in recording
-        num_chan = wave_data.NumRows();  // number of channels in recording
-
-      // Convert starting time of the segment to corresponding sample number.
-      // If end time is -1 then use the whole file starting from start time.
-      int32 start_samp = start * samp_freq,
-          end_samp = (end != -1)? (end * samp_freq) : num_samp;
-      KALDI_ASSERT(start_samp >= 0 && end_samp > 0 && "Invalid start or end.");
-
-      // start sample must be less than total number of samples,
-      // otherwise skip the segment
-      if (start_samp < 0 || start_samp >= num_samp) {
-        KALDI_WARN << "Start sample out of range " << start_samp << " [length:] "
-                   << num_samp << ", skipping segment " << segment;
+      BaseFloat samp_freq = wave.SampFreq();  // Sampling fequency.
+      int32 num_samp = wave_data.NumCols(),  // Number of samples in recording.
+        num_chan = wave_data.NumRows();  // Number of channels in recording.
+      BaseFloat file_length = num_samp / samp_freq;  // In seconds.
+
+      // Start must be within the wave data, otherwise skip the segment.
+      if (start < 0 || start > file_length) {
+        KALDI_WARN << "Segment start is out of file data range [0, "
+                   << file_length << "s]; skipping segment '" << line << "'";
         continue;
       }
-      /* end sample must be less than total number samples
-       * otherwise skip the segment
-       */
-      if (end_samp > num_samp) {
-        if ((end_samp >=
-             num_samp + static_cast<int32>(max_overshoot * samp_freq))) {
-          KALDI_WARN << "End sample too far out of range " << end_samp
-                     << " [length:] " << num_samp << ", skipping segment "
-                     << segment;
-          continue;
-        }
-        end_samp = num_samp;  // for small differences, just truncate.
+
+      // End must be less than the file length adjusted for possible overshoot;
+      // otherwise skip the segment. end == -1 passes the check.
+      if (end > file_length + max_overshoot) {
+        KALDI_WARN << "Segment end is too far out of file data range [0,"
+                   << file_length << "s]; skipping segment '" << line << "'";
+        continue;
       }
-      // Skip if segment size is less than minimum segment length (default 0.1s)
-      if (end_samp <=
-          start_samp + static_cast<int32>(min_segment_length * samp_freq)) {
+
+      // Otherwise ensure the end is not beyond the end of data, and default
+      // end == -1 to the end of file data.
+      if (end < 0 || end > file_length) end = file_length;
+
+      // Skip if segment size is less than the minimum allowed.
+      if (end - start < min_segment_length) {
         KALDI_WARN << "Segment " << segment << " too short, skipping it.";
         continue;
       }
-      /* check whether the wav file has more than one channel
-       * if yes, specify the channel info in segments file
-       * otherwise skips the segment
-       */
+
+      // Check that the channel is specified in the segments file for a multi-
+      // channel file, and that the channel actually exists in the wave data.
       if (channel == -1) {
         if (num_chan == 1) channel = 0;
         else {
-          KALDI_ERR << "If your data has multiple channels, you must specify the"
-              " channel in the segments file.  Processing segment " << segment;
+          KALDI_ERR << ("Your data has multiple channels. You must "
+                        "specify the channel in the segments file. "
+                        "Skipping segment ") << segment;
         }
       } else {
         if (channel >= num_chan) {
           KALDI_WARN << "Invalid channel " << channel << " >= " << num_chan
-                     << ", processing segment " << segment;
+                     << ". Skipping segment " << segment;
           continue;
         }
       }
-      /*
-       * This function  return a portion of a wav data from the orignial wav data matrix
-       */
-      SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1, start_samp, end_samp-start_samp);
+
+      // Convert endpoints of the segment to sample numbers. Note that the
+      // conversion requires a proper rounding.
+      int32 start_samp = static_cast<int32>(start * samp_freq + 0.5f),
+          end_samp = static_cast<int32>(end * samp_freq + 0.5f);
+    
+      if (end_samp > num_samp) 
+        end_samp = num_samp;
+     
+      // Get the range of data from the orignial wave_data matrix.
+      SubMatrix<BaseFloat> segment_matrix(wave_data, channel, 1,
+                                          start_samp, end_samp - start_samp);
       WaveData segment_wave(samp_freq, segment_matrix);
-      writer.Write(segment, segment_wave); // write segment in wave format.
+      writer.Write(segment, segment_wave);  // Write the range in wave format.
       num_success++;
     }
     KALDI_LOG << "Successfully processed " << num_success << " lines out of "
               << num_lines << " in the segments file. ";
-    /* prints number of segments processed */
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
   }
 }
-
diff --git a/src/featbin/modify-cmvn-stats.cc b/src/featbin/modify-cmvn-stats.cc
index 49cea56f06f..d9511eccb17 100644
--- a/src/featbin/modify-cmvn-stats.cc
+++ b/src/featbin/modify-cmvn-stats.cc
@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
     po.Register("convert-to-mean-and-var", &convert_to_mean_and_var,
                 "If true, convert the stats to a matrix containing the mean "
                 "and the centered variance in each dimension");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2 && po.NumArgs() != 3) {
@@ -52,7 +52,7 @@ int main(int argc, char *argv[]) {
 
     int32 num_done = 0;
 
-    
+
     std::string skip_dims_str, rspecifier, wspecifier;
     if (po.NumArgs() == 3) {
       skip_dims_str = po.GetArg(1);
@@ -61,14 +61,14 @@ int main(int argc, char *argv[]) {
     } else {
       rspecifier = po.GetArg(1);
       wspecifier = po.GetArg(2);
-    }      
+    }
 
     std::vector<int32> skip_dims;
     if (!SplitStringToIntegers(skip_dims_str, ":", false, &skip_dims)) {
       KALDI_ERR << "Bad first argument (should be colon-separated list of "
                 <<  "integers)";
     }
-    
+
     SequentialDoubleMatrixReader reader(rspecifier);
     DoubleMatrixWriter writer(wspecifier);
 
@@ -96,7 +96,7 @@ int main(int argc, char *argv[]) {
               variance = mat(1, i) / count - mean * mean;
           modified_mat(0, i) = mean;
           modified_mat(1, i) = variance;
-        }  
+        }
         writer.Write(reader.Key(), modified_mat);
         num_done++;
       }
diff --git a/src/featbin/paste-feats.cc b/src/featbin/paste-feats.cc
index 7d2afeb9f37..beadf5e1725 100644
--- a/src/featbin/paste-feats.cc
+++ b/src/featbin/paste-feats.cc
@@ -28,7 +28,7 @@ namespace kaldi {
 
 // returns true if successfully appended.
 bool AppendFeats(const std::vector<Matrix<BaseFloat> > &in,
-                 std::string utt,
+                 const std::string &utt,
                  int32 tolerance,
                  Matrix<BaseFloat> *out) {
   // Check the lengths
diff --git a/src/fgmmbin/Makefile b/src/fgmmbin/Makefile
index baa4cd9be33..5db252477b5 100644
--- a/src/fgmmbin/Makefile
+++ b/src/fgmmbin/Makefile
@@ -18,7 +18,6 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index 644eb639381..a22c014a7d5 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -15,7 +15,8 @@ BINFILES = fstdeterminizestar  \
            fstmakecontextsyms fstaddsubsequentialloop fstaddselfloops  \
            fstrmepslocal fstcomposecontext fsttablecompose fstrand \
            fstdeterminizelog fstphicompose fstcopy \
-           fstpushspecial fsts-to-transcripts fsts-project fsts-union fsts-concat
+           fstpushspecial fsts-to-transcripts fsts-project fsts-union \
+           fsts-concat make-grammar-fst
 
 OBJFILES =
 
@@ -24,7 +25,7 @@ TESTFILES =
 # actually, this library is currently empty.  Everything is a header.
 LIBFILE =
 
-ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../decoder/kaldi-decoder.a ../fstext/kaldi-fstext.a \
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstbin/fstcomposecontext.cc b/src/fstbin/fstcomposecontext.cc
index d5ea07df4d3..8f9d270ee6b 100644
--- a/src/fstbin/fstcomposecontext.cc
+++ b/src/fstbin/fstcomposecontext.cc
@@ -22,6 +22,7 @@
 #include "util/common-utils.h"
 #include "fst/fstlib.h"
 #include "fstext/context-fst.h"
+#include "fstext/grammar-context-fst.h"
 #include "fstext/fstext-utils.h"
 #include "fstext/kaldi-fst-io.h"
 
@@ -34,27 +35,31 @@
   ( echo "<eps> 0"; echo "a 1"; echo "b 2"; echo "c 3" ) > phones.txt
   fstmakecontextsyms phones.txt ilabels.sym > context.txt
   fstprint --isymbols=context.txt --osymbols=phones.txt tmp.fst
-#  0    1    <eps>/<eps>/a    a
-#  1    2    <eps>/a/b    b
-#  2    3    a/b/c    c
-#  3
+  # and the result is:
+
+WARNING (fstcomposecontext[5.4]:main():fstcomposecontext.cc:130) Disambiguation symbols list is empty; this likely indicates an error in data preparation.
+0	1	<eps>	a
+1	2	<eps>/a/b	b
+2	3	a/b/c	c
+3	4	b/c/<eps>	<eps>
+4
 
 
   # (2) with disambig syms:
   ( echo 4; echo 5) > disambig.list
-  ( echo "<eps> 0"; echo "a 1"; echo "b 2"; echo "c 3" ) > phones.txt
+  ( echo "<eps> 0"; echo "a 1"; echo "b 2"; echo "c 3"; echo "#0 4"; echo "#1 5") > phones.txt
   ( echo "0 1 1 1"; echo "1 2 2 2"; echo " 2 3 4 4"; echo "3 4 3 3"; echo "4 5 5 5"; echo "5 0" ) | fstcompile > in.fst
-  fstcomposecontext --disambig-syms=disambig.list ilabels.sym in.fst tmp.fst
-  fstmakecontextsyms --disambig-syms=disambig.list phones.txt ilabels.sym > context.txt
+  fstcomposecontext --read-disambig-syms=disambig.list ilabels.sym in.fst tmp.fst
+  fstmakecontextsyms phones.txt ilabels.sym > context.txt
   cp phones.txt phones_disambig.txt;  ( echo "#0 4"; echo "#1 5" ) >> phones_disambig.txt
   fstprint --isymbols=context.txt --osymbols=phones_disambig.txt tmp.fst
 
-#  0    1    <eps>/<eps>/a    a
-#  1    2    <eps>/a/b    b
-#  2    3    #0    #0
-#  3    4    a/b/c    c
-#  4    5    #1    #1
-#  5
+0	1	#-1	a
+1	2	<eps>/a/b	b
+2	3	#0	#0
+3	4	a/b/c	c
+4	5	#1	#1
+5	6	b/c/<eps>	<eps>
 
 */
 
@@ -86,22 +91,27 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage:  fstcomposecontext <ilabels-output-file>  [<in.fst> [<out.fst>] ]\n"
         "E.g:  fstcomposecontext ilabels.sym < LG.fst > CLG.fst\n";
-    
+
 
     ParseOptions po(usage);
     bool binary = true;
     std::string disambig_rxfilename,
         disambig_wxfilename;
-    int32 N = 3, P = 1;
+    int32 context_width = 3, central_position = 1;
+    int32 nonterm_phones_offset = -1;
     po.Register("binary", &binary,
                 "If true, output ilabels-output-file in binary format");
     po.Register("read-disambig-syms", &disambig_rxfilename,
                 "List of disambiguation symbols on input of in.fst");
     po.Register("write-disambig-syms", &disambig_wxfilename,
                 "List of disambiguation symbols on input of out.fst");
-    po.Register("context-size", &N, "Size of phone context window");
-    po.Register("central-position", &P,
+    po.Register("context-size", &context_width, "Size of phone context window");
+    po.Register("central-position", &central_position,
                 "Designated central position in context window");
+    po.Register("nonterm-phones-offset",  &nonterm_phones_offset,
+                "The integer id of #nonterm_bos in your phones.txt, if present "
+                "(only relevant for grammar-FST construction, see "
+                "doc/grammar.dox");
 
     po.Read(argc, argv);
 
@@ -130,13 +140,24 @@ int main(int argc, char *argv[]) {
       KALDI_WARN << "Disambiguation symbols list is empty; this likely "
                  << "indicates an error in data preparation.";
     }
-    
+
     std::vector<std::vector<int32> > ilabels;
     VectorFst<StdArc> composed_fst;
 
     // Work gets done here (see context-fst.h)
-    ComposeContext(disambig_in, N, P, fst, &composed_fst, &ilabels);
-
+    if (nonterm_phones_offset < 0) {
+      // The normal case.
+      ComposeContext(disambig_in, context_width, central_position,
+                     fst, &composed_fst, &ilabels);
+    } else {
+      // The grammar-FST case. See ../doc/grammar.dox for an intro.
+      if (context_width != 2 || central_position != 1) {
+        KALDI_ERR << "Grammar-fst graph creation only supports models with left-"
+            "biphone context.  (--nonterm-phones-offset option was supplied).";
+      }
+      ComposeContextLeftBiphone(nonterm_phones_offset,  disambig_in,
+                                *fst, &composed_fst, &ilabels);
+    }
     WriteILabelInfo(Output(ilabels_out_filename, binary).Stream(),
                     binary, ilabels);
 
@@ -160,4 +181,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/fstbin/fstmakecontextfst.cc b/src/fstbin/fstmakecontextfst.cc
index bbe44b3566f..59655a61e9e 100644
--- a/src/fstbin/fstmakecontextfst.cc
+++ b/src/fstbin/fstmakecontextfst.cc
@@ -46,15 +46,15 @@ int main(int argc, char *argv[]) {
 
     bool binary = true;  // binary output to ilabels_output_file.
     std::string disambig_rxfilename, disambig_wxfilename;
-    int32 N = 3, P = 1;
-    
+    int32 context_width = 3, central_position = 1;
+
     ParseOptions po(usage);
     po.Register("read-disambig-syms", &disambig_rxfilename,
                 "List of disambiguation symbols to read");
     po.Register("write-disambig-syms", &disambig_wxfilename,
                 "List of disambiguation symbols to write");
-    po.Register("context-size", &N, "Size of phonetic context window");
-    po.Register("central-position", &P,
+    po.Register("context-size", &context_width, "Size of phonetic context window");
+    po.Register("central-position", &central_position,
                 "Designated central position in context window");
     po.Register("binary", &binary,
                 "Write ilabels output file in binary Kaldi format");
@@ -91,7 +91,7 @@ int main(int argc, char *argv[]) {
     if ( (disambig_wxfilename != "") && (disambig_rxfilename == "") )
       KALDI_ERR << "fstmakecontextfst: cannot specify --write-disambig-syms if "
           "not specifying --read-disambig-syms\n";
-    
+
     std::vector<int32> disambig_in;
     if (disambig_rxfilename != "") {
       if (!ReadIntegerVectorSimple(disambig_rxfilename, &disambig_in))
@@ -100,21 +100,33 @@ int main(int argc, char *argv[]) {
     }
 
     if (std::binary_search(phone_syms.begin(), phone_syms.end(), subseq_sym)
-       ||std::binary_search(disambig_in.begin(), disambig_in.end(), subseq_sym))
-      KALDI_ERR << "Invalid subsequential symbol "<<(subseq_sym)<<", already a phone or disambiguation symbol.";
+       || std::binary_search(disambig_in.begin(), disambig_in.end(), subseq_sym))
+      KALDI_ERR << "Invalid subsequential symbol " << subseq_sym
+                << ", already a phone or disambiguation symbol.";
+
+    // 'loop_fst' will be an acceptor FST with single (initial and final) state, with
+    // a loop for each phone and disambiguation symbol.
+    StdVectorFst loop_fst;
+    loop_fst.AddState();  // Add state zero.
+    loop_fst.SetStart(0);
+    loop_fst.SetFinal(0, TropicalWeight::One());
+    for (size_t i = 0; i < phone_syms.size(); i++) {
+      int32 sym = phone_syms[i];
+      loop_fst.AddArc(0, StdArc(sym, sym, TropicalWeight::One(), 0));
+    }
+    for (size_t i = 0; i < disambig_in.size(); i++) {
+      int32 sym = disambig_in[i];
+      loop_fst.AddArc(0, StdArc(sym, sym, TropicalWeight::One(), 0));
+    }
 
+    std::vector<std::vector<int32> > ilabels;
+    VectorFst<StdArc> context_fst;
 
-    ContextFst<StdArc, int32> cfst(subseq_sym,
-                                   phone_syms,
-                                   disambig_in,
-                                   N,
-                                   P);
+    ComposeContext(disambig_in, context_width, central_position,
+                   &loop_fst, &context_fst, &ilabels, true);
 
-    VectorFst<StdArc> vfst(cfst);  // Copy the fst to a VectorFst.
+    WriteFstKaldi(context_fst, fst_out_filename);
 
-    WriteFstKaldi(vfst, fst_out_filename);
-    
-    const std::vector<std::vector<int32> >  &ilabels = cfst.ILabelInfo();
     WriteILabelInfo(Output(ilabels_out_filename, binary).Stream(),
                     binary, ilabels);
 
@@ -133,4 +145,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/fstbin/fstmakecontextsyms.cc b/src/fstbin/fstmakecontextsyms.cc
index e3c7d279053..c9d49397545 100644
--- a/src/fstbin/fstmakecontextsyms.cc
+++ b/src/fstbin/fstmakecontextsyms.cc
@@ -32,18 +32,23 @@
     ( echo 3; echo 4 ) > disambig.list
     fstmakecontextfst --read-disambig-syms=disambig.list <(grep -v '#' phones.txt)  5 ilabels.int > C.fst
     fstmakecontextsyms phones.txt ilabels.int > context_syms.txt
+    fstprint  --isymbols=context_syms.txt --osymbols=phones.txt C.fst > C.txt
+
     fstrandgen C.fst | fstprint --isymbols=context_syms.txt --osymbols=phones.txt
 
     Example output:
-0   1   #0        #0
-1   2   #-1       a
-2   3   <eps>/a/a a
-3   4   a/a/a     a
-4   5   #0        #0
-5   6   a/a/b     b
-6   7   a/b/<eps> #$
-7   8   #1        #1
-8
+
+  fstrandgen C.fst | fstprint --isymbols=context_syms.txt --osymbols=phones.txt
+0	1	#-1	b
+1	2	<eps>/b/<eps>	#$
+2	3	#1	#1
+3	4	#0	#0
+4	5	#0	#0
+5	6	#0	#0
+6	7	#0	#0
+7	8	#0	#0
+8	9	#1	#1
+9
 */
 
 
diff --git a/src/fstbin/fstrand.cc b/src/fstbin/fstrand.cc
index 9344b538d9c..f0bc3938051 100644
--- a/src/fstbin/fstrand.cc
+++ b/src/fstbin/fstrand.cc
@@ -45,6 +45,8 @@ int main(int argc, char *argv[]) {
     po.Register("allow-empty", &opts.allow_empty,
                 "If true, we may generate an empty FST.");
 
+    po.Read(argc, argv);
+
     if (po.NumArgs() > 1) {
       po.PrintUsage();
       exit(1);
diff --git a/src/fstbin/make-grammar-fst.cc b/src/fstbin/make-grammar-fst.cc
new file mode 100644
index 00000000000..fc9a17908f9
--- /dev/null
+++ b/src/fstbin/make-grammar-fst.cc
@@ -0,0 +1,162 @@
+// fstbin/make-grammar-fst.cc
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fst/fstlib.h"
+#include "fstext/table-matcher.h"
+#include "fstext/kaldi-fst-io.h"
+#include "decoder/grammar-fst.h"
+
+namespace fst {
+
+// Reads an FST from disk using Kaldi I/O mechanisms, and if it is not of type
+// ConstFst, copies it to that stype.
+ConstFst<StdArc>* ReadAsConstFst(std::string rxfilename) {
+  // the following call will throw if there is an error.
+  Fst<StdArc> *fst = ReadFstKaldiGeneric(rxfilename);
+  ConstFst<StdArc> *const_fst = dynamic_cast<ConstFst<StdArc>* >(fst);
+  if (!const_fst) {
+    const_fst = new ConstFst<StdArc>(*fst);
+    delete fst;
+  }
+  return const_fst;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+    using kaldi::int32;
+
+    const char *usage =
+        "Construct GrammarFst and write it to disk (or convert it to ConstFst\n"
+        "and write that to disk instead).  Mostly intended for demonstration\n"
+        "and testing purposes (since it may be more convenient to construct\n"
+        "GrammarFst from code).  See kaldi-asr.org/doc/grammar.html\n"
+        "Can also be used to prepares FSTs for this use, by calling\n"
+        "PrepareForGrammarFst(), which does things like adding final-probs and\n"
+        "making small structural tweaks to the FST\n"
+        "\n"
+        "Usage (1): make-grammar-fst [options] <top-level-fst> <symbol1> <fst1> \\\n"
+        "                            [<symbol2> <fst2> ...]] <fst-out>\n"
+        "\n"
+        "<symbol1>, <symbol2> are the integer ids of the corresponding\n"
+        " user-defined nonterminal symbols (e.g. #nonterm:contact_list) in the\n"
+        " phones.txt file.\n"
+        "e.g.: make-grammar-fst --nonterm-phones-offset=317 HCLG.fst \\\n"
+        "            320 HCLG1.fst HCLG_grammar.fst\n"
+        "\n"
+        "Usage (2): make-grammar-fst <fst-in> <fst-out>\n"
+        "  Prepare individual FST for compilation into GrammarFst.\n"
+        "  E.g. make-grammar-fst HCLG.fst HCLGmod.fst.  The outputs of this\n"
+        "   will then become the arguments <top-level-fst>, <fst1>, ... for usage\n"
+        "   pattern (1).\n"
+        "\n"
+        "The --nonterm-phones-offset option is required for both usage patterns.\n";
+
+
+    ParseOptions po(usage);
+
+
+    int32 nonterm_phones_offset = -1;
+    bool write_as_grammar = true;
+
+    po.Register("nonterm-phones-offset", &nonterm_phones_offset,
+                "Integer id of #nonterm_bos in phones.txt");
+    po.Register("write-as-grammar", &write_as_grammar, "If true, "
+                "write as GrammarFst object; if false, convert to "
+                "ConstFst<StdArc> (readable by standard decoders) "
+                "and write that.");
+
+    po.Read(argc, argv);
+
+
+    if (po.NumArgs() < 2 || po.NumArgs() % 2 != 0) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    if (nonterm_phones_offset < 0)
+      KALDI_ERR << "The --nonterm-phones-offset option must be supplied "
+          "and positive.";
+
+    if (po.NumArgs() == 2) {
+      // this usage pattern calls PrepareForGrammarFst().
+      VectorFst<StdArc> *fst = ReadFstKaldi(po.GetArg(1));
+      PrepareForGrammarFst(nonterm_phones_offset, fst);
+      // This will write it as VectorFst; to avoid it having to be converted to
+      // ConstFst when read again by make-grammar-fst, you may want to pipe
+      // through fstconvert --fst_type=const.
+      WriteFstKaldi(*fst, po.GetArg(2));
+      exit(0);
+    }
+
+    std::string top_fst_str = po.GetArg(1),
+        fst_out_str = po.GetArg(po.NumArgs());
+
+    std::shared_ptr<const ConstFst<StdArc> > top_fst(
+        ReadAsConstFst(top_fst_str));
+    std::vector<std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > > > pairs;
+
+    int32 num_pairs = (po.NumArgs() - 2) / 2;
+    for (int32 i = 1; i <= num_pairs; i++) {
+      int32 nonterminal;
+      std::string nonterm_str = po.GetArg(2*i);
+      if (!ConvertStringToInteger(nonterm_str, &nonterminal) ||
+          nonterminal <= 0)
+        KALDI_ERR << "Expected positive integer as nonterminal, got: "
+                  << nonterm_str;
+      std::string fst_str = po.GetArg(2*i + 1);
+      std::shared_ptr<const ConstFst<StdArc> > this_fst(ReadAsConstFst(fst_str));
+      pairs.push_back(std::pair<int32, std::shared_ptr<const ConstFst<StdArc> > >(
+          nonterminal, this_fst));
+    }
+
+    GrammarFst *grammar_fst = new GrammarFst(nonterm_phones_offset,
+                                             top_fst,
+                                             pairs);
+
+    if (write_as_grammar) {
+      bool binary = true;  // GrammarFst does not support non-binary write.
+      WriteKaldiObject(*grammar_fst, fst_out_str, binary);
+      delete grammar_fst;
+    } else {
+      VectorFst<StdArc> vfst;
+      CopyToVectorFst(grammar_fst, &vfst);
+      delete grammar_fst;
+      ConstFst<StdArc> cfst(vfst);
+      // We don't have a wrapper in kaldi-fst-io.h for writing type
+      // ConstFst<StdArc>, so do it manually.
+      bool binary = true, write_binary_header = false;  // suppress the ^@B
+      Output ko(fst_out_str, binary, write_binary_header);
+      FstWriteOptions wopts(kaldi::PrintableWxfilename(fst_out_str));
+      cfst.Write(ko.Stream(), wopts);
+    }
+
+    KALDI_LOG << "Created grammar FST and wrote it to "
+              << fst_out_str;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/fstext/Makefile b/src/fstext/Makefile
index 7efd9fcfd8c..b76bd413c42 100644
--- a/src/fstext/Makefile
+++ b/src/fstext/Makefile
@@ -17,14 +17,14 @@ TESTFILES = determinize-star-test \
       determinize-lattice-test lattice-utils-test deterministic-fst-test \
       push-special-test epsilon-property-test prune-special-test
 
-OBJFILES = push-special.o kaldi-fst-io.o
+OBJFILES = push-special.o kaldi-fst-io.o context-fst.o grammar-context-fst.o
 
 
 LIBNAME = kaldi-fstext
 
 # tree and matrix archives needed for test-context-fst
 # matrix archive needed for push-special.
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
deleted file mode 100644
index dc8a4a8370b..00000000000
--- a/src/fstext/context-fst-inl.h
+++ /dev/null
@@ -1,519 +0,0 @@
-// fstext/context-fst-inl.h
-
-// Copyright 2009-2011  Microsoft Corporation;  Jan Silovsky
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_FSTEXT_CONTEXT_FST_INL_H_
-#define KALDI_FSTEXT_CONTEXT_FST_INL_H_
-#include "base/kaldi-common.h"
-#include "fstext/fstext-utils.h"
-
-// Do not include this file directly.  It is included by context-fst.h.
-
-
-
-namespace fst {
-
-/// \addtogroup context_fst_group
-/// @{
-
-namespace internal {
-
-template<class Arc, class LabelT>
-typename ContextFstImpl<Arc, LabelT>::StateId
-   ContextFstImpl<Arc, LabelT>::FindState(const vector<LabelT> &seq) {
-  // Finds state-id corresponding to this vector of phones.  Inserts it if
-  // necessary.
-  assert(static_cast<int32>(seq.size()) == N_-1);
-  VectorToStateIter iter = state_map_.find(seq);
-  if (iter == state_map_.end()) {  // Not already in map.
-    StateId this_state_id = (StateId)state_seqs_.size();
-    state_seqs_.push_back(seq);
-    state_map_[seq] = this_state_id;
-    return this_state_id;
-  } else {
-    return iter->second;
-  }
-}
-
-template<class Arc, class LabelT>
-typename ContextFstImpl<Arc, LabelT>::Label
-ContextFstImpl<Arc, LabelT>::FindLabel(const vector<LabelT> &label_vec) {
-  // Finds ilabel corresponding to this information.. Creates new ilabel if necessary.
-  VectorToLabelIter iter = ilabel_map_.find(label_vec);
-  if (iter == ilabel_map_.end()) {  // Not already in map.
-    Label this_label = ilabel_info_.size();
-    ilabel_info_.push_back(label_vec);
-    ilabel_map_[label_vec] = this_label;
-    return this_label;
-  } else {
-    return iter->second;
-  }
-}
-
-
-template<class Arc, class LabelT>
-typename ContextFstImpl<Arc, LabelT>::StateId ContextFstImpl<Arc, LabelT>::Start() {
-  if (! CacheImpl<Arc>::HasStart()) {
-    vector<LabelT> vec(N_-1, 0);  // Vector of N_-1 epsilons. [e.g. N = 3].
-    StateId s = FindState(vec);
-    assert(s == 0);
-    this->SetStart(s);
-  }
-  return CacheImpl<Arc>::Start();
-}
-
-
-
-template<class Arc, class LabelT>
-ContextFstImpl<Arc, LabelT>::ContextFstImpl(const ContextFstImpl &other):
-    phone_syms_(other.phone_syms_),
-    disambig_syms_(other.disambig_syms_) {
-  KALDI_ERR << "ContextFst copying not yet supported "
-            << "[not hard, but would have to test.]";
-}
-
-
-template<class Arc, class LabelT>
-ContextFstImpl<Arc, LabelT>::ContextFstImpl(Label subsequential_symbol,  // epsilon not allowed.
-                                            const vector<LabelT> &phone_syms,  // on output side of ifst.
-                                            const vector<LabelT> &disambig_syms,  // on output
-                                            int N,
-                                            int P):
-    phone_syms_(phone_syms),  disambig_syms_(disambig_syms), subsequential_symbol_(subsequential_symbol) ,
-    N_(N), P_(P) {
-
-  {  // This block checks the inputs.
-    assert(subsequential_symbol != 0
-           && disambig_syms_.count(subsequential_symbol) == 0
-           && phone_syms_.count(subsequential_symbol) == 0);
-    if (phone_syms.empty())
-      KALDI_WARN << "Context FST created but there are no phone symbols: probably input FST was empty.";
-    assert(phone_syms_.count(0) == 0);
-    assert(disambig_syms_.count(0) == 0);
-    for (size_t i = 0; i < phone_syms.size(); i++)
-      assert(disambig_syms_.count(phone_syms[i]) == 0);
-    assert(N>0 && P>=0 && P<N);
-  }
-  SetType("context");
-  assert(subsequential_symbol_ != 0);  // it's OK to be kNoLabel though, if it never appears in ifst.
-
-  assert(disambig_syms_.count(subsequential_symbol_) == 0 && phone_syms_.count(subsequential_symbol_) == 0);
-
-  vector<LabelT> eps_vec;  // empty vec.
-  // Make sure the symbol that equates to epsilon is zero in our numbering.
-  Label eps_id = FindLabel(eps_vec);  // this function will add it to the input
-  // symbol table, if necessary.
-  assert(eps_id == 0);  // doing this in the initializer should guarantee it is zero.
-
-  if (N > P+1 && !disambig_syms_.empty()) {
-    // We add in a symbol whose sequence representation is [ 0 ], and whose symbol-id
-    // is 1.  This is treated as a disambiguation symbol, we call it #-1 in printed
-    // form.  It is necessary to ensure that all determinizable LG's will have determinizable
-    // CLG's.  The problem it fixes is quite subtle-- it relates to reordering of
-    // disambiguation symbols (they appear earlier in CLG than in LG, relative to phones),
-    // and the fact that if a disambig symbol appears at the very start of a sequence in
-    // CLG, it's not clear exatly where it appeared on the corresponding sequence at
-    // the input of LG.
-    vector<LabelT> pseudo_eps_vec;
-    pseudo_eps_vec.push_back(0);
-    pseudo_eps_symbol_= FindLabel(pseudo_eps_vec);  // this function will add it to the input
-    // symbol table, if necessary.
-    assert(pseudo_eps_symbol_ == 1);
-  } else pseudo_eps_symbol_ = 0;  // use actual epsilon.
-}
-
-
-
-template<class Arc, class LabelT>
-typename ContextFstImpl<Arc, LabelT>::Weight ContextFstImpl<Arc, LabelT>::Final(StateId s) {
-  assert(static_cast<size_t>(s) < state_seqs_.size());  // make sure state exists already.
-  if (!this->HasFinal(s)) {  // Work out final-state weight.
-    const vector<LabelT> &seq = state_seqs_[s];
-
-    bool final_ok;
-    assert(static_cast<int32>(seq.size()) == N_-1);
-
-    if (P_ < N_ - 1) {
-      /* Note that P_ (in zero based indexing) is the "central position", and for arcs out of
-         this state the thing at P_ will be the one we expand.  If this is the subsequential symbol,
-         it means we will output nothing (and will obviously never output anything).  Thus we make
-         this state the final state.
-      */
-      final_ok = (seq[P_] == subsequential_symbol_);
-    } else {
-      /* If P_ == N_-1, then the "central phone" is the last one in the list (we have a left-context system).
-         In this case everything is output immediately and there is no need for a subsequential symbol.
-         Here, any state in the FST can be the final state.
-      */
-      final_ok = true;
-    }
-    Weight w = final_ok ? Weight::One() : Weight::Zero();
-    this->SetFinal(s, w);
-    return w;
-  }
-  return CacheImpl<Arc>::Final(s);
-}
-
-// Warning!  Not tested for correctness.  Does not really matter, the way
-// this function is being used so far.  Note: could possibly be wrong,
-template<class Arc, class LabelT>
-size_t ContextFstImpl<Arc, LabelT>::NumArcs(StateId s) {
-  if (this->HasArcs(s)) {
-    return CacheImpl<Arc>::NumArcs(s);
-  }
-  KALDI_ASSERT(s >= 0 && s < state_seqs_.size());
-  const vector<LabelT> &seq = state_seqs_[s];
-  KALDI_ASSERT(seq.size() == N_ - 1);
-  if (!seq.empty() && seq.back() == subsequential_symbol_) {
-    // State is not a "normal" state because it just saw the subsequential symbol,
-    // hence it cannot accept phones.
-
-    if (P_ == N_ - 1 || seq[P_] == subsequential_symbol_) { // don't
-      // accept subsequential symbol.. c.f. logic in CreateArc().
-      return disambig_syms_.size();
-    } else {
-      return disambig_syms_.size() + 1; // Accept disambig syms and
-                                        // subsequential symbol.
-    }
-  } else {
-    // For normal states, in general there is potentially an arc for each phone and an arc
-    // for each disambiguation symbol, plus one for the subsequential symbol.
-    return phone_syms_.size() + disambig_syms_.size() + 1;
-  }
-}
-
-template<class Arc, class LabelT>
-size_t ContextFstImpl<Arc, LabelT>::NumInputEpsilons(StateId s) {
-  if (!this->HasArcs(s))
-    Expand(s);
-  return CacheImpl<Arc>::NumInputEpsilons(s);
-}
-
-template<class Arc, class LabelT>
-void ContextFstImpl<Arc, LabelT>::InitArcIterator(StateId s, ArcIteratorData<Arc> *data) {
-  if (!this->HasArcs(s))
-    Expand(s);
-  CacheImpl<Arc>::InitArcIterator(s, data);
-}
-
-
-template<class Arc, class LabelT>
-void ContextFstImpl<Arc, LabelT>::CreateDisambigArc(StateId s,
-                                                    Label olabel,
-                                                    Arc *oarc) {  // called from CreateArc.
-  // Creates a self-loop arc corresponding to the disambiguation symbol.
-  vector<LabelT> label_info;  // (olabel);
-  label_info.push_back(-olabel);  // olabel is a disambiguation symbol.  Use its negative
-  // so we can easily distinguish them.
-  Label ilabel = FindLabel(label_info);
-  oarc->ilabel = ilabel;
-  oarc->olabel = olabel;
-  oarc->weight = Weight::One();
-  oarc->nextstate = s;  // self-loop.
-}
-
-template<class Arc, class LabelT>
-bool ContextFstImpl<Arc, LabelT>::CreatePhoneOrEpsArc(StateId src,
-                                                      StateId dst,
-                                                      Label olabel,
-                                                      const vector<LabelT> &phone_seq,
-                                                      Arc *oarc) {
-  // called from CreateArc.
-  // creates the arc with a phone's state on its input labels (or epsilon).
-  // returns true if it created the arc.
-  // returns false if it could not create an arc due to the decision-tree returning false
-  // [this only happens if opts_.behavior_on_failure == ContextFstOptions::kNoArc].
-
-  assert(phone_seq[P_] != subsequential_symbol_);  // would be coding error.
-
-  if (phone_seq[P_] == 0) {  // this can happen at the beginning of the graph.
-    // we don't output a real phone.  Epsilon arc (but sometimes we need to
-    // use a special disambiguation symbol instead of epsilon).
-    *oarc = Arc(pseudo_eps_symbol_, olabel, Weight::One(), dst);
-    // This 1 is a "special" disambiguation symbol (#-1 in printed form) that
-    // we use to represent epsilons.
-    return true;
-  } else {
-    // have a phone in central position.
-    Label ilabel = FindLabel(phone_seq);
-    *oarc = Arc(ilabel, olabel, Weight::One(), dst);
-    return true;
-  }
-}
-
-
-// This function is specific to ContextFst.  It's not part of the Fst
-// interface but it's called (indirectly)by the special matcher.  It
-// attempts to create an arc out of state s, with output label
-// "olabel" [it works out the input label from the value of "olabel".
-// It returns true if it is able to create an arc, and false
-// otherwise.
-template<class Arc, class LabelT>
-bool ContextFstImpl<Arc, LabelT>::CreateArc(StateId s,
-                                            Label olabel,
-                                            Arc *oarc) {
-  // Returns true to indicate the arc exists.
-
-  if (olabel == 0) return false;  // No epsilon-output arcs in this FST.
-
-  const vector<LabelT> &seq = state_seqs_[s];
-
-  if (IsDisambigSymbol(olabel)) {  // Disambiguation-symbol arcs.. create self-loop.
-    CreateDisambigArc(s, olabel, oarc);
-    return true;
-  } else if (IsPhoneSymbol(olabel) || olabel == subsequential_symbol_) {
-    // If all is OK, we shift the old sequence left by 1 and push on the new phone.
-
-    if (olabel != subsequential_symbol_ && !seq.empty() &&
-        seq.back() == subsequential_symbol_) {
-      return false;  // Phone not allowed to follow subsequential symbol.
-    }
-
-    if (olabel == subsequential_symbol_ &&
-        (P_ == N_-1 || seq[P_] == subsequential_symbol_)) {
-      // We already had "enough" subsequential symbols in a row and don't want to
-      // accept any more, or we'd be making the subsequential symbol the central phone.
-      return false;
-    }
-
-    vector<LabelT> newseq(N_-1);  // seq shifted left by 1.
-    for (int i = 0;i < N_-2;i++) newseq[i] = seq[i+1];
-    if (N_ > 1) newseq[N_-2] = olabel;
-
-    vector<LabelT> phoneseq(seq);  // copy it before FindState which
-    // possibly changes the address.
-    StateId nextstate = FindState(newseq);
-
-    phoneseq.push_back(olabel);  // Now it's the full context window of size N_.
-    for (int i = 1; i < N_ ; i++)
-      if (phoneseq[i] == subsequential_symbol_) phoneseq[i] = 0;  // don't put subseq. symbol on
-    // the output arcs, just 0.
-    return CreatePhoneOrEpsArc(s, nextstate, olabel, phoneseq, oarc);
-  } else {
-    KALDI_ERR << "ContextFst: CreateArc, invalid olabel supplied [confusion "
-              << "about phone list or disambig symbols?]: " << olabel;
-  }
-  return false;  // won't get here.  suppress compiler error.
-}
-
-// Note that Expand is not called if we do the composition using
-// ContextMatcher<Arc>, which is the normal case.
-template<class Arc, class LabelT>
-void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not final state weight].
-  assert(static_cast<size_t>(s) < state_seqs_.size());  // make sure state exists already.
-
-  // We just try adding all possible symbols on the output side.
-  Arc arc;
-  if (this->CreateArc(s, subsequential_symbol_, &arc)) {
-    this->PushArc(s, arc);
-  }
-  for (typename kaldi::ConstIntegerSet<Label>::iterator iter = phone_syms_.begin();
-       iter != phone_syms_.end(); ++iter) {
-    Label phone = *iter;
-    if (this->CreateArc(s, phone, &arc)) {
-      this->PushArc(s, arc);
-    }
-  }
-  for (typename kaldi::ConstIntegerSet<Label>::iterator iter = disambig_syms_.begin();
-       iter != disambig_syms_.end(); ++iter) {
-    Label disambig_sym = *iter;
-    if (this->CreateArc(s, disambig_sym, &arc)) {
-      this->PushArc(s, arc);
-    }
-  }
-  this->SetArcs(s);  // mark the arcs as "done". [so HasArcs returns true].
-}
-
-}  // namespace internal
-
-template<class Arc, class LabelT>
-bool ContextMatcher<Arc, LabelT>::Find(typename Arc::Label match_label) {
-  assert(s_ != kNoStateId);
-  // we know at this point that match_type_ == MATCH_OUTPUT.  we are matching output.
-
-  if (match_label == kNoLabel) {
-    // A ContextFst has no epsilons on its output.  So
-    // we don't need to match self-loops on the other FST.
-    ready_ = false;
-    return false;
-  } else if (match_label == 0) {
-    arc_.ilabel = 0;
-    arc_.olabel = kNoLabel;  // epsilon_L
-    arc_.weight = Weight::One();
-    arc_.nextstate = s_;  // loop.
-    ready_ = true;
-    return true;  // epsilon_L loop.
-  } else {
-    const ContextFst<Arc, LabelT> *cfst = static_cast<const ContextFst<Arc, LabelT>*> (fst_);  // we checked in initializer, that it is.
-    ready_ = cfst->CreateArc(s_, match_label, &arc_);
-    return ready_;
-  }
-}
-
-template<class Arc>
-void AddSubsequentialLoop(typename Arc::Label subseq_symbol,
-                          MutableFst<Arc> *fst) {
-  typedef typename Arc::StateId StateId;
-  typedef typename Arc::Weight Weight;
-
-  vector<StateId> final_states;
-  for (StateIterator<MutableFst<Arc> > siter(*fst); !siter.Done(); siter.Next()) {
-    StateId s = siter.Value();
-    if (fst->Final(s) != Weight::Zero())  final_states.push_back(s);
-  }
-
-  StateId superfinal = fst->AddState();
-  Arc arc(subseq_symbol, 0, Weight::One(), superfinal);
-  fst->AddArc(superfinal, arc);  // loop at superfinal.
-  fst->SetFinal(superfinal, Weight::One());
-
-  for (size_t i = 0; i < final_states.size(); i++) {
-    StateId s = final_states[i];
-    fst->AddArc(s, Arc(subseq_symbol, 0, fst->Final(s), superfinal));
-    // No, don't remove the final-weights of the original states..
-    // this is so we can add the subsequential loop in cases where
-    // there is no context, and it won't hurt.
-    // fst->SetFinal(s, Weight::Zero());
-    arc.nextstate = final_states[i];
-  }
-}
-
-template<class I>
-void WriteILabelInfo(std::ostream &os, bool binary,
-                     const vector<vector<I> > &info) {
-  I sz = info.size();
-  kaldi::WriteBasicType(os, binary, sz);
-  for (I i = 0; i < sz; i++) {
-    kaldi::WriteIntegerVector(os, binary, info[i]);
-  }
-}
-
-
-template<class I>
-void ReadILabelInfo(std::istream &is, bool binary,
-                    vector<vector<I> > *info) {
-  I sz = info->size();
-  kaldi::ReadBasicType(is, binary, &sz);
-  assert(info != NULL);
-  info->resize(sz);
-  for (int i = 0; i < sz; i++) {
-    kaldi::ReadIntegerVector(is, binary, &((*info)[i]));
-  }
-}
-
-// Type I must be signed.
-template<class I>
-SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<I> > &info,
-                                         const SymbolTable &phones_symtab,
-                                         std::string separator,
-                                         std::string initial_disambig) {  // e.g. separator = "/", initial-disambig="#-1"
-  assert(std::numeric_limits<I>::is_signed);  // make sure is signed type.
-  assert(!info.empty());
-  assert(info[0].empty());
-  SymbolTable *ans = new SymbolTable("ilabel-info-symtab");
-  int64 s = ans->AddSymbol(phones_symtab.Find(static_cast<int64>(0)));
-  assert(s == 0);
-  for (size_t i = 1; i < info.size(); i++) {
-    if (info[i].size() == 0) {
-      KALDI_ERR << "Invalid ilabel-info";
-    }
-    if (info[i].size() == 1 &&
-       info[i][0] <= 0) {
-      if (info[i][0] == 0) {  // special symbol at start that we want to call #-1.
-        s = ans->AddSymbol(initial_disambig);
-        if (s != i) {
-          KALDI_ERR << "Disambig symbol " << initial_disambig
-                    << " already in vocab";
-        }
-      } else {
-        std::string disambig_sym = phones_symtab.Find(-info[i][0]);
-        if (disambig_sym == "") {
-          KALDI_ERR << "Disambig symbol " << -info[i][0]
-                    << " not in phone symbol-table";
-        }
-        s = ans->AddSymbol(disambig_sym);
-        if (s != i) {
-          KALDI_ERR << "Disambig symbol " << disambig_sym
-                    << " already in vocab";
-        }
-      }
-    } else {
-      // is a phone-context-window.
-      std::string newsym;
-      for (size_t j = 0; j < info[i].size(); j++) {
-        std::string phonesym = phones_symtab.Find(info[i][j]);
-        if (phonesym == "") {
-          KALDI_ERR << "Symbol " << info[i][j]
-                    << " not in phone symbol-table";
-        }
-        if (j != 0) newsym += separator;
-        newsym += phonesym;
-      }
-      int64 s = ans->AddSymbol(newsym);
-      if (s != static_cast<int64>(i)) {
-        KALDI_ERR << "Some problem with duplicate symbols";
-      }
-    }
-  }
-  return ans;
-}
-
-inline void ComposeContext(const vector<int32> &disambig_syms_in,
-                           int N, int P,
-                           VectorFst<StdArc> *ifst,
-                           VectorFst<StdArc> *ofst,
-                           vector<vector<int32> > *ilabels_out) {
-  assert(ifst != NULL && ofst != NULL);
-  assert(N > 0);
-  assert(P >= 0);
-  assert(P < N);
-
-  vector<int32> disambig_syms(disambig_syms_in);
-  std::sort(disambig_syms.begin(), disambig_syms.end());
-  vector<int32> all_syms;
-  GetInputSymbols(*ifst, false/*no eps*/, &all_syms);
-  std::sort(all_syms.begin(), all_syms.end());
-  vector<int32> phones;
-  for (size_t i = 0; i < all_syms.size(); i++)
-    if (!std::binary_search(disambig_syms.begin(),
-                            disambig_syms.end(), all_syms[i]))
-      phones.push_back(all_syms[i]);
-
-  // Get subsequential symbol that does not clash with
-  // any disambiguation symbol or symbol in the FST.
-  int32 subseq_sym = 1;
-  if (!all_syms.empty())
-    subseq_sym = std::max(subseq_sym, all_syms.back() + 1);
-  if (!disambig_syms.empty())
-    subseq_sym = std::max(subseq_sym, disambig_syms.back() + 1);
-
-  // if P == N-1, it's left-context, and no subsequential symbol needed.
-  if (P != N-1)
-    AddSubsequentialLoop(subseq_sym, ifst);
-  ContextFst<StdArc, int32> cfst(subseq_sym, phones, disambig_syms, N, P);
-  ComposeContextFst(cfst, *ifst, ofst);
-  *ilabels_out = cfst.ILabelInfo();
-}
-
-///
-
-}  // namespace fst
-
-
-
-#endif  // KALDI_FSTEXT_CONTEXT_FST_INL_H_
diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc
index a57b7231728..65da1bb0797 100644
--- a/src/fstext/context-fst-test.cc
+++ b/src/fstext/context-fst-test.cc
@@ -25,7 +25,8 @@
 
 namespace fst
 {
-
+using std::vector;
+using std::cout;
 
 // GenAcceptorFromSequence generates a linear acceptor (identical input+output symbols) that has this
 // sequence of symbols, and
@@ -154,10 +155,11 @@ static VectorFst<Arc> *GenRandPhoneSeq(vector<typename Arc::Label> &phone_syms,
 
 // Don't instantiate with log semiring, as RandEquivalent may fail.
 // TestContestFst also test ReadILabelInfo and WriteILabelInfo.
-template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
-  typedef typename Arc::Label Label;
-  typedef typename Arc::StateId StateId;
-  typedef typename Arc::Weight Weight;
+static void TestContextFst(bool verbose, bool use_matcher) {
+  typedef StdArc Arc;
+  typedef Arc::Label Label;
+  typedef Arc::StateId StateId;
+  typedef Arc::Weight Weight;
 
   // Generate a random set of phones.
   size_t num_phones = 1 + kaldi::Rand() % 10;
@@ -176,25 +178,11 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
   vector<int32> phone_syms;
   for (size_t i = 0; i < phones.size();i++) phone_syms.push_back(phones[i]);
 
-  SymbolTable symtab_out("cfst-output-syms");
-
 
-  ContextFst<Arc> cfst(subsequential_symbol,
-                       phones, disambig_syms,
-                       N, P);
+  InverseContextFst inv_cfst(subsequential_symbol,
+                             phones, disambig_syms,
+                             N, P);
 
-  bool test_vec = (kaldi::Rand() % 2 == 0);
-  VectorFst<Arc> *cfst_vec = NULL;
-  if (test_vec) {
-    cfst_vec = new VectorFst<Arc>(cfst);  // fully expand it.
-    cfst_vec->SetInputSymbols(cfst.InputSymbols());  // because isymbols get changed
-    // as it gets constructed.
-  }
-
-  if (verbose) {  // Try to print the fst.
-    FstPrinter<Arc> fstprinter(cfst, cfst.InputSymbols(), cfst.OutputSymbols(), NULL, false, true, "\t");
-    fstprinter.Print(&std::cout, "standard output");
-  }
 
   /* Now create random phone-sequences and compose them with the context FST.
   */
@@ -207,50 +195,35 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
     if (verbose) {
       std::cout << "Sequence FST is:\n";
       {  // Try to print the fst.
-        FstPrinter<Arc> fstprinter(*f, f->InputSymbols(), f->OutputSymbols(), NULL, false, true, "\t");
+        FstPrinter<Arc> fstprinter(*f, NULL, NULL, NULL, false, true, "\t");
         fstprinter.Print(&std::cout, "standard output");
       }
     }
 
     VectorFst<Arc> fst_composed;
-    VectorFst<Arc> fst_composed_vec;
-    if (use_matcher)   ComposeContextFst(cfst, *f, &fst_composed);
-    else Compose(cfst, *f, &fst_composed);
 
+    ComposeDeterministicOnDemandInverse(*f,  &inv_cfst, &fst_composed);
 
-    if (test_vec) {
-      Compose(*cfst_vec, *f, &fst_composed_vec);
-      assert(RandEquivalent(fst_composed, fst_composed_vec, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
-      // delete cfst_vec;
-    }
 
     // Testing WriteILabelInfo and ReadILabelInfo.
     {
       bool binary = (kaldi::Rand() % 2 == 0);
       WriteILabelInfo(kaldi::Output("tmpf", binary).Stream(),
-                      binary, cfst.ILabelInfo());
+                      binary, inv_cfst.IlabelInfo());
 
       bool binary_in;
       vector<vector<int32> > ilabel_info;
       kaldi::Input ki("tmpf", &binary_in);
       ReadILabelInfo(ki.Stream(),
                      binary_in, &ilabel_info);
-      assert(ilabel_info == cfst.ILabelInfo());
+      assert(ilabel_info == inv_cfst.IlabelInfo());
     }
 
 
-    // These lines are important and a bit confusing.
-    // The Compose algorithm actually sets these symbols, but it gets it wrong,
-    // because it creates a copy of the input symbols of cfst *as they existed at the start*.
-    // They get modified during the composition (assuming we didn't already print out the FST)
-    // because
-    fst_composed.SetInputSymbols(cfst.InputSymbols());
-
     if (verbose) {
       std::cout << "Composed FST is:\n";
       {  // Try to print the fst.
-        FstPrinter<Arc> fstprinter(fst_composed, fst_composed.InputSymbols(),
-                                   fst_composed.OutputSymbols(), NULL, false, true, "\t");
+        FstPrinter<Arc> fstprinter(fst_composed, NULL, NULL, NULL, false, true, "\t");
         fstprinter.Print(&std::cout, "standard output");
       }
     }
@@ -260,13 +233,12 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
                                             phone_syms,
                                             disambig_syms,
                                             phone_seq,
-                                            cfst.ILabelInfo(),
+                                            inv_cfst.IlabelInfo(),
                                             N, P);
     kaldi::AssertEqual(tot_cost, tot_cost_check);
 
     delete f;
   }
-  if (test_vec) { delete cfst_vec; }
 
   unlink("tmpf");
 }
@@ -279,6 +251,6 @@ int main() {
   for (int i = 0;i < 16;i++) {
     bool verbose = (i < 4);
     bool use_matcher = ( (i/4) % 2 == 0);
-    fst::TestContextFst<fst::StdArc>(verbose, use_matcher);
+    fst::TestContextFst(verbose, use_matcher);
   }
 }
diff --git a/src/fstext/context-fst.cc b/src/fstext/context-fst.cc
new file mode 100644
index 00000000000..d382144700d
--- /dev/null
+++ b/src/fstext/context-fst.cc
@@ -0,0 +1,401 @@
+// fstext/context-fst.cc
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fstext/context-fst.h"
+#include "base/kaldi-error.h"
+
+namespace fst {
+using std::vector;
+
+
+InverseContextFst::InverseContextFst(
+    Label subsequential_symbol,
+    const vector<int32>& phones,
+    const vector<int32>& disambig_syms,
+    int32 context_width,
+    int32 central_position):
+    context_width_(context_width),
+    central_position_(central_position),
+    phone_syms_(phones),
+    disambig_syms_(disambig_syms),
+    subsequential_symbol_(subsequential_symbol) {
+
+  {  // This block checks the inputs.
+    KALDI_ASSERT(subsequential_symbol != 0
+                 && disambig_syms_.count(subsequential_symbol) == 0
+                 && phone_syms_.count(subsequential_symbol) == 0);
+    if (phone_syms_.empty())
+      KALDI_WARN << "Context FST created but there are no phone symbols: probably "
+          "input FST was empty.";
+    KALDI_ASSERT(phone_syms_.count(0) == 0 && disambig_syms_.count(0) == 0 &&
+                 central_position_ >= 0 && central_position_ < context_width_);
+    for (size_t i = 0; i < phones.size(); i++) {
+      KALDI_ASSERT(disambig_syms_.count(phones[i]) == 0);
+    }
+  }
+
+  // empty vector, will be the ilabel_info vector that corresponds to epsilon,
+  // in case our FST needs to output epsilons.
+  vector<int32> empty_vec;
+  Label epsilon_label = FindLabel(empty_vec);
+
+  // epsilon_vec is the phonetic context window we have at the very start of a
+  // sequence, meaning "no real phones have been seen yet".
+  vector<int32> epsilon_vec(context_width_ - 1, 0);
+  StateId start_state = FindState(epsilon_vec);
+
+  KALDI_ASSERT(epsilon_label == 0 && start_state == 0);
+
+  if (context_width_ > central_position_ + 1 && !disambig_syms_.empty()) {
+    // We add a symbol whose sequence representation is [ 0 ], and whose
+    // symbol-id is 1.  This is treated as a disambiguation symbol, we call it
+    // #-1 in printed form.  It is necessary to ensure that all determinizable
+    // LG's will have determinizable CLG's.  The problem it fixes is quite
+    // subtle-- it relates to reordering of disambiguation symbols (they appear
+    // earlier in CLG than in LG, relative to phones), and the fact that if a
+    // disambig symbol appears at the very start of a sequence in CLG, it's not
+    // clear exatly where it appeared on the corresponding sequence at the input
+    // of LG.
+    vector<int32> pseudo_eps_vec;
+    pseudo_eps_vec.push_back(0);
+    pseudo_eps_symbol_= FindLabel(pseudo_eps_vec);
+    KALDI_ASSERT(pseudo_eps_symbol_ == 1);
+  } else {
+    pseudo_eps_symbol_ = 0;  // use actual epsilon.
+  }
+}
+
+
+void InverseContextFst::ShiftSequenceLeft(Label label,
+                                          std::vector<int32> *phone_seq) {
+  if (!phone_seq->empty()) {
+    phone_seq->erase(phone_seq->begin());
+    phone_seq->push_back(label);
+  }
+}
+
+void InverseContextFst::GetFullPhoneSequence(
+    const std::vector<int32> &seq, Label label,
+    std::vector<int32> *full_phone_sequence) {
+  int32 context_width = context_width_;
+  full_phone_sequence->reserve(context_width);
+  full_phone_sequence->insert(full_phone_sequence->end(),
+                              seq.begin(), seq.end());
+  full_phone_sequence->push_back(label);
+  for (int32 i = central_position_ + 1; i < context_width; i++) {
+    if ((*full_phone_sequence)[i] == subsequential_symbol_) {
+      (*full_phone_sequence)[i] = 0;
+    }
+  }
+}
+
+
+InverseContextFst::Weight InverseContextFst::Final(StateId s) {
+  KALDI_ASSERT(static_cast<size_t>(s) < state_seqs_.size());
+
+  const vector<int32> &phone_context = state_seqs_[s];
+
+  KALDI_ASSERT(phone_context.size() == context_width_ - 1);
+
+  bool has_final_prob;
+
+  if (central_position_ < context_width_ - 1) {
+    has_final_prob = (phone_context[central_position_] == subsequential_symbol_);
+    // if phone_context[central_position_] != subsequential_symbol_ then we have
+    // pending phones-in-context that we still need to output, so we need to
+    // consume more subsequential symbols before we can terminate.
+  } else {
+    has_final_prob = true;
+  }
+  return has_final_prob ? Weight::One() : Weight::Zero();
+}
+
+bool InverseContextFst::GetArc(StateId s, Label ilabel, Arc *arc) {
+  KALDI_ASSERT(ilabel != 0 && static_cast<size_t>(s) < state_seqs_.size() &&
+               state_seqs_[s].size() == context_width_ - 1);
+
+  if (IsDisambigSymbol(ilabel)) {
+    // A disambiguation-symbol self-loop arc.
+    CreateDisambigArc(s, ilabel, arc);
+    return true;
+  } else if (IsPhoneSymbol(ilabel)) {
+    const vector<int32> &seq = state_seqs_[s];
+    if (!seq.empty() && seq.back() == subsequential_symbol_) {
+      return false;  // A real phone is not allowed to follow the subsequential
+                     // symbol.
+    }
+
+    // next_seq will be 'seq' shifted left by 1, with 'ilabel' appended.
+    vector<int32> next_seq(seq);
+    ShiftSequenceLeft(ilabel, &next_seq);
+
+    // full-seq will be the full context window of size context_width_.
+    vector<int32> full_seq;
+    GetFullPhoneSequence(seq, ilabel, &full_seq);
+
+    StateId next_s = FindState(next_seq);
+
+    CreatePhoneOrEpsArc(s, next_s, ilabel, full_seq, arc);
+    return true;
+  } else if (ilabel == subsequential_symbol_) {
+    const vector<int32> &seq = state_seqs_[s];
+
+    if (central_position_ + 1 == context_width_ ||
+        seq[central_position_] == subsequential_symbol_) {
+      // We already had "enough" subsequential symbols in a row and don't want to
+      // accept any more, or we'd be making the subsequential symbol the central phone.
+      return false;
+    }
+
+    // full-seq will be the full context window of size context_width_.
+    vector<int32> full_seq;
+    GetFullPhoneSequence(seq, ilabel, &full_seq);
+
+    vector<int32> next_seq(seq);
+    ShiftSequenceLeft(ilabel, &next_seq);
+    StateId next_s = FindState(next_seq);
+
+    CreatePhoneOrEpsArc(s, next_s, ilabel, full_seq, arc);
+    return true;
+  } else {
+    KALDI_ERR << "ContextFst: CreateArc, invalid ilabel supplied [confusion "
+              << "about phone list or disambig symbols?]: " << ilabel;
+  }
+  return false;  // won't get here.  suppress compiler error.
+}
+
+
+void InverseContextFst::CreateDisambigArc(StateId s, Label ilabel, Arc *arc) {
+  // Creates a self-loop arc corresponding to the disambiguation symbol.
+  vector<int32> label_info;       // This will be a vector containing just [ -olabel ].
+  label_info.push_back(-ilabel);  // olabel is a disambiguation symbol.  Use its negative
+                                  // so we can more easily distinguish them from phones.
+  Label olabel = FindLabel(label_info);
+  arc->ilabel = ilabel;
+  arc->olabel = olabel;
+  arc->weight = Weight::One();
+  arc->nextstate = s;  // self-loop.
+}
+
+void InverseContextFst::CreatePhoneOrEpsArc(StateId src, StateId dest,
+                                            Label ilabel,
+                                            const vector<int32> &phone_seq,
+                                            Arc *arc) {
+  KALDI_PARANOID_ASSERT(phone_seq[central_position_] != subsequential_symbol_);
+
+  arc->ilabel = ilabel;
+  arc->weight = Weight::One();
+  arc->nextstate = dest;
+  if (phone_seq[central_position_] == 0) {
+    // This can happen at the beginning of the graph.  In this case we don't
+    // output a real phone, we createdt an epsilon arc (but sometimes we need to
+    // use a special disambiguation symbol instead of epsilon).
+    arc->olabel = pseudo_eps_symbol_;
+  } else {
+    // We have a phone in the central position.
+    arc->olabel = FindLabel(phone_seq);
+  }
+}
+
+StdArc::StateId InverseContextFst::FindState(const vector<int32> &seq) {
+  // Finds state-id corresponding to this vector of phones.  Inserts it if
+  // necessary.
+  KALDI_ASSERT(static_cast<int32>(seq.size()) == context_width_ - 1);
+  VectorToStateMap::const_iterator iter = state_map_.find(seq);
+  if (iter == state_map_.end()) {  // Not already in map.
+    StateId this_state_id = (StateId)state_seqs_.size();
+    state_seqs_.push_back(seq);
+    state_map_[seq] = this_state_id;
+    return this_state_id;
+  } else {
+    return iter->second;
+  }
+}
+
+StdArc::Label InverseContextFst::FindLabel(const vector<int32> &label_vec) {
+  // Finds the ilabel corresponding to this vector (creates a new ilabel if
+  // necessary).
+  VectorToLabelMap::const_iterator iter = ilabel_map_.find(label_vec);
+  if (iter == ilabel_map_.end()) {  // Not already in map.
+    Label this_label = ilabel_info_.size();
+    ilabel_info_.push_back(label_vec);
+    ilabel_map_[label_vec] = this_label;
+    return this_label;
+  } else {
+    return iter->second;
+  }
+}
+
+
+void ComposeContext(const vector<int32> &disambig_syms_in,
+                    int32 context_width, int32 central_position,
+                    VectorFst<StdArc> *ifst,
+                    VectorFst<StdArc> *ofst,
+                    vector<vector<int32> > *ilabels_out,
+                    bool project_ifst) {
+  KALDI_ASSERT(ifst != NULL && ofst != NULL);
+  KALDI_ASSERT(context_width > 0);
+  KALDI_ASSERT(central_position >= 0);
+  KALDI_ASSERT(central_position < context_width);
+
+  vector<int32> disambig_syms(disambig_syms_in);
+  std::sort(disambig_syms.begin(), disambig_syms.end());
+
+  vector<int32> all_syms;
+  GetInputSymbols(*ifst, false/*no eps*/, &all_syms);
+  std::sort(all_syms.begin(), all_syms.end());
+  vector<int32> phones;
+  for (size_t i = 0; i < all_syms.size(); i++)
+    if (!std::binary_search(disambig_syms.begin(),
+                            disambig_syms.end(), all_syms[i]))
+      phones.push_back(all_syms[i]);
+
+  // Get subsequential symbol that does not clash with
+  // any disambiguation symbol or symbol in the FST.
+  int32 subseq_sym = 1;
+  if (!all_syms.empty())
+    subseq_sym = std::max(subseq_sym, all_syms.back() + 1);
+  if (!disambig_syms.empty())
+    subseq_sym = std::max(subseq_sym, disambig_syms.back() + 1);
+
+  // if central_position == context_width-1, it's left-context, and no
+  // subsequential symbol is needed.
+  if (central_position != context_width-1) {
+    AddSubsequentialLoop(subseq_sym, ifst);
+    if (project_ifst) {
+      fst::Project(ifst, fst::PROJECT_INPUT);
+    }
+  }
+
+  InverseContextFst inv_c(subseq_sym, phones, disambig_syms,
+                          context_width, central_position);
+
+  // The following statement is equivalent to the following
+  // (if FSTs had the '*' operator for composition):
+  //   (*ofst) = inv(inv_c) * (*ifst)
+  ComposeDeterministicOnDemandInverse(*ifst, &inv_c, ofst);
+
+  inv_c.SwapIlabelInfo(ilabels_out);
+}
+
+void AddSubsequentialLoop(StdArc::Label subseq_symbol,
+                          MutableFst<StdArc> *fst) {
+  typedef StdArc Arc;
+  typedef typename Arc::StateId StateId;
+  typedef typename Arc::Weight Weight;
+
+  vector<StateId> final_states;
+  for (StateIterator<MutableFst<Arc> > siter(*fst); !siter.Done(); siter.Next()) {
+    StateId s = siter.Value();
+    if (fst->Final(s) != Weight::Zero())  final_states.push_back(s);
+  }
+
+  StateId superfinal = fst->AddState();
+  Arc arc(subseq_symbol, 0, Weight::One(), superfinal);
+  fst->AddArc(superfinal, arc);  // loop at superfinal.
+  fst->SetFinal(superfinal, Weight::One());
+
+  for (size_t i = 0; i < final_states.size(); i++) {
+    StateId s = final_states[i];
+    fst->AddArc(s, Arc(subseq_symbol, 0, fst->Final(s), superfinal));
+    // No, don't remove the final-weights of the original states..
+    // this is so we can add the subsequential loop in cases where
+    // there is no context, and it won't hurt.
+    // fst->SetFinal(s, Weight::Zero());
+    arc.nextstate = final_states[i];
+  }
+}
+
+void WriteILabelInfo(std::ostream &os, bool binary,
+                     const vector<vector<int32> > &info) {
+  int32 size = info.size();
+  kaldi::WriteBasicType(os, binary, size);
+  for (int32 i = 0; i < size; i++) {
+    kaldi::WriteIntegerVector(os, binary, info[i]);
+  }
+}
+
+
+void ReadILabelInfo(std::istream &is, bool binary,
+                    vector<vector<int32> > *info) {
+  int32 size = info->size();
+  kaldi::ReadBasicType(is, binary, &size);
+  info->resize(size);
+  for (int32 i = 0; i < size; i++) {
+    kaldi::ReadIntegerVector(is, binary, &((*info)[i]));
+  }
+}
+
+SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<int32> > &info,
+                                         const SymbolTable &phones_symtab,
+                                         std::string separator,
+                                         std::string initial_disambig) {  // e.g. separator = "/", initial-disambig="#-1"
+  KALDI_ASSERT(!info.empty() && info[0].empty());
+  SymbolTable *ans = new SymbolTable("ilabel-info-symtab");
+  int64 s = ans->AddSymbol(phones_symtab.Find(static_cast<int64>(0)));
+  assert(s == 0);
+  for (size_t i = 1; i < info.size(); i++) {
+    if (info[i].size() == 0) {
+      KALDI_ERR << "Invalid ilabel-info";
+    }
+    if (info[i].size() == 1 &&
+       info[i][0] <= 0) {
+      if (info[i][0] == 0) {  // special symbol at start that we want to call #-1.
+        s = ans->AddSymbol(initial_disambig);
+        if (s != i) {
+          KALDI_ERR << "Disambig symbol " << initial_disambig
+                    << " already in vocab";
+        }
+      } else {
+        std::string disambig_sym = phones_symtab.Find(-info[i][0]);
+        if (disambig_sym == "") {
+          KALDI_ERR << "Disambig symbol " << -info[i][0]
+                    << " not in phone symbol-table";
+        }
+        s = ans->AddSymbol(disambig_sym);
+        if (s != i) {
+          KALDI_ERR << "Disambig symbol " << disambig_sym
+                    << " already in vocab";
+        }
+      }
+    } else {
+      // is a phone-context-window.
+      std::string newsym;
+      for (size_t j = 0; j < info[i].size(); j++) {
+        std::string phonesym = phones_symtab.Find(info[i][j]);
+        if (phonesym == "") {
+          KALDI_ERR << "Symbol " << info[i][j]
+                    << " not in phone symbol-table";
+        }
+        if (j != 0) newsym += separator;
+        newsym += phonesym;
+      }
+      int64 s = ans->AddSymbol(newsym);
+      if (s != static_cast<int64>(i)) {
+        KALDI_ERR << "Some problem with duplicate symbols";
+      }
+    }
+  }
+  return ans;
+}
+
+
+
+
+}  // end namespace fst
diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h
index 3ea5e8c2b6c..b5271d14631 100644
--- a/src/fstext/context-fst.h
+++ b/src/fstext/context-fst.h
@@ -1,6 +1,7 @@
 // fstext/context-fst.h
 
 // Copyright 2009-2011  Microsoft Corporation
+//                2018  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -42,10 +43,10 @@
 #ifndef KALDI_FSTEXT_CONTEXT_FST_H_
 #define KALDI_FSTEXT_CONTEXT_FST_H_
 
-/* This header defines a context FST "C" which transduces from symbols representing phone
-   context windows (e.g. "a, b, c") to individual phone, e.g. "a".  The context
-   FST is an on-demand FST.  It has its own matcher type that makes it particularly
-   efficient to compose with.
+/* This header defines a context FST "C" (the "C" in "HCLG") which transduces
+   from symbols representing phone context windows (e.g. "a, b, c") to
+   individual phones, e.g. "a".  Search for "hbka.pdf" ("Speech Recognition
+   with Weighted Finite State Transducers") by M. Mohri, for more context.
 */
 
 #include <unordered_map>
@@ -58,398 +59,59 @@ using std::unordered_map;
 #include <fst/fst-decl.h>
 
 #include "util/const-integer-set.h"
+#include "fstext/deterministic-fst.h"
 
 namespace fst {
 
-/// \addtogroup context_fst_group "Classes and functions related to context expansion"
-/// @{
 
-namespace internal {
 
-/*
-   ContextFstImpl inherits from CacheImpl, which handles caching of states.
-*/
-
-template <class Arc,
-          class LabelT = int32> // make the vector<Label> things actually vector<int32> for
-                                // easier compatibility with Kaldi code.
-class ContextFstImpl : public CacheImpl<Arc> {
- public:
-
-  // Inherit the stuff about setting "type", properties and symbol-tables, from
-  // FstImpl, which we inherit from (in a long chain) via CacheImpl<Arc>.
-  using FstImpl<Arc>::SetType;
-  using FstImpl<Arc>::SetProperties;
-  using FstImpl<Arc>::Properties;
-  using FstImpl<Arc>::SetInputSymbols;
-  using FstImpl<Arc>::SetOutputSymbols;
-
-  typedef typename Arc::Weight Weight;
-  typedef typename Arc::StateId StateId;
-  typedef typename Arc::Label Label;
-  typedef DefaultCacheStore<Arc> Store;
-  typedef typename Store::State State;
-  typedef unordered_map<vector<LabelT>,
-                        StateId, kaldi::VectorHasher<LabelT> > VectorToStateType;
-  typedef unordered_map<vector<LabelT>,
-                        Label, kaldi::VectorHasher<LabelT> > VectorToLabelType;
-
-  typedef typename VectorToStateType::const_iterator VectorToStateIter;
-  typedef typename VectorToLabelType::const_iterator VectorToLabelIter;
-
-  ContextFstImpl(Label subsequential_symbol,  // epsilon not allowed.
-                 const vector<LabelT> &phones,
-                 const vector<LabelT> &disambig_syms,
-                 int32 N,  // size of ctx window
-                 int32 P);
-
-  ContextFstImpl(const ContextFstImpl &other);
-
-  ~ContextFstImpl() { }
-
-  // See \ref tree_ilabel
-  // "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel" for more
-  // information about the ilabel_info.
-  const vector<vector<LabelT> > &ILabelInfo() const { return ilabel_info_; }
-
-  StateId Start();
-
-  Weight Final(StateId s);
-
-  // Warning!  Not fully tested for correctness.  Does not really matter, the
-  // way this function is being used so far.
-  size_t NumArcs(StateId s);
-
-  size_t NumInputEpsilons(StateId s);
-
-  size_t NumOutputEpsilons(StateId s) { return 0; }
-
-  void InitArcIterator(StateId s, ArcIteratorData<Arc> *data);
-
-  // This function is specific to ContextFst.  It's not part of
-  // the Fst interface.  It attempts to create an arc out of state s,
-  // with output label "olabel" [it works out the input label from
-  // the value of "olabel".  It returns true if it is able to create
-  // an arc, and false otherwise.
-  bool CreateArc(StateId s, Label olabel, Arc *oarc);
-
-  // Note that Expand is not called if we do the composition using
-  // ContextMatcher<Arc>.
-  // This function expands arcs only [not final state weight].
-  void Expand(StateId s);
-
- private:
-  //! Finds state-id corresponding to this vector of phones.  Inserts it if necessary.
-  StateId FindState(const vector<LabelT> &seq);
-
-  //! Finds the label index corresponding to this context-window of phones.
-  //! Inserts it if necessary.
-  Label FindLabel(const vector<LabelT> &label_info);
-
-  // Ask whether symbol on output side is disambiguation symbol.
-  bool IsDisambigSymbol(Label lab) {  return (disambig_syms_.count(lab) != 0); }
-
-  // Ask whether symbol on input side is disambiguation symbol.
-  bool IsPhoneSymbol(Label lab) {  return (phone_syms_.count(lab) != 0); }
-
-  inline void CreateDisambigArc(StateId s, Label olabel, Arc *oarc);  // called from CreateArc.
-
-  inline bool CreatePhoneOrEpsArc(StateId src, StateId dst, Label olabel,
-                                  const vector<LabelT> &phone_seq, Arc *oarc);
-
-  // maps from vector<LabelT> to StateId.
-  VectorToStateType state_map_;
-  vector<vector<LabelT> > state_seqs_;
-
-  // maps from vector<LabelT> to Label
-  VectorToLabelType ilabel_map_;
-  vector<vector<LabelT> > ilabel_info_;
 
-  // Stuff we were provided at input (but changed to more convenient form):
-  kaldi::ConstIntegerSet<Label> phone_syms_;
-  kaldi::ConstIntegerSet<Label> disambig_syms_;
-  Label subsequential_symbol_;
-  int N_;
-  int P_;
-  int pseudo_eps_symbol_;  // This is the symbol we put on epsilon arcs at the start
-  // of the graph.  If we have "real" disambiguation symbols AND N > P+1, then this cannot be
-  // epsilon or there is a danger of non-determinizable output.  It's because in this case,
-  // the disambiguation symbols are shifted left w.r.t. the phones, and there becomes
-  // and ambiguity if a disambiguation symbol appears at the start of a sequence onthe
-  // input of CLG, whether it was at the very start of the input of LG, or just after, say,
-  // the first real phone.  What we do if we need pseudo_eps_symbol_ to be not epsilon,
-  // we create a special symbol with symbol-id 1 and sequence representation (ilabels entry)
-  // [ 0 ] .  In the printed form we call this #-1.
-  std::string separator_;
-};
-
-}  // namespace internal
-
-/*
-   Actual FST for ContextFst.  Most of the work gets done in ContextFstImpl.
-
-   A ContextFst is a transducer from symbols representing phones-in-context,
-   to phones.  It is an on-demand FST.  However, it does not create itself in the usual
-   way by expanding states by enumerating all their arcs.  This is possible to enable
-   iterating over arcs, but it is not recommended.  Instead, we define a special
-   Matcher class that knows how to request the specific arc corresponding to a particular
-   output label.
-
-   This class requires a list of all the phones and disambiguation
-   symbols, plus the subsequential symbol.  This is required to be able to
-   enumerate all output symbols (if we want to access it in an inefficient way), and
-   also to distinguish between phones and disambiguation symbols.
-*/
-
-template <class Arc,
-          class LabelT = int32> // make the vector<LabelT> things actually vector<int32> for
-                                // easier compatibility with Kaldi code.
-class ContextFst : public ImplToFst<internal::ContextFstImpl<Arc, LabelT>> {
- public:
-  friend class ArcIterator<ContextFst<Arc>>;
-  friend class StateIterator<ContextFst<Arc>>;
-
-  typedef typename Arc::Weight Weight;
-  typedef typename Arc::Label Label;
-  typedef typename Arc::StateId StateId;
-  typedef DefaultCacheStore<Arc> Store;
-  typedef typename Store::State State;
-  typedef internal::ContextFstImpl<Arc, LabelT> Impl;
-
-  /// See \ref graph_context for more details.
-  ContextFst(Label subsequential_symbol,  // epsilon not allowed.
-             const vector<LabelT>& phones,  // symbols on output side of fst.
-             const vector<LabelT>& disambig_syms,  // symbols on output side of fst.
-             int32 N,  // Size of context window
-             int32 P)  // Pos of "central" phone in ctx window, from 0..N-1.
-      : ImplToFst<Impl>(std::make_shared<Impl>(
-            subsequential_symbol, phones, disambig_syms, N, P)) {
-    assert(std::numeric_limits<LabelT>::is_signed);
-  }
-
-  ContextFst(const ContextFst<Arc, LabelT> &fst, bool safe = false)
-      : ImplToFst<Impl>(fst, safe) {}
-
-  ContextFst<Arc, LabelT> *Copy(bool safe = false) const override {
-    return new ContextFst<Arc, LabelT>(*this, safe);
-  }
-
-  inline void InitStateIterator(StateIteratorData<Arc> *data) const override;
-
-  void InitArcIterator(StateId s, ArcIteratorData<Arc> *data) const override {
-    GetMutableImpl()->InitArcIterator(s, data);
-  }
-
-  // This function is used in ContextMatcher.
-  // Semantically this is not really const, as it causes states to be
-  // added to the state table in impl_, and the input vocabulary to be
-  // expanded, but C++ lets us make this const, and compose.h
-  // requires it (because it provides const fst's to the Matcher object.
-  bool CreateArc(StateId s, Label olabel, Arc *oarc) const {
-    return GetMutableImpl()->CreateArc(s, olabel, oarc);
-  }
-
-  // Careful: the output of ILabelInfo depends on what has been visited.
-  const vector<vector<LabelT> > &ILabelInfo() const {
-    return GetImpl()->ILabelInfo();
-  }
-
- private:
-  using ImplToFst<Impl>::GetImpl;
-  using ImplToFst<Impl>::GetMutableImpl;
-
-  ContextFst &operator=(const ContextFst &fst) = delete;
-};
-
-/// Useful utility function for writing these vectors to disk.
-/// writes as int32 for binary compatibility since it will typically be "int".
-template<class I>
+/// Utility function for writing ilabel-info vectors to disk.
 void WriteILabelInfo(std::ostream &os, bool binary,
-                     const vector<vector<I> > &info);
+                     const std::vector<std::vector<int32> > &ilabel_info);
 
-/// Useful utility function for reading these vectors from disk.
-/// writes as int32 (see WriteILabelInfo above).
-template<class I>
+/// Utility function for reading ilabel-info vectors from disk.
 void ReadILabelInfo(std::istream &is, bool binary,
-                    vector<vector<I> > *info);
+                    std::vector<std::vector<int32> > *ilabel_info);
 
 
 /// The following function is mainly of use for printing and debugging.
-template<class I>
-SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<I> > &info,
+SymbolTable *CreateILabelInfoSymbolTable(const std::vector<std::vector<int32> > &ilabel_info,
                                          const SymbolTable &phones_symtab,
                                          std::string separator,
                                          std::string disambig_prefix);  // e.g. separator = "/", disambig_prefix = "#"
 
 
 
-// Specialization for ContextFst, of StateIterator.
-// Just directs to use the one from CacheFst.
-template<class A>
-class StateIterator< ContextFst<A> >
-    : public CacheStateIterator< ContextFst<A> > {
- public:
-  explicit StateIterator(const ContextFst<A> &fst)
-    : CacheStateIterator< ContextFst<A> >(fst, fst.GetMutableImpl()) {}
-};
-
-
-// Specialization for ContextFst, of ArcIterator.
-// Just directs to use the one from CacheFst.
-template <class A>
-class ArcIterator< ContextFst<A> >
-    : public CacheArcIterator< ContextFst<A> > {
- public:
-  typedef typename A::StateId StateId;
-
-  ArcIterator(const ContextFst<A> &fst, StateId s)
-    : CacheArcIterator< ContextFst<A> >(fst.GetMutableImpl(), s) {
-    if (!fst.GetImpl()->HasArcs(s)) // arcs not already computed.
-      fst.GetMutableImpl()->Expand(s);
-  }
-};
-
-template <class Arc, class I> inline
-void ContextFst<Arc, I>::InitStateIterator(StateIteratorData<Arc> *data) const {
-  data->base = new StateIterator< ContextFst<Arc> >(*this);
-}
-
-
-
-// ContextMatcher is a matcher type that is specialized to compose a ContextFst
-// on the left, with an arbitrary FST on the right.  It does so by, rather than
-// using arc iterators (which would force a call to Expand in ContextFstImpl, which
-// would expand all the states), uses the CreateArc function of ContextFst.  This
-// function is specific to the ContextFst type.  ContextMatcher queries the
-// type of the FST using FstType(), and verifies that the left hand FST is a context
-// FST, and then uses a static cast to ContextFst.  [We can't make it a template
-// argument, as the template for ComposeFstOptions is only templated on a single FST
-// class, and uses the same Matcher type for both the left and right].
-
-template <class Arc, class LabelT>
-class ContextMatcher : public MatcherBase<Arc> {  // CAREFUL: templated on arc, not on FST like normal Matcher.
- public:
-  typedef Fst<Arc> FST;  // basic FST type that we get passed
-  // because this is used in composition, typically one side will be a ContextFst,
-  // and one will be some other type which we just treat as Fst<Arc>
-  typedef ContextFst<Arc, LabelT> ContextF;
-  typedef typename Arc::StateId StateId;
-  typedef typename Arc::Label Label;
-  typedef typename Arc::Weight Weight;
-
-  // If input is a context FST, and match_type == MATCH_OUTPUT,
-  // then we do matching by creating the arcs on-demand.
-  // Otherwise we do not match.
-
-  ContextMatcher(const FST &fst, MatchType match_type)
-      : fst_(fst.Copy()),
-        match_label_(kNoLabel),
-        s_ (kNoStateId),
-        ready_(false) {
-    if (match_type == MATCH_OUTPUT && fst.Type() == (string)"context") {
-      match_type_ = MATCH_OUTPUT;
-    } else {
-      match_type_ = MATCH_NONE;
-    }
-  }
-
-  ContextMatcher(const ContextMatcher<Arc, LabelT> &matcher, bool safe)
-      : fst_(matcher.fst_->Copy(safe)),
-        match_type_(matcher.match_type_),
-        match_label_(kNoLabel),
-        s_ (kNoStateId),
-        ready_(false) {}
-
-  virtual ~ContextMatcher() {
-    delete fst_;
-  }
-
-  virtual const FST &GetFst() const { return *fst_; }
-
-  virtual ContextMatcher<Arc, LabelT> *Copy(bool safe = false) const {
-    return new ContextMatcher<Arc, LabelT>(*this, safe);
-  }
-
-  virtual MatchType Type(bool test) const {
-    return match_type_;
-  }
-
-  void SetState(StateId s) {
-    if (match_type_ == MATCH_NONE)
-      LOG(FATAL) << "ContextMatcher: bad match type";
-    s_ = s;
-  }
-
-  bool Find(Label match_label);
-
-  bool Done() const {
-    return !ready_;
-  }
-
-  const Arc& Value() const {
-    assert(ready_);
-    return arc_;
-  }
-
-  void Next() {  // we only ever get one arc so just set ready_ to false.
-    assert(ready_);
-    ready_ = false;
-  }
-
-
-  virtual uint64 Properties(uint64 props) const { return props; } // simple matcher that does
-  // not change its FST, so properties are properties of FST it is applied to
-
- private:
-  virtual void SetState_(StateId s) { SetState(s); }
-  virtual bool Find_(Label label) { return Find(label); }
-  virtual bool Done_() const { return Done(); }
-  virtual const Arc& Value_() const { return Value(); }
-  virtual void Next_() { Next(); }
-
-  const FST *fst_;
-  MatchType match_type_;          // Type of match to perform
-  Label match_label_;             // Current label to be matched
-  StateId s_;                     // Current state.
-  Arc arc_;                       // Current arc.
-  bool ready_;                     // True if arc is waiting to be output.
-  bool current_loop_;             // Current arc is the implicit loop
-
-  void operator = (const SortedMatcher<FST> &);  // Disallow
-};
-
-
-
-/* This is a specialization of Compose, where the left argument is of
-   type ContextFst.
-   For clarity we distinguish it with a different name.
-   It uses the special matcher which should be more efficient than
-   a normal matcher.
-   The fst ifst2 must have the subsequential loop (if not a left-context-only
-   system)
-*/
-template<class Arc, class LabelT>
-void ComposeContextFst(const ContextFst<Arc, LabelT> &ifst1, const Fst<Arc> &ifst2,
-                       MutableFst<Arc> *ofst,
-                       const ComposeOptions &opts = ComposeOptions()) {
-  ComposeFstOptions<Arc, ContextMatcher<Arc, LabelT> > nopts;
-  nopts.gc_limit = 0;  // Cache only the most recent state for fastest copy.
-  *ofst = ComposeFst<Arc>(ifst1, ifst2, nopts);
-  if (opts.connect)
-    Connect(ofst);
-}
-
 /**
    Used in the command-line tool fstcomposecontext.  It creates a context FST and
    composes it on the left with "ifst" to make "ofst".  It outputs the label
    information to ilabels_out.  "ifst" is mutable because we need to add the
    subsequential loop.
+
+    @param [in] disambig_syms  List of disambiguation symbols, e.g. the integer
+                 ids of #0, #1, #2 ... in the phones.txt.
+    @param [in] context_width  Size of context window, e.g. 3 for triphone.
+    @param [in] central_position  Central position in phonetic context window
+                  (zero-based index), e.g. 1 for triphone.
+    @param [in,out] ifst   The FST we are composing with C (e.g. LG.fst), mustable because
+                  we need to add the subsequential loop to it.
+    @param [out] ofst   Composed output FST (would be CLG.fst).
+    @param [out] ilabels_out  Vector, indexed by ilabel of CLG.fst, providing information
+                  about the meaning of that ilabel; see
+                  "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
+    @param [in] project_ifst  This is intended only to be set to true
+                  in the program 'fstmakecontextfst'... if true, it will
+                  project on the input after adding the subsequential loop
+                  to 'ifst', which allows us to reconstruct the context
+                  fst C.fst.
  */
-inline void ComposeContext(const vector<int32> &disambig_syms,
-                           int N, int P,
-                           VectorFst<StdArc> *ifst,
-                           VectorFst<StdArc> *ofst,
-                           vector<vector<int32> > *ilabels_out);
+void ComposeContext(const std::vector<int32> &disambig_syms,
+                    int32 context_width, int32 central_position,
+                    VectorFst<StdArc> *ifst,
+                    VectorFst<StdArc> *ofst,
+                    std::vector<std::vector<int32> > *ilabels_out,
+                    bool project_ifst = false);
 
 
 /**
@@ -466,13 +128,213 @@ inline void ComposeContext(const vector<int32> &disambig_syms,
   original final-states rather than setting them to zero, so the resulting FST
   can accept zero '$' symbols at the end (in case we had no right context).
 */
-template<class Arc>
-void AddSubsequentialLoop(typename Arc::Label subseq_symbol,
-                          MutableFst<Arc> *fst);
+void AddSubsequentialLoop(StdArc::Label subseq_symbol,
+                          MutableFst<StdArc> *fst);
+
+
+/*
+   InverseContextFst represents the inverse of the context FST "C" (the "C" in
+   "HCLG") which transduces from symbols representing phone context windows
+   (e.g. "a, b, c") to individual phones, e.g. "a".  So InverseContextFst
+   transduces from phones to symbols representing phone context windows.  The
+   point is that the inverse is deterministic, so the DeterministicOnDemandFst
+   interface is applicable, which turns out to be a convenient way to implement
+   this.
+
+   This doesn't implement the full Fst interface, it implements the
+   DeterministicOnDemandFst interface which is much simpler and which is
+   sufficient for what we need to do with this.
+
+   Search for "hbka.pdf" ("Speech Recognition with Weighted Finite State
+   Transducers") by M. Mohri, for more context.
+*/
+
+class InverseContextFst: public DeterministicOnDemandFst<StdArc> {
+public:
+  typedef StdArc Arc;
+  typedef typename StdArc::StateId StateId;
+  typedef typename StdArc::Weight Weight;
+  typedef typename StdArc::Label Label;
+
+  /**
+     Constructor.
+        @param [in] subsequential_symbol   The integer id of the 'subsequential symbol'
+                          (usually represented as '$') that terminates sequences on the
+                          output of C.fst (input of InverseContextFst).  Search for
+                          "quential" in https://cs.nyu.edu/~mohri/pub/hbka.pdf.
+                          This may just be the first unused integer id.  Must be nonzer.
+        @param [in] phones      List of integer ids of phones, as you would see in phones.txt
+        @param [in] disambig_syms   List of integer ids of disambiguation symbols,
+                                   e.g. the ids of #0, #1, #2 in phones.txt
+        @param [in] context_width  Size of context window, e.g. 3 for triphone.
+        @param [in] central_position  Central position in context window (zero-based),
+                                   e.g. 1 for triphone.
+     See \ref graph_context for more details.
+  */
+  InverseContextFst(Label subsequential_symbol,
+                    const std::vector<int32>& phones,
+                    const std::vector<int32>& disambig_syms,
+                    int32 context_width,
+                    int32 central_position);
+
+
+  virtual StateId Start() { return 0; }
+
+  virtual Weight Final(StateId s);
+
+  /// Note: ilabel must not be epsilon.
+  virtual bool GetArc(StateId s, Label ilabel, Arc *arc);
+
+  ~InverseContextFst() { }
+
+  // Returns a reference to a vector<vector<int32> > with information about all
+  // the input symbols of C (i.e. all the output symbols of this
+  // InverseContextFst).  See
+  // "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
+  const std::vector<std::vector<int32> > &IlabelInfo() const {
+    return ilabel_info_;
+  }
+
+  // A way to destructively obtain the ilabel-info.  Only do this if you
+  // are just about to destroy this object.
+  void SwapIlabelInfo(std::vector<std::vector<int32> > *vec) { ilabel_info_.swap(*vec); }
+
+private:
+
+  /// Returns the state-id corresponding to this vector of phones; creates the
+  /// state it if necessary.  Requires seq.size() == context_width_ - 1.
+  StateId FindState(const std::vector<int32> &seq);
+
+  /// Finds the label index corresponding to this context-window of phones
+  /// (likely of width context_width_).  Inserts it into the
+  /// ilabel_info_/ilabel_map_ tables if necessary.
+  Label FindLabel(const std::vector<int32> &label_info);
+
+  inline bool IsDisambigSymbol(Label lab) { return (disambig_syms_.count(lab) != 0); }
+
+  inline bool IsPhoneSymbol(Label lab) { return (phone_syms_.count(lab) != 0); }
+
+  /// Create disambiguation-symbol self-loop arc; where 'ilabel' must correspond to
+  /// a disambiguation symbol.  Called from CreateArc().
+  inline void CreateDisambigArc(StateId s, Label ilabel, Arc *arc);
+
+  /// Creates an arc, this function is to be called only when 'ilabel'
+  /// corresponds to a phone.  Called from CreateArc().  The olabel may end be
+  /// epsilon, instead of a phone-in-context, if the system has right context
+  /// and we are very near the beginning of the phone sequence.
+  inline void CreatePhoneOrEpsArc(StateId src, StateId dst, Label ilabel,
+                                  const std::vector<int32> &phone_seq, Arc *arc);
+
+
+  /// If phone_seq is nonempty then this function it left by one and appends
+  /// 'label' to it, otherwise it does nothing.  We expect (but do not check)
+  /// that phone_seq->size() == context_width_ - 1.
+  inline void ShiftSequenceLeft(Label label, std::vector<int32> *phone_seq);
+
+  /// This utility function does something equivalent to the following 3 steps:
+  ///   *full_phone_sequence =  seq;
+  ///  full_phone_sequence->append(label)
+  ///  Replace any values equal to 'subsequential_symbol_' in
+  /// full_phone_sequence with zero (this is to avoid having to keep track of
+  /// the value of 'subsequential_symbol_' outside of this program).
+  /// This function assumes that seq.size() == context_width_ - 1, and also that
+  /// 'subsequential_symbol_' does not appear in positions 0 through
+  /// central_position_ of 'seq'.
+  inline void GetFullPhoneSequence(const std::vector<int32> &seq, Label label,
+                                   std::vector<int32> *full_phone_sequence);
+
+  // Map type to map from vectors of int32 (representing phonetic contexts,
+  // which will be of dimension context_width - 1) to StateId (corresponding to
+  // the state index in this FST).
+  typedef unordered_map<std::vector<int32>, StateId,
+                        kaldi::VectorHasher<int32> > VectorToStateMap;
+
+  // Map type to map from vectors of int32 (representing ilabel-info,
+  // see http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel) to
+  // Label (the output label in this FST).
+  typedef unordered_map<std::vector<int32>, Label,
+                        kaldi::VectorHasher<int32> > VectorToLabelMap;
+
+
+  // Sometimes called N, context_width_ this is the width of the
+  // phonetic context, e.g. 3 for triphone, 2 for biphone, one for monophone.
+  // It is a user-specified value.
+  int32 context_width_;
+
+  // Sometimes called P, central_position_ is is the (zero-based) "central
+  // position" in the context window, meaning the phone that is "in" a certain
+  // context.  The most widely used values of (context-width, central-position)
+  // are: (3,1) for triphone, (1,0) for monophone, and (2, 1) for left biphone.
+  // This is also specified by the user.  As an example, in the left-biphone
+  // [ 5, 6 ], we view it as "the phone numbered 6 with the phone numbered 5 as
+  // its left-context".
+  int32 central_position_;
+
+  // The following three variables were also passed in by the caller:
+
+  // 'phone_syms_' are a set of phone-ids, typically 1, 2, .. num_phones.
+  kaldi::ConstIntegerSet<Label> phone_syms_;
+
+  // disambig_syms_ is the set of integer ids of the disambiguation symbols,
+  // usually represented in text form as #0, #1, #2, etc.  These are inserted
+  // into the grammar (for #0) and the lexicon (for #1, #2, ...) in order to
+  // make the composed FSTs determinizable.  They are treated "specially" by the
+  // context FST in that they are not part of the context, they are just "passed
+  // through" via self-loops.  See the Mohri chapter mrentioned above for more
+  // information.
+  kaldi::ConstIntegerSet<Label> disambig_syms_;
+
+  // subsequential_symbol_, represented as "$" in the Mohri chapter mentioned
+  // above, is something which terminates phonetic sequences to force out the
+  // last phones-in-context.  In our implementation it's added to det(LG) as a
+  // self-loop on final states before composing with C.
+  // (c.f. AddSubsequentialLoop()).
+  Label subsequential_symbol_;
+
+
+  // pseudo_eps_symbol_, which in printed form we refer to as "#-1", is a symbol that
+  // appears on the ilabels of the context transducer C, i.e. the olabels of this
+  // FST which is C's inverse.  It is a symbol we introduce to solve a special problem
+  // in systems with right-context (context_width_ > central_position_ + 1) that
+  // use disambiguation symbols.  It exists to prevent CLG from being nondeterminizable.
+  //
+  // The issue is that, in this case, the disambiguation symbols are shifted
+  // left w.r.t. the phones, and there becomes an ambiguity, if a disambiguation
+  // symbol appears at the start of a sequence on the input of CLG, about
+  // whether it was at the very start of the input of LG, or just after, say,
+  // the first real phone.  This can lead to determinization failure under
+  // certain circumstances.  What we do if we need pseudo_eps_symbol_ to be not
+  // epsilon, we create a special symbol with symbol-id 1 and sequence
+  // representation (ilabels entry) [ 0 ] .
+  int32 pseudo_eps_symbol_;
+
+  // maps from vector<int32>, representing phonetic contexts of length
+  // context_width_ - 1, to StateId.  (The states of the "C" fst correspond to
+  // phonetic contexts, but we only create them as and when they are needed).
+  VectorToStateMap state_map_;
+
+  // The inverse of 'state_map_': gives us the phonetic context corresponding to
+  // each state-id.
+  std::vector<std::vector<int32> > state_seqs_;
+
+  // maps from vector<int32>, representing phonetic contexts of length
+  // context_width_ - 1, to Label.  These are actually the output labels of this
+  // InverseContextFst (because of the "Inverse" part), but for historical
+  // reasons and because we've used the term ilabels" in the documentation, we
+  // still call these "ilabels").
+  VectorToLabelMap ilabel_map_;
+
+  // ilabel_info_ is the reverse map of ilabel_map_.
+  // Indexed by olabel (although we call this ilabel_info_ for historical
+  // reasons and because is for the ilabels of C), ilabel_info_[i] gives
+  // information about the meaning of each symbol on the input of C
+  // aka the output of inv(C).
+  // See "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
+  std::vector<std::vector<int32> > ilabel_info_;
+
+};
 
-/// @}
 }  // namespace fst
 
-#include "context-fst-inl.h"
 
 #endif  // KALDI_FSTEXT_CONTEXT_FST_H_
diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index c6f99697e00..84eafe41e64 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -292,7 +292,7 @@ bool LmExampleDeterministicOnDemandFst<Arc>::GetArc(
     // note: if your histories are the other way round, you might just do
     // wseq.pop() here.
   }
-  if (log_prob == -numeric_limits<float>::infinity()) { // assume this
+  if (log_prob == -std::numeric_limits<float>::infinity()) { // assume this
     // is what happens if prob of the word is zero.  Some LMs will never
     // return zero.
     return false; // no arc.
@@ -423,9 +423,11 @@ void ComposeDeterministicOnDemandInverse(const Fst<Arc> &right,
 
   // Set start state in fst_composed.
   StateId s_left = left->Start(),
-          s_right = right.Start(),
-          start_state = fst_composed->AddState();
+      s_right = right.Start();
+  if (s_left == kNoStateId || s_right == kNoStateId)
+    return;  // Empty result.
   StatePair start_pair(s_left, s_right);
+  StateId start_state = fst_composed->AddState();
   state_queue.push(start_pair);
   fst_composed->SetStart(start_state);
   // A mapping between pairs of states in *left and right, and the corresponding
diff --git a/src/fstext/deterministic-fst-test.cc b/src/fstext/deterministic-fst-test.cc
index a041291e427..25c4eab9c87 100644
--- a/src/fstext/deterministic-fst-test.cc
+++ b/src/fstext/deterministic-fst-test.cc
@@ -21,32 +21,35 @@
 #include "fstext/fst-test-utils.h"
 #include "util/kaldi-io.h"
 
-#include <sys/stat.h> 
+#include <sys/stat.h>
 
 namespace fst {
+using std::cout;
+using std::cerr;
+using std::endl;
+
+bool FileExists(std::string strFilename) {
+  struct stat stFileInfo;
+  bool blnReturn;
+  int intStat;
+
+  // Attempt to get the file attributes
+  intStat = stat(strFilename.c_str(), &stFileInfo);
+  if (intStat == 0) {
+    // We were able to get the file attributes
+    // so the file obviously exists.
+    blnReturn = true;
+  } else {
+    // We were not able to get the file attributes.
+    // This may mean that we don't have permission to
+    // access the folder which contains this file. If you
+    // need to do that level of checking, lookup the
+    // return values of stat which will give you
+    // more details on why stat failed.
+    blnReturn = false;
+  }
 
-bool FileExists(string strFilename) { 
-  struct stat stFileInfo; 
-  bool blnReturn; 
-  int intStat; 
-
-  // Attempt to get the file attributes 
-  intStat = stat(strFilename.c_str(), &stFileInfo); 
-  if (intStat == 0) { 
-    // We were able to get the file attributes 
-    // so the file obviously exists. 
-    blnReturn = true; 
-  } else { 
-    // We were not able to get the file attributes. 
-    // This may mean that we don't have permission to 
-    // access the folder which contains this file. If you 
-    // need to do that level of checking, lookup the 
-    // return values of stat which will give you 
-    // more details on why stat failed. 
-    blnReturn = false; 
-  } 
-   
-  return blnReturn; 
+  return blnReturn;
 }
 
 // Simplify writing
@@ -102,9 +105,9 @@ StdVectorFst* CreateResultFst() {
   fst->AddState();    // state 4
   fst->AddArc(4, StdArc(15, 15, 0.5, 5));
 
-  fst->AddState();     // state 5 
+  fst->AddState();     // state 5
   fst->SetFinal(5, 0.6);
-  
+
   return fst;
 }
 
@@ -119,13 +122,13 @@ Weight WalkSinglePath(StdVectorFst *ifst, DeterministicOnDemandFst<StdArc> *dfst
   StateId isrc=ifst->Start();
   StateId dsrc=dfst->Start();
   Weight totalCost = Weight::One();
-  
+
   while (ifst->Final(isrc) == Weight::Zero()) { // while not final
     fst::ArcIterator<StdVectorFst> aiter(*ifst, isrc);
     const StdArc &iarc = aiter.Value();
     if (dfst->GetArc(dsrc, iarc.olabel, &oarc)) {
       Weight cost = Times(iarc.weight, oarc.weight);
-      // cout << "  Matched label "<<iarc.olabel<<" at summed cost "<<cost<<endl;      
+      // cout << "  Matched label "<<iarc.olabel<<" at summed cost "<<cost<<endl;
       totalCost = Times(totalCost, cost);
     } else {
       cout << "  Can't match arc ["<<iarc.ilabel<<","<<iarc.olabel<<","<<iarc.weight<<"] from "<<isrc<<endl;
@@ -136,7 +139,7 @@ Weight WalkSinglePath(StdVectorFst *ifst, DeterministicOnDemandFst<StdArc> *dfst
     dsrc = oarc.nextstate;
   }
   totalCost = Times(totalCost, dfst->Final(dsrc));
-                    
+
   cout << "  Total cost: " << totalCost << endl;
   return totalCost;
 }
@@ -152,7 +155,7 @@ void TestBackoffAndCache() {
   ArcSort(nfst, StdILabelCompare());
   BackoffDeterministicOnDemandFst<StdArc> dfst1a(*nfst);
   CacheDeterministicOnDemandFst<StdArc> dfst1(&dfst1a);
-  
+
   // Compare all arcs in dfst1 with expected result
   for (StateIterator<StdVectorFst> riter(*rfst); !riter.Done(); riter.Next()) {
     StateId rsrc = riter.Value();
@@ -197,9 +200,9 @@ void TestCompose() {
 
   VectorFst<StdArc> path_fst;
   ShortestPath(composed_fst, &path_fst);
-  
+
   BackoffDeterministicOnDemandFst<StdArc> dfst2(composed_fst);
-  
+
   Weight w1 = WalkSinglePath(&path_fst, &dfst1),
       w2 = WalkSinglePath(&path_fst, &dfst2);
   KALDI_ASSERT(ApproxEqual(w1, w2));
@@ -226,4 +229,4 @@ int main() {
   TestBackoffAndCache();
   TestCompose();
 }
-  
+
diff --git a/src/fstext/deterministic-fst.h b/src/fstext/deterministic-fst.h
index 407e115d3e6..5dc616794ca 100644
--- a/src/fstext/deterministic-fst.h
+++ b/src/fstext/deterministic-fst.h
@@ -316,7 +316,7 @@ void ComposeDeterministicOnDemand(const Fst<Arc> &fst1,
    inverse of 'fst2' (i.e. with the input and output symbols swapped),
    is that the DeterministicOnDemandFst interface only supports lookup
    by ilabel (see its function GetArc).
-   This does not call Connect.
+   This does not call Connect().
 */
 template<class Arc>
 void ComposeDeterministicOnDemandInverse(const Fst<Arc> &fst1,
diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h
index 43ad809f70e..059c9e99bc4 100644
--- a/src/fstext/determinize-lattice-inl.h
+++ b/src/fstext/determinize-lattice-inl.h
@@ -73,7 +73,7 @@ template<class IntType> class LatticeStringRepository {
   const Entry *Concatenate (const Entry *a, const Entry *b) {
     if (a == NULL) return b;
     else if (b == NULL) return a;
-    vector<IntType> v;
+    std::vector<IntType> v;
     ConvertToVector(b, &v);
     const Entry *ans = a;
     for(size_t i = 0; i < v.size(); i++)
@@ -81,7 +81,7 @@ template<class IntType> class LatticeStringRepository {
     return ans;
   }
   const Entry *CommonPrefix (const Entry *a, const Entry *b) {
-    vector<IntType> a_vec, b_vec;
+    std::vector<IntType> a_vec, b_vec;
     ConvertToVector(a, &a_vec);
     ConvertToVector(b, &b_vec);
     const Entry *ans = NULL;
@@ -94,7 +94,7 @@ template<class IntType> class LatticeStringRepository {
   // removes any elements from b that are not part of
   // a common prefix with a.
   void ReduceToCommonPrefix(const Entry *a,
-                            vector<IntType> *b) {
+                            std::vector<IntType> *b) {
     size_t a_size = Size(a), b_size = b->size();
     while (a_size> b_size) {
       a = a->parent;
@@ -102,7 +102,7 @@ template<class IntType> class LatticeStringRepository {
     }
     if (b_size > a_size)
       b_size = a_size;
-    typename vector<IntType>::iterator b_begin = b->begin();
+    typename std::vector<IntType>::iterator b_begin = b->begin();
     while (a_size != 0) {
       if (a->i != *(b_begin + a_size - 1))
         b_size = a_size - 1;
@@ -116,7 +116,7 @@ template<class IntType> class LatticeStringRepository {
   // removes the first n elements of a.
   const Entry *RemovePrefix(const Entry *a, size_t n) {
     if (n==0) return a;
-    vector<IntType> a_vec;
+    std::vector<IntType> a_vec;
     ConvertToVector(a, &a_vec);
     assert(a_vec.size() >= n);
     const Entry *ans = NULL;
@@ -146,11 +146,11 @@ template<class IntType> class LatticeStringRepository {
     return ans;
   }
 
-  void ConvertToVector(const Entry *entry, vector<IntType> *out) const {
+  void ConvertToVector(const Entry *entry, std::vector<IntType> *out) const {
     size_t length = Size(entry);
     out->resize(length);
     if (entry != NULL) {
-      typename vector<IntType>::reverse_iterator iter = out->rbegin();
+      typename std::vector<IntType>::reverse_iterator iter = out->rbegin();
     while (entry != NULL) {
       *iter = entry->i;
       entry = entry->parent;
@@ -159,7 +159,7 @@ template<class IntType> class LatticeStringRepository {
   }
   }
 
-  const Entry *ConvertFromVector(const vector<IntType> &vec) {
+  const Entry *ConvertFromVector(const std::vector<IntType> &vec) {
     const Entry *e = NULL;
     for(size_t i = 0; i < vec.size(); i++)
       e = Successor(e, vec[i]);
@@ -220,7 +220,7 @@ template<class IntType> class LatticeStringRepository {
       return (*e1 == *e2);
     }
   };
-  typedef unordered_set<const Entry*, EntryKey, EntryEqual> SetType;
+  typedef std::unordered_set<const Entry*, EntryKey, EntryEqual> SetType;
 
   void RebuildHelper(const Entry *to_add, SetType *tmp_set) {
     while(true) {
@@ -287,13 +287,13 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     ofst->SetStart(0);
     // now process transitions.
     for (StateId this_state = 0; this_state < nStates; this_state++) {
-      vector<TempArc> &this_vec(output_arcs_[this_state]);
-      typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
+      std::vector<TempArc> &this_vec(output_arcs_[this_state]);
+      typename std::vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
 
       for (;iter != end; ++iter) {
         const TempArc &temp_arc(*iter);
         CompactArc new_arc;
-        vector<Label> seq;
+        std::vector<Label> seq;
         repository_.ConvertToVector(temp_arc.string, &seq);
         CompactWeight weight(temp_arc.weight, seq);
         if (temp_arc.nextstate == kNoStateId) {  // is really final weight.
@@ -307,9 +307,9 @@ template<class Weight, class IntType> class LatticeDeterminizer {
         }
       }
       // Free up memory.  Do this inside the loop as ofst is also allocating memory
-      if (destroy) { vector<TempArc> temp; std::swap(temp, this_vec); }
+      if (destroy) { std::vector<TempArc> temp; std::swap(temp, this_vec); }
     }
-    if (destroy) { vector<vector<TempArc> > temp; std::swap(temp, output_arcs_); }
+    if (destroy) { std::vector<std::vector<TempArc> > temp; std::swap(temp, output_arcs_); }
   }
 
   // Output to standard FST with Weight as its weight type.  We will create extra
@@ -332,12 +332,12 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     }
     ofst->SetStart(0);
     for (OutputStateId this_state = 0; this_state < nStates; this_state++) {
-      vector<TempArc> &this_vec(output_arcs_[this_state]);
+      std::vector<TempArc> &this_vec(output_arcs_[this_state]);
 
-      typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
+      typename std::vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
       for (; iter != end; ++iter) {
         const TempArc &temp_arc(*iter);
-        vector<Label> seq;
+        std::vector<Label> seq;
         repository_.ConvertToVector(temp_arc.string, &seq);
 
         if (temp_arc.nextstate == kNoStateId) {  // Really a final weight.
@@ -381,11 +381,11 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       }
       // Free up memory.  Do this inside the loop as ofst is also allocating memory
       if (destroy) {
-        vector<TempArc> temp; temp.swap(this_vec);
+        std::vector<TempArc> temp; temp.swap(this_vec);
       }
     }
     if (destroy) {
-      vector<vector<TempArc> > temp;
+      std::vector<std::vector<TempArc> > temp;
       temp.swap(output_arcs_);
       repository_.Destroy();
     }
@@ -421,11 +421,11 @@ template<class Weight, class IntType> class LatticeDeterminizer {
         iter != initial_hash_.end(); ++iter)
       delete iter->first;
     { InitialSubsetHash tmp; tmp.swap(initial_hash_); }
-    { vector<vector<Element>* > output_states_tmp;
+    { std::vector<std::vector<Element>* > output_states_tmp;
       output_states_tmp.swap(output_states_); }
-    { vector<char> tmp;  tmp.swap(isymbol_or_final_); }
-    { vector<OutputStateId> tmp; tmp.swap(queue_); }
-    { vector<pair<Label, Element> > tmp; tmp.swap(all_elems_tmp_); }
+    { std::vector<char> tmp;  tmp.swap(isymbol_or_final_); }
+    { std::vector<OutputStateId> tmp; tmp.swap(queue_); }
+    { std::vector<std::pair<Label, Element> > tmp; tmp.swap(all_elems_tmp_); }
   }
 
   ~LatticeDeterminizer() {
@@ -451,7 +451,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     for (typename InitialSubsetHash::const_iterator
              iter = initial_hash_.begin();
          iter != initial_hash_.end(); ++iter) {
-      const vector<Element> &vec = *(iter->first);
+      const std::vector<Element> &vec = *(iter->first);
       Element elem = iter->second;
       for (size_t i = 0; i < vec.size(); i++)
         needed_strings.push_back(vec[i].string);
@@ -510,7 +510,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
         if (!CheckMemoryUsage()) return false;
       }
       return (determinized_ = true);
-    } catch (std::bad_alloc) {
+    } catch (const std::bad_alloc &) {
       int32 repo_size = repository_.MemSize(),
           arcs_size = num_arcs_ * sizeof(TempArc),
           elems_size = num_elems_ * sizeof(Element),
@@ -520,7 +520,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
           << " (repo,arcs,elems) = ("
           << repo_size << "," << arcs_size << "," << elems_size << ")";
       return (determinized_ = false);
-    } catch (std::runtime_error) {
+    } catch (const std::runtime_error &) {
       KALDI_WARN << "Caught exception doing lattice determinization";
       return (determinized_ = false);
     }
@@ -577,9 +577,9 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
   class SubsetKey {
    public:
-    size_t operator ()(const vector<Element> * subset) const {  // hashes only the state and string.
+    size_t operator ()(const std::vector<Element> * subset) const {  // hashes only the state and string.
       size_t hash = 0, factor = 1;
-      for (typename vector<Element>::const_iterator iter= subset->begin(); iter != subset->end(); ++iter) {
+      for (typename std::vector<Element>::const_iterator iter= subset->begin(); iter != subset->end(); ++iter) {
         hash *= factor;
         hash += iter->state + reinterpret_cast<size_t>(iter->string);
         factor *= 23531;  // these numbers are primes.
@@ -592,11 +592,11 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // and string, and approximate match on weights.
   class SubsetEqual {
    public:
-    bool operator ()(const vector<Element> * s1, const vector<Element> * s2) const {
+    bool operator ()(const std::vector<Element> * s1, const std::vector<Element> * s2) const {
       size_t sz = s1->size();
       assert(sz>=0);
       if (sz != s2->size()) return false;
-      typename vector<Element>::const_iterator iter1 = s1->begin(),
+      typename std::vector<Element>::const_iterator iter1 = s1->begin(),
           iter1_end = s1->end(), iter2=s2->begin();
       for (; iter1 < iter1_end; ++iter1, ++iter2) {
         if (iter1->state != iter2->state ||
@@ -614,11 +614,11 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // Used only for debug.
   class SubsetEqualStates {
    public:
-    bool operator ()(const vector<Element> * s1, const vector<Element> * s2) const {
+    bool operator ()(const std::vector<Element> * s1, const std::vector<Element> * s2) const {
       size_t sz = s1->size();
       assert(sz>=0);
       if (sz != s2->size()) return false;
-      typename vector<Element>::const_iterator iter1 = s1->begin(),
+      typename std::vector<Element>::const_iterator iter1 = s1->begin(),
           iter1_end = s1->end(), iter2=s2->begin();
       for (; iter1 < iter1_end; ++iter1, ++iter2) {
         if (iter1->state != iter2->state) return false;
@@ -629,7 +629,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
   // Define the hash type we use to map subsets (in minimal
   // representation) to OutputStateId.
-  typedef unordered_map<const vector<Element>*, OutputStateId,
+  typedef std::unordered_map<const std::vector<Element>*, OutputStateId,
                         SubsetKey, SubsetEqual> MinimalSubsetHash;
 
   // Define the hash type we use to map subsets (in initial
@@ -637,16 +637,16 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // extra weight. [note: we interpret the Element.state in here
   // as an OutputStateId even though it's declared as InputStateId;
   // these types are the same anyway].
-  typedef unordered_map<const vector<Element>*, Element,
+  typedef std::unordered_map<const std::vector<Element>*, Element,
                         SubsetKey, SubsetEqual> InitialSubsetHash;
 
 
   // converts the representation of the subset from canonical (all states) to
   // minimal (only states with output symbols on arcs leaving them, and final
   // states).  Output is not necessarily normalized, even if input_subset was.
-  void ConvertToMinimal(vector<Element> *subset) {
+  void ConvertToMinimal(std::vector<Element> *subset) {
     assert(!subset->empty());
-    typename vector<Element>::iterator cur_in = subset->begin(),
+    typename std::vector<Element>::iterator cur_in = subset->begin(),
         cur_out = subset->begin(), end = subset->end();
     while (cur_in != end) {
       if(IsIsymbolOrFinal(cur_in->state)) {  // keep it...
@@ -661,16 +661,16 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // Takes a minimal, normalized subset, and converts it to an OutputStateId.
   // Involves a hash lookup, and possibly adding a new OutputStateId.
   // If it creates a new OutputStateId, it adds it to the queue.
-  OutputStateId MinimalToStateId(const vector<Element> &subset) {
+  OutputStateId MinimalToStateId(const std::vector<Element> &subset) {
     typename MinimalSubsetHash::const_iterator iter
         = minimal_hash_.find(&subset);
     if (iter != minimal_hash_.end()) // Found a matching subset.
       return iter->second;
     OutputStateId ans = static_cast<OutputStateId>(output_arcs_.size());
-    vector<Element> *subset_ptr = new vector<Element>(subset);
+    std::vector<Element> *subset_ptr = new std::vector<Element>(subset);
     output_states_.push_back(subset_ptr);
     num_elems_ += subset_ptr->size();
-    output_arcs_.push_back(vector<TempArc>());
+    output_arcs_.push_back(std::vector<TempArc>());
     minimal_hash_[subset_ptr] = ans;
     queue_.push_back(ans);
     return ans;
@@ -679,7 +679,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
   // Given a normalized initial subset of elements (i.e. before epsilon closure),
   // compute the corresponding output-state.
-  OutputStateId InitialToStateId(const vector<Element> &subset_in,
+  OutputStateId InitialToStateId(const std::vector<Element> &subset_in,
                                  Weight *remaining_weight,
                                  StringId *common_prefix) {
     typename InitialSubsetHash::const_iterator iter
@@ -693,7 +693,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       return elem.state;
     }
     // else no matching subset-- have to work it out.
-    vector<Element> subset(subset_in);
+    std::vector<Element> subset(subset_in);
     // Follow through epsilons.  Will add no duplicate states.  note: after
     // EpsilonClosure, it is the same as "canonical" subset, except not
     // normalized (actually we never compute the normalized canonical subset,
@@ -716,7 +716,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     // Before returning "ans", add the initial subset to the hash,
     // so that we can bypass the epsilon-closure etc., next time
     // we process the same initial subset.
-    vector<Element> *initial_subset_ptr = new vector<Element>(subset_in);
+    std::vector<Element> *initial_subset_ptr = new std::vector<Element>(subset_in);
     elem.state = ans;
     initial_hash_[initial_subset_ptr] = elem;
     num_elems_ += initial_subset_ptr->size(); // keep track of memory usage.
@@ -736,7 +736,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     if (weight_comp != 0) return weight_comp;
     // now comparing strings.
     if (a_str == b_str) return 0;
-    vector<IntType> a_vec, b_vec;
+    std::vector<IntType> a_vec, b_vec;
     repository_.ConvertToVector(a_str, &a_vec);
     repository_.ConvertToVector(b_str, &b_vec);
     // First compare their lengths.
@@ -759,15 +759,15 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // Has no side effects except on the string repository.  The "output_subset" is not
   // necessarily normalized (in the sense of there being no common substring), unless
   // input_subset was.
-  void EpsilonClosure(vector<Element> *subset) {
+  void EpsilonClosure(std::vector<Element> *subset) {
     // at input, subset must have only one example of each StateId.  [will still
     // be so at output].  This function follows input-epsilons, and augments the
     // subset accordingly.
 
     std::deque<Element> queue;
-    unordered_map<InputStateId, Element> cur_subset;
-    typedef typename unordered_map<InputStateId, Element>::iterator MapIter;
-    typedef typename vector<Element>::const_iterator VecIter;
+    std::unordered_map<InputStateId, Element> cur_subset;
+    typedef typename std::unordered_map<InputStateId, Element>::iterator MapIter;
+    typedef typename std::vector<Element>::const_iterator VecIter;
 
     for (VecIter iter = subset->begin(); iter != subset->end(); ++iter) {
       queue.push_back(*iter);
@@ -850,7 +850,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // Has no side effects except on the variable repository_, and output_arcs_.
 
   void ProcessFinal(OutputStateId output_state) {
-    const vector<Element> &minimal_subset = *(output_states_[output_state]);
+    const std::vector<Element> &minimal_subset = *(output_states_[output_state]);
     // processes final-weights for this subset.
 
     // minimal_subset may be empty if the graphs is not connected/trimmed, I think,
@@ -858,7 +858,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     bool is_final = false;
     StringId final_string = NULL;  // = NULL to keep compiler happy.
     Weight final_weight = Weight::Zero();
-    typename vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
+    typename std::vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
     for (; iter != end; ++iter) {
       const Element &elem = *iter;
       Weight this_final_weight = Times(elem.weight, ifst_->Final(elem.state));
@@ -888,7 +888,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // NormalizeSubset normalizes the subset "elems" by
   // removing any common string prefix (putting it in common_str),
   // and dividing by the total weight (putting it in tot_weight).
-  void NormalizeSubset(vector<Element> *elems,
+  void NormalizeSubset(std::vector<Element> *elems,
                        Weight *tot_weight,
                        StringId *common_str) {
     if(elems->empty()) { // just set common_str, tot_weight
@@ -899,7 +899,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       return;
     }
     size_t size = elems->size();
-    vector<IntType> common_prefix;
+    std::vector<IntType> common_prefix;
     repository_.ConvertToVector((*elems)[0].string, &common_prefix);
     Weight weight = (*elems)[0].weight;
     for (size_t i = 1; i < size; i++) {
@@ -921,8 +921,8 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // Take a subset of Elements that is sorted on state, and
   // merge any Elements that have the same state (taking the best
   // (weight, string) pair in the semiring).
-  void MakeSubsetUnique(vector<Element> *subset) {
-    typedef typename vector<Element>::iterator IterType;
+  void MakeSubsetUnique(std::vector<Element> *subset) {
+    typedef typename std::vector<Element>::iterator IterType;
 
     // This assert is designed to fail (usually) if the subset is not sorted on
     // state.
@@ -959,7 +959,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // associated with each), and any such sets of Elements have to be merged
   // within this routine (we take the [weight, string] pair that's better in the
   // semiring).
-  void ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset) {
+  void ProcessTransition(OutputStateId state, Label ilabel, std::vector<Element> *subset) {
     MakeSubsetUnique(subset); // remove duplicates with the same state.
 
     StringId common_str;
@@ -995,7 +995,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
 
   class PairComparator {
    public:
-    inline bool operator () (const pair<Label, Element> &p1, const pair<Label, Element> &p2) {
+    inline bool operator () (const std::pair<Label, Element> &p1, const std::pair<Label, Element> &p2) {
       if (p1.first < p2.first) return true;
       else if (p1.first > p2.first) return false;
       else {
@@ -1016,22 +1016,22 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   // and output_arcs_.
 
   void ProcessTransitions(OutputStateId output_state) {
-    const vector<Element> &minimal_subset = *(output_states_[output_state]);
+    const std::vector<Element> &minimal_subset = *(output_states_[output_state]);
     // it's possible that minimal_subset could be empty if there are
     // unreachable parts of the graph, so don't check that it's nonempty.
-    vector<pair<Label, Element> > &all_elems(all_elems_tmp_); // use class member
+    std::vector<std::pair<Label, Element> > &all_elems(all_elems_tmp_); // use class member
     // to avoid memory allocation/deallocation.
     {
       // Push back into "all_elems", elements corresponding to all
       // non-epsilon-input transitions out of all states in "minimal_subset".
-      typename vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
+      typename std::vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
       for (;iter != end; ++iter) {
         const Element &elem = *iter;
         for (ArcIterator<Fst<Arc> > aiter(*ifst_, elem.state); ! aiter.Done(); aiter.Next()) {
           const Arc &arc = aiter.Value();
           if (arc.ilabel != 0
               && arc.weight != Weight::Zero()) {  // Non-epsilon transition -- ignore epsilons here.
-            pair<Label, Element> this_pr;
+            std::pair<Label, Element> this_pr;
             this_pr.first = arc.ilabel;
             Element &next_elem(this_pr.second);
             next_elem.state = arc.nextstate;
@@ -1048,9 +1048,9 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     PairComparator pc;
     std::sort(all_elems.begin(), all_elems.end(), pc);
     // now sorted first on input label, then on state.
-    typedef typename vector<pair<Label, Element> >::const_iterator PairIter;
+    typedef typename std::vector<std::pair<Label, Element> >::const_iterator PairIter;
     PairIter cur = all_elems.begin(), end = all_elems.end();
-    vector<Element> this_subset;
+    std::vector<Element> this_subset;
     while (cur != end) {
       // Process ranges that share the same input symbol.
       Label ilabel = cur->first;
@@ -1092,7 +1092,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     size_t max_state = output_arcs_.size() - 2;  // Don't take the last
     // one as we might be halfway into constructing it.
 
-    vector<OutputStateId> predecessor(max_state+1, kNoStateId);
+    std::vector<OutputStateId> predecessor(max_state+1, kNoStateId);
     for (size_t i = 0; i < max_state; i++) {
       for (size_t j = 0; j < output_arcs_[i].size(); j++) {
         OutputStateId nextstate = output_arcs_[i][j].nextstate;
@@ -1103,13 +1103,13 @@ template<class Weight, class IntType> class LatticeDeterminizer {
           predecessor[nextstate] = i;
       }
     }
-    vector<pair<Label, StringId> > traceback;
+    std::vector<std::pair<Label, StringId> > traceback;
     // 'traceback' is a pair of (ilabel, olabel-seq).
     OutputStateId cur_state = max_state;  // A recently constructed state.
 
     while (cur_state != 0 && cur_state != kNoStateId) {
       OutputStateId last_state = predecessor[cur_state];
-      pair<Label, StringId> p;
+      std::pair<Label, StringId> p;
       size_t i;
       for (i = 0; i < output_arcs_[last_state].size(); i++) {
         if (output_arcs_[last_state][i].nextstate == cur_state) {
@@ -1131,7 +1131,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
        << "ilabel (olabel olabel) ilabel (olabel) ... :";
     for (ssize_t i = traceback.size() - 1; i >= 0; i--) {
       ss << ' ' << traceback[i].first << " ( ";
-      vector<Label> seq;
+      std::vector<Label> seq;
       repository_.ConvertToVector(traceback[i].second, &seq);
       for (size_t j = 0; j < seq.size(); j++)
         ss << seq[j] << ' ';
@@ -1194,16 +1194,16 @@ template<class Weight, class IntType> class LatticeDeterminizer {
       elem.state = start_id;
       elem.weight = Weight::One();
       elem.string = repository_.EmptyString();  // Id of empty sequence.
-      vector<Element> subset;
+      std::vector<Element> subset;
       subset.push_back(elem);
       EpsilonClosure(&subset); // follow through epsilon-inputs links
       ConvertToMinimal(&subset); // remove all but final states and
       // states with input-labels on arcs out of them.
-      vector<Element> *subset_ptr = new vector<Element>(subset);
+      std::vector<Element> *subset_ptr = new std::vector<Element>(subset);
       assert(output_arcs_.empty() && output_states_.empty());
       // add the new state...
       output_states_.push_back(subset_ptr);
-      output_arcs_.push_back(vector<TempArc>());
+      output_arcs_.push_back(std::vector<TempArc>());
       OutputStateId initial_state = 0;
       minimal_hash_[subset_ptr] = initial_state;
       queue_.push_back(initial_state);
@@ -1213,11 +1213,11 @@ template<class Weight, class IntType> class LatticeDeterminizer {
   KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizer);
 
 
-  vector<vector<Element>* > output_states_; // maps from output state to
+  std::vector<std::vector<Element>* > output_states_; // maps from output state to
                                             // minimal representation [normalized].
                                             // View pointers as owned in
                                             // minimal_hash_.
-  vector<vector<TempArc> > output_arcs_;  // essentially an FST in our format.
+  std::vector<std::vector<TempArc> > output_arcs_;  // essentially an FST in our format.
 
   int num_arcs_; // keep track of memory usage: number of arcs in output_arcs_
   int num_elems_; // keep track of memory usage: number of elems in output_states_
@@ -1241,15 +1241,15 @@ template<class Weight, class IntType> class LatticeDeterminizer {
                                      // normalize, there may be an extra weight
                                      // and string.  Owns the pointers
                                     // in its keys.
-  vector<OutputStateId> queue_; // Queue of output-states to process.  Starts with
+  std::vector<OutputStateId> queue_; // Queue of output-states to process.  Starts with
   // state 0, and increases and then (hopefully) decreases in length during
   // determinization.  LIFO queue (queue discipline doesn't really matter).
 
-  vector<pair<Label, Element> > all_elems_tmp_; // temporary vector used in ProcessTransitions.
+  std::vector<std::pair<Label, Element> > all_elems_tmp_; // temporary vector used in ProcessTransitions.
 
   enum IsymbolOrFinal { OSF_UNKNOWN = 0, OSF_NO = 1, OSF_YES = 2 };
 
-  vector<char> isymbol_or_final_; // A kind of cache; it says whether
+  std::vector<char> isymbol_or_final_; // A kind of cache; it says whether
   // each state is (emitting or final) where emitting means it has at least one
   // non-epsilon output arc.  Only accessed by IsIsymbolOrFinal()
 
diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc
index 7359fa1354d..886aa4cc1b9 100644
--- a/src/fstext/determinize-lattice-test.cc
+++ b/src/fstext/determinize-lattice-test.cc
@@ -23,6 +23,8 @@
 #include "base/kaldi-math.h"
 
 namespace fst {
+using std::vector;
+using std::cout;
 
 void TestLatticeStringRepository() {
   typedef int32 IntType;
diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index fcba5ef8ea2..57b9ed3425d 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -44,10 +44,10 @@ template<class Label, class StringId> class StringRepository {
  public:
   class VectorKey { // Hash function object.
    public:
-    size_t operator()(const vector<Label> *vec) const {
+    size_t operator()(const std::vector<Label> *vec) const {
       assert(vec != NULL);
       size_t hash = 0, factor = 1;
-      for (typename vector<Label>::const_iterator it = vec->begin();
+      for (typename std::vector<Label>::const_iterator it = vec->begin();
            it != vec->end(); it++) {
         hash += factor*(*it);
         factor *= 103333;  // just an arbitrary prime number.
@@ -57,12 +57,12 @@ template<class Label, class StringId> class StringRepository {
   };
   class VectorEqual {  // Equality-operator function object.
    public:
-    size_t operator()(const vector<Label> *vec1, const vector<Label> *vec2) const {
+    size_t operator()(const std::vector<Label> *vec1, const std::vector<Label> *vec2) const {
       return (*vec1 == *vec2);
     }
   };
 
-  typedef unordered_map<const vector<Label>*, StringId, VectorKey, VectorEqual> MapType;
+  typedef unordered_map<const std::vector<Label>*, StringId, VectorKey, VectorEqual> MapType;
 
   StringId IdOfEmpty() { return no_symbol; }
 
@@ -71,12 +71,12 @@ template<class Label, class StringId> class StringRepository {
       return l + single_symbol_start;
     } else {
       // l is out of the allowed range so we have to treat it as a sequence of length one.  Should be v. rare.
-      vector<Label> v; v.push_back(l);
+      std::vector<Label> v; v.push_back(l);
       return IdOfSeqInternal(v);
     }
   }
 
-  StringId IdOfSeq(const vector<Label> &v) {  // also works for sizes 0 and 1.
+  StringId IdOfSeq(const std::vector<Label> &v) {  // also works for sizes 0 and 1.
     size_t sz = v.size();
     if (sz == 0) return no_symbol;
     else if (v.size() == 1) return IdOfLabel(v[0]);
@@ -86,7 +86,7 @@ template<class Label, class StringId> class StringRepository {
   inline bool IsEmptyString(StringId id) {
     return id == no_symbol;
   }
-  void SeqOfId(StringId id, vector<Label> *v) {
+  void SeqOfId(StringId id, std::vector<Label> *v) {
     if (id == no_symbol) v->clear();
     else if (id>=single_symbol_start) {
       v->resize(1); (*v)[0] = id - single_symbol_start;
@@ -98,11 +98,11 @@ template<class Label, class StringId> class StringRepository {
   StringId RemovePrefix(StringId id, size_t prefix_len) {
     if (prefix_len == 0) return id;
     else {
-      vector<Label> v;
+      std::vector<Label> v;
       SeqOfId(id, &v);
       size_t sz = v.size();
       assert(sz >= prefix_len);
-      vector<Label> v_noprefix(sz - prefix_len);
+      std::vector<Label> v_noprefix(sz - prefix_len);
       for (size_t i = 0;i < sz-prefix_len;i++) v_noprefix[i] = v[i+prefix_len];
       return IdOfSeq(v_noprefix);
     }
@@ -111,15 +111,15 @@ template<class Label, class StringId> class StringRepository {
   StringRepository() {
     // The following are really just constants but don't want to complicate compilation so make them
     // class variables.  Due to the brokenness of <limits>, they can't be accessed as constants.
-    string_end = (numeric_limits<StringId>::max() / 2) - 1;  // all hash values must be <= this.
-    no_symbol = (numeric_limits<StringId>::max() / 2);  // reserved for empty sequence.
-    single_symbol_start =  (numeric_limits<StringId>::max() / 2) + 1;
-    single_symbol_range =  numeric_limits<StringId>::max() - single_symbol_start;
+    string_end = (std::numeric_limits<StringId>::max() / 2) - 1;  // all hash values must be <= this.
+    no_symbol = (std::numeric_limits<StringId>::max() / 2);  // reserved for empty sequence.
+    single_symbol_start =  (std::numeric_limits<StringId>::max() / 2) + 1;
+    single_symbol_range =  std::numeric_limits<StringId>::max() - single_symbol_start;
   }
   void Destroy() {
-    for (typename vector<vector<Label>* >::iterator iter = vec_.begin(); iter != vec_.end(); ++iter)
+    for (typename std::vector<std::vector<Label>* >::iterator iter = vec_.begin(); iter != vec_.end(); ++iter)
       delete *iter;
-    vector<vector<Label>* > tmp_vec;
+    std::vector<std::vector<Label>* > tmp_vec;
     tmp_vec.swap(vec_);
     MapType tmp_map;
     tmp_map.swap(map_);
@@ -131,13 +131,13 @@ template<class Label, class StringId> class StringRepository {
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(StringRepository);
 
-  StringId IdOfSeqInternal(const vector<Label> &v) {
+  StringId IdOfSeqInternal(const std::vector<Label> &v) {
     typename MapType::iterator iter = map_.find(&v);
     if (iter != map_.end()) {
       return iter->second;
     } else {  // must add it to map.
       StringId this_id = (StringId) vec_.size();
-      vector<Label> *v_new = new vector<Label> (v);
+      std::vector<Label> *v_new = new std::vector<Label> (v);
       vec_.push_back(v_new);
       map_[v_new] = this_id;
       assert(this_id < string_end);  // or we used up the labels.
@@ -145,7 +145,7 @@ template<class Label, class StringId> class StringRepository {
     }
   }
 
-  vector<vector<Label>* > vec_;
+  std::vector<std::vector<Label>* > vec_;
   MapType map_;
 
   static const StringId string_start = (StringId) 0;  // This must not change.  It's assumed.
@@ -195,13 +195,13 @@ template<class F> class DeterminizerStar {
       elem.state = start_id;
       elem.weight = Weight::One();
       elem.string = repository_.IdOfEmpty();  // Id of empty sequence.
-      vector<Element> vec;
+      std::vector<Element> vec;
       vec.push_back(elem);
       OutputStateId cur_id = SubsetToStateId(vec);
       assert(cur_id == 0 && "Do not call Determinize twice.");
     }
     while (!Q_.empty()) {
-      pair<vector<Element>*, OutputStateId> cur_pair = Q_.front();
+      std::pair<std::vector<Element>*, OutputStateId> cur_pair = Q_.front();
       Q_.pop_front();
       ProcessSubset(cur_pair);
       if (debug_ptr && *debug_ptr) Debug();  // will exit.
@@ -287,9 +287,9 @@ template<class F> class DeterminizerStar {
 
   class SubsetKey {
    public:
-    size_t operator ()(const vector<Element> * subset) const {  // hashes only the state and string.
+    size_t operator ()(const std::vector<Element> * subset) const {  // hashes only the state and string.
       size_t hash = 0, factor = 1;
-      for (typename vector<Element>::const_iterator iter = subset->begin();
+      for (typename std::vector<Element>::const_iterator iter = subset->begin();
            iter != subset->end(); ++iter) {
         hash *= factor;
         hash += iter->state + 103333 * iter->string;
@@ -303,12 +303,12 @@ template<class F> class DeterminizerStar {
   // and string, and approximate match on weights.
   class SubsetEqual {
    public:
-    bool operator ()(const vector<Element> *s1,
-                     const vector<Element> *s2) const {
+    bool operator ()(const std::vector<Element> *s1,
+                     const std::vector<Element> *s2) const {
       size_t sz = s1->size();
       assert(sz >= 0);
       if (sz != s2->size()) return false;
-      typename vector<Element>::const_iterator iter1 = s1->begin(),
+      typename std::vector<Element>::const_iterator iter1 = s1->begin(),
           iter1_end = s1->end(), iter2 = s2->begin();
       for (; iter1 < iter1_end; ++iter1, ++iter2) {
         if (iter1->state != iter2->state ||
@@ -327,11 +327,11 @@ template<class F> class DeterminizerStar {
   // Used only for debug.
   class SubsetEqualStates {
    public:
-    bool operator ()(const vector<Element> *s1, const vector<Element> *s2) const {
+    bool operator ()(const std::vector<Element> *s1, const std::vector<Element> *s2) const {
       size_t sz = s1->size();
       assert(sz>=0);
       if (sz != s2->size()) return false;
-      typename vector<Element>::const_iterator iter1 = s1->begin(),
+      typename std::vector<Element>::const_iterator iter1 = s1->begin(),
           iter1_end = s1->end(), iter2=s2->begin();
       for (; iter1 < iter1_end; ++iter1, ++iter2) {
         if (iter1->state != iter2->state) return false;
@@ -341,7 +341,7 @@ template<class F> class DeterminizerStar {
   };
 
   // Define the hash type we use to store subsets.
-  typedef unordered_map<const vector<Element>*, OutputStateId, SubsetKey, SubsetEqual> SubsetHash;
+  typedef unordered_map<const std::vector<Element>*, OutputStateId, SubsetKey, SubsetEqual> SubsetHash;
 
   class EpsilonClosure {
    public:
@@ -355,8 +355,8 @@ template<class F> class DeterminizerStar {
     // This function computes epsilon closure of subset of states by following epsilon links.
     // Called by ProcessSubset.
     // Has no side effects except on the repository.
-    void GetEpsilonClosure(const vector<Element> &input_subset,
-                        vector<Element> *output_subset);
+    void GetEpsilonClosure(const std::vector<Element> &input_subset,
+                        std::vector<Element> *output_subset);
 
    private:
     struct EpsilonClosureInfo {
@@ -387,8 +387,8 @@ template<class F> class DeterminizerStar {
     // duplicate computation if epsilons form a DAG that is not a tree
     //
     // We put the queues here for better efficiency for memory allocation
-    deque<typename Arc::StateId> queue_;
-    vector<Element> queue_2_;
+    std::deque<typename Arc::StateId> queue_;
+    std::vector<Element> queue_2_;
 
     // the following 2 structures together form our *virtual "map"*
     // basically we need a map from state_id to EpsilonClosureInfo that operates
@@ -397,12 +397,12 @@ template<class F> class DeterminizerStar {
     // As a result each time we do a look-up, we need to check
     // if (ecinfo_[id_to_index_[id]].element.state == id)
     // Yet this is still faster than using a std::map<StateId, EpsilonClosureInfo>
-    vector<int> id_to_index_;
+    std::vector<int> id_to_index_;
     // unlike id_to_index_, we clear the content of ecinfo_ each time we call
     // EpsilonClosure(). This needed because we need an efficient way to
     // traverse the virtual map - it is just too costly to traverse the
     // id_to_index_ vector.
-    vector<EpsilonClosureInfo> ecinfo_;
+    std::vector<EpsilonClosureInfo> ecinfo_;
 
     // Add one element (elem) into cur_subset
     // it also adds the necessary stuff to queue_, set the correct weight
@@ -432,7 +432,7 @@ template<class F> class DeterminizerStar {
   // called by ProcessSubset.
   // Has no side effects except on the variable repository_, and output_arcs_.
 
-  void ProcessFinal(const vector<Element> &closed_subset, OutputStateId state) {
+  void ProcessFinal(const std::vector<Element> &closed_subset, OutputStateId state) {
     // processes final-weights for this subset.
     bool is_final = false;
     StringId final_string = 0;  // = 0 to keep compiler happy.
@@ -440,7 +440,7 @@ template<class F> class DeterminizerStar {
     // we just set it to avoid spurious compiler warnings.  We avoid setting it
     // to Zero() because floating-point infinities can sometimes generate
     // interrupts and slow things down.
-    typename vector<Element>::const_iterator iter = closed_subset.begin(),
+    typename std::vector<Element>::const_iterator iter = closed_subset.begin(),
         end = closed_subset.end();
     for (; iter != end; ++iter) {
       const Element &elem = *iter;
@@ -472,14 +472,14 @@ template<class F> class DeterminizerStar {
   // ProcessTransition is called from "ProcessTransitions".  Broken out for
   // clarity.  Has side effects on output_arcs_, and (via SubsetToStateId), Q_
   // and hash_.
-  void ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset);
+  void ProcessTransition(OutputStateId state, Label ilabel, std::vector<Element> *subset);
 
   // "less than" operator for pair<Label, Element>.   Used in ProcessTransitions.
   // Lexicographical order, with comparing the state only for "Element".
 
   class PairComparator {
    public:
-    inline bool operator () (const pair<Label, Element> &p1, const pair<Label, Element> &p2) {
+    inline bool operator () (const std::pair<Label, Element> &p1, const std::pair<Label, Element> &p2) {
       if (p1.first < p2.first) return true;
       else if (p1.first > p2.first) return false;
       else {
@@ -498,11 +498,11 @@ template<class F> class DeterminizerStar {
   // with the same ilabel.
   // Side effects on repository, and (via ProcessTransition) on Q_, hash_,
   // and output_arcs_.
-  void ProcessTransitions(const vector<Element> &closed_subset, OutputStateId state) {
-    vector<pair<Label, Element> > all_elems;
+  void ProcessTransitions(const std::vector<Element> &closed_subset, OutputStateId state) {
+    std::vector<std::pair<Label, Element> > all_elems;
     {  // Push back into "all_elems", elements corresponding to all non-epsilon-input transitions
       // out of all states in "closed_subset".
-      typename vector<Element>::const_iterator iter = closed_subset.begin(),
+      typename std::vector<Element>::const_iterator iter = closed_subset.begin(),
           end = closed_subset.end();
       for (; iter != end; ++iter) {
         const Element &elem = *iter;
@@ -510,7 +510,7 @@ template<class F> class DeterminizerStar {
              !aiter.Done(); aiter.Next()) {
           const Arc &arc = aiter.Value();
           if (arc.ilabel != 0) {  // Non-epsilon transition -- ignore epsilons here.
-            pair<Label, Element> this_pr;
+            std::pair<Label, Element> this_pr;
             this_pr.first = arc.ilabel;
             Element &next_elem(this_pr.second);
             next_elem.state = arc.nextstate;
@@ -519,7 +519,7 @@ template<class F> class DeterminizerStar {
                                  // handle separately for efficiency
               next_elem.string = elem.string;
             else {
-              vector<Label> seq;
+              std::vector<Label> seq;
               repository_.SeqOfId(elem.string, &seq);
               seq.push_back(arc.olabel);
               next_elem.string = repository_.IdOfSeq(seq);
@@ -532,9 +532,9 @@ template<class F> class DeterminizerStar {
     PairComparator pc;
     std::sort(all_elems.begin(), all_elems.end(), pc);
     // now sorted first on input label, then on state.
-    typedef typename vector<pair<Label, Element> >::const_iterator PairIter;
+    typedef typename std::vector<std::pair<Label, Element> >::const_iterator PairIter;
     PairIter cur = all_elems.begin(), end = all_elems.end();
-    vector<Element> this_subset;
+    std::vector<Element> this_subset;
     while (cur != end) {
       // Process ranges that share the same input symbol.
       Label ilabel = cur->first;
@@ -552,25 +552,25 @@ template<class F> class DeterminizerStar {
   // fst.  This is a hash lookup; if no such state exists, it adds a new state to the hash
   // and adds a new pair to the queue.
   // Side effects on hash_ and Q_, and on output_arcs_ [just affects the size].
-  OutputStateId SubsetToStateId(const vector<Element> &subset) {  // may add the subset to the queue.
+  OutputStateId SubsetToStateId(const std::vector<Element> &subset) {  // may add the subset to the queue.
     typedef typename SubsetHash::iterator IterType;
     IterType iter = hash_.find(&subset);
     if (iter == hash_.end()) {  // was not there.
-      vector<Element> *new_subset = new vector<Element>(subset);
+      std::vector<Element> *new_subset = new std::vector<Element>(subset);
       OutputStateId new_state_id = (OutputStateId) output_arcs_.size();
-      bool ans = hash_.insert(std::pair<const vector<Element>*,
+      bool ans = hash_.insert(std::pair<const std::vector<Element>*,
                                         OutputStateId>(new_subset,
                                                        new_state_id)).second;
       assert(ans);
-      output_arcs_.push_back(vector<TempArc>());
+      output_arcs_.push_back(std::vector<TempArc>());
       if (allow_partial_ == false) {
         // If --allow-partial is not requested, we do the old way.
-        Q_.push_front(pair<vector<Element>*, OutputStateId>(new_subset,  new_state_id));
+        Q_.push_front(std::pair<std::vector<Element>*, OutputStateId>(new_subset,  new_state_id));
       } else {
         // If --allow-partial is requested, we do breadth first search. This
         // ensures that when we return partial results, we return the states
         // that are reachable by the fewest steps from the start state.
-        Q_.push_back(pair<vector<Element>*, OutputStateId>(new_subset,  new_state_id));
+        Q_.push_back(std::pair<std::vector<Element>*, OutputStateId>(new_subset,  new_state_id));
       }
       return new_state_id;
     } else {
@@ -585,11 +585,11 @@ template<class F> class DeterminizerStar {
   // of (states, weights)).  After that we ignore epsilons.  We process the final-weight
   // of the state, and then handle transitions out (this may add more determinized states
   // to the queue).
-  void ProcessSubset(const pair<vector<Element>*, OutputStateId> & pair) {
-    const vector<Element> *subset = pair.first;
+  void ProcessSubset(const std::pair<std::vector<Element>*, OutputStateId> & pair) {
+    const std::vector<Element> *subset = pair.first;
     OutputStateId state = pair.second;
 
-    vector<Element> closed_subset;  // subset after epsilon closure.
+    std::vector<Element> closed_subset;  // subset after epsilon closure.
     epsilon_closure_.GetEpsilonClosure(*subset, &closed_subset);
 
     // Now follow non-epsilon arcs [and also process final states]
@@ -602,9 +602,9 @@ template<class F> class DeterminizerStar {
   void Debug();
 
   KALDI_DISALLOW_COPY_AND_ASSIGN(DeterminizerStar);
-  deque<pair<vector<Element>*, OutputStateId> > Q_;  // queue of subsets to be processed.
+  std::deque<std::pair<std::vector<Element>*, OutputStateId> > Q_;  // queue of subsets to be processed.
 
-  vector<vector<TempArc> > output_arcs_;  // essentially an FST in our format.
+  std::vector<std::vector<TempArc> > output_arcs_;  // essentially an FST in our format.
 
   const Fst<Arc> *ifst_;
   float delta_;
@@ -649,8 +649,8 @@ bool DeterminizeStar(F &ifst,
 
 template<class F>
 void DeterminizerStar<F>::EpsilonClosure::
-            GetEpsilonClosure(const vector<Element> &input_subset,
-                                       vector<Element> *output_subset) {
+            GetEpsilonClosure(const std::vector<Element> &input_subset,
+                                       std::vector<Element> *output_subset) {
   ecinfo_.resize(0);
   size_t size = input_subset.size();
   // find whether input fst is known to be sorted in input label.
@@ -778,7 +778,7 @@ void DeterminizerStar<F>::EpsilonClosure::
       ss << "FST was not functional -> not determinizable.";
       { // Print some debugging information.  Can be helpful to debug
         // the inputs when FSTs are mysteriously non-functional.
-        vector<Label> tmp_seq;
+        std::vector<Label> tmp_seq;
         repository_->SeqOfId(info.element.string, &tmp_seq);
         ss << "\nFirst string:";
         for (size_t i = 0; i < tmp_seq.size(); i++)
@@ -843,7 +843,7 @@ void DeterminizerStar<F>::EpsilonClosure::ExpandOneElement(
     if (arc.olabel == 0) {
       next_elem.string = str;
     } else {
-      vector<Label> seq;
+      std::vector<Label> seq;
       repository_->SeqOfId(str, &seq);
       if (arc.olabel != 0)
         seq.push_back(arc.olabel);
@@ -880,13 +880,13 @@ void DeterminizerStar<F>::Output(MutableFst<GallicArc<Arc> > *ofst,
   ofst->SetStart(0);
   // now process transitions.
   for (StateId this_state = 0; this_state < nStates; this_state++) {
-    vector<TempArc> &this_vec(output_arcs_[this_state]);
-    typename vector<TempArc>::const_iterator iter = this_vec.begin(),
+    std::vector<TempArc> &this_vec(output_arcs_[this_state]);
+    typename std::vector<TempArc>::const_iterator iter = this_vec.begin(),
         end = this_vec.end();
     for (; iter != end; ++iter) {
       const TempArc &temp_arc(*iter);
       GallicArc<Arc> new_arc;
-      vector<Label> seq;
+      std::vector<Label> seq;
       repository_.SeqOfId(temp_arc.ostring, &seq);
       StringWeight<Label, STRING_LEFT> string_weight;
       for (size_t i = 0;i < seq.size();i++) string_weight.PushBack(seq[i]);
@@ -903,9 +903,9 @@ void DeterminizerStar<F>::Output(MutableFst<GallicArc<Arc> > *ofst,
       }
     }
     // Free up memory.  Do this inside the loop as ofst is also allocating memory
-    if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
+    if (destroy) { std::vector<TempArc> temp; temp.swap(this_vec); }
   }
-  if (destroy) { vector<vector<TempArc> > temp; temp.swap(output_arcs_); }
+  if (destroy) { std::vector<std::vector<TempArc> > temp; temp.swap(output_arcs_); }
 }
 
 template<class F>
@@ -928,13 +928,13 @@ void DeterminizerStar<F>::Output(MutableFst<Arc> *ofst, bool destroy) {
   }
   ofst->SetStart(0);
   for (OutputStateId this_state = 0; this_state < num_states; this_state++) {
-    vector<TempArc> &this_vec(output_arcs_[this_state]);
+    std::vector<TempArc> &this_vec(output_arcs_[this_state]);
 
-    typename vector<TempArc>::const_iterator iter = this_vec.begin(),
+    typename std::vector<TempArc>::const_iterator iter = this_vec.begin(),
         end = this_vec.end();
     for (; iter != end; ++iter) {
       const TempArc &temp_arc(*iter);
-      vector<Label> seq;
+      std::vector<Label> seq;
       repository_.SeqOfId(temp_arc.ostring, &seq);
       if (temp_arc.nextstate == kNoStateId) {  // Really a final weight.
         // Make a sequence of states going to a final state, with the strings as labels.
@@ -976,23 +976,23 @@ void DeterminizerStar<F>::Output(MutableFst<Arc> *ofst, bool destroy) {
       }
     }
     // Free up memory.  Do this inside the loop as ofst is also allocating memory
-    if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
+    if (destroy) { std::vector<TempArc> temp; temp.swap(this_vec); }
   }
   if (destroy) {
-    vector<vector<TempArc> > temp;
+    std::vector<std::vector<TempArc> > temp;
     temp.swap(output_arcs_);
     repository_.Destroy();
   }
 }
 
 template<class F> void DeterminizerStar<F>::
-ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset) {
+ProcessTransition(OutputStateId state, Label ilabel, std::vector<Element> *subset) {
   // At input, "subset" may contain duplicates for a given dest state (but in sorted
   // order).  This function removes duplicates from "subset", normalizes it, and adds
   // a transition to the dest. state (possibly affecting Q_ and hash_, if state did not
   // exist).
 
-  typedef typename vector<Element>::iterator IterType;
+  typedef typename std::vector<Element>::iterator IterType;
   {  // This block makes the subset have one unique Element per state, adding the weights.
     IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
     size_t num_out = 0;
@@ -1019,12 +1019,12 @@ ProcessTransition(OutputStateId state, Label ilabel, vector<Element> *subset) {
   Weight tot_weight;
   {  // This block computes common_str and tot_weight (essentially: the common divisor)
     // and removes them from the elements.
-    vector<Label> seq;
+    std::vector<Label> seq;
 
     IterType begin = subset->begin(), iter, end = subset->end();
     {  // This block computes "seq", which is the common prefix, and "common_str",
       // which is the StringId version of "seq".
-      vector<Label> tmp_seq;
+      std::vector<Label> tmp_seq;
       for (iter = begin; iter != end; ++iter) {
         if (iter == begin) {
           repository_.SeqOfId(iter->string, &seq);
@@ -1081,7 +1081,7 @@ void DeterminizerStar<F>::Debug() {
   size_t max_state = output_arcs_.size() - 2;  // don't take the last
   // one as we might be halfway into constructing it.
 
-  vector<OutputStateId> predecessor(max_state+1, kNoStateId);
+  std::vector<OutputStateId> predecessor(max_state+1, kNoStateId);
   for (size_t i = 0; i < max_state; i++) {
     for (size_t j = 0; j < output_arcs_[i].size(); j++) {
       OutputStateId nextstate = output_arcs_[i][j].nextstate;
@@ -1092,13 +1092,13 @@ void DeterminizerStar<F>::Debug() {
         predecessor[nextstate] = i;
     }
   }
-  vector<pair<Label, StringId> > traceback;
+  std::vector<std::pair<Label, StringId> > traceback;
   // 'traceback' is a pair of (ilabel, olabel-seq).
   OutputStateId cur_state = max_state;  // A recently constructed state.
 
   while (cur_state != 0 && cur_state != kNoStateId) {
     OutputStateId last_state = predecessor[cur_state];
-    pair<Label, StringId> p;
+    std::pair<Label, StringId> p;
     size_t i;
     for (i = 0; i < output_arcs_[last_state].size(); i++) {
       if (output_arcs_[last_state][i].nextstate == cur_state) {
@@ -1120,7 +1120,7 @@ void DeterminizerStar<F>::Debug() {
     << "ilabel (olabel olabel) ilabel (olabel) ... :";
   for (ssize_t i = traceback.size() - 1; i >= 0; i--) {
     ss << ' ' << traceback[i].first << " ( ";
-    vector<Label> seq;
+    std::vector<Label> seq;
     repository_.SeqOfId(traceback[i].second, &seq);
     for (size_t j = 0; j < seq.size(); j++)
       ss << seq[j] << ' ';
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index ee150f0c024..814e6a38d9b 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -65,16 +65,16 @@ template<class Arc>  void TestDeterminize() {
 
   VectorFst<Arc> *fst = new VectorFst<Arc>();
   int n_syms = 2 + kaldi::Rand() % 5, n_states = 3 + kaldi::Rand() % 10, n_arcs = 5 + kaldi::Rand() % 30, n_final = 1 + kaldi::Rand()%3;  // Up to 2 unique symbols.
-  cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
+  std::cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
   SymbolTable *sptr = NULL;
 
-  vector<Label> all_syms;  // including epsilon.
+  std::vector<Label> all_syms;  // including epsilon.
   // Put symbols in the symbol table from 1..n_syms-1.
   for (size_t i = 0;i < (size_t)n_syms;i++)
     all_syms.push_back(i);
 
   // Create states.
-  vector<StateId> all_states;
+  std::vector<StateId> all_states;
   for (size_t i = 0;i < (size_t)n_states;i++) {
     StateId this_state = fst->AddState();
     if (i == 0) fst->SetStart(i);
@@ -114,7 +114,7 @@ template<class Arc>  void TestDeterminize() {
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
 
-  vector<Label> extra_syms;
+  std::vector<Label> extra_syms;
   if (fst->Start() != kNoStateId) {  // "Connect" did not make it empty....
     PreDeterminize(fst, 1000, &extra_syms);
   }
@@ -206,16 +206,16 @@ template<class Arc>  void TestPush() {
 
   VectorFst<Arc> *fst = new VectorFst<Arc>();
   int n_syms = 2 + kaldi::Rand() % 5, n_states = 3 + kaldi::Rand() % 10, n_arcs = 5 + kaldi::Rand() % 30, n_final = 1 + kaldi::Rand()%3;  // Up to 2 unique symbols.
-  cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
+  std::cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
   SymbolTable *sptr = NULL;
 
-  vector<Label> all_syms;  // including epsilon.
+  std::vector<Label> all_syms;  // including epsilon.
   // Put symbols in the symbol table from 1..n_syms-1.
   for (size_t i = 0;i < (size_t)n_syms;i++)
     all_syms.push_back(i);
 
   // Create states.
-  vector<StateId> all_states;
+  std::vector<StateId> all_states;
   for (size_t i = 0;i < (size_t)n_states;i++) {
     StateId this_state = fst->AddState();
     if (i == 0) fst->SetStart(i);
@@ -255,7 +255,7 @@ template<class Arc>  void TestPush() {
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
 
-  vector<Label> extra_syms;
+  std::vector<Label> extra_syms;
   if (fst->Start() != kNoStateId) {  // "Connect" did not make it empty....
     PreDeterminize(fst, 1000, &extra_syms);
   }
@@ -284,16 +284,16 @@ template<class Arc>  void TestMinimize() {
 
   VectorFst<Arc> *fst = new VectorFst<Arc>();
   int n_syms = 2 + kaldi::Rand() % 5, n_states = 3 + kaldi::Rand() % 10, n_arcs = 5 + kaldi::Rand() % 30, n_final = 1 + kaldi::Rand()%3;  // Up to 2 unique symbols.
-  cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
+  std::cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
   SymbolTable *sptr =NULL;
 
-  vector<Label> all_syms;  // including epsilon.
+  std::vector<Label> all_syms;  // including epsilon.
   // Put symbols in the symbol table from 1..n_syms-1.
   for (size_t i = 0;i < (size_t)n_syms;i++)
     all_syms.push_back(i);
 
   // Create states.
-  vector<StateId> all_states;
+  std::vector<StateId> all_states;
   for (size_t i = 0;i < (size_t)n_states;i++) {
     StateId this_state = fst->AddState();
     if (i == 0) fst->SetStart(i);
@@ -333,7 +333,7 @@ template<class Arc>  void TestMinimize() {
 
   VectorFst<Arc> *fst_copy_orig = new VectorFst<Arc>(*fst);
 
-  vector<Label> extra_syms;
+  std::vector<Label> extra_syms;
   if (fst->Start() != kNoStateId) {  // "Connect" did not make it empty....
     PreDeterminize(fst, 1000, &extra_syms);
   }
@@ -451,12 +451,12 @@ template<class Arc, class inttype> void TestStringRepository() {
 
   int N = 100;
   if (sizeof(inttype) == 1) N = 64;
-  vector<vector<Label> > strings(N);
-  vector<inttype> ids(N);
+  std::vector<std::vector<Label> > strings(N);
+  std::vector<inttype> ids(N);
 
   for (int i = 0;i < N;i++) {
     size_t len = kaldi::Rand() % 4;
-    vector<Label> vec;
+    std::vector<Label> vec;
     for (size_t j = 0;j < len;j++) vec.push_back( (kaldi::Rand()%10) + 150*(kaldi::Rand()%2));  // make it have reasonable range.
     if (i < 500 && vec.size() == 0) ids[i] = sr.IdOfEmpty();
     else if (i < 500 && vec.size() == 1) ids[i] = sr.IdOfLabel(vec[0]);
@@ -466,7 +466,7 @@ template<class Arc, class inttype> void TestStringRepository() {
   }
 
   for (int i = 0;i < N;i++) {
-    vector<Label> tmpv;
+    std::vector<Label> tmpv;
     tmpv.push_back(10);  // just put in garbage.
     sr.SeqOfId(ids[i], &tmpv);
     assert(tmpv == strings[i]);
@@ -477,7 +477,7 @@ template<class Arc, class inttype> void TestStringRepository() {
     if (sizeof(inttype) != 1) {
       size_t prefix_len = kaldi::Rand() % (strings[i].size() + 1);
       inttype s2 = sr.RemovePrefix(ids[i], prefix_len);
-      vector<Label> vec2;
+      std::vector<Label> vec2;
       sr.SeqOfId(s2, &vec2);
       for (size_t j = 0;j < strings[i].size()-prefix_len;j++) {
         assert(vec2[j] == strings[i][j+prefix_len]);
diff --git a/src/fstext/factor-inl.h b/src/fstext/factor-inl.h
index 6913d960132..5958dc6ab88 100644
--- a/src/fstext/factor-inl.h
+++ b/src/fstext/factor-inl.h
@@ -36,7 +36,7 @@ namespace fst {
 template<class Arc>
 void GetStateProperties(const Fst<Arc> &fst,
                         typename Arc::StateId max_state,
-                        vector<StatePropertiesType> *props) {
+                        std::vector<StatePropertiesType> *props) {
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   assert(props != NULL);
@@ -67,7 +67,7 @@ void GetStateProperties(const Fst<Arc> &fst,
 
 template<class Arc, class I>
 void Factor(const Fst<Arc> &fst, MutableFst<Arc> *ofst,
-               vector<vector<I> > *symbols_out) {
+               std::vector<std::vector<I> > *symbols_out) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Label Label;
@@ -75,15 +75,15 @@ void Factor(const Fst<Arc> &fst, MutableFst<Arc> *ofst,
   assert(symbols_out != NULL);
   ofst->DeleteStates();
   if (fst.Start() < 0) return;  // empty FST.
-  vector<StateId> order;
+  std::vector<StateId> order;
   DfsOrderVisitor<Arc> dfs_order_visitor(&order);
   DfsVisit(fst, &dfs_order_visitor);
   assert(order.size() > 0);
   StateId max_state = *(std::max_element(order.begin(), order.end()));
-  vector<StatePropertiesType> state_properties;
+  std::vector<StatePropertiesType> state_properties;
   GetStateProperties(fst, max_state, &state_properties);
 
-  vector<bool> remove(max_state+1);  // if true, will remove this state.
+  std::vector<bool> remove(max_state+1);  // if true, will remove this state.
 
   // Now identify states that will be removed (made the middle of a chain).
   // The basic rule is that if the FstStateProperties equals
@@ -96,16 +96,16 @@ void Factor(const Fst<Arc> &fst, MutableFst<Arc> *ofst,
   for (StateId i = 0; i <= max_state; i++)
     remove[i] = (state_properties[i] == (kStateArcsIn|kStateArcsOut)
                  || state_properties[i] == (kStateArcsIn|kStateArcsOut|kStateIlabelsOut));
-  vector<StateId> state_mapping(max_state+1, kNoStateId);
+  std::vector<StateId> state_mapping(max_state+1, kNoStateId);
 
-  typedef unordered_map<vector<I>, Label, kaldi::VectorHasher<I> > SymbolMapType;
+  typedef unordered_map<std::vector<I>, Label, kaldi::VectorHasher<I> > SymbolMapType;
   SymbolMapType symbol_mapping;
   Label symbol_counter = 0;
   {
-    vector<I> eps;
+    std::vector<I> eps;
     symbol_mapping[eps] = symbol_counter++;
   }
-  vector<I> this_sym;  // a temporary used inside the loop.
+  std::vector<I> this_sym;  // a temporary used inside the loop.
   for (size_t i = 0; i < order.size(); i++) {
     StateId state = order[i];
     if (!remove[state]) {  // Process this state...
@@ -154,13 +154,13 @@ template<class Arc>
 void Factor(const Fst<Arc> &fst, MutableFst<Arc> *ofst1,
             MutableFst<Arc> *ofst2) {
   typedef typename Arc::Label Label;
-  vector<vector<Label> > symbols;
+  std::vector<std::vector<Label> > symbols;
   Factor(fst, ofst2, &symbols);
   CreateFactorFst(symbols, ofst1);
 }
 
 template<class Arc, class I>
-void ExpandInputSequences(const vector<vector<I> > &sequences,
+void ExpandInputSequences(const std::vector<std::vector<I> > &sequences,
                           MutableFst<Arc> *fst) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   typedef typename Arc::StateId StateId;
@@ -236,7 +236,7 @@ class RemoveSomeInputSymbolsMapper {
         kNoEpsilons|kNoIEpsilons|kILabelSorted|kNotILabelSorted;
     return props & ~to_remove;
   }
-  RemoveSomeInputSymbolsMapper(const vector<I> &to_remove):
+  RemoveSomeInputSymbolsMapper(const std::vector<I> &to_remove):
       to_remove_set_(to_remove) {
     KALDI_ASSERT_IS_INTEGER_TYPE(I);
          assert(to_remove_set_.count(0) == 0);  // makes no sense to remove epsilon.
@@ -247,7 +247,7 @@ class RemoveSomeInputSymbolsMapper {
 
 
 template<class Arc, class I>
-void CreateFactorFst(const vector<vector<I> > &sequences,
+void CreateFactorFst(const std::vector<std::vector<I> > &sequences,
                      MutableFst<Arc> *fst) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   typedef typename Arc::StateId StateId;
@@ -282,7 +282,7 @@ void CreateFactorFst(const vector<vector<I> > &sequences,
 
 
 template<class Arc, class I>
-void CreateMapFst(const vector<I> &symbol_map,
+void CreateMapFst(const std::vector<I> &symbol_map,
                   MutableFst<Arc> *fst) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   typedef typename Arc::StateId StateId;
diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc
index cb021ab4643..687d0ad59b3 100644
--- a/src/fstext/factor-test.cc
+++ b/src/fstext/factor-test.cc
@@ -26,7 +26,7 @@
 
 namespace fst
 {
-
+using std::vector;
 
 // Don't instantiate with log semiring, as RandEquivalent may fail.
 template<class Arc> static void TestFactor() {
diff --git a/src/fstext/factor.h b/src/fstext/factor.h
index cb96e7aa20a..5b34f405192 100644
--- a/src/fstext/factor.h
+++ b/src/fstext/factor.h
@@ -50,7 +50,7 @@ namespace fst {
    As a side effect it also sorts the FST in depth-first order.  Factor will
    usually do the best job when the olabels have been pushed to the left,
    i.e. if you make a call like
-   
+
      Push<Arc, REWEIGHT_TO_INITIAL>(fsta, &fstb, kPushLabels);
 
    This is because it only creates a chain with olabels on the first arc of the
@@ -63,7 +63,7 @@ namespace fst {
 
 template<class Arc, class I>
 void Factor(const Fst<Arc> &fst, MutableFst<Arc> *ofst,
-            vector<vector<I> > *symbols);
+            std::vector<std::vector<I> > *symbols);
 
 
 /// This is a more conventional interface of Factor that outputs
@@ -80,7 +80,7 @@ void Factor(const Fst<Arc> &fst, MutableFst<Arc> *ofst1,
 /// in case you did have a symbol table there it would no longer be valid.  It
 /// leaves any weight and output symbols on the first arc of the chain.
 template<class Arc, class I>
-void ExpandInputSequences(const vector<vector<I> > &sequences,
+void ExpandInputSequences(const std::vector<std::vector<I> > &sequences,
                           MutableFst<Arc> *fst);
 
 
@@ -95,7 +95,7 @@ void ExpandInputSequences(const vector<vector<I> > &sequences,
 /// same as calling "ExpandInputSequences".  Use TableCompose (see table-matcher.h)
 /// for efficiency.
 template<class Arc, class I>
-void CreateFactorFst(const vector<vector<I> > &sequences,  
+void CreateFactorFst(const std::vector<std::vector<I> > &sequences,
                      MutableFst<Arc> *fst);
 
 
@@ -105,7 +105,7 @@ void CreateFactorFst(const vector<vector<I> > &sequences,
 /// map to the input symbols of something we compose with it on the right.
 /// Must have symbol_map[0] == 0.
 template<class Arc, class I>
-void CreateMapFst(const vector<I> &symbol_map,
+void CreateMapFst(const std::vector<I> &symbol_map,
                   MutableFst<Arc> *fst);
 
 
@@ -117,7 +117,7 @@ enum  StatePropertiesEnum
   kStateArcsOut = 0x10,
   kStateMultipleArcsOut = 0x20,
   kStateOlabelsOut = 0x40,
-  kStateIlabelsOut = 0x80 }; 
+  kStateIlabelsOut = 0x80 };
 
 typedef unsigned char StatePropertiesType;
 
@@ -127,7 +127,7 @@ typedef unsigned char StatePropertiesType;
 template<class Arc>
 void GetStateProperties(const Fst<Arc> &fst,
                         typename Arc::StateId max_state,
-                        vector<StatePropertiesType> *props);
+                        std::vector<StatePropertiesType> *props);
 
 
 
@@ -137,7 +137,7 @@ class DfsOrderVisitor {
   // c.f. dfs-visit.h.  Used in factor-fst-impl.h
   typedef typename Arc::StateId StateId;
  public:
-  DfsOrderVisitor(vector<StateId> *order): order_(order) { order->clear(); }
+  DfsOrderVisitor(std::vector<StateId> *order): order_(order) { order->clear(); }
   void InitVisit(const Fst<Arc> &fst) {}
   bool InitState(StateId s, StateId) { order_->push_back(s); return true; }
   bool TreeArc(StateId, const Arc&) { return true; }
@@ -146,7 +146,7 @@ class DfsOrderVisitor {
   void FinishState(StateId, StateId, const Arc *) { }
   void FinishVisit() { }
  private:
-  vector<StateId> *order_;
+  std::vector<StateId> *order_;
 };
 
 
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 756e449fcaa..37d50fa5d80 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -75,7 +75,7 @@ typename Arc::StateId NumArcs(const ExpandedFst<Arc> &fst) {
 template<class Arc, class I>
 void GetOutputSymbols(const Fst<Arc> &fst,
                       bool include_eps,
-                      vector<I> *symbols) {
+                      std::vector<I> *symbols) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   std::set<I> all_syms;
   for (StateIterator<Fst<Arc> > siter(fst); !siter.Done(); siter.Next()) {
@@ -96,7 +96,7 @@ void GetOutputSymbols(const Fst<Arc> &fst,
 template<class Arc, class I>
 void GetInputSymbols(const Fst<Arc> &fst,
                      bool include_eps,
-                     vector<I> *symbols) {
+                     std::vector<I> *symbols) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   unordered_set<I> all_syms;
   for (StateIterator<Fst<Arc> > siter(fst); !siter.Done(); siter.Next()) {
@@ -116,7 +116,7 @@ void GetInputSymbols(const Fst<Arc> &fst,
 
 
 template<class Arc, class I>
-void RemoveSomeInputSymbols(const vector<I> &to_remove,
+void RemoveSomeInputSymbols(const std::vector<I> &to_remove,
                             MutableFst<Arc> *fst) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   RemoveSomeInputSymbolsMapper<Arc, I> mapper(to_remove);
@@ -133,9 +133,9 @@ class MapInputSymbolsMapper {
       ans.ilabel = (*symbol_mapping_)[ans.ilabel];
     return ans;
   }
-  MapFinalAction FinalAction() { return MAP_NO_SUPERFINAL; }
-  MapSymbolsAction InputSymbolsAction() { return MAP_CLEAR_SYMBOLS; }
-  MapSymbolsAction OutputSymbolsAction() { return MAP_COPY_SYMBOLS; }
+  MapFinalAction FinalAction() const { return MAP_NO_SUPERFINAL; }
+  MapSymbolsAction InputSymbolsAction() const { return MAP_CLEAR_SYMBOLS; }
+  MapSymbolsAction OutputSymbolsAction() const { return MAP_COPY_SYMBOLS; }
   uint64 Properties(uint64 props) const {  // Not tested.
     bool remove_epsilons = (symbol_mapping_->size() > 0 && (*symbol_mapping_)[0] != 0);
     bool add_epsilons = (symbol_mapping_->size() > 1 &&
@@ -152,20 +152,20 @@ class MapInputSymbolsMapper {
   }
   // initialize with copy = false only if the "to_remove" argument will not be deleted
   // in the lifetime of this object.
-  MapInputSymbolsMapper(const vector<I> &to_remove, bool copy) {
+  MapInputSymbolsMapper(const std::vector<I> &to_remove, bool copy) {
     KALDI_ASSERT_IS_INTEGER_TYPE(I);
-    if (copy) symbol_mapping_ = new vector<I> (to_remove);
+    if (copy) symbol_mapping_ = new std::vector<I> (to_remove);
     else symbol_mapping_ = &to_remove;
     owned = copy;
   }
   ~MapInputSymbolsMapper() { if (owned && symbol_mapping_ != NULL) delete symbol_mapping_; }
  private:
   bool owned;
-  const vector<I> *symbol_mapping_;
+  const std::vector<I> *symbol_mapping_;
 };
 
 template<class Arc, class I>
-void MapInputSymbols(const vector<I> &symbol_mapping,
+void MapInputSymbols(const std::vector<I> &symbol_mapping,
                      MutableFst<Arc> *fst) {
   KALDI_ASSERT_IS_INTEGER_TYPE(I);
   // false == don't copy the "symbol_mapping", retain pointer--
@@ -176,15 +176,15 @@ void MapInputSymbols(const vector<I> &symbol_mapping,
 
 template<class Arc, class I>
 bool GetLinearSymbolSequence(const Fst<Arc> &fst,
-                             vector<I> *isymbols_out,
-                             vector<I> *osymbols_out,
+                             std::vector<I> *isymbols_out,
+                             std::vector<I> *osymbols_out,
                              typename Arc::Weight *tot_weight_out) {
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
   Weight tot_weight = Weight::One();
-  vector<I> ilabel_seq;
-  vector<I> olabel_seq;
+  std::vector<I> ilabel_seq;
+  std::vector<I> olabel_seq;
 
   StateId cur_state = fst.Start();
   if (cur_state == kNoStateId) {  // empty sequence.
@@ -216,10 +216,10 @@ bool GetLinearSymbolSequence(const Fst<Arc> &fst,
 }
 
 
-// see fstext-utils.sh for comment.
+// see fstext-utils.h for comment.
 template<class Arc>
 void ConvertNbestToVector(const Fst<Arc> &fst,
-                          vector<VectorFst<Arc> > *fsts_out) {
+                          std::vector<VectorFst<Arc> > *fsts_out) {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
   fsts_out->clear();
@@ -278,7 +278,7 @@ void ConvertNbestToVector(const Fst<Arc> &fst,
 template<class Arc>
 void NbestAsFsts(const Fst<Arc> &fst,
                  size_t n,
-                 vector<VectorFst<Arc> > *fsts_out) {
+                 std::vector<VectorFst<Arc> > *fsts_out) {
   KALDI_ASSERT(n > 0);
   KALDI_ASSERT(fsts_out != NULL);
   VectorFst<Arc> nbest_fst;
@@ -287,7 +287,7 @@ void NbestAsFsts(const Fst<Arc> &fst,
 }
 
 template<class Arc, class I>
-void MakeLinearAcceptorWithAlternatives(const vector<vector<I> > &labels,
+void MakeLinearAcceptorWithAlternatives(const std::vector<std::vector<I> > &labels,
                                         MutableFst<Arc> *ofst) {
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
@@ -308,7 +308,7 @@ void MakeLinearAcceptorWithAlternatives(const vector<vector<I> > &labels,
 }
 
 template<class Arc, class I>
-void MakeLinearAcceptor(const vector<I> &labels, MutableFst<Arc> *ofst) {
+void MakeLinearAcceptor(const std::vector<I> &labels, MutableFst<Arc> *ofst) {
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
 
@@ -328,7 +328,7 @@ void MakeLinearAcceptor(const vector<I> &labels, MutableFst<Arc> *ofst) {
 template<class I>
 void GetSymbols(const SymbolTable &symtab,
                 bool include_eps,
-                vector<I> *syms_out) {
+                std::vector<I> *syms_out) {
   KALDI_ASSERT(syms_out != NULL);
   syms_out->clear();
   for (SymbolTableIterator iter(symtab);
@@ -344,7 +344,7 @@ void GetSymbols(const SymbolTable &symtab,
 template<class Arc>
 void SafeDeterminizeWrapper(MutableFst<Arc> *ifst, MutableFst<Arc> *ofst, float delta) {
   typename Arc::Label highest_sym = HighestNumberedInputSymbol(*ifst);
-  vector<typename Arc::Label> extra_syms;
+  std::vector<typename Arc::Label> extra_syms;
   PreDeterminize(ifst,
                  (typename Arc::Label)(highest_sym+1),
                  &extra_syms);
@@ -356,7 +356,7 @@ void SafeDeterminizeWrapper(MutableFst<Arc> *ifst, MutableFst<Arc> *ofst, float
 template<class Arc>
 void SafeDeterminizeMinimizeWrapper(MutableFst<Arc> *ifst, VectorFst<Arc> *ofst, float delta) {
   typename Arc::Label highest_sym = HighestNumberedInputSymbol(*ifst);
-  vector<typename Arc::Label> extra_syms;
+  std::vector<typename Arc::Label> extra_syms;
   PreDeterminize(ifst,
                  (typename Arc::Label)(highest_sym+1),
                  &extra_syms);
@@ -466,7 +466,7 @@ template<class Arc, class F> // F is functor type from labels to classes.
 bool PrecedingInputSymbolsAreSameClass(bool start_is_epsilon, const Fst<Arc> &fst, const F &f) {
   typedef typename F::Result ClassType;
   typedef typename Arc::StateId StateId;
-  vector<ClassType> classes;
+  std::vector<ClassType> classes;
   ClassType noClass = f(kNoLabel);
 
   if (start_is_epsilon) {
@@ -535,7 +535,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
   typedef typename F::Result ClassType;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
-  vector<ClassType> classes;
+  std::vector<ClassType> classes;
   ClassType noClass = f(kNoLabel);
   ClassType epsClass = f(0);
   if (start_is_epsilon) {  // treat having-start-state as epsilon in-transition.
@@ -567,7 +567,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
   // Work out list of arcs we have to change as (state, arc-offset).
   // Can't do the actual changes in this pass, since we have to add new
   // states which invalidates the iterators.
-  vector<pair<StateId, size_t> > arcs_to_change;
+  std::vector<std::pair<StateId, size_t> > arcs_to_change;
   for (StateIterator<Fst<Arc> > siter(*fst); !siter.Done(); siter.Next()) {
     StateId s = siter.Value();
     for (ArcIterator<Fst<Arc> > aiter(*fst, s); !aiter.Done(); aiter.Next()) {
@@ -579,7 +579,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
   }
   KALDI_ASSERT(!arcs_to_change.empty());  // since !bad_states.empty().
 
-  std::map<pair<StateId, ClassType>, StateId> state_map;
+  std::map<std::pair<StateId, ClassType>, StateId> state_map;
   // state_map is a map from (bad-state, input-symbol-class) to dummy-state.
 
   for (size_t i = 0; i < arcs_to_change.size(); i++) {
@@ -590,7 +590,7 @@ void MakePrecedingInputSymbolsSameClass(bool start_is_epsilon, MutableFst<Arc> *
 
     // Transition is non-eps transition to "bad" state.  Introduce new state (or find
     // existing one).
-    pair<StateId, ClassType> p(arc.nextstate, f(arc.ilabel));
+    std::pair<StateId, ClassType> p(arc.nextstate, f(arc.ilabel));
     if (state_map.count(p) == 0) {
       StateId newstate = state_map[p] = fst->AddState();
       fst->AddArc(newstate, Arc(0, 0, Weight::One(), arc.nextstate));
@@ -617,7 +617,7 @@ void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fs
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   typedef typename F::Result ClassType;
-  vector<StateId> bad_states;
+  std::vector<StateId> bad_states;
   ClassType noClass = f(kNoLabel);
   ClassType epsClass = f(0);
   for (StateIterator<Fst<Arc> > siter(*fst); !siter.Done(); siter.Next()) {
@@ -640,7 +640,7 @@ void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fs
     if (bad)
       bad_states.push_back(s);
   }
-  vector<Arc> my_arcs;
+  std::vector<Arc> my_arcs;
   for (size_t i = 0; i < bad_states.size(); i++) {
     StateId s = bad_states[i];
     my_arcs.clear();
@@ -666,7 +666,7 @@ void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fs
 
 
 template<class Arc>
-VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts) {
+VectorFst<Arc>* MakeLoopFst(const std::vector<const ExpandedFst<Arc> *> &fsts) {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Label Label;
@@ -708,7 +708,7 @@ VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts) {
         && fst->NumArcs(fst_start_state) == 1
         && fst->Final(fst_start_state) == Weight::Zero();
 
-    vector<StateId> state_map(fst_num_states);  // fst state -> ans state
+    std::vector<StateId> state_map(fst_num_states);  // fst state -> ans state
     for (StateId s = 0; s < fst_num_states; s++) {
       if (s == fst_start_state && share_start_state) state_map[s] = loop_state;
       else state_map[s] = ans->AddState();
@@ -819,9 +819,9 @@ bool EqualAlign(const Fst<Arc> &ifst,
     return false;
   }
   // First select path through ifst.
-  vector<StateId> path;
-  vector<size_t> arc_offsets;  // arc taken out of each state.
-  vector<int> nof_ilabels;
+  std::vector<StateId> path;
+  std::vector<size_t> arc_offsets;  // arc taken out of each state.
+  std::vector<int> nof_ilabels;
 
   StateId num_ilabels = 0;
   int retry_no = 0;
@@ -880,7 +880,7 @@ bool EqualAlign(const Fst<Arc> &ifst,
   }
 
   StateId num_self_loops = 0;
-  vector<ssize_t> self_loop_offsets(path.size());
+  std::vector<ssize_t> self_loop_offsets(path.size());
   for (size_t i = 0; i < path.size(); i++)
     if ( (self_loop_offsets[i] = FindSelfLoopWithILabel(ifst, path[i]))
          != static_cast<ssize_t>(-1) )
@@ -957,10 +957,10 @@ void RemoveUselessArcs(MutableFst<Arc> *fst) {
   for (StateIterator<MutableFst<Arc> > siter(*fst);
       !siter.Done();
       siter.Next()) {
-    vector<size_t> arcs_to_delete;
-    vector<Arc> arcs;
+    std::vector<size_t> arcs_to_delete;
+    std::vector<Arc> arcs;
     // pair2arclist lets us look up the arcs
-    std::map<pair<Label, StateId>, vector<size_t> > pair2arclist;
+    std::map<std::pair<Label, StateId>, std::vector<size_t> > pair2arclist;
     StateId state = siter.Value();
     for (ArcIterator<MutableFst<Arc> > aiter(*fst, state);
         !aiter.Done();
@@ -970,10 +970,10 @@ void RemoveUselessArcs(MutableFst<Arc> *fst) {
       arcs.push_back(arc);
       pair2arclist[std::make_pair(arc.ilabel, arc.nextstate)].push_back(pos);
     }
-    typename std::map<pair<Label, StateId>, vector<size_t> >::iterator
+    typename std::map<std::pair<Label, StateId>, std::vector<size_t> >::iterator
         iter = pair2arclist.begin(), end = pair2arclist.end();
     for (; iter!= end; ++iter) {
-      const vector<size_t> &poslist = iter->second;
+      const std::vector<size_t> &poslist = iter->second;
       if (poslist.size() > 1) {  // >1 arc with same ilabel, dest-state
         size_t best_pos = poslist[0];
         Weight best_weight = arcs[best_pos].weight;
@@ -1141,6 +1141,8 @@ inline bool IsStochasticFst(const Fst<Arc> &fst,
   NaturalLess<Weight> nl;
   bool first_time = true;
   bool ans = true;
+  if (min_sum) *min_sum = Arc::Weight::One();
+  if (max_sum) *max_sum = Arc::Weight::One();
   for (StateIterator<Fst<Arc> > siter(fst); !siter.Done(); siter.Next()) {
     StateId s = siter.Value();
     Weight sum = fst.Final(s);
@@ -1177,6 +1179,8 @@ inline bool IsStochasticFst(const Fst<LogArc> &fst,
   typedef Arc::Weight Weight;
   bool first_time = true;
   bool ans = true;
+  if (min_sum) *min_sum = LogArc::Weight::One();
+  if (max_sum) *max_sum = LogArc::Weight::One();
   for (StateIterator<Fst<Arc> > siter(fst); !siter.Done(); siter.Next()) {
     StateId s = siter.Value();
     Weight sum = fst.Final(s);
@@ -1213,7 +1217,8 @@ inline bool IsStochasticFstInLog(const Fst<StdArc> &fst,
                           StdArc::Weight *min_sum,
                           StdArc::Weight *max_sum) {
   bool ans = false;
-  LogArc::Weight log_min, log_max;
+  LogArc::Weight log_min = LogArc::Weight::One(),
+    log_max = LogArc::Weight::Zero();
   if (fst.Type() == "const") {
     ConstFst<LogArc> logfst;
     Cast(dynamic_cast<const ConstFst<StdArc>&>(fst), &logfst);
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 96ccf67366e..4ce296f093a 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -25,6 +25,9 @@
 
 namespace fst
 {
+using std::vector;
+using std::cout;
+
 template<class Arc, class I>
 void TestMakeLinearAcceptor() {
   typedef typename Arc::Label Label;
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index 7b3f098a564..fb55ad69f72 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -56,14 +56,14 @@ typename Arc::StateId NumArcs(const ExpandedFst<Arc> &fst);
 template<class Arc, class I>
 void GetInputSymbols(const Fst<Arc> &fst,
                      bool include_eps,
-                     vector<I> *symbols);
+                     std::vector<I> *symbols);
 
 /// GetOutputSymbols gets the list of symbols on the output of fst
 /// (including epsilon, if include_eps == true)
 template<class Arc, class I>
 void GetOutputSymbols(const Fst<Arc> &fst,
                       bool include_eps,
-                      vector<I> *symbols);
+                      std::vector<I> *symbols);
 
 /// ClearSymbols sets all the symbols on the input and/or
 /// output side of the FST to zero, as specified.
@@ -76,7 +76,7 @@ void ClearSymbols(bool clear_input,
 template<class I>
 void GetSymbols(const SymbolTable &symtab,
                 bool include_eps,
-                vector<I> *syms_out);
+                std::vector<I> *syms_out);
 
 
 
@@ -131,19 +131,19 @@ void MinimizeEncoded(VectorFst<Arc> *fst, float delta = kDelta) {
 /// create that output.
 template<class Arc, class I>
 bool GetLinearSymbolSequence(const Fst<Arc> &fst,
-                             vector<I> *isymbols_out,
-                             vector<I> *osymbols_out,
+                             std::vector<I> *isymbols_out,
+                             std::vector<I> *osymbols_out,
                              typename Arc::Weight *tot_weight_out);
 
 
 /// This function converts an FST with a special structure, which is
 /// output by the OpenFst functions ShortestPath and RandGen, and converts
-/// them into a vector of separate FSTs.  This special structure is that
+/// them into a std::vector of separate FSTs.  This special structure is that
 /// the only state that has more than one (arcs-out or final-prob) is the
 /// start state.  fsts_out is resized to the appropriate size.
 template<class Arc>
 void ConvertNbestToVector(const Fst<Arc> &fst,
-                          vector<VectorFst<Arc> > *fsts_out);
+                          std::vector<VectorFst<Arc> > *fsts_out);
 
 
 /// Takes the n-shortest-paths (using ShortestPath), but outputs
@@ -153,14 +153,14 @@ void ConvertNbestToVector(const Fst<Arc> &fst,
 template<class Arc>
 void NbestAsFsts(const Fst<Arc> &fst,
                  size_t n,
-                 vector<VectorFst<Arc> > *fsts_out);
+                 std::vector<VectorFst<Arc> > *fsts_out);
 
 
 
 
 /// Creates unweighted linear acceptor from symbol sequence.
 template<class Arc, class I>
-void MakeLinearAcceptor(const vector<I> &labels, MutableFst<Arc> *ofst);
+void MakeLinearAcceptor(const std::vector<I> &labels, MutableFst<Arc> *ofst);
 
 
 
@@ -168,7 +168,7 @@ void MakeLinearAcceptor(const vector<I> &labels, MutableFst<Arc> *ofst);
 /// at each position.  Epsilon is treated like a normal symbol here.
 /// Each position in "labels" must have at least one alternative.
 template<class Arc, class I>
-void MakeLinearAcceptorWithAlternatives(const vector<vector<I> > &labels,
+void MakeLinearAcceptorWithAlternatives(const std::vector<std::vector<I> > &labels,
                                         MutableFst<Arc> *ofst);
 
 
@@ -198,14 +198,14 @@ void SafeDeterminizeMinimizeWrapperInLog(VectorFst<StdArc> *ifst, VectorFst<StdA
 /// RemoveSomeInputSymbols removes any symbol that appears in "to_remove", from
 /// the input side of the FST, replacing them with epsilon.
 template<class Arc, class I>
-void RemoveSomeInputSymbols(const vector<I> &to_remove,
+void RemoveSomeInputSymbols(const std::vector<I> &to_remove,
                             MutableFst<Arc> *fst);
 
 // MapInputSymbols will replace any input symbol i that is between 0 and
 // symbol_map.size()-1, with symbol_map[i].  It removes the input symbol
 // table of the FST.
 template<class Arc, class I>
-void MapInputSymbols(const vector<I> &symbol_map,
+void MapInputSymbols(const std::vector<I> &symbol_map,
                      MutableFst<Arc> *fst);
 
 
@@ -304,7 +304,7 @@ void MakeFollowingInputSymbolsSameClass(bool end_is_epsilon, MutableFst<Arc> *fs
 /// less well optimized and would have a lot of final-states.
 
 template<class Arc>
-VectorFst<Arc>* MakeLoopFst(const vector<const ExpandedFst<Arc> *> &fsts);
+VectorFst<Arc>* MakeLoopFst(const std::vector<const ExpandedFst<Arc> *> &fsts);
 
 
 /// ApplyProbabilityScale is applicable to FSTs in the log or tropical semiring.
diff --git a/src/fstext/grammar-context-fst.cc b/src/fstext/grammar-context-fst.cc
new file mode 100644
index 00000000000..a9917d3b192
--- /dev/null
+++ b/src/fstext/grammar-context-fst.cc
@@ -0,0 +1,228 @@
+// fstext/grammar-context-fst.cc
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fstext/grammar-context-fst.h"
+#include "base/kaldi-error.h"
+#include "util/stl-utils.h"
+
+namespace fst {
+using std::vector;
+
+InverseLeftBiphoneContextFst::InverseLeftBiphoneContextFst(
+    Label nonterm_phones_offset,
+    const vector<int32>& phones,
+    const vector<int32>& disambig_syms):
+    nonterm_phones_offset_(nonterm_phones_offset),
+    phone_syms_(phones),
+    disambig_syms_(disambig_syms) {
+
+  { // This block does some checks.
+    std::vector<int32> all_inputs(phones);
+    all_inputs.insert(all_inputs.end(), disambig_syms.begin(),
+                      disambig_syms.end());
+    all_inputs.push_back(nonterm_phones_offset);
+    size_t size = all_inputs.size();
+    kaldi::SortAndUniq(&all_inputs);
+    if (all_inputs.size() != size) {
+      KALDI_ERR << "There was overlap between disambig symbols, phones, "
+          "and/or --nonterm-phones-offset";
+    }
+    if (all_inputs.front() <= 0)
+      KALDI_ERR << "Symbols <= 0 were passed in as phones, disambig-syms, "
+          "or nonterm-phones-offset.";
+    if (all_inputs.back() != nonterm_phones_offset) {
+      // the value passed --nonterm-phones-offset is not higher numbered
+      // than all the phones and disambig syms... do some more checking.
+      for (int32 i = 1; i < 4; i++) {
+        int32 symbol = nonterm_phones_offset + i;
+        // None of the symbols --nonterm-phones-offset + {kNontermBos, kNontermBegin,
+        //                  kNontermEnd, kNontermReenter, kNontermUserDefined}
+        // (i.e. the special symbols plus the first user-defined symbol) may be
+        // listed as phones or disambig symbols... this doesn't make sense.  We
+        // do allow disambig symbols to be higher-numbered than the nonterminal
+        // sybols, just in case that happens to be needed, but they can't overlap.
+        if (std::binary_search(all_inputs.begin(), all_inputs.end(), symbol)) {
+          KALDI_ERR << "The symbol " << symbol
+                    << " = --nonterm-phones-offset + " << i
+                    << " was listed as a phone or disambig symbol.";
+        }
+      }
+    }
+    if (phone_syms_.empty())
+      KALDI_WARN << "Context FST created but there are no phone symbols: probably "
+          "input FST was empty.";
+  }
+
+  // empty vector, will be the ilabel_info vector that corresponds to epsilon,
+  // in case our FST needs to output epsilons.
+  vector<int32> empty_vec;
+  Label epsilon_label = FindLabel(empty_vec);
+  // Make sure that a label is assigned for epsilon.
+  KALDI_ASSERT(epsilon_label == 0);
+}
+
+
+InverseLeftBiphoneContextFst::Weight InverseLeftBiphoneContextFst::Final(StateId s) {
+  if (s == 0 || phone_syms_.count(s) != 0 ||
+      s == GetPhoneSymbolFor(kNontermEnd))
+    return Weight::One();
+  else
+    return Weight::Zero();
+}
+
+bool InverseLeftBiphoneContextFst::GetArc(
+    StateId s, Label ilabel, Arc *arc) {
+  // it's a rule of the DeterministicOnDemandFst that the ilabel cannot be zero.q
+  KALDI_ASSERT(ilabel != 0);
+
+  arc->ilabel = ilabel;
+  arc->weight = Weight::One();
+
+  if (s == 0 || phone_syms_.count(s) != 0) {
+    // This is an epsilon or phone state.
+    if (phone_syms_.count(ilabel) != 0) {
+      // The ilabel is a phone.
+      std::vector<int32> context_window(2);
+      context_window[0] = s;
+      context_window[1] = ilabel;
+      arc->olabel = FindLabel(context_window);
+      arc->nextstate = ilabel;
+      return true;
+    } else if (disambig_syms_.count(ilabel) != 0) {
+      // the ilabel is a disambiguation symbol.  Make a self-loop arc that
+      // replicates the disambiguation symbol on the input.
+      // The ilabel-info vector for disambig symbols is just a single element
+      // consisting of the negative of the disambig symbols (for easier
+      // identification from code).
+      std::vector<int32> this_ilabel_info(1);
+      this_ilabel_info[0] = -ilabel;
+      arc->olabel = FindLabel(this_ilabel_info);
+      arc->nextstate = s;
+      return true;
+    } else if (ilabel == GetPhoneSymbolFor(kNontermBegin) &&
+               s == 0) {
+      // We were at the start state and saw the symbol #nonterm_begin.
+      // Output nothing, but transition to the special #nonterm_begin state.
+      // when we're in that state, arcs for phones generate special
+      // osymbols corresponding to pairs like (#nonterm_begin, p1).
+      arc->olabel = 0;
+      arc->nextstate = GetPhoneSymbolFor(kNontermBegin);
+      return true;
+    } else if (ilabel == GetPhoneSymbolFor(kNontermEnd)) {
+      // we saw #nonterm_end.
+      std::vector<int32> this_ilabel_info(2);
+      this_ilabel_info[0] = -(GetPhoneSymbolFor(kNontermEnd));
+      this_ilabel_info[1] = (s != 0 ? s : GetPhoneSymbolFor(kNontermBos));
+      arc->olabel = FindLabel(this_ilabel_info);
+      arc->nextstate = GetPhoneSymbolFor(kNontermEnd);
+      return true;
+    } else if (ilabel >= GetPhoneSymbolFor(kNontermUserDefined)) {
+      // Assume this ilabel is a user-defined nonterminal.
+      // Transition to the state kNontermUserDefined, with an olabel
+      // (#nonterm:foo, p1) where 'p1' is the current left-context.
+      std::vector<int32> this_ilabel_info(2);
+      this_ilabel_info[0] = -ilabel;
+      this_ilabel_info[1] = (s != 0 ? s : GetPhoneSymbolFor(kNontermBos));
+      arc->olabel = FindLabel(this_ilabel_info);
+      // the destination state is not specific to this user-defined symbol, it's
+      // a generic destination state.
+      arc->nextstate = GetPhoneSymbolFor(kNontermUserDefined);
+      return true;
+    } else {
+      return false;
+    }
+  } else if (s == GetPhoneSymbolFor(kNontermBegin)) {
+    if (phone_syms_.count(ilabel) != 0 || ilabel == GetPhoneSymbolFor(kNontermBos)) {
+      std::vector<int32> this_ilabel_info(2);
+      this_ilabel_info[0] = -GetPhoneSymbolFor(kNontermBegin);
+      this_ilabel_info[1] = ilabel;
+      arc->nextstate = (ilabel == GetPhoneSymbolFor(kNontermBos) ? 0 : ilabel);
+      arc->olabel = FindLabel(this_ilabel_info);
+      return true;
+    } else {
+      return false;
+    }
+  } else if (s == GetPhoneSymbolFor(kNontermEnd)) {
+    return false;
+  } else if (s == GetPhoneSymbolFor(kNontermUserDefined)) {
+    if (phone_syms_.count(ilabel) != 0 || ilabel == GetPhoneSymbolFor(kNontermBos)) {
+      std::vector<int32> this_ilabel_info(2);
+      this_ilabel_info[0] = -GetPhoneSymbolFor(kNontermReenter);
+      this_ilabel_info[1] = ilabel;
+      arc->nextstate = (ilabel == GetPhoneSymbolFor(kNontermBos) ? 0 : ilabel);
+      arc->olabel = FindLabel(this_ilabel_info);
+      return true;
+    } else {
+      return false;
+    }
+  } else {
+    // likely code error.
+    KALDI_ERR << "Invalid state encountered";
+    return false;  // won't get here.  suppress compiler error.
+  }
+}
+
+StdArc::Label InverseLeftBiphoneContextFst::FindLabel(const vector<int32> &label_vec) {
+  // Finds the ilabel corresponding to this vector (creates a new ilabel if
+  // necessary).
+  VectorToLabelMap::const_iterator iter = ilabel_map_.find(label_vec);
+  if (iter == ilabel_map_.end()) {  // Not already in map.
+    Label this_label = ilabel_info_.size();
+    ilabel_info_.push_back(label_vec);
+    ilabel_map_[label_vec] = this_label;
+    return this_label;
+  } else {
+    return iter->second;
+  }
+}
+
+
+void ComposeContextLeftBiphone(
+    int32 nonterm_phones_offset,
+    const vector<int32> &disambig_syms_in,
+    const VectorFst<StdArc> &ifst,
+    VectorFst<StdArc> *ofst,
+    std::vector<std::vector<int32> > *ilabels) {
+
+  vector<int32> disambig_syms(disambig_syms_in);
+  std::sort(disambig_syms.begin(), disambig_syms.end());
+
+  vector<int32> all_syms;
+  GetInputSymbols(ifst, false/*no eps*/, &all_syms);
+  std::sort(all_syms.begin(), all_syms.end());
+  vector<int32> phones;
+  for (size_t i = 0; i < all_syms.size(); i++)
+    if (!std::binary_search(disambig_syms.begin(),
+                            disambig_syms.end(), all_syms[i]) &&
+        all_syms[i] < nonterm_phones_offset)
+      phones.push_back(all_syms[i]);
+
+
+  InverseLeftBiphoneContextFst inv_c(nonterm_phones_offset,
+                                     phones, disambig_syms);
+
+  // The following statement is equivalent to the following
+  // (if FSTs had the '*' operator for composition):
+  //   (*ofst) = inv(inv_c) * (*ifst)
+  ComposeDeterministicOnDemandInverse(ifst, &inv_c, ofst);
+
+  inv_c.SwapIlabelInfo(ilabels);
+}
+
+}  // end namespace fst
diff --git a/src/fstext/grammar-context-fst.h b/src/fstext/grammar-context-fst.h
new file mode 100644
index 00000000000..20fd2456db2
--- /dev/null
+++ b/src/fstext/grammar-context-fst.h
@@ -0,0 +1,287 @@
+// fstext/grammar-context-fst.h
+
+// Copyright   2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+
+
+#ifndef KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_
+#define KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_
+
+/* This header defines a special form of the context FST "C" (the "C" in "HCLG")
+   that integrates with our framework for building dynamic graphs for grammars
+   that are too big to statically create, or graphs with on-the-fly pieces that
+   you want to create at recognition time without building the whole graph.
+
+   This framework is limited to only work with models with left-biphone context.
+   (Fortunately this doesn't impact results, as our best models are all 'chain'
+   models with left biphone context).
+
+   The main code exported from here is the class InverseLeftBiphoneContextFst,
+   which is similar to the InverseContextFst defined in context-fst.h, but
+   is limited to left-biphone context and also supports certain special
+   extensions we need to compile grammars.
+
+   See \ref grammar (../doc/grammar.dox) for high-level
+   documentation on how this framework works.
+*/
+
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <fst/fstlib.h>
+#include <fst/fst-decl.h>
+
+#include "util/const-integer-set.h"
+#include "fstext/deterministic-fst.h"
+#include "fstext/context-fst.h"
+
+namespace fst {
+
+
+/**
+   An anonymous enum to define some values for symbols used in our grammar-fst
+   framework.  Please understand this with reference to the documentation in
+   \ref grammar (../doc/grammar.dox).  This enum defines
+   the values of nonterminal-related symbols in phones.txt.  They are not
+   the actual values-- they will be shifted by adding the value
+   nonterm_phones_offset which is passed in by the command-line flag
+   --nonterm-phones-offset.
+
+ */
+
+enum NonterminalValues {
+  kNontermBos = 0,  // #nonterm_bos
+  kNontermBegin = 1,  // #nonterm_begin
+  kNontermEnd = 2,  // #nonterm_end
+  kNontermReenter = 3,  // #nonterm_reenter
+  kNontermUserDefined = 4,   // the lowest-numbered user-defined nonterminal, e.g. #nonterm:foo
+  // kNontermMediumNumber and kNontermBigNumber come into the encoding of
+  // nonterminal-related symbols in HCLG.fst.  The only hard constraint on them
+  // is that kNontermBigNumber must be bigger than the biggest transition-id in
+  // your system, and kNontermMediumNumber must be >0.  These values were chosen
+  // for ease of human inspection of numbers encoded with them.
+  kNontermMediumNumber = 1000,
+  kNontermBigNumber = 10000000
+};
+
+
+
+// Returns the smallest multiple of 1000 that is strictly greater than
+// nonterm_phones_offset.  Used in the encoding of special symbol in HCLG;
+// they are encoded as
+//  special_symbol =
+//     kNontermBigNumber + (nonterminal * encoding_multiple) + phone_index
+inline int32 GetEncodingMultiple(int32 nonterm_phones_offset) {
+  int32 medium_number = static_cast<int32>(kNontermMediumNumber);
+  return medium_number *
+      ((nonterm_phones_offset + medium_number) / medium_number);
+}
+
+/**
+   This is a variant of the function ComposeContext() which is to be used
+   with our "grammar FST" framework (see \ref graph_context, i.e.
+   ../doc/grammar.dox, for more details).  This does not take
+   the 'context_width' and 'central_position' arguments because they are
+   assumed to be 2 and 1 respectively (meaning, left-biphone phonetic context).
+
+   This function creates a context FST and composes it on the left with "ifst"
+   to make "ofst".
+
+    @param [in] nonterm_phones_offset  The integer id of the symbol
+                  #nonterm_bos in the phones.txt file.  You can just set this
+                  to a large value (like 1 million) if you are not actually using
+                  nonterminals (e.g. for testing purposes).
+    @param [in] disambig_syms  List of disambiguation symbols, e.g. the integer
+                 ids of #0, #1, #2 ... in the phones.txt.
+    @param [in,out] ifst   The FST we are composing with C (e.g. LG.fst).
+    @param [out] ofst   Composed output FST (would be CLG.fst).
+    @param [out] ilabels  Vector, indexed by ilabel of CLG.fst, providing information
+                  about the meaning of that ilabel; see \ref tree_ilabel
+                  (http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel)
+                  and also \ref grammar_special_clg
+                  (http://kaldi-asr.org/doc/grammar#grammar_special_clg).
+  */
+void ComposeContextLeftBiphone(
+    int32 nonterm_phones_offset,
+    const std::vector<int32> &disambig_syms,
+    const VectorFst<StdArc> &ifst,
+    VectorFst<StdArc> *ofst,
+    std::vector<std::vector<int32> > *ilabels);
+
+
+
+/*
+   InverseLeftBiphoneContextFst represents the inverse of the context FST "C" (the "C" in
+   "HCLG") which transduces from symbols representing phone context windows
+   (e.g. "a, b, c") to individual phones, e.g. "a".  So InverseContextFst
+   transduces from phones to symbols representing phone context windows.  The
+   point is that the inverse is deterministic, so the DeterministicOnDemandFst
+   interface is applicable, which turns out to be a convenient way to implement
+   this.
+
+   This doesn't implement the full Fst interface, it implements the
+   DeterministicOnDemandFst interface which is much simpler and which is
+   sufficient for what we need to do with this.
+
+   Search for "hbka.pdf" ("Speech Recognition with Weighted Finite State
+   Transducers") by M. Mohri, for more context.
+*/
+
+class InverseLeftBiphoneContextFst: public DeterministicOnDemandFst<StdArc> {
+public:
+  typedef StdArc Arc;
+  typedef typename StdArc::StateId StateId;
+  typedef typename StdArc::Weight Weight;
+  typedef typename StdArc::Label Label;
+
+  /**
+     Constructor.  This does not take the arguments 'context_width' or
+     'central_position' because they are assumed to be (2, 1) meaning a
+     system with left-biphone context; and there is no subsequential
+     symbol because it is not needed in systems without right context.
+
+        @param [in] nonterm_phones_offset  The integer id of the symbol
+                  #nonterm_bos in the phones.txt file. You can just set this to
+                  a large value (like 1 million) if you are not actually using
+                  nonterminals (e.g. for testing purposes).
+        @param [in] phones      List of integer ids of phones, as you would see in phones.txt
+        @param [in] disambig_syms   List of integer ids of disambiguation symbols,
+                                   e.g. the ids of #0, #1, #2 in phones.txt
+
+     See \ref graph_context for more details.
+  */
+  InverseLeftBiphoneContextFst(Label nonterm_phones_offset,
+                               const std::vector<int32>& phones,
+                               const std::vector<int32>& disambig_syms);
+
+  /**
+     Here is a note on the state space of InverseLeftBiphoneContextFst;
+     see \ref grammar_special_c which has some documentation on this.
+
+     The state space uses the same numbering as phones.txt.
+
+       State 0 means the beginning-of-sequence state, where there is no left
+       context.
+
+       For each phone p in the list 'phones' passed to the constructor (i.e. in
+       the set passed to the constructor), the state 'p' corresponds to a
+       left-context of that phone.
+
+       If p is equal to nonterm_phones_offset_ + kNontermBegin (i.e. the
+       integer form of `\#nonterm_begin`), then this is the state we transition
+       to when we see that symbol starting from left-context==0 (no context).  The
+       transition to this special state will have epsilon on the output.  (talking
+       here about inv(C), not C, so input/output are reversed).
+       The state is nonfinal and when we see a regular phone p1 or #nonterm_bos, instead of
+       outputting that phone in context, we output the pair (#nonterm_begin,p1) or
+       (#nonterm_begin,#nonterm_bos).  This state is not final.
+
+       If p is equal to nonterm_phones_offset_ + kNontermUserDefined, then this
+       is the state we transition to when we see any user-defined nonterminal.
+       Transitions to this special state have olabels of the form (#nonterm:foo,p1)
+       where p1 is the preceding context (with #nonterm_begin if that context was
+       0); transitions out of it have olabels of the form (#nonterm_reenter,p2), where
+       p2 is the phone on the ilabel of that transition.  Again: talking about inv(C).
+       This state is not final.
+
+       If p is equal to nonterm_phones_offset_ + kNontermEnd, then this is
+       the state we transition to when we see the ilabel #nonterm_end.  The olabels
+       on the transitions to it (talking here about inv(C), so ilabels and olabels
+       are reversed) are of the form (#nonterm_end, p1) where p1 corresponds to the
+       context we were in.  This state is final.
+   */
+
+
+  virtual StateId Start() { return 0; }
+
+  virtual Weight Final(StateId s);
+
+  /// Note: ilabel must not be epsilon.
+  virtual bool GetArc(StateId s, Label ilabel, Arc *arc);
+
+  ~InverseLeftBiphoneContextFst() { }
+
+  // Returns a reference to a vector<vector<int32> > with information about all
+  // the input symbols of C (i.e. all the output symbols of this
+  // InverseContextFst).  See
+  // "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
+  const std::vector<std::vector<int32> > &IlabelInfo() const {
+    return ilabel_info_;
+  }
+
+  // A way to destructively obtain the ilabel-info.  Only do this if you
+  // are just about to destroy this object.
+  void SwapIlabelInfo(std::vector<std::vector<int32> > *vec) { ilabel_info_.swap(*vec); }
+
+private:
+
+  inline int32 GetPhoneSymbolFor(enum NonterminalValues n) {
+    return nonterm_phones_offset_ + static_cast<int32>(n);
+  }
+
+  /// Finds the label index corresponding to this context-window of phones
+  /// (likely of width context_width_).  Inserts it into the
+  /// ilabel_info_/ilabel_map_ tables if necessary.
+  Label FindLabel(const std::vector<int32> &label_info);
+
+
+  // Map type to map from vectors of int32 (representing ilabel-info,
+  // see http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel) to
+  // Label (the output label in this FST).
+  typedef unordered_map<std::vector<int32>, Label,
+                        kaldi::VectorHasher<int32> > VectorToLabelMap;
+
+
+  // The following three variables were also passed in by the caller:
+  int32 nonterm_phones_offset_;
+
+  // 'phone_syms_' are a set of phone-ids, typically 1, 2, .. num_phones.
+  kaldi::ConstIntegerSet<Label> phone_syms_;
+
+  // disambig_syms_ is the set of integer ids of the disambiguation symbols,
+  // usually represented in text form as #0, #1, #2, etc.  These are inserted
+  // into the grammar (for #0) and the lexicon (for #1, #2, ...) in order to
+  // make the composed FSTs determinizable.  They are treated "specially" by the
+  // context FST in that they are not part of the context, they are just "passed
+  // through" via self-loops.  See the Mohri chapter mrentioned above for more
+  // information.
+  kaldi::ConstIntegerSet<Label> disambig_syms_;
+
+
+  // maps from vector<int32>, representing phonetic contexts of length
+  // context_width_ - 1, to Label.  These are actually the output labels of this
+  // InverseContextFst (because of the "Inverse" part), but for historical
+  // reasons and because we've used the term ilabels" in the documentation, we
+  // still call these "ilabels").
+  VectorToLabelMap ilabel_map_;
+
+  // ilabel_info_ is the reverse map of ilabel_map_.
+  // Indexed by olabel (although we call this ilabel_info_ for historical
+  // reasons and because is for the ilabels of C), ilabel_info_[i] gives
+  // information about the meaning of each symbol on the input of C
+  // aka the output of inv(C).
+  // See "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel".
+  std::vector<std::vector<int32> > ilabel_info_;
+
+};
+
+}  // namespace fst
+
+
+#endif  // KALDI_FSTEXT_GRAMMAR_CONTEXT_FST_H_
diff --git a/src/fstext/kaldi-fst-io.cc b/src/fstext/kaldi-fst-io.cc
index acbd9e59000..276baeb7535 100644
--- a/src/fstext/kaldi-fst-io.cc
+++ b/src/fstext/kaldi-fst-io.cc
@@ -98,7 +98,7 @@ VectorFst<StdArc> *CastOrConvertToVectorFst(Fst<StdArc> *fst) {
   if (real_type == "vector") {
     return dynamic_cast<VectorFst<StdArc> *>(fst);
   } else {
-    // As the 'fst' can't cast to VectorFst, we carete a new
+    // As the 'fst' can't cast to VectorFst, we create a new
     // VectorFst<StdArc> initialized by 'fst', and delete 'fst'.
     VectorFst<StdArc> *new_fst = new VectorFst<StdArc>(*fst);
     delete fst;
diff --git a/src/fstext/kaldi-fst-io.h b/src/fstext/kaldi-fst-io.h
index 9715d81941e..206dc71238a 100644
--- a/src/fstext/kaldi-fst-io.h
+++ b/src/fstext/kaldi-fst-io.h
@@ -87,7 +87,7 @@ fst::VectorFst<fst::StdArc> *ReadAndPrepareLmFst(std::string rxfilename);
 
 // This is a Holder class with T = VectorFst<Arc>, that meets the requirements
 // of a Holder class as described in ../util/kaldi-holder.h. This enables us to
-// read/write collections of FSTs indexed by strings, using the Table comcpet (
+// read/write collections of FSTs indexed by strings, using the Table concept (
 // see ../util/kaldi-table.h).
 // Originally it was only templated on T = VectorFst<StdArc>, but as the keyword
 // spotting stuff introduced more types of FSTs, we made it also templated on
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index 5bb40e3efa3..c97a538dd1d 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -40,7 +40,7 @@ void ConvertLattice(
   typedef ArcTpl<CompactWeight> CompactArc;
 
   VectorFst<ArcTpl<Weight> > ffst;
-  vector<vector<Int> > labels;
+  std::vector<std::vector<Int> > labels;
   if (invert) // normal case: want the ilabels as sequences on the arcs of
     Factor(ifst, &ffst, &labels);  // the output... Factor makes seqs of
                                    // ilabels.
@@ -67,7 +67,7 @@ void ConvertLattice(
   for (StateId s = 0; s < num_states; s++) {
     Weight final_weight = ffst.Final(s);
     if (final_weight != Weight::Zero()) {
-      CompactWeight final_compact_weight(final_weight, vector<Int>());
+      CompactWeight final_compact_weight(final_weight, std::vector<Int>());
       ofst->SetFinal(s, final_compact_weight);
     }
     for (ArcIterator<ExpandedFst<Arc> > iter(ffst, s);
@@ -195,7 +195,7 @@ void ConvertLattice(
 
 template<class Weight, class ScaleFloat>
 void ScaleLattice(
-    const vector<vector<ScaleFloat> > &scale,
+    const std::vector<std::vector<ScaleFloat> > &scale,
     MutableFst<ArcTpl<Weight> > *fst) {
   assert(scale.size() == 2 && scale[0].size() == 2 && scale[1].size() == 2);
   if (scale == DefaultLatticeScale()) // nothing to do.
diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc
index e74caef4aa2..aa931d47d07 100644
--- a/src/fstext/lattice-utils-test.cc
+++ b/src/fstext/lattice-utils-test.cc
@@ -232,7 +232,7 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
 // use TestConvertPair when the Weight can be constructed from
 // a pair of floats.
 template<class Weight, class Int> void TestScalePair(bool invert) {
-  vector<vector<double> > scale1 = DefaultLatticeScale(),
+  std::vector<std::vector<double> > scale1 = DefaultLatticeScale(),
       scale2 = DefaultLatticeScale();
   // important that all these numbers exactly representable as floats..
   // exact floating-point comparisons are used in LatticeWeight, and
diff --git a/src/fstext/lattice-utils.h b/src/fstext/lattice-utils.h
index 6fcb336ceb2..489fcac4792 100644
--- a/src/fstext/lattice-utils.h
+++ b/src/fstext/lattice-utils.h
@@ -127,16 +127,16 @@ void ConvertFstToLattice(
 
 
 /** Returns a default 2x2 matrix scaling factor for LatticeWeight */
-inline vector<vector<double> > DefaultLatticeScale() {
-  vector<vector<double> > ans(2);
+inline std::vector<std::vector<double> > DefaultLatticeScale() {
+  std::vector<std::vector<double> > ans(2);
   ans[0].resize(2, 0.0);
   ans[1].resize(2, 0.0);
   ans[0][0] = ans[1][1] = 1.0;
   return ans;
 }
 
-inline vector<vector<double> > AcousticLatticeScale(double acwt) {
-  vector<vector<double> > ans(2);
+inline std::vector<std::vector<double> > AcousticLatticeScale(double acwt) {
+  std::vector<std::vector<double> > ans(2);
   ans[0].resize(2, 0.0);
   ans[1].resize(2, 0.0);
   ans[0][0] = 1.0;
@@ -144,8 +144,8 @@ inline vector<vector<double> > AcousticLatticeScale(double acwt) {
   return ans;
 }
 
-inline vector<vector<double> > GraphLatticeScale(double lmwt) {
-  vector<vector<double> > ans(2);
+inline std::vector<std::vector<double> > GraphLatticeScale(double lmwt) {
+  std::vector<std::vector<double> > ans(2);
   ans[0].resize(2, 0.0);
   ans[1].resize(2, 0.0);
   ans[0][0] = lmwt;
@@ -153,8 +153,8 @@ inline vector<vector<double> > GraphLatticeScale(double lmwt) {
   return ans;
 }
 
-inline vector<vector<double> > LatticeScale(double lmwt, double acwt) {
-  vector<vector<double> > ans(2);
+inline std::vector<std::vector<double> > LatticeScale(double lmwt, double acwt) {
+  std::vector<std::vector<double> > ans(2);
   ans[0].resize(2, 0.0);
   ans[1].resize(2, 0.0);
   ans[0][0] = lmwt;
@@ -172,7 +172,7 @@ inline vector<vector<double> > LatticeScale(double lmwt, double acwt) {
  */
 template<class Weight, class ScaleFloat>
 void ScaleLattice(
-    const vector<vector<ScaleFloat> > &scale,
+    const std::vector<std::vector<ScaleFloat> > &scale,
     MutableFst<ArcTpl<Weight> > *fst);
 
 /// Removes state-level alignments (the strings that are
diff --git a/src/fstext/lattice-weight-test.cc b/src/fstext/lattice-weight-test.cc
index ae768e711f7..55229675f39 100644
--- a/src/fstext/lattice-weight-test.cc
+++ b/src/fstext/lattice-weight-test.cc
@@ -20,6 +20,8 @@
 #include "fstext/lattice-weight.h"
 
 namespace fst {
+using std::vector;
+using std::cout;
 // these typedefs are the same as in ../lat/kaldi-lattice.h, but
 // just used here for testing (doesn't matter if they get out of
 // sync).
@@ -82,7 +84,7 @@ void LatticeWeightTest() {
     bool a = nl(l1, l2);
     bool b = (Plus(l1, l2) == l1 && l1 != l2);
     KALDI_ASSERT(a == b);
-    
+
     KALDI_ASSERT(Compare(l1, Plus(l1, l2)) != 1); // so do not have l1 > l1 + l2
     LatticeWeight l5 = RandomLatticeWeight(), l6 = RandomLatticeWeight();
     {
@@ -100,7 +102,7 @@ void LatticeWeightTest() {
     }
     KALDI_ASSERT(l1.Member() && l2.Member() && l3.Member() && l4.Member()
                  && l5.Member() && l6.Member());
-    if (l2 != LatticeWeight::Zero()) 
+    if (l2 != LatticeWeight::Zero())
       KALDI_ASSERT(ApproxEqual(Divide(Times(l1, l2), l2), l1)); // (a*b) / b = a if b != 0
     KALDI_ASSERT(ApproxEqual(l1, l1.Quantize()));
 
@@ -190,6 +192,6 @@ void CompactLatticeWeightTest() {
 
 int main() {
   fst::LatticeWeightTest();
-  fst::CompactLatticeWeightTest();  
+  fst::CompactLatticeWeightTest();
 }
 
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index 7281dc9ba50..7637c4d1c55 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -37,10 +37,10 @@ template<class FloatType>
 class LatticeWeightTpl;
 
 template <class FloatType>
-inline ostream &operator <<(ostream &strm, const LatticeWeightTpl<FloatType> &w);
+inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl<FloatType> &w);
 
 template <class FloatType>
-inline istream &operator >>(istream &strm, LatticeWeightTpl<FloatType> &w);
+inline std::istream &operator >>(std::istream &strm, LatticeWeightTpl<FloatType> &w);
 
 
 template<class FloatType>
@@ -57,7 +57,7 @@ class LatticeWeightTpl {
 
   inline void SetValue2(T f) { value2_ = f; }
 
-  LatticeWeightTpl() { }
+  LatticeWeightTpl(): value1_{}, value2_{} { }
 
   LatticeWeightTpl(T a, T b): value1_(a), value2_(b) {}
 
@@ -74,22 +74,22 @@ class LatticeWeightTpl {
   }
 
   static const LatticeWeightTpl Zero() {
-    return LatticeWeightTpl(numeric_limits<T>::infinity(),
-                            numeric_limits<T>::infinity());
+    return LatticeWeightTpl(std::numeric_limits<T>::infinity(),
+                            std::numeric_limits<T>::infinity());
   }
 
   static const LatticeWeightTpl One() {
     return LatticeWeightTpl(0.0, 0.0);
   }
 
-  static const string &Type() {
-    static const string type = (sizeof(T) == 4 ? "lattice4" : "lattice8") ;
+  static const std::string &Type() {
+    static const std::string type = (sizeof(T) == 4 ? "lattice4" : "lattice8") ;
     return type;
   }
 
   static const LatticeWeightTpl NoWeight() {
-    return LatticeWeightTpl(numeric_limits<FloatType>::quiet_NaN(),
-                            numeric_limits<FloatType>::quiet_NaN());
+    return LatticeWeightTpl(std::numeric_limits<FloatType>::quiet_NaN(),
+                            std::numeric_limits<FloatType>::quiet_NaN());
   }
 
   bool Member() const {
@@ -97,24 +97,24 @@ class LatticeWeightTpl {
     // also test for no -inf, and either both or neither
     // must be +inf, and
     if (value1_ != value1_ || value2_ != value2_) return false; // NaN
-    if (value1_ == -numeric_limits<T>::infinity()  ||
-       value2_ == -numeric_limits<T>::infinity()) return false; // -infty not allowed
-    if (value1_ == numeric_limits<T>::infinity() ||
-        value2_ == numeric_limits<T>::infinity()) {
-      if (value1_ != numeric_limits<T>::infinity() ||
-          value2_ != numeric_limits<T>::infinity()) return false; // both must be +infty;
+    if (value1_ == -std::numeric_limits<T>::infinity()  ||
+       value2_ == -std::numeric_limits<T>::infinity()) return false; // -infty not allowed
+    if (value1_ == std::numeric_limits<T>::infinity() ||
+        value2_ == std::numeric_limits<T>::infinity()) {
+      if (value1_ != std::numeric_limits<T>::infinity() ||
+          value2_ != std::numeric_limits<T>::infinity()) return false; // both must be +infty;
       // this is necessary so that the semiring has only one zero.
     }
     return true;
   }
 
   LatticeWeightTpl Quantize(float delta = kDelta) const {
-    if (value1_+value2_ == -numeric_limits<T>::infinity()) {
-      return LatticeWeightTpl(-numeric_limits<T>::infinity(), -numeric_limits<T>::infinity());
-    } else if (value1_+value2_ == numeric_limits<T>::infinity()) {
-      return LatticeWeightTpl(numeric_limits<T>::infinity(), numeric_limits<T>::infinity());
-    } else if (value1_+value2_ != value1_+value2_) { // NaN
-      return LatticeWeightTpl(value1_+value2_, value1_+value2_);
+    if (value1_ + value2_ == -std::numeric_limits<T>::infinity()) {
+      return LatticeWeightTpl(-std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity());
+    } else if (value1_ + value2_ == std::numeric_limits<T>::infinity()) {
+      return LatticeWeightTpl(std::numeric_limits<T>::infinity(), std::numeric_limits<T>::infinity());
+    } else if (value1_ + value2_ != value1_ + value2_) { // NaN
+      return LatticeWeightTpl(value1_ + value2_, value1_ + value2_);
     } else {
       return LatticeWeightTpl(floor(value1_/delta + 0.5F)*delta, floor(value2_/delta + 0.5F) * delta);
     }
@@ -126,7 +126,7 @@ class LatticeWeightTpl {
 
   // This is used in OpenFst for binary I/O.  This is OpenFst-style,
   // not Kaldi-style, I/O.
-  istream &Read(istream &strm) {
+  std::istream &Read(std::istream &strm) {
     // Always read/write as float, even if T is double,
     // so we can use OpenFst-style read/write and still maintain
     // compatibility when compiling with different FloatTypes
@@ -138,7 +138,7 @@ class LatticeWeightTpl {
 
   // This is used in OpenFst for binary I/O.  This is OpenFst-style,
   // not Kaldi-style, I/O.
-  ostream &Write(ostream &strm) const {
+  std::ostream &Write(std::ostream &strm) const {
     WriteType(strm, value1_);
     WriteType(strm, value2_);
     return strm;
@@ -159,10 +159,10 @@ class LatticeWeightTpl {
   }
 
  protected:
-  inline static void WriteFloatType(ostream &strm, const T &f) {
-    if (f == numeric_limits<T>::infinity())
+  inline static void WriteFloatType(std::ostream &strm, const T &f) {
+    if (f == std::numeric_limits<T>::infinity())
       strm << "Infinity";
-    else if (f == -numeric_limits<T>::infinity())
+    else if (f == -std::numeric_limits<T>::infinity())
       strm << "-Infinity";
     else if (f != f)
       strm << "BadNumber";
@@ -171,16 +171,15 @@ class LatticeWeightTpl {
   }
 
   // Internal helper function, used in ReadNoParen.
-  inline static void ReadFloatType(istream &strm, T &f) {
-    string s;
+  inline static void ReadFloatType(std::istream &strm, T &f) {
+    std::string s;
     strm >> s;
     if (s == "Infinity") {
-      f = numeric_limits<T>::infinity();
+      f = std::numeric_limits<T>::infinity();
     } else if (s == "-Infinity") {
-      f = -numeric_limits<T>::infinity();
+      f = -std::numeric_limits<T>::infinity();
     } else if (s == "BadNumber") {
-      f = numeric_limits<T>::infinity();
-      f -= f; // get NaN
+      f = std::numeric_limits<T>::quiet_NaN();
     } else {
       char *p;
       f = strtod(s.c_str(), &p);
@@ -191,14 +190,14 @@ class LatticeWeightTpl {
 
   // Reads LatticeWeight when there are no parentheses around pair terms...
   // currently the only form supported.
-  inline istream &ReadNoParen(
-      istream &strm, char separator) {
+  inline std::istream &ReadNoParen(
+      std::istream &strm, char separator) {
     int c;
     do {
       c = strm.get();
     } while (isspace(c));
 
-    string s1;
+    std::string s1;
     while (c != separator) {
       if (c == EOF) {
         strm.clear(std::ios::badbit);
@@ -207,15 +206,15 @@ class LatticeWeightTpl {
       s1 += c;
       c = strm.get();
     }
-    istringstream strm1(s1);
+    std::istringstream strm1(s1);
     ReadFloatType(strm1, value1_); // ReadFloatType is class member function
     // read second element
     ReadFloatType(strm, value2_);
     return strm;
   }
 
-  friend istream &operator>> <FloatType>(istream&, LatticeWeightTpl<FloatType>&);
-  friend ostream &operator<< <FloatType>(ostream&, const LatticeWeightTpl<FloatType>&);
+  friend std::istream &operator>> <FloatType>(std::istream&, LatticeWeightTpl<FloatType>&);
+  friend std::ostream &operator<< <FloatType>(std::ostream&, const LatticeWeightTpl<FloatType>&);
 
  private:
   T value1_;
@@ -232,9 +231,9 @@ class LatticeWeightTpl {
 template<class FloatType, class ScaleFloatType>
 inline LatticeWeightTpl<FloatType> ScaleTupleWeight(
     const LatticeWeightTpl<FloatType> &w,
-    const vector<vector<ScaleFloatType> > &scale) {
+    const std::vector<std::vector<ScaleFloatType> > &scale) {
   // Without the next special case we'd get NaNs from infinity * 0
-  if (w.Value1() == numeric_limits<FloatType>::infinity())
+  if (w.Value1() == std::numeric_limits<FloatType>::infinity())
     return LatticeWeightTpl<FloatType>::Zero();
   return LatticeWeightTpl<FloatType>(scale[0][0] * w.Value1() + scale[0][1] * w.Value2(),
                                      scale[1][0] * w.Value1() + scale[1][1] * w.Value2());
@@ -250,7 +249,7 @@ inline PairWeight<TropicalWeightTpl<FloatType>,
                   TropicalWeightTpl<FloatType> > ScaleTupleWeight(
                       const PairWeight<TropicalWeightTpl<FloatType>,
                                        TropicalWeightTpl<FloatType> > &w,
-                      const vector<vector<ScaleFloatType> > &scale) {
+                      const std::vector<std::vector<ScaleFloatType> > &scale) {
   typedef TropicalWeightTpl<FloatType> BaseType;
   typedef PairWeight<BaseType, BaseType> PairType;
   const BaseType zero = BaseType::Zero();
@@ -373,14 +372,14 @@ inline LatticeWeightTpl<FloatType> Divide(const LatticeWeightTpl<FloatType> &w1,
                                           DivideType typ = DIVIDE_ANY) {
   typedef FloatType T;
   T a = w1.Value1() - w2.Value1(), b = w1.Value2() - w2.Value2();
-  if (a != a || b != b || a == -numeric_limits<T>::infinity()
-     || b == -numeric_limits<T>::infinity()) {
+  if (a != a || b != b || a == -std::numeric_limits<T>::infinity()
+     || b == -std::numeric_limits<T>::infinity()) {
     KALDI_WARN << "LatticeWeightTpl::Divide, NaN or invalid number produced. "
                << "[dividing by zero?]  Returning zero";
     return LatticeWeightTpl<T>::Zero();
   }
-  if (a == numeric_limits<T>::infinity() ||
-     b == numeric_limits<T>::infinity())
+  if (a == std::numeric_limits<T>::infinity() ||
+     b == std::numeric_limits<T>::infinity())
     return LatticeWeightTpl<T>::Zero(); // not a valid number if only one is infinite.
   return LatticeWeightTpl<T>(a, b);
 }
@@ -395,7 +394,7 @@ inline bool ApproxEqual(const LatticeWeightTpl<FloatType> &w1,
 }
 
 template <class FloatType>
-inline ostream &operator <<(ostream &strm, const LatticeWeightTpl<FloatType> &w) {
+inline std::ostream &operator <<(std::ostream &strm, const LatticeWeightTpl<FloatType> &w) {
   LatticeWeightTpl<FloatType>::WriteFloatType(strm, w.Value1());
   CHECK(FLAGS_fst_weight_separator.size() == 1);
   strm << FLAGS_fst_weight_separator[0]; // comma by default;
@@ -405,7 +404,7 @@ inline ostream &operator <<(ostream &strm, const LatticeWeightTpl<FloatType> &w)
 }
 
 template <class FloatType>
-inline istream &operator >>(istream &strm, LatticeWeightTpl<FloatType> &w1) {
+inline std::istream &operator >>(std::istream &strm, LatticeWeightTpl<FloatType> &w1) {
   CHECK(FLAGS_fst_weight_separator.size() == 1);
   // separator defaults to ','
   return w1.ReadNoParen(strm, FLAGS_fst_weight_separator[0]);
@@ -436,7 +435,7 @@ class CompactLatticeWeightTpl {
 
   CompactLatticeWeightTpl() { }
 
-  CompactLatticeWeightTpl(const WeightType &w, const vector<IntType> &s):
+  CompactLatticeWeightTpl(const WeightType &w, const std::vector<IntType> &s):
       weight_(w), string_(s) { }
 
   CompactLatticeWeightTpl &operator=(const CompactLatticeWeightTpl<WeightType, IntType> &w) {
@@ -447,30 +446,30 @@ class CompactLatticeWeightTpl {
 
   const W &Weight() const { return weight_; }
 
-  const vector<IntType> &String() const { return string_; }
+  const std::vector<IntType> &String() const { return string_; }
 
   void SetWeight(const W &w) { weight_ = w; }
 
-  void SetString(const vector<IntType> &s) { string_ = s; }
+  void SetString(const std::vector<IntType> &s) { string_ = s; }
 
   static const CompactLatticeWeightTpl<WeightType, IntType> Zero() {
     return CompactLatticeWeightTpl<WeightType, IntType>(
-        WeightType::Zero(), vector<IntType>());
+        WeightType::Zero(), std::vector<IntType>());
   }
 
   static const CompactLatticeWeightTpl<WeightType, IntType> One() {
     return CompactLatticeWeightTpl<WeightType, IntType>(
-        WeightType::One(), vector<IntType>());
+        WeightType::One(), std::vector<IntType>());
   }
 
-  inline static string GetIntSizeString() {
+  inline static std::string GetIntSizeString() {
     char buf[2];
     buf[0] = '0' + sizeof(IntType);
     buf[1] = '\0';
     return buf;
   }
-  static const string &Type() {
-    static const string type = "compact" + WeightType::Type()
+  static const std::string &Type() {
+    static const std::string type = "compact" + WeightType::Type()
         + GetIntSizeString();
     return type;
   }
@@ -483,7 +482,7 @@ class CompactLatticeWeightTpl {
 
   CompactLatticeWeightTpl<WeightType, IntType> Reverse() const {
     size_t s = string_.size();
-    vector<IntType> v(s);
+    std::vector<IntType> v(s);
     for(size_t i = 0; i < s; i++)
       v[i] = string_[s-i-1];
     return CompactLatticeWeightTpl<WeightType, IntType>(weight_, v);
@@ -510,7 +509,7 @@ class CompactLatticeWeightTpl {
 
   // This is used in OpenFst for binary I/O.  This is OpenFst-style,
   // not Kaldi-style, I/O.
-  istream &Read(istream &strm) {
+  std::istream &Read(std::istream &strm) {
     weight_.Read(strm);
     if (strm.fail()){ return strm; }
     int32 sz;
@@ -530,7 +529,7 @@ class CompactLatticeWeightTpl {
 
   // This is used in OpenFst for binary I/O.  This is OpenFst-style,
   // not Kaldi-style, I/O.
-  ostream &Write(ostream &strm) const {
+  std::ostream &Write(std::ostream &strm) const {
     weight_.Write(strm);
     if (strm.fail()){ return strm; }
     int32 sz = static_cast<int32>(string_.size());
@@ -551,7 +550,7 @@ class CompactLatticeWeightTpl {
   }
  private:
   W weight_;
-  vector<IntType> string_;
+  std::vector<IntType> string_;
 
 };
 
@@ -589,8 +588,8 @@ inline bool ApproxEqual(const CompactLatticeWeightTpl<WeightType, IntType> &w1,
 // break.
 
 template<class WeightType, class IntType>
-inline int Compare (const CompactLatticeWeightTpl<WeightType, IntType> &w1,
-                    const CompactLatticeWeightTpl<WeightType, IntType> &w2) {
+inline int Compare(const CompactLatticeWeightTpl<WeightType, IntType> &w1,
+                   const CompactLatticeWeightTpl<WeightType, IntType> &w2) {
   int c1 = Compare(w1.Weight(), w2.Weight());
   if (c1 != 0) return c1;
   int l1 = w1.String().size(), l2 = w2.String().size();
@@ -677,9 +676,9 @@ inline CompactLatticeWeightTpl<WeightType, IntType> Times(
     return CompactLatticeWeightTpl<WeightType, IntType>::Zero();
     // special case to ensure zero is unique
   } else {
-    vector<IntType> v;
+    std::vector<IntType> v;
     v.resize(w1.String().size() + w2.String().size());
-    typename vector<IntType>::iterator iter = v.begin();
+    typename std::vector<IntType>::iterator iter = v.begin();
     iter = std::copy(w1.String().begin(), w1.String().end(), iter); // returns end of first range.
     std::copy(w2.String().begin(), w2.String().end(), iter);
     return CompactLatticeWeightTpl<WeightType, IntType>(w, v);
@@ -701,24 +700,24 @@ inline CompactLatticeWeightTpl<WeightType, IntType> Divide(const CompactLatticeW
   }
   WeightType w = Divide(w1.Weight(), w2.Weight());
 
-  const vector<IntType> v1 = w1.String(), v2 = w2.String();
+  const std::vector<IntType> v1 = w1.String(), v2 = w2.String();
   if (v2.size() > v1.size()) {
     KALDI_ERR << "Cannot divide, length mismatch";
   }
-  typename vector<IntType>::const_iterator v1b = v1.begin(),
+  typename std::vector<IntType>::const_iterator v1b = v1.begin(),
       v1e = v1.end(), v2b = v2.begin(), v2e = v2.end();
   if (div == DIVIDE_LEFT) {
     if (!std::equal(v2b, v2e, v1b)) { // v2 must be identical to first part of v1.
       KALDI_ERR << "Cannot divide, data mismatch";
     }
     return CompactLatticeWeightTpl<WeightType, IntType>(
-        w, vector<IntType>(v1b+(v2e-v2b), v1e)); // return last part of v1.
+        w, std::vector<IntType>(v1b+(v2e-v2b), v1e)); // return last part of v1.
   } else if (div == DIVIDE_RIGHT) {
     if (!std::equal(v2b, v2e, v1e-(v2e-v2b))) { // v2 must be identical to last part of v1.
       KALDI_ERR << "Cannot divide, data mismatch";
     }
     return CompactLatticeWeightTpl<WeightType, IntType>(
-        w, vector<IntType>(v1b, v1e-(v2e-v2b))); // return first part of v1.
+        w, std::vector<IntType>(v1b, v1e-(v2e-v2b))); // return first part of v1.
 
   } else {
     KALDI_ERR << "Cannot divide CompactLatticeWeightTpl with DIVIDE_ANY";
@@ -727,7 +726,7 @@ inline CompactLatticeWeightTpl<WeightType, IntType> Divide(const CompactLatticeW
 }
 
 template <class WeightType, class IntType>
-inline ostream &operator <<(ostream &strm, const CompactLatticeWeightTpl<WeightType, IntType> &w) {
+inline std::ostream &operator <<(std::ostream &strm, const CompactLatticeWeightTpl<WeightType, IntType> &w) {
   strm << w.Weight();
   CHECK(FLAGS_fst_weight_separator.size() == 1);
   strm << FLAGS_fst_weight_separator[0]; // comma by default.
@@ -740,7 +739,7 @@ inline ostream &operator <<(ostream &strm, const CompactLatticeWeightTpl<WeightT
 }
 
 template <class WeightType, class IntType>
-inline istream &operator >>(istream &strm, CompactLatticeWeightTpl<WeightType, IntType> &w) {
+inline std::istream &operator >>(std::istream &strm, CompactLatticeWeightTpl<WeightType, IntType> &w) {
   std::string s;
   strm >> s;
   if (strm.fail()) {
@@ -763,7 +762,7 @@ inline istream &operator >>(istream &strm, CompactLatticeWeightTpl<WeightType, I
     return strm;
   }
   // read string part.
-  vector<IntType> string;
+  std::vector<IntType> string;
   const char *c = s2.c_str();
   while(*c != '\0') {
     if (*c == kStringSeparator) // '_'
@@ -788,13 +787,13 @@ class CompactLatticeWeightCommonDivisorTpl {
 
   Weight operator()(const Weight &w1, const Weight &w2) const {
     // First find longest common prefix of the strings.
-    typename vector<IntType>::const_iterator s1b = w1.String().begin(),
+    typename std::vector<IntType>::const_iterator s1b = w1.String().begin(),
         s1e = w1.String().end(), s2b = w2.String().begin(), s2e = w2.String().end();
     while (s1b < s1e && s2b < s2e && *s1b == *s2b) {
       s1b++;
       s2b++;
     }
-    return Weight(Plus(w1.Weight(), w2.Weight()), vector<IntType>(w1.String().begin(), s1b));
+    return Weight(Plus(w1.Weight(), w2.Weight()), std::vector<IntType>(w1.String().begin(), s1b));
   }
 };
 
@@ -808,7 +807,7 @@ class CompactLatticeWeightCommonDivisorTpl {
 template<class Weight, class IntType, class ScaleFloatType>
 inline CompactLatticeWeightTpl<Weight, IntType> ScaleTupleWeight(
     const CompactLatticeWeightTpl<Weight, IntType> &w,
-    const vector<vector<ScaleFloatType> > &scale) {
+    const std::vector<std::vector<ScaleFloatType> > &scale) {
   return CompactLatticeWeightTpl<Weight, IntType>(
       Weight(ScaleTupleWeight(w.Weight(), scale)), w.String());
 }
diff --git a/src/fstext/pre-determinize-inl.h b/src/fstext/pre-determinize-inl.h
index 557dbc9d124..77a31b9e3e6 100644
--- a/src/fstext/pre-determinize-inl.h
+++ b/src/fstext/pre-determinize-inl.h
@@ -253,13 +253,13 @@ inline bool HasBannedPrefixPlusDigits(SymbolTable *symTable, std::string prefix,
   return false;  // doesn't have banned symbol.
 }
 
-template<class T> void CopySetToVector(const std::set<T> s, vector<T> *v) {
+template<class T> void CopySetToVector(const std::set<T> s, std::vector<T> *v) {
   // adds members of s to v, in sorted order from lowest to highest
   // (because the set was in sorted order).
   assert(v != NULL);
   v->resize(s.size());
   typename std::set<T>::const_iterator siter = s.begin();
-  typename vector<T>::iterator viter = v->begin();
+  typename std::vector<T>::iterator viter = v->begin();
   for (;  siter != s.end(); ++siter, ++viter) {
     assert(viter != v->end());
     *viter = *siter;
@@ -268,7 +268,7 @@ template<class T> void CopySetToVector(const std::set<T> s, vector<T> *v) {
 
 // Warning.  This function calls 'new'.
 template<class T>
-vector<T>* InsertMember(const vector<T> m, vector<vector<T>*> *S) {
+std::vector<T>* InsertMember(const std::vector<T> m, std::vector<std::vector<T>*> *S) {
   assert(m.size() > 0);
   T idx = m[0];
   assert(idx>=(T)0 && idx < (T)S->size());
@@ -278,7 +278,7 @@ vector<T>* InsertMember(const vector<T> m, vector<vector<T>*> *S) {
     // It could either be a programming error or a deeper conceptual bug.
     return NULL;  // nothing was inserted.
   } else {
-    vector<T> *ret = (*S)[idx] = new vector<T>(m);  // New copy of m.
+    std::vector<T> *ret = (*S)[idx] = new std::vector<T>(m);  // New copy of m.
     return ret;  // was inserted.
   }
 }
@@ -289,9 +289,9 @@ vector<T>* InsertMember(const vector<T> m, vector<vector<T>*> *S) {
 // not problematic.  We assume that the fst is sorted on input label (so epsilon arcs first)
 // The algorithm is described in section (C) above.  We use the same variable for S and T.
 template<class Arc> void Closure(MutableFst<Arc> *fst, std::set<typename Arc::StateId> *S,
-                                 const vector<bool> &pVec) {
+                                 const std::vector<bool> &pVec) {
   typedef typename Arc::StateId StateId;
-  vector<StateId> Q;
+  std::vector<StateId> Q;
   CopySetToVector(*S, &Q);
   while (Q.size() != 0) {
     StateId s = Q.back();
@@ -301,7 +301,7 @@ template<class Arc> void Closure(MutableFst<Arc> *fst, std::set<typename Arc::St
       if (arc.ilabel != 0) break;  // Break from the loop: due to sorting there will be no
       // more transitions with epsilons as input labels.
       if (!pVec[arc.nextstate]) {  // Next state is not problematic -> we can use this transition.
-        pair< typename std::set<StateId>::iterator, bool > p = S->insert(arc.nextstate);
+        std::pair< typename std::set<StateId>::iterator, bool > p = S->insert(arc.nextstate);
         if (p.second) {  // True means: was inserted into S (wasn't already there).
           Q.push_back(arc.nextstate);
         }
@@ -316,7 +316,7 @@ template<class Arc> void Closure(MutableFst<Arc> *fst, std::set<typename Arc::St
 template<class Arc, class Int>
 void PreDeterminize(MutableFst<Arc> *fst,
                     typename Arc::Label first_new_sym,
-                    vector<Int> *symsOut) {
+                    std::vector<Int> *symsOut) {
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
   typedef size_t ArcId;  // Our own typedef, not standard OpenFst.  Use size_t
@@ -357,10 +357,10 @@ void PreDeterminize(MutableFst<Arc> *fst,
     KALDI_VLOG(2) <<  "PreDeterminize: n_states = "<<(n_states)<<", max_state ="<<(max_state);
   }
 
-  vector<bool> p_vec(max_state+1, false);  // compute this next.
+  std::vector<bool> p_vec(max_state+1, false);  // compute this next.
   {  // D(ii): computing the array p. ["problematic states, i.e. states with >1 input transition,
     // counting being the initial state as an input transition"].
-    vector<bool> seen_vec(max_state+1, false);  // rather than counting incoming transitions we just have a bool that says we saw at least one.
+    std::vector<bool> seen_vec(max_state+1, false);  // rather than counting incoming transitions we just have a bool that says we saw at least one.
 
     seen_vec[fst->Start()] = true;
     for (StateIterator<MutableFst<Arc> > siter(*fst); ! siter.Done(); siter.Next()) {
@@ -375,19 +375,19 @@ void PreDeterminize(MutableFst<Arc> *fst,
     }
   }
   // D(iii): set up m(a)
-  std::map<pair<StateId, ArcId>, size_t> m_map;
+  std::map<std::pair<StateId, ArcId>, size_t> m_map;
   // This is the array m, indexed by arcs.  It maps to the index of the symbol we add.
 
 
   // WARNING: we should be sure to clean up this memory before exiting.  Do not return
   // or throw an exception from this function, later than this point, without cleaning up!
   // Note that the vectors are shared between Q and S (they "belong to" S.
-  vector<vector<StateId>* > S(max_state+1, (vector<StateId>*)(void*)0);
-  vector<pair<vector<StateId>*, size_t> > Q;
+  std::vector<std::vector<StateId>* > S(max_state+1, (std::vector<StateId>*)(void*)0);
+  std::vector<std::pair<std::vector<StateId>*, size_t> > Q;
 
   // D(iv): initialize S and Q.
   {
-    vector<StateId> all_seed_states;  // all "problematic" states, plus initial state (if not problematic).
+    std::vector<StateId> all_seed_states;  // all "problematic" states, plus initial state (if not problematic).
     if (!p_vec[fst->Start()])
       all_seed_states.push_back(fst->Start());
     for (StateId s = 0;s<=max_state; s++)
@@ -399,16 +399,16 @@ void PreDeterminize(MutableFst<Arc> *fst,
       closure_s.insert(s);  // insert "seed" state.
       pre_determinize_helpers::Closure(fst, &closure_s, p_vec);  // follow epsilons to non-problematic states.
       // Closure in this case whis will usually not add anything, for typical topologies in speech
-      vector<StateId> closure_s_vec;
+      std::vector<StateId> closure_s_vec;
       pre_determinize_helpers::CopySetToVector(closure_s, &closure_s_vec);
       KALDI_ASSERT(closure_s_vec.size() != 0);
-      vector<StateId> *ptr = pre_determinize_helpers::InsertMember(closure_s_vec, &S);
+      std::vector<StateId> *ptr = pre_determinize_helpers::InsertMember(closure_s_vec, &S);
       KALDI_ASSERT(ptr != NULL);  // Or conceptual bug or programming error.
-      Q.push_back(pair<vector<StateId>*, size_t>(ptr, 0));
+      Q.push_back(std::pair<std::vector<StateId>*, size_t>(ptr, 0));
     }
   }
 
-  vector<bool> d_vec(max_state+1, false);  // "done vector".  Purely for debugging.
+  std::vector<bool> d_vec(max_state+1, false);  // "done vector".  Purely for debugging.
 
 
   size_t num_extra_det_states = 0;
@@ -417,9 +417,9 @@ void PreDeterminize(MutableFst<Arc> *fst,
   while (Q.size() != 0) {
 
     // (D)(v)(a)
-    pair<vector<StateId>*, size_t> cur_pair(Q.back());
+    std::pair<std::vector<StateId>*, size_t> cur_pair(Q.back());
     Q.pop_back();
-    const vector<StateId> &A(*cur_pair.first);
+    const std::vector<StateId> &A(*cur_pair.first);
     size_t n =cur_pair.second;  // next special symbol to add.
 
     // (D)(v)(b)
@@ -430,7 +430,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
 
     // From here is (D)(v)(c).  We work out S_\eps and S_t (for t\neq eps)
     // simultaneously at first.
-    map<Label, set<pair<pair<StateId, ArcId>, StateId> > > arc_hash;
+    std::map<Label, std::set<std::pair<std::pair<StateId, ArcId>, StateId> > > arc_hash;
     // arc_hash is a hash with info of all arcs from states in the set A to
     // non-problematic states.
     // It is a map from ilabel to pair(pair(start-state, arc-offset), end-state).
@@ -446,8 +446,8 @@ void PreDeterminize(MutableFst<Arc> *fst,
         for (ArcIterator<MutableFst<Arc> > aiter(*fst, s); ! aiter.Done(); aiter.Next(), ++arc_id) {
           const Arc &arc = aiter.Value();
 
-          pair<pair<StateId, ArcId>, StateId>
-              this_pair(pair<StateId, ArcId>(s, arc_id), arc.nextstate);
+          std::pair<std::pair<StateId, ArcId>, StateId>
+              this_pair(std::pair<StateId, ArcId>(s, arc_id), arc.nextstate);
           bool inserted = (arc_hash[arc.ilabel].insert(this_pair)).second;
           assert(inserted);  // Otherwise we had a duplicate.
         }
@@ -456,10 +456,10 @@ void PreDeterminize(MutableFst<Arc> *fst,
 
     // (D)(v)(d)
     if (arc_hash.count(0) == 1) {  // We have epsilon transitions out.
-      set<pair<pair<StateId, ArcId>, StateId> >  &eps_set = arc_hash[0];
-      typedef typename set<pair<pair<StateId, ArcId>, StateId> >::iterator set_iter_t;
+      std::set<std::pair<std::pair<StateId, ArcId>, StateId> >  &eps_set = arc_hash[0];
+      typedef typename std::set<std::pair<std::pair<StateId, ArcId>, StateId> >::iterator set_iter_t;
       for (set_iter_t siter = eps_set.begin(); siter != eps_set.end(); ++siter) {
-        const pair<pair<StateId, ArcId>, StateId>  &this_pr = *siter;
+        const std::pair<std::pair<StateId, ArcId>, StateId>  &this_pr = *siter;
         if (p_vec[this_pr.second]) {  // Eps-transition to problematic state.
           assert(m_map.count(this_pr.first) == 0);
           m_map[this_pr.first] = n;
@@ -470,13 +470,13 @@ void PreDeterminize(MutableFst<Arc> *fst,
 
     // (D)(v)(e)
     {
-      typedef typename map<Label, set<pair<pair<StateId, ArcId>, StateId> > >::iterator map_iter_t;
-      typedef typename set<pair<pair<StateId, ArcId>, StateId> >::iterator set_iter_t2;
+      typedef typename std::map<Label, std::set<std::pair<std::pair<StateId, ArcId>, StateId> > >::iterator map_iter_t;
+      typedef typename std::set<std::pair<std::pair<StateId, ArcId>, StateId> >::iterator set_iter_t2;
       for (map_iter_t miter = arc_hash.begin(); miter != arc_hash.end(); ++miter) {
         Label t = miter->first;
-        set<pair<pair<StateId, ArcId>, StateId> >  &S_t = miter->second;
+        std::set<std::pair<std::pair<StateId, ArcId>, StateId> >  &S_t = miter->second;
         if (t != 0) {  // For t != epsilon,
-          set<StateId> V_t;  // set of destination non-problem states.  Will create this set now.
+          std::set<StateId> V_t;  // set of destination non-problem states.  Will create this set now.
 
           // exists_noproblem is true iff |U_t| > 0.
           size_t k = 0;
@@ -485,7 +485,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
           // The if-statement if (|S_t|>1) is pushed inside the loop, as the loop also computes
           // the set V_t.
           for (set_iter_t2 siter = S_t.begin(); siter != S_t.end(); ++siter) {
-            const pair<pair<StateId, ArcId>, StateId>  &this_pr = *siter;
+            const std::pair<std::pair<StateId, ArcId>, StateId>  &this_pr = *siter;
             if (p_vec[this_pr.second]) {  // only consider problematic states (just set T_t)
               if (S_t.size() > 1) {  // This is where we pushed the if-statement in.
                 assert(m_map.count(this_pr.first) == 0);
@@ -499,11 +499,11 @@ void PreDeterminize(MutableFst<Arc> *fst,
           }
           if (V_t.size() != 0) {
             pre_determinize_helpers::Closure(fst, &V_t, p_vec);  // follow epsilons to non-problematic states.
-            vector<StateId> closure_V_t_vec;
+            std::vector<StateId> closure_V_t_vec;
             pre_determinize_helpers::CopySetToVector(V_t, &closure_V_t_vec);
-            vector<StateId> *ptr = pre_determinize_helpers::InsertMember(closure_V_t_vec, &S);
+            std::vector<StateId> *ptr = pre_determinize_helpers::InsertMember(closure_V_t_vec, &S);
             if (ptr != NULL) {  // was inserted.
-              Q.push_back(pair<vector<StateId>*, size_t>(ptr, k));
+              Q.push_back(std::pair<std::vector<StateId>*, size_t>(ptr, k));
             }
           }
         }
@@ -522,7 +522,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
   {  // (D)(vii): compute symbol-table ID's.
     // sets up symsOut array.
     int64 n = -1;
-    for (typename map<pair<StateId, ArcId>, size_t>::iterator m_iter = m_map.begin();
+    for (typename std::map<std::pair<StateId, ArcId>, size_t>::iterator m_iter = m_map.begin();
         m_iter != m_map.end();
         ++m_iter) {
       n = std::max(n, (int64) m_iter->second);  // m_iter->second is of type size_t.
@@ -533,14 +533,14 @@ void PreDeterminize(MutableFst<Arc> *fst,
   }
 
   // (D)(viii): set up hash.
-  map<pair<StateId, size_t>, StateId> h_map;
+  std::map<std::pair<StateId, size_t>, StateId> h_map;
 
   {  // D(ix): add extra symbols!  This is where the work gets done.
 
     // Core part of this is below, search for (*)
     size_t n_states_added = 0;
 
-    for (typename map<pair<StateId, ArcId>, size_t>::iterator m_iter = m_map.begin();
+    for (typename std::map<std::pair<StateId, ArcId>, size_t>::iterator m_iter = m_map.begin();
         m_iter != m_map.end();
         ++m_iter) {
       StateId state = m_iter->first.first;
@@ -555,7 +555,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
       if (arc.ilabel == 0)
         arc.ilabel = (*symsOut)[m_a];
       else {
-        pair<StateId, size_t> pr(arc.nextstate, m_a);
+        std::pair<StateId, size_t> pr(arc.nextstate, m_a);
         if (!h_map.count(pr)) {
           n_states_added++;
           StateId newstate = fst->AddState();
@@ -579,7 +579,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
 
 
 template<class Label> void CreateNewSymbols(SymbolTable *input_sym_table, int nSym,
-                                            std::string prefix, vector<Label> *symsOut) {
+                                            std::string prefix, std::vector<Label> *symsOut) {
   // Creates nSym new symbols named (prefix)0, (prefix)1 and so on.
   // Crashes if it cannot create them because one or more of them were in the symbol
   // table already.
@@ -596,8 +596,8 @@ template<class Label> void CreateNewSymbols(SymbolTable *input_sym_table, int nS
 
 
 // see pre-determinize.h for documentation.
-template<class Arc> void AddSelfLoops(MutableFst<Arc> *fst, vector<typename Arc::Label> &isyms,
-                                      vector<typename Arc::Label> &osyms) {
+template<class Arc> void AddSelfLoops(MutableFst<Arc> *fst, std::vector<typename Arc::Label> &isyms,
+                                      std::vector<typename Arc::Label> &osyms) {
   assert(fst != NULL);
   assert(isyms.size() == osyms.size());
   typedef typename Arc::Label Label;
@@ -614,7 +614,7 @@ template<class Arc> void AddSelfLoops(MutableFst<Arc> *fst, vector<typename Arc:
          osyms_min = *std::min_element(osyms.begin(), osyms.end()),
          osyms_max = *std::max_element(osyms.begin(), osyms.end());
   std::set<Label> isyms_set, osyms_set;
-  for (size_t i = 0;i < isyms.size();i++) {
+  for (size_t i = 0; i < isyms.size(); i++) {
     assert(isyms[i] > 0 && osyms[i] > 0);  // should not have epsilon or invalid symbols.
     isyms_set.insert(isyms[i]);
     osyms_set.insert(osyms[i]);
@@ -648,7 +648,7 @@ template<class Arc> void AddSelfLoops(MutableFst<Arc> *fst, vector<typename Arc:
 }
 
 template<class Arc>
-int64 DeleteISymbols(MutableFst<Arc> *fst, vector<typename Arc::Label> isyms) {
+int64 DeleteISymbols(MutableFst<Arc> *fst, std::vector<typename Arc::Label> isyms) {
 
   // We could do this using the Mapper concept, but this is much easier to understand.
 
@@ -690,7 +690,7 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
   assert(fst != NULL);
   StateId num_states = fst->NumStates();
   StateId num_final = 0;
-  vector<StateId> final_states;
+  std::vector<StateId> final_states;
   for (StateId s = 0; s < num_states; s++) {
     if (fst->Final(s) != Weight::Zero()) {
       num_final++;
@@ -710,7 +710,7 @@ typename Arc::StateId CreateSuperFinal(MutableFst<Arc> *fst) {
 
   StateId final_state = fst->AddState();
   fst->SetFinal(final_state, Weight::One());
-  for (size_t idx = 0;idx < final_states.size(); idx++) {
+  for (size_t idx = 0; idx < final_states.size(); idx++) {
     StateId s = final_states[idx];
     Weight weight = fst->Final(s);
     fst->SetFinal(s, Weight::Zero());
diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc
index bea8120e0e5..7210e455413 100644
--- a/src/fstext/pre-determinize-test.cc
+++ b/src/fstext/pre-determinize-test.cc
@@ -26,6 +26,9 @@
 
 namespace fst
 {
+  using std::vector;
+  using std::cout;
+
 // Don't instantiate with log semiring, as RandEquivalent may fail.
 template<class Arc>  void TestPreDeterminize() {
   typedef typename Arc::Label Label;
diff --git a/src/fstext/pre-determinize.h b/src/fstext/pre-determinize.h
index 56f9ae0368c..7ead946bfb5 100644
--- a/src/fstext/pre-determinize.h
+++ b/src/fstext/pre-determinize.h
@@ -31,11 +31,11 @@ namespace fst {
 
 /* PreDeterminize inserts extra symbols on the input side of an FST as necessary to
    ensure that, after epsilon removal, it will be compactly determinizable by the
-   determinize* algorithm.  By compactly determinizable we mean that 
+   determinize* algorithm.  By compactly determinizable we mean that
    no original FST state is represented in more than one determinized state).
 
    Caution: this code is now only used in testing.
-   
+
    The new symbols start from the value "first_new_symbol", which should be
    higher than the largest-numbered symbol currently in the FST.  The new
    symbols added are put in the array syms_out, which should be empty at start.
@@ -44,7 +44,7 @@ namespace fst {
 template<class Arc, class Int>
 void PreDeterminize(MutableFst<Arc> *fst,
                     typename Arc::Label first_new_symbol,
-                    vector<Int> *syms_out);
+                    std::vector<Int> *syms_out);
 
 
 /* CreateNewSymbols is a helper function used inside PreDeterminize, and is also useful
@@ -53,7 +53,7 @@ void PreDeterminize(MutableFst<Arc> *fst,
 
 template<class Label>
 void CreateNewSymbols(SymbolTable *inputSymTable, int nSym,
-                      std::string prefix, vector<Label> *syms_out);
+                      std::string prefix, std::vector<Label> *syms_out);
 
 /** AddSelfLoops is a function you will probably want to use alongside PreDeterminize,
     to add self-loops to any FSTs that you compose on the left hand side of the one
@@ -72,15 +72,15 @@ void CreateNewSymbols(SymbolTable *inputSymTable, int nSym,
     of symbols on its input and output.
 */
 template<class Arc>
-void AddSelfLoops(MutableFst<Arc> *fst, vector<typename Arc::Label> &isyms,
-                     vector<typename Arc::Label> &osyms);
+void AddSelfLoops(MutableFst<Arc> *fst, std::vector<typename Arc::Label> &isyms,
+                     std::vector<typename Arc::Label> &osyms);
 
 
 /* DeleteSymbols replaces any instances of symbols in the vector symsIn, appearing
    on the input side, with epsilon. */
 /* It returns the number of instances of symbols deleted. */
 template<class Arc>
-int64 DeleteISymbols(MutableFst<Arc> *fst, vector<typename Arc::Label> symsIn);
+int64 DeleteISymbols(MutableFst<Arc> *fst, std::vector<typename Arc::Label> symsIn);
 
 /* CreateSuperFinal takes an FST, and creates an equivalent FST with a single final
    state with no transitions out and unit final weight, by inserting epsilon transitions
diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc
index 2da002d980e..5d8c40b6a75 100644
--- a/src/fstext/prune-special-test.cc
+++ b/src/fstext/prune-special-test.cc
@@ -39,7 +39,7 @@ static void TestPruneSpecial() {
   {
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t");
     fstprinter.Print(&std::cout, "standard output");
-    std::cout << endl;
+    std::cout << std::endl;
   }
 
   // Do the special pruning.
@@ -48,7 +48,7 @@ static void TestPruneSpecial() {
   {
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t");
     fstprinter.Print(&std::cout, "standard output");
-    std::cout << endl;
+    std::cout << std::endl;
   }
 
   // Do the normal pruning.
@@ -57,7 +57,7 @@ static void TestPruneSpecial() {
   {
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t");
     fstprinter.Print(&std::cout, "standard output");
-    std::cout << endl;
+    std::cout << std::endl;
   }
 
   KALDI_ASSERT(RandEquivalent(ofst1, ofst2,
diff --git a/src/fstext/rand-fst.h b/src/fstext/rand-fst.h
index 3fb4e897c5c..56ba250e183 100644
--- a/src/fstext/rand-fst.h
+++ b/src/fstext/rand-fst.h
@@ -62,7 +62,7 @@ template<class Arc> VectorFst<Arc>* RandFst(RandFstOptions opts = RandFstOptions
  start:
 
   // Create states.
-  vector<StateId> all_states;
+  std::vector<StateId> all_states;
   for (size_t i = 0;i < (size_t)opts.n_states;i++) {
     StateId this_state = fst->AddState();
     if (i == 0) fst->SetStart(i);
@@ -88,7 +88,7 @@ template<class Arc> VectorFst<Arc>* RandFst(RandFstOptions opts = RandFstOptions
     a.ilabel = kaldi::Rand() % opts.n_syms;
     a.olabel = kaldi::Rand() % opts.n_syms;  // same input+output vocab.
     a.weight = (Weight) (opts.weight_multiplier*(kaldi::Rand() % 4));
-    
+
     fst->AddArc(start_state, a);
   }
 
@@ -114,7 +114,7 @@ template<class Arc> VectorFst<Arc>* RandPairFst(RandFstOptions opts = RandFstOpt
  start:
 
   // Create states.
-  vector<StateId> all_states;
+  std::vector<StateId> all_states;
   for (size_t i = 0;i < (size_t)opts.n_states;i++) {
     StateId this_state = fst->AddState();
     if (i == 0) fst->SetStart(i);
@@ -140,7 +140,7 @@ template<class Arc> VectorFst<Arc>* RandPairFst(RandFstOptions opts = RandFstOpt
     a.ilabel = kaldi::Rand() % opts.n_syms;
     a.olabel = kaldi::Rand() % opts.n_syms;  // same input+output vocab.
     a.weight = Weight (opts.weight_multiplier*(kaldi::Rand() % 4), opts.weight_multiplier*(kaldi::Rand() % 4));
-    
+
     fst->AddArc(start_state, a);
   }
 
diff --git a/src/fstext/remove-eps-local-inl.h b/src/fstext/remove-eps-local-inl.h
index 0565981186d..8590bab2254 100644
--- a/src/fstext/remove-eps-local-inl.h
+++ b/src/fstext/remove-eps-local-inl.h
@@ -64,9 +64,9 @@ class RemoveEpsLocalClass {
  private:
   MutableFst<Arc> *fst_;
   StateId non_coacc_state_;  //  use this to delete arcs: make it nextstate
-  vector<StateId> num_arcs_in_;   // The number of arcs into the state, plus one
+  std::vector<StateId> num_arcs_in_;   // The number of arcs into the state, plus one
                                   // if it's the start state.
-  vector<StateId> num_arcs_out_;  // The number of arcs out of the state, plus
+  std::vector<StateId> num_arcs_out_;  // The number of arcs out of the state, plus
                                   // one if it's a final state.
   ReweightPlus reweight_plus_;
 
@@ -95,7 +95,7 @@ class RemoveEpsLocalClass {
     num_arcs_in_[fst_->Start()]++;  // count start as trans in.
     for (StateId s = 0; s < num_states; s++) {
       if (fst_->Final(s) != Weight::Zero())
-      num_arcs_out_[s]++;  // count final as transition.
+        num_arcs_out_[s]++;  // count final as transition.
       for (ArcIterator<MutableFst<Arc> > aiter(*fst_, s); !aiter.Done(); aiter.Next()) {
         num_arcs_in_[aiter.Value().nextstate]++;
         num_arcs_out_[s]++;
@@ -174,7 +174,7 @@ class RemoveEpsLocalClass {
     const StateId nextstate = arc.nextstate;
     Weight total_removed = Weight::Zero(),
         total_kept = Weight::Zero();  // totals out of nextstate.
-    vector<Arc> arcs_to_add;  // to add to state s.
+    std::vector<Arc> arcs_to_add;  // to add to state s.
     for (MutableArcIterator<MutableFst<Arc> > aiter_next(fst_, nextstate);
         !aiter_next.Done();
         aiter_next.Next()) {
diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc
index af8b890cca8..80cca875ff0 100644
--- a/src/fstext/remove-eps-local-test.cc
+++ b/src/fstext/remove-eps-local-test.cc
@@ -26,8 +26,8 @@
 
 namespace fst
 {
-
-
+using std::vector;
+using std::cout;
 
 // Don't instantiate with log semiring, as RandEquivalent may fail.
 template<class Arc> static void TestRemoveEpsLocal() {
diff --git a/src/fstext/remove-eps-local.h b/src/fstext/remove-eps-local.h
index ce6ff067d7b..c45226c11cd 100644
--- a/src/fstext/remove-eps-local.h
+++ b/src/fstext/remove-eps-local.h
@@ -36,7 +36,7 @@ namespace fst {
 /// into one.
 /// The algorithm preserves equivalence and stochasticity in the given semiring.
 /// If you want to preserve stochasticity in a different semiring (e.g. log),
-/// then use RemoveEpsLocalSpecial, which only words for StdArc but which
+/// then use RemoveEpsLocalSpecial, which only works for StdArc but which
 /// preserves stochasticity, where possible (*) in the LogArc sense.  The reason that we can't
 /// just cast to a different semiring is that in that case we would no longer
 /// be able to guarantee equivalence in the original semiring (this arises from
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index 3e704879fb9..290a4f8bc2e 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -86,7 +86,7 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual const FST &GetFst() const { return *fst_; }
 
   virtual ~TableMatcherImpl() {
-    vector<ArcId> *const empty = ((vector<ArcId>*)(NULL)) + 1;  // special marker.
+    std::vector<ArcId> *const empty = ((std::vector<ArcId>*)(NULL)) + 1;  // special marker.
     for (size_t i = 0; i < tables_.size(); i++) {
       if (tables_[i] != NULL && tables_[i] != empty)
         delete tables_[i];
@@ -107,12 +107,12 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
     if (match_type_ == MATCH_NONE)
       LOG(FATAL) << "TableMatcher: bad match type";
     s_ = s;
-    vector<ArcId> *const empty = ((vector<ArcId>*)(NULL)) + 1;  // special marker.
+    std::vector<ArcId> *const empty = ((std::vector<ArcId>*)(NULL)) + 1;  // special marker.
     if (static_cast<size_t>(s) >= tables_.size()) {
       assert(s>=0);
       tables_.resize(s+1, NULL);
     }
-    vector<ArcId>* &this_table_ = tables_[s];  // note: ref to ptr.
+    std::vector<ArcId>* &this_table_ = tables_[s];  // note: ref to ptr.
     if (this_table_ == empty) {
       backoff_matcher_.SetState(s);
       return;
@@ -137,7 +137,7 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
         return;  // table would be too sparse.
       }
       // OK, now we are creating the table.
-      this_table_ = new vector<ArcId> (highest_label+1, kNoStateId);
+      this_table_ = new std::vector<ArcId> (highest_label+1, kNoStateId);
       ArcId pos = 0;
       for (aiter.Seek(0); !aiter.Done(); aiter.Next(), pos++) {
         Label label = (match_type_ == MATCH_OUTPUT ?
@@ -232,7 +232,7 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   Arc loop_;
   ArcIterator<FST> *aiter_;
   StateId s_;
-  vector<vector<ArcId> *> tables_;
+  std::vector<std::vector<ArcId> *> tables_;
   TableMatcherOptions opts_;
   BackoffMatcher backoff_matcher_;
 
diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc
index 46b6aaf46fb..b4682443d29 100644
--- a/src/fstext/trivial-factor-weight-test.cc
+++ b/src/fstext/trivial-factor-weight-test.cc
@@ -26,6 +26,8 @@
 
 namespace fst
 {
+  using std::cout;
+  using std::vector;
 
 // Don't instantiate with log semiring, as RandEquivalent may fail.
 template<class Arc>  void TestFactor() {
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index 43d72729c08..044e83b1f96 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -220,7 +220,7 @@ class TrivialFactorWeightFstImpl
     } else {
       StateId s = elements_.size();
       elements_.push_back(e);
-      element_map_.insert(pair<const Element, StateId>(e, s));
+      element_map_.insert(std::pair<const Element, StateId>(e, s));
       return s;
     }
   }
@@ -238,7 +238,7 @@ class TrivialFactorWeightFstImpl
           PushArc(s, Arc(extra_ilabel_, extra_olabel_, e.weight, dest));
         } // else we're done.  This is a final state.
       } else {  // Can be factored.
-        const pair<Weight, Weight> &p = fit.Value();
+        const std::pair<Weight, Weight> &p = fit.Value();
         StateId dest = FindState(Element(e.state, p.second.Quantize(delta_)));
         PushArc(s, Arc(extra_ilabel_, extra_olabel_, p.first, dest));
       }
@@ -253,7 +253,7 @@ class TrivialFactorWeightFstImpl
           StateId dest = FindState(Element(arc.nextstate, Weight::One()));
           PushArc(s, Arc(arc.ilabel, arc.olabel, arc.weight, dest));
         } else {
-          const pair<Weight, Weight> &p = fit.Value();
+          const std::pair<Weight, Weight> &p = fit.Value();
           StateId dest = FindState(Element(arc.nextstate, p.second.Quantize(delta_)));
           PushArc(s, Arc(arc.ilabel, arc.olabel, p.first, dest));
         }
@@ -263,7 +263,7 @@ class TrivialFactorWeightFstImpl
       if (final_w != Weight::Zero()) {
         FactorIterator fit(final_w);
         if (!fit.Done()) {
-          const pair<Weight, Weight> &p = fit.Value();
+          const std::pair<Weight, Weight> &p = fit.Value();
           StateId dest = FindState(Element(kNoStateId, p.second.Quantize(delta_)));
           PushArc(s, Arc(extra_ilabel_, extra_olabel_, p.first, dest));
         }
@@ -298,7 +298,7 @@ class TrivialFactorWeightFstImpl
   uint32 mode_;               // factoring arc and/or final weights
   Label extra_ilabel_;        // ilabel of arc created when factoring final w's
   Label extra_olabel_;        // olabel of arc created when factoring final w's
-  vector<Element> elements_;  // mapping Fst state to Elements
+  std::vector<Element> elements_;  // mapping Fst state to Elements
   ElementMap element_map_;    // mapping Elements to Fst state
 
 };
diff --git a/src/gmm/Makefile b/src/gmm/Makefile
index d8aedadfd93..caee6734afe 100644
--- a/src/gmm/Makefile
+++ b/src/gmm/Makefile
@@ -14,8 +14,8 @@ OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o \
 
 LIBNAME = kaldi-gmm
 
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 
diff --git a/src/gmm/diag-gmm-test.cc b/src/gmm/diag-gmm-test.cc
index 8d7ff3d73bd..3308acc2837 100644
--- a/src/gmm/diag-gmm-test.cc
+++ b/src/gmm/diag-gmm-test.cc
@@ -43,7 +43,7 @@ void InitRandomGmm(DiagGmm *gmm_in) {
   gmm.SetWeights(weights);
   gmm.SetInvVarsAndMeans(inv_vars, means);
   gmm.Perturb(0.5 * RandUniform());
-  gmm.ComputeGconsts();  // this is unnecassary; computed in Perturb
+  gmm.ComputeGconsts();  // this is unnecessary; computed in Perturb
 }
 
 
diff --git a/src/gmm/diag-gmm.cc b/src/gmm/diag-gmm.cc
index eed5553fc8e..36fda914a2e 100644
--- a/src/gmm/diag-gmm.cc
+++ b/src/gmm/diag-gmm.cc
@@ -530,7 +530,7 @@ void DiagGmm::LogLikelihoods(const VectorBase<BaseFloat> &data,
   loglikes->Resize(gconsts_.Dim(), kUndefined);
   loglikes->CopyFromVec(gconsts_);
   if (data.Dim() != Dim()) {
-    KALDI_ERR << "DiagGmm::ComponentLogLikelihood, dimension "
+    KALDI_ERR << "DiagGmm::LogLikelihoods, dimension "
               << "mismatch " << data.Dim() << " vs. "<< Dim();
   }
   Vector<BaseFloat> data_sq(data);
@@ -549,7 +549,7 @@ void DiagGmm::LogLikelihoods(const MatrixBase<BaseFloat> &data,
   loglikes->Resize(data.NumRows(), gconsts_.Dim(), kUndefined);
   loglikes->CopyRowsFromVec(gconsts_);
   if (data.NumCols() != Dim()) {
-    KALDI_ERR << "DiagGmm::ComponentLogLikelihood, dimension "
+    KALDI_ERR << "DiagGmm::LogLikelihoods, dimension "
               << "mismatch " << data.NumCols() << " vs. "<< Dim();
   }
   Matrix<BaseFloat> data_sq(data);
diff --git a/src/gmm/diag-gmm.h b/src/gmm/diag-gmm.h
index 1243d7a6bfd..4a10ea34471 100644
--- a/src/gmm/diag-gmm.h
+++ b/src/gmm/diag-gmm.h
@@ -100,7 +100,7 @@ class DiagGmm {
                                const std::vector<int32> &indices,
                                Vector<BaseFloat> *loglikes) const;
 
-  /// Get gaussian selection information for one frame.  Returns og-like
+  /// Get gaussian selection information for one frame.  Returns log-like
   /// this frame.  Output is the best "num_gselect" indices, sorted from best to
   /// worst likelihood.  If "num_gselect" > NumGauss(), sets it to NumGauss().
   BaseFloat GaussianSelection(const VectorBase<BaseFloat> &data,
diff --git a/src/gmm/ebw-diag-gmm-test.cc b/src/gmm/ebw-diag-gmm-test.cc
index dfcec0e0fd3..41d4c75579b 100644
--- a/src/gmm/ebw-diag-gmm-test.cc
+++ b/src/gmm/ebw-diag-gmm-test.cc
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-
+#include <cmath>
 
 #include "gmm/diag-gmm.h"
 #include "gmm/ebw-diag-gmm.h"
@@ -141,7 +141,7 @@ void UnitTestEstimateMmieDiagGmm() {
   den.SetZero(flags);
 
   size_t iteration = 0;
-  double last_log_like_diff;
+  double last_log_like_diff = NAN;
   while (iteration < maxiterations) {
     Vector<BaseFloat> featvec_num(dim);
     Vector<BaseFloat> featvec_den(dim);
diff --git a/src/gmm/mle-diag-gmm.h b/src/gmm/mle-diag-gmm.h
index 24194ef886a..d41d36489bf 100644
--- a/src/gmm/mle-diag-gmm.h
+++ b/src/gmm/mle-diag-gmm.h
@@ -85,7 +85,7 @@ struct MapDiagGmmOptions {
   /// Tau value for the weights-- this tau value is applied
   /// per state, not per Gaussian.
   BaseFloat weight_tau;
-  
+
   MapDiagGmmOptions(): mean_tau(10.0),
                              variance_tau(50.0),
                              weight_tau(10.0) { }
@@ -150,8 +150,8 @@ class AccumDiagGmm {
       const MatrixBase<BaseFloat> &data,
       const VectorBase<BaseFloat> &frame_weights,
       int32 num_threads);
-  
-  
+
+
   /// Increment the stats for this component by the specified amount
   /// (not all parts may be taken, depending on flags).
   /// Note: x_stats and x2_stats are assumed to already be multiplied by "occ"
@@ -162,7 +162,7 @@ class AccumDiagGmm {
 
   /// Increment with stats from this other accumulator (times scale)
   void Add(double scale, const AccumDiagGmm &acc);
-  
+
   /// Smooths the accumulated counts by adding 'tau' extra frames. An example
   /// use for this is I-smoothing for MMIE.   Calls SmoothWithAccum.
   void SmoothStats(BaseFloat tau);
@@ -179,13 +179,13 @@ class AccumDiagGmm {
   void SmoothWithModel(BaseFloat tau, const DiagGmm &src_gmm);
 
   // Const accessors
-  const GmmFlagsType Flags() const { return flags_; }
+  GmmFlagsType Flags() const { return flags_; }
   const VectorBase<double> &occupancy() const { return occupancy_; }
   const MatrixBase<double> &mean_accumulator() const { return mean_accumulator_; }
   const MatrixBase<double> &variance_accumulator() const { return variance_accumulator_; }
 
   // used in testing.
-  void AssertEqual(const AccumDiagGmm &other); 
+  void AssertEqual(const AccumDiagGmm &other);
  private:
   int32 dim_;
   int32 num_comp_;
diff --git a/src/gmm/mle-full-gmm.h b/src/gmm/mle-full-gmm.h
index 6e770764e1e..618714b0e9b 100644
--- a/src/gmm/mle-full-gmm.h
+++ b/src/gmm/mle-full-gmm.h
@@ -1,7 +1,7 @@
 // gmm/mle-full-gmm.h
 
 // Copyright 2009-2011  Jan Silovsky;  Saarland University;
-//                      Microsoft Corporation; 
+//                      Microsoft Corporation;
 //                      Univ. Erlangen Nuremberg, Korbinian Riedhammer
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -91,7 +91,7 @@ class AccumFullGmm {
   void Resize(int32 num_components, int32 dim, GmmFlagsType flags);
   /// Calls Resize with arguments based on gmm_ptr_
   void Resize(const FullGmm &gmm, GmmFlagsType flags);
-  
+
   void ResizeVarAccumulator(int32 num_comp, int32 dim);
   /// Returns the number of mixture components
   int32 NumGauss() const { return num_comp_; }
@@ -122,8 +122,8 @@ class AccumFullGmm {
                                const VectorBase<BaseFloat> &data,
                                BaseFloat frame_posterior);
 
-  /// Accessors  
-  const GmmFlagsType Flags() const { return flags_; }
+  /// Accessors
+  GmmFlagsType Flags() const { return flags_; }
   const Vector<double> &occupancy() const { return occupancy_; }
   const Matrix<double> &mean_accumulator() const { return mean_accumulator_; }
   const std::vector<SpMatrix<double> > &covariance_accumulator() const { return covariance_accumulator_; }
diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile
index 72a0fa15e73..82d10abe9ce 100644
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@@ -37,8 +37,8 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/gmmbin/gmm-basis-fmllr-training.cc b/src/gmmbin/gmm-basis-fmllr-training.cc
index 40c86be670b..3d93c3ca877 100644
--- a/src/gmmbin/gmm-basis-fmllr-training.cc
+++ b/src/gmmbin/gmm-basis-fmllr-training.cc
@@ -36,7 +36,7 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Estimate fMLLR basis representation. Reads a set of gradient scatter\n"
         "accumulations. Outputs basis matrices.\n"
-        "Usage: gmm-basis-fmllr-training [options] <model-in> <basis-wspecifier>"
+        "Usage: gmm-basis-fmllr-training [options] <model-in> <basis-wspecifier> "
          "<accs-in1> <accs-in2> ...\n";
 
     bool binary_write = true;
@@ -86,4 +86,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/gmmbin/gmm-decode-simple.cc b/src/gmmbin/gmm-decode-simple.cc
index b408afafdff..5ef35552dc0 100644
--- a/src/gmmbin/gmm-decode-simple.cc
+++ b/src/gmmbin/gmm-decode-simple.cc
@@ -38,8 +38,9 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int32 int32;
     using fst::SymbolTable;
     using fst::VectorFst;
+    using fst::Fst;
     using fst::StdArc;
-    using fst::ReadFstKaldi;
+    using fst::ReadFstKaldiGeneric;
 
     const char *usage =
         "Decode features using GMM-based model.\n"
@@ -86,7 +87,7 @@ int main(int argc, char *argv[]) {
       am_gmm.Read(ki.Stream(), binary);
     }
 
-    VectorFst<StdArc> *decode_fst = ReadFstKaldi(fst_in_filename);
+    Fst<StdArc> *decode_fst = ReadFstKaldiGeneric(fst_in_filename);
 
     Int32VectorWriter words_writer(words_wspecifier);
 
diff --git a/src/gmmbin/gmm-init-biphone.cc b/src/gmmbin/gmm-init-biphone.cc
index 74f9110a8fd..0775a5c7b23 100644
--- a/src/gmmbin/gmm-init-biphone.cc
+++ b/src/gmmbin/gmm-init-biphone.cc
@@ -22,6 +22,7 @@
 #include "util/common-utils.h"
 #include "gmm/am-diag-gmm.h"
 #include "tree/event-map.h"
+#include "tree/context-dep.h"
 #include "hmm/hmm-topology.h"
 #include "hmm/transition-model.h"
 
@@ -51,10 +52,14 @@ void ReadSharedPhonesList(std::string rxfilename, std::vector<std::vector<int32>
 EventMap
 *GetFullBiphoneStubMap(const std::vector<std::vector<int32> > &phone_sets,
                        const std::vector<int32> &phone2num_pdf_classes,
-                       const std::vector<bool> &share_roots) {
+                       const std::vector<int32> &ci_phones_list,
+                       const std::vector<std::vector<int32> > &bi_counts,
+                       int32 biphone_min_count,
+                       const std::vector<int32> &mono_counts,
+                       int32 mono_min_count) {
 
   {  // Check the inputs
-    KALDI_ASSERT(!phone_sets.empty() && share_roots.size() == phone_sets.size());
+    KALDI_ASSERT(!phone_sets.empty());
     std::set<int32> all_phones;
     for (size_t i = 0; i < phone_sets.size(); i++) {
       KALDI_ASSERT(IsSortedAndUniq(phone_sets[i]));
@@ -66,9 +71,26 @@ EventMap
     }
   }
 
+
   int32 numpdfs_per_phone = phone2num_pdf_classes[1];
   int32 current_pdfid = 0;
   std::map<EventValueType, EventMap*> level1_map;  // key is 1
+
+  for (size_t i = 0; i < ci_phones_list.size(); i++) {
+    std::map<EventValueType, EventAnswerType> level2_map;
+    level2_map[0] = current_pdfid++;
+    if (numpdfs_per_phone == 2) level2_map[1] = current_pdfid++;
+    level1_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level2_map);
+  }
+
+  // If there is not enough data for a biphone, we will revert to monophone
+  // and if there is not enough data for the monophone either, we will revert
+  // to zerophone (which is like a global garbage pdf) after initializing it.
+  int32 zerophone_pdf = -1;
+  // If a monophone state is created for a phone-set, the corresponding pdf will
+  // be stored in this vector.
+  std::vector<int32> monophone_pdf(phone_sets.size(), -1);
+
   for (size_t i = 0; i < phone_sets.size(); i++) {
 
     if (numpdfs_per_phone == 1) {
@@ -88,52 +110,147 @@ EventMap
         level1_map[pset[k]] = new TableEventMap(0, level2_map);
     } else {
       KALDI_ASSERT(numpdfs_per_phone == 2);
-      int32 base_pdfid = current_pdfid;
-      std::vector<int32> pset = phone_sets[i];  // All these will have a shared
+      std::vector<int32> right_phoneset = phone_sets[i];  // All these will have a shared
                                                 // event-map child
-      for (size_t k = 0; k < pset.size(); k++) {
-        // Create an event map for level2:
-        std::map<EventValueType, EventMap*> level2_map;  // key is 0
-        {
-          std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+      // Create an event map for level2:
+      std::map<EventValueType, EventMap*> level2_map;  // key is 0
+      {  // Handle CI phones
+        std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+        level3_map[0] = current_pdfid++;
+        level3_map[1] = current_pdfid++;
+        level2_map[0] = new TableEventMap(kPdfClass, level3_map);  // no-left-context case
+        for (size_t i = 0; i < ci_phones_list.size(); i++)  // ci-phone left-context cases
+          level2_map[ci_phones_list[i]] = new TableEventMap(kPdfClass, level3_map);
+      }
+      for (size_t j = 0; j < phone_sets.size(); j++) {
+        std::vector<int32> left_phoneset = phone_sets[j];  // All these will have a
+        // shared subtree with 2 pdfids
+        std::map<EventValueType, EventAnswerType> level3_map;  // key is kPdfClass
+        if (bi_counts.empty() ||
+            bi_counts[left_phoneset[0]][right_phoneset[0]] >= biphone_min_count) {
           level3_map[0] = current_pdfid++;
           level3_map[1] = current_pdfid++;
-          level2_map[0] = new TableEventMap(kPdfClass, level3_map);  // no-left-context case
+        } else if (mono_counts.empty() ||
+                   mono_counts[right_phoneset[0]] > mono_min_count) {
+          //  Revert to mono.
+          KALDI_VLOG(2) << "Reverting to mono for biphone (" << left_phoneset[0]
+                        << "," << right_phoneset[0] << ")";
+          if (monophone_pdf[i] == -1) {
+            KALDI_VLOG(1) << "Reserving mono PDFs for phone-set " << i;
+            monophone_pdf[i] = current_pdfid++;
+            current_pdfid++; // num-pdfs-per-phone is 2
+          }
+          level3_map[0] = monophone_pdf[i];
+          level3_map[1] = monophone_pdf[i] + 1;
+        } else {
+          KALDI_VLOG(2) << "Reverting to zerophone for biphone ("
+                        << left_phoneset[0]
+                        << "," << right_phoneset[0] << ")";
+          // Revert to zerophone
+          if (zerophone_pdf == -1) {
+            KALDI_VLOG(1) << "Reserving zero PDFs.";
+            zerophone_pdf = current_pdfid++;
+            current_pdfid++; // num-pdfs-per-phone is 2
+          }
+          level3_map[0] = zerophone_pdf;
+          level3_map[1] = zerophone_pdf + 1;
         }
-        for (size_t j = 0; j < phone_sets.size(); j++) {
-          std::map<EventValueType, EventAnswerType> level3_map;  // key is -1
-          level3_map[0] = current_pdfid++;
-          level3_map[1] = current_pdfid++;
 
-          std::vector<int32> ipset = phone_sets[j];  // All these will have a
-                                                     // shared subtree with 2 pdfids
-          for (size_t ik = 0; ik < ipset.size(); ik++) {
-            level2_map[ipset[ik]] = new TableEventMap(kPdfClass, level3_map);
-          }
+        for (size_t k = 0; k < left_phoneset.size(); k++) {
+          int32 left_phone = left_phoneset[k];
+          level2_map[left_phone] = new TableEventMap(kPdfClass, level3_map);
         }
-        level1_map[pset[k]] = new TableEventMap(0, level2_map);
-        if (k != pset.size() - 1)
-          current_pdfid = base_pdfid;
+      }
+      for (size_t k = 0; k < right_phoneset.size(); k++) {
+        std::map<EventValueType, EventMap*> level2_copy;
+        for (auto const& kv: level2_map)
+          level2_copy[kv.first] = kv.second->Copy(std::vector<EventMap*>());
+        int32 right_phone = right_phoneset[k];
+        level1_map[right_phone] = new TableEventMap(0, level2_copy);
       }
     }
 
   }
+  KALDI_LOG << "Num PDFs: " << current_pdfid;
   return new TableEventMap(1, level1_map);
 }
 
+
 ContextDependency*
-BiphoneContextDependencyFull(const std::vector<std::vector<int32> > phone_sets,
-                             const std::vector<int32> phone2num_pdf_classes) {
-  std::vector<bool> share_roots(phone_sets.size(), false);   // Don't share roots
+BiphoneContextDependencyFull(std::vector<std::vector<int32> > phone_sets,
+                             const std::vector<int32> phone2num_pdf_classes,
+                             const std::vector<int32> &ci_phones_list,
+                             const std::vector<std::vector<int32> > &bi_counts,
+                             int32 biphone_min_count,
+                             const std::vector<int32> &mono_counts,
+                             int32 mono_min_count) {
+  // Remove all the CI phones from the phone sets
+  std::set<int32> ci_phones;
+  for (size_t i = 0; i < ci_phones_list.size(); i++)
+    ci_phones.insert(ci_phones_list[i]);
+  for (int32 i = phone_sets.size() - 1; i >= 0; i--) {
+    for (int32 j = phone_sets[i].size() - 1; j >= 0; j--) {
+      if (ci_phones.find(phone_sets[i][j]) != ci_phones.end()) {  // Delete it
+        phone_sets[i].erase(phone_sets[i].begin() + j);
+        if (phone_sets[i].empty())   // If empty, delete the whole entry
+          phone_sets.erase(phone_sets.begin() + i);
+      }
+    }
+  }
+
+  std::vector<bool> share_roots(phone_sets.size(), false);  // Don't share roots
   // N is context size, P = position of central phone (must be 0).
   int32 P = 1, N = 2;
   EventMap *pdf_map = GetFullBiphoneStubMap(phone_sets,
-                                            phone2num_pdf_classes, share_roots);
+                                            phone2num_pdf_classes,
+                                            ci_phones_list, bi_counts,
+                                            biphone_min_count, mono_counts,
+                                            mono_min_count);
   return new ContextDependency(N, P, pdf_map);
 }
 
+
 } // end namespace kaldi
 
+/* This function reads the counts of biphones and monophones from a text file
+   generated for chain flat-start training. On each line there is either a
+   biphone count or a monophone count:
+   <left-phone-id> <right-phone-id> <count>
+   <monophone-id> <count>
+   The phone-id's are according to phones.txt.
+
+   It's more efficient to load the biphone counts into a map because
+   most entries are zero, but since there are not many biphones, a 2-dim vector
+   is OK. */
+static void ReadPhoneCounts(std::string &filename, int32 num_phones,
+                            std::vector<int32> *mono_counts,
+                            std::vector<std::vector<int32> > *bi_counts) {
+  // The actual phones start from id = 1 (so the last phone has id = num_phones).
+  mono_counts->resize(num_phones + 1, 0);
+  bi_counts->resize(num_phones + 1, std::vector<int>(num_phones + 1, 0));
+  std::ifstream infile(filename);
+  std::string line;
+  while (std::getline(infile, line)) {
+    std::istringstream iss(line);
+    int a, b;
+    long c;
+    if ((std::istringstream(line) >> a >> b >> c)) {
+      // It's a biphone count.
+      KALDI_ASSERT(a >= 0 && a <= num_phones);  // 0 means no-left-context
+      KALDI_ASSERT(b > 0 && b <= num_phones);
+      KALDI_ASSERT(c >= 0);
+      (*bi_counts)[a][b] = c;
+    } else if ((std::istringstream(line) >> b >> c)) {
+      // It's a monophone count.
+      KALDI_ASSERT(b > 0 && b <= num_phones);
+      KALDI_ASSERT(c >= 0);
+      (*mono_counts)[b] = c;
+    } else {
+      KALDI_ERR << "Bad line in phone stats file: " << line;
+    }
+  }
+}
+
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
@@ -147,12 +264,28 @@ int main(int argc, char *argv[]) {
         " gmm-init-biphone topo 39 bi.mdl bi.tree\n";
 
     bool binary = true;
-    std::string shared_phones_rxfilename;
+    std::string shared_phones_rxfilename, phone_counts_rxfilename;
+    int32 min_biphone_count = 100, min_mono_count = 20;
+    std::string ci_phones_str;
+    std::vector<int32> ci_phones;  // Sorted, uniqe vector of
+    // context-independent phones.
+
     ParseOptions po(usage);
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("shared-phones", &shared_phones_rxfilename,
                 "rxfilename containing, on each line, a list of phones "
                 "whose pdfs should be shared.");
+    po.Register("ci-phones", &ci_phones_str, "Colon-separated list of "
+                "integer indices of context-independent phones.");
+    po.Register("phone-counts", &phone_counts_rxfilename,
+                "rxfilename containing, on each line, a biphone/phone and "
+                "its count in the training data.");
+    po.Register("min-biphone-count", &min_biphone_count, "Minimum number of "
+                "occurences of a biphone in training data to reserve pdfs "
+                "for it.");
+    po.Register("min-monophone-count", &min_mono_count, "Minimum number of "
+                "occurences of a monophone in training data to reserve pdfs "
+                "for it.");
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
@@ -162,13 +295,20 @@ int main(int argc, char *argv[]) {
 
 
     std::string topo_filename = po.GetArg(1);
-    int dim;
+    int dim = 0;
     if (!ConvertStringToInteger(po.GetArg(2), &dim) || dim <= 0 || dim > 10000)
       KALDI_ERR << "Bad dimension:" << po.GetArg(2)
                 << ". It should be a positive integer.";
     std::string model_filename = po.GetArg(3);
     std::string tree_filename = po.GetArg(4);
 
+    if (!ci_phones_str.empty()) {
+      SplitStringToIntegers(ci_phones_str, ":", false, &ci_phones);
+      std::sort(ci_phones.begin(), ci_phones.end());
+      if (!IsSortedAndUniq(ci_phones) || ci_phones.empty() || ci_phones[0] == 0)
+        KALDI_ERR << "Invalid --ci-phones option: " << ci_phones_str;
+    }
+
     Vector<BaseFloat> glob_inv_var(dim);
     glob_inv_var.Set(1.0);
     Vector<BaseFloat> glob_mean(dim);
@@ -189,6 +329,15 @@ int main(int argc, char *argv[]) {
                    phone2num_pdf_classes[phones[i]] == 2);
     }
 
+    std::vector<int32> mono_counts;
+    std::vector<std::vector<int32> > bi_counts;
+    if (!phone_counts_rxfilename.empty()) {
+      ReadPhoneCounts(phone_counts_rxfilename, phones.size(),
+                      &mono_counts, &bi_counts);
+      KALDI_LOG << "Loaded mono/bi phone counts.";
+    }
+
+
     // Now the tree:
     ContextDependency *ctx_dep = NULL;
     std::vector<std::vector<int32> > shared_phones;
@@ -200,7 +349,10 @@ int main(int argc, char *argv[]) {
       ReadSharedPhonesList(shared_phones_rxfilename, &shared_phones);
       // ReadSharedPhonesList crashes on error.
     }
-    ctx_dep = BiphoneContextDependencyFull(shared_phones, phone2num_pdf_classes);
+    ctx_dep = BiphoneContextDependencyFull(shared_phones, phone2num_pdf_classes,
+                                           ci_phones, bi_counts,
+                                           min_biphone_count,
+                                           mono_counts, min_mono_count);
 
     int32 num_pdfs = ctx_dep->NumPdfs();
 
diff --git a/src/gmmbin/gmm-init-mono.cc b/src/gmmbin/gmm-init-mono.cc
index 0aac769eb70..3c370c36515 100644
--- a/src/gmmbin/gmm-init-mono.cc
+++ b/src/gmmbin/gmm-init-mono.cc
@@ -23,6 +23,7 @@
 #include "gmm/am-diag-gmm.h"
 #include "hmm/hmm-topology.h"
 #include "hmm/transition-model.h"
+#include "tree/context-dep.h"
 
 namespace kaldi {
 // This function reads a file like:
diff --git a/src/gmmbin/gmm-latgen-map.cc b/src/gmmbin/gmm-latgen-map.cc
index 541b031fe6c..ccc15f5a20c 100644
--- a/src/gmmbin/gmm-latgen-map.cc
+++ b/src/gmmbin/gmm-latgen-map.cc
@@ -38,6 +38,7 @@
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
+    using std::string;
     typedef kaldi::int32 int32;
     using fst::SymbolTable;
     using fst::Fst;
@@ -56,7 +57,7 @@ int main(int argc, char *argv[]) {
     bool binary = true;
     bool allow_partial = true;
     BaseFloat acoustic_scale = 0.1;
-        
+
     std::string word_syms_filename, utt2spk_rspecifier;
     LatticeFasterDecoderConfig decoder_opts;
     decoder_opts.Register(&po);
@@ -109,7 +110,7 @@ int main(int argc, char *argv[]) {
         KALDI_ERR << "Could not open table for writing lattices: "
                   << lattice_wspecifier;
     }
-        
+
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_filename != "") {
       word_syms = fst::SymbolTable::ReadText(word_syms_filename);
@@ -185,7 +186,7 @@ int main(int argc, char *argv[]) {
         }
         AmDiagGmm am_gmm;
         am_gmm.CopyFromAmDiagGmm(gmms_reader.Value(utt));
-        
+
         Matrix<BaseFloat> features(feature_reader.Value());
         feature_reader.FreeCurrent();
         if (features.NumRows() == 0) {
@@ -210,7 +211,7 @@ int main(int argc, char *argv[]) {
         } else num_fail++;
       }  // end looping over all utterances
     }
-    KALDI_LOG << "Average log-likelihood per frame is " 
+    KALDI_LOG << "Average log-likelihood per frame is "
               << (tot_like / frame_count) << " over " << frame_count << " frames.";
 
     double elapsed = timer.Elapsed();
diff --git a/src/gmmbin/gmm-mixup.cc b/src/gmmbin/gmm-mixup.cc
index 9fb52981bfc..a76b3805d89 100644
--- a/src/gmmbin/gmm-mixup.cc
+++ b/src/gmmbin/gmm-mixup.cc
@@ -35,15 +35,15 @@ int main(int argc, char *argv[]) {
         "e.g. of mixing up:\n"
         " gmm-mixup --mix-up=4000 1.mdl 1.occs 2.mdl\n"
         "e.g. of merging:\n"
-        " gmm-mixup --merge=2000 1.mdl 1.occs 2.mdl\n";
-        
+        " gmm-mixup --mix-down=2000 1.mdl 1.occs 2.mdl\n";
+
     bool binary_write = true;
     int32 mixup = 0;
     int32 mixdown = 0;
     BaseFloat perturb_factor = 0.01;
     BaseFloat power = 0.2;
     BaseFloat min_count = 20.0;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("mix-up", &mixup, "Increase number of mixture components to "
diff --git a/src/gst-plugin/Makefile b/src/gst-plugin/Makefile
index 92af0483a6e..4d4764b6006 100644
--- a/src/gst-plugin/Makefile
+++ b/src/gst-plugin/Makefile
@@ -34,11 +34,6 @@ ifneq ($(wildcard ../../tools/portaudio/install/include/pa_linux_alsa.h),)
     EXTRA_LDLIBS += -lasound
 endif
 
-# MKL libs required when linked via shared library
-ifdef MKLROOT
-	EXTRA_LDLIBS+=-lmkl_p4n -lmkl_def
-endif
-
 # Library so name and rpath
 CXX_VERSION=$(shell $(CXX) --version 2>/dev/null)
 ifneq (,$(findstring clang, $(CXX_VERSION)))
diff --git a/src/hmm/Makefile b/src/hmm/Makefile
index 6da3b7b7757..0ad5da74c28 100644
--- a/src/hmm/Makefile
+++ b/src/hmm/Makefile
@@ -9,8 +9,8 @@ OBJFILES = hmm-topology.o transition-model.o hmm-utils.o tree-accu.o \
         posterior.o hmm-test-utils.o
 
 LIBNAME = kaldi-hmm
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/hmm/hmm-test-utils.h b/src/hmm/hmm-test-utils.h
index 495ebf278ae..4faaa92fa66 100644
--- a/src/hmm/hmm-test-utils.h
+++ b/src/hmm/hmm-test-utils.h
@@ -24,6 +24,7 @@
 #include "hmm/hmm-topology.h"
 #include "hmm/transition-model.h"
 #include "lat/kaldi-lattice.h"
+#include "tree/context-dep.h"
 
 namespace kaldi {
 
diff --git a/src/hmm/hmm-topology.cc b/src/hmm/hmm-topology.cc
index cf134065dbf..29634ecda0b 100644
--- a/src/hmm/hmm-topology.cc
+++ b/src/hmm/hmm-topology.cc
@@ -69,7 +69,7 @@ void HmmTopology::Read(std::istream &is, bool binary) {
         ReadToken(is, binary, &token);
         while (token != "</TopologyEntry>") {
           if (token != "<State>")
-            KALDI_ERR << "Expected </TopologyEntry> or <State>, got instead "<<token;
+            KALDI_ERR << "Expected </TopologyEntry> or <State>, got instead " << token;
           int32 state;
           ReadBasicType(is, binary, &state);
           if (state != static_cast<int32>(this_entry.size()))
@@ -88,7 +88,8 @@ void HmmTopology::Read(std::istream &is, bool binary) {
             int32 self_loop_pdf_class = kNoPdf;
             ReadBasicType(is, binary, &forward_pdf_class);
             ReadToken(is, binary, &token);
-            KALDI_ASSERT(token == "<SelfLoopPdfClass>");
+            if (token != "<SelfLoopPdfClass>")
+              KALDI_ERR << "Expected <SelfLoopPdfClass>, got instead " << token;
             ReadBasicType(is, binary, &self_loop_pdf_class);
             this_entry.push_back(HmmState(forward_pdf_class, self_loop_pdf_class));
             ReadToken(is, binary, &token);
@@ -102,10 +103,10 @@ void HmmTopology::Read(std::istream &is, bool binary) {
             this_entry.back().transitions.push_back(std::make_pair(dst_state, trans_prob));
             ReadToken(is, binary, &token);
           }
-          if(token == "<Final>") // TODO: remove this clause after a while.
+          if (token == "<Final>")  // TODO: remove this clause after a while.
             KALDI_ERR << "You are trying to read old-format topology with new Kaldi.";
           if (token != "</State>")
-            KALDI_ERR << "Reading HmmTopology,  unexpected token "<<token;
+            KALDI_ERR << "Expected </State>, got instead " << token;
           ReadToken(is, binary, &token);
         }
         int32 my_index = entries_.size();
diff --git a/src/hmm/hmm-topology.h b/src/hmm/hmm-topology.h
index edea02998c0..750d35bcfe4 100644
--- a/src/hmm/hmm-topology.h
+++ b/src/hmm/hmm-topology.h
@@ -21,7 +21,6 @@
 #define KALDI_HMM_HMM_TOPOLOGY_H_
 
 #include "base/kaldi-common.h"
-#include "tree/context-dep.h"
 #include "util/const-integer-set.h"
 
 
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index a122ca5dc05..15a1edfd255 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -1,6 +1,7 @@
 // hmm/hmm-utils.cc
 
 // Copyright 2009-2011  Microsoft Corporation
+//                2018  Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -22,12 +23,13 @@
 #include "hmm/hmm-utils.h"
 #include "fst/fstlib.h"
 #include "fstext/fstext-lib.h"
+#include "fstext/grammar-context-fst.h"
 
 namespace kaldi {
 
 
 
-fst::VectorFst<fst::StdArc> *GetHmmAsFst(
+fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
     std::vector<int32> phone_window,
     const ContextDependencyInterface &ctx_dep,
     const TransitionModel &trans_model,
@@ -59,7 +61,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
       std::ostringstream ctx_ss;
       for (size_t i = 0; i < phone_window.size(); i++)
         ctx_ss << phone_window[i] << ' ';
-      KALDI_ERR << "GetHmmAsFst: context-dependency object could not produce "
+      KALDI_ERR << "GetHmmAsFsa: context-dependency object could not produce "
                 << "an answer: pdf-class = " << pdf_class << " ctx-window = "
                 << ctx_ss.str() << ".  This probably points "
           "to either a coding error in some graph-building process, "
@@ -150,7 +152,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
 
 
 fst::VectorFst<fst::StdArc>*
-GetHmmAsFstSimple(std::vector<int32> phone_window,
+GetHmmAsFsaSimple(std::vector<int32> phone_window,
                   const ContextDependencyInterface &ctx_dep,
                   const TransitionModel &trans_model,
                   BaseFloat prob_scale) {
@@ -231,18 +233,32 @@ GetHmmAsFstSimple(std::vector<int32> phone_window,
 
 
 
+/// This utility function, used in GetHTransducer(), creates an FSA (finite
+/// state acceptor, i.e. an FST with ilabels equal to olabels) with a single
+/// successful path, with a single label on it.
+static inline fst::VectorFst<fst::StdArc> *MakeTrivialAcceptor(int32 label) {
+  typedef fst::StdArc Arc;
+  typedef Arc::Weight Weight;
+  fst::VectorFst<Arc> *ans = new fst::VectorFst<Arc>;
+  ans->AddState();
+  ans->AddState();
+  ans->SetStart(0);
+  ans->SetFinal(1, Weight::One());
+  ans->AddArc(0, Arc(label, label, Weight::One(), 1));
+  return ans;
+}
 
 
-// The H transducer has a separate outgoing arc for each of the symbols in ilabel_info.
 
-fst::VectorFst<fst::StdArc> *GetHTransducer (const std::vector<std::vector<int32> > &ilabel_info,
-                                             const ContextDependencyInterface &ctx_dep,
-                                             const TransitionModel &trans_model,
-                                             const HTransducerConfig &config,
-                                             std::vector<int32> *disambig_syms_left) {
+// The H transducer has a separate outgoing arc for each of the symbols in ilabel_info.
+fst::VectorFst<fst::StdArc> *GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
+                                            const ContextDependencyInterface &ctx_dep,
+                                            const TransitionModel &trans_model,
+                                            const HTransducerConfig &config,
+                                            std::vector<int32> *disambig_syms_left) {
   KALDI_ASSERT(ilabel_info.size() >= 1 && ilabel_info[0].size() == 0);  // make sure that eps == eps.
   HmmCacheType cache;
-  // "cache" is an optimization that prevents GetHmmAsFst repeating work
+  // "cache" is an optimization that prevents GetHmmAsFsa repeating work
   // unnecessarily.
   using namespace fst;
   typedef StdArc Arc;
@@ -264,24 +280,42 @@ fst::VectorFst<fst::StdArc> *GetHTransducer (const std::vector<std::vector<int32
 
   for (int32 j = 1; j < static_cast<int32>(ilabel_info.size()); j++) {  // zero is eps.
     KALDI_ASSERT(!ilabel_info[j].empty());
-    if (ilabel_info[j].size() == 1 &&
-       ilabel_info[j][0] <= 0) {  // disambig symbol
-
-      // disambiguation symbol.
-      int32 disambig_sym_left = next_disambig_sym++;
-      disambig_syms_left->push_back(disambig_sym_left);
-      // get acceptor with one path with "disambig_sym" on it.
-      VectorFst<Arc> *fst = new VectorFst<Arc>;
-      fst->AddState();
-      fst->AddState();
-      fst->SetStart(0);
-      fst->SetFinal(1, Weight::One());
-      fst->AddArc(0, Arc(disambig_sym_left, disambig_sym_left, Weight::One(), 1));
-      fsts[j] = fst;
+    if (ilabel_info[j][0] < 0 ||
+        (ilabel_info[j][0] == 0 && ilabel_info[j].size() == 1)) {
+      // disambig symbol or special symbol for grammar FSTs.
+      if (ilabel_info[j].size() == 1) {
+        // disambiguation symbol.
+        int32 disambig_sym_left = next_disambig_sym++;
+        disambig_syms_left->push_back(disambig_sym_left);
+        fsts[j] = MakeTrivialAcceptor(disambig_sym_left);
+      } else if (ilabel_info[j].size() == 2) {
+        if (config.nonterm_phones_offset <= 0) {
+          KALDI_ERR << "ilabel-info seems to be for grammar-FST.  You need to "
+              "supply the --nonterm-phones-offset option.";
+        }
+        int32 nonterm_phones_offset = config.nonterm_phones_offset,
+            nonterminal = -ilabel_info[j][0],
+            left_context_phone = ilabel_info[j][1];
+        if (nonterminal <= nonterm_phones_offset ||
+            left_context_phone <= 0 ||
+            left_context_phone > nonterm_phones_offset) {
+          KALDI_ERR << "Could not interpret this ilabel-info with "
+              "--nonterm-phones-offset=" << nonterm_phones_offset
+                    << ": nonterminal,left-context-phone="
+                    << nonterminal << ',' << left_context_phone;
+        }
+        int32 big_number = static_cast<int32>(fst::kNontermBigNumber),
+            encoding_multiple = fst::GetEncodingMultiple(nonterm_phones_offset);
+        int32 encoded_symbol = big_number + nonterminal * encoding_multiple +
+            left_context_phone;
+        fsts[j] = MakeTrivialAcceptor(encoded_symbol);
+      } else {
+        KALDI_ERR << "Could not decode this ilabel_info entry.";
+      }
     } else {  // Real phone-in-context.
       std::vector<int32> phone_window = ilabel_info[j];
 
-      VectorFst<Arc> *fst = GetHmmAsFst(phone_window,
+      VectorFst<Arc> *fst = GetHmmAsFsa(phone_window,
                                         ctx_dep,
                                         trans_model,
                                         config,
@@ -399,7 +433,8 @@ class TidToTstateMapper {
   // check.
 
   // This maps valid transition-ids to transition states, maps kNoLabel to -1, and
-  // maps all other symbols (i.e. epsilon symbols and disambig symbols) to zero.
+  // maps all other symbols (i.e. epsilon symbols, disambig symbols, and symbols
+  // with values over 100000/kNontermBigNumber) to zero.
   // Its point is to provide an equivalence class on labels that's relevant to what
   // the self-loop will be on the following (or preceding) state.
   TidToTstateMapper(const TransitionModel &trans_model,
@@ -416,7 +451,8 @@ class TidToTstateMapper {
         KALDI_ERR << "AddSelfLoops: graph already has self-loops.";
       return trans_model_.TransitionIdToTransitionState(label);
     } else {  // 0 or (presumably) disambiguation symbol.  Map to zero
-      if (label != 0)
+      int32 big_number = fst::kNontermBigNumber;  // 1000000
+      if (label != 0 && label < big_number)
         KALDI_ASSERT(std::binary_search(disambig_syms_.begin(),
                                         disambig_syms_.end(),
                                         label));  // or invalid label
@@ -489,7 +525,8 @@ static void AddSelfLoopsReorder(const TransitionModel &trans_model,
   // with the corresponding labels on them by this probability).
 
   for (StateId s = 0; s < static_cast<StateId>(state_in.size()); s++) {
-    if (state_in[s] > 0) {  // defined, and not eps or a disambiguation symbol...
+    if (state_in[s] > 0) {  // defined, and not eps or a disambiguation symbol or a
+                            // nonterminal-related sybol for grammar decoding...
       int32 trans_state = static_cast<int32>(state_in[s]);
       // First multiply all probabilities by "forward" probability.
       BaseFloat log_prob = trans_model.GetNonSelfLoopLogProb(trans_state);
@@ -1174,7 +1211,7 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
   typedef fst::StdArc Arc;
   int32 length = alignment->size();
   BaseFloat prob_scale = 0.0;
-  fst::VectorFst<Arc> *fst = GetHmmAsFstSimple(phone_window, ctx_dep,
+  fst::VectorFst<Arc> *fst = GetHmmAsFsaSimple(phone_window, ctx_dep,
                                                trans_model, prob_scale);
   fst::RmEpsilon(fst);
 
@@ -1252,5 +1289,16 @@ void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
   }
 }
 
+void GetPdfToPhonesMap(const TransitionModel &trans_model,
+                       std::vector<std::set<int32> > *pdf2phones) {
+  pdf2phones->clear();
+  pdf2phones->resize(trans_model.NumPdfs());
+  for (int32 i = 0; i < trans_model.NumTransitionIds(); i++) {
+    int32 trans_id = i + 1;
+    int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
+    int32 phone = trans_model.TransitionIdToPhone(trans_id);
+    (*pdf2phones)[pdf_id].insert(phone);
+  }
+}
 
 } // namespace kaldi
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 628f018a3fa..4415927df4e 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -38,13 +38,19 @@ struct HTransducerConfig {
   /// Note this doesn't apply to self-loops; GetHTransducer() does
   /// not include self-loops.
   BaseFloat transition_scale;
+  int32 nonterm_phones_offset;
 
   HTransducerConfig():
-      transition_scale(1.0) { }
+      transition_scale(1.0),
+      nonterm_phones_offset(-1) { }
 
   void Register (OptionsItf *opts) {
     opts->Register("transition-scale", &transition_scale,
                    "Scale of transition probs (relative to LM)");
+    opts->Register("nonterm-phones-offset", &nonterm_phones_offset,
+                   "The integer id of #nonterm_bos in phones.txt, if present. "
+                   "Only needs to be set if you are doing grammar decoding, "
+                   "see doc/grammar.dox.");
   }
 };
 
@@ -58,16 +64,19 @@ struct HmmCacheHash {
 };
 
 /// HmmCacheType is a map from (central-phone, sequence of pdf-ids) to FST, used
-/// as cache in GetHmmAsFst, as an optimization.
+/// as cache in GetHmmAsFsa, as an optimization.
 typedef unordered_map<std::pair<int32, std::vector<int32> >,
                       fst::VectorFst<fst::StdArc>*,
                       HmmCacheHash> HmmCacheType;
 
 
 /// Called by GetHTransducer() and probably will not need to be called directly;
-/// it creates the FST corresponding to the phone.  Does not include self-loops;
-/// you have to call AddSelfLoops() for that.  Result owned by caller.
-/// Returns an acceptor (i.e. ilabels, olabels identical) with transition-ids
+/// it creates and returns the FST corresponding to the phone.  It's actually an
+/// acceptor (ilabels equal to olabels), which is why this is called "Fsa" not
+/// "Fst".  This acceptor does not include self-loops; you have to call
+/// AddSelfLoops() for that.  (We do that at a later graph compilation phase,
+/// for efficiency).  The labels on the FSA correspond to transition-ids.
+///
 /// as the symbols.
 /// For documentation in context, see \ref hmm_graph_get_hmm_as_fst
 ///   @param context_window  A vector representing the phonetic context; see
@@ -81,8 +90,7 @@ typedef unordered_map<std::pair<int32, std::vector<int32> >,
 ///       if it finds that the object it needs is already there, it will
 ///       just return a pointer value from "cache"-- not that this means
 ///       you have to be careful not to delete things twice.
-
-fst::VectorFst<fst::StdArc> *GetHmmAsFst(
+fst::VectorFst<fst::StdArc> *GetHmmAsFsa(
     std::vector<int32> context_window,
     const ContextDependencyInterface &ctx_dep,
     const TransitionModel &trans_model,
@@ -94,7 +102,7 @@ fst::VectorFst<fst::StdArc> *GetHmmAsFst(
 /// currently.  Creates the acceptor FST with self-loops, and with fewer
 /// options.
 fst::VectorFst<fst::StdArc>*
-GetHmmAsFstSimple(std::vector<int32> context_window,
+GetHmmAsFsaSimple(std::vector<int32> context_window,
                   const ContextDependencyInterface &ctx_dep,
                   const TransitionModel &trans_model,
                   BaseFloat prob_scale);
@@ -116,11 +124,11 @@ GetHmmAsFstSimple(std::vector<int32> context_window,
   * input of the transducer
   */
 fst::VectorFst<fst::StdArc>*
-GetHTransducer (const std::vector<std::vector<int32> > &ilabel_info,
-                const ContextDependencyInterface &ctx_dep,
-                const TransitionModel &trans_model,
-                const HTransducerConfig &config,
-                std::vector<int32> *disambig_syms_left);
+GetHTransducer(const std::vector<std::vector<int32> > &ilabel_info,
+               const ContextDependencyInterface &ctx_dep,
+               const TransitionModel &trans_model,
+               const HTransducerConfig &config,
+               std::vector<int32> *disambig_syms_left);
 
 /**
   * GetIlabelMapping produces a mapping that's similar to HTK's logical-to-physical
@@ -138,10 +146,10 @@ GetHTransducer (const std::vector<std::vector<int32> > &ilabel_info,
   *       create a vector ilabel_info_new such that
   *       ilabel_info_new[i] == ilabel_info_old[old2new_map[i]]
   */
-void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
-                       const ContextDependencyInterface &ctx_dep,
-                       const TransitionModel &trans_model,
-                       std::vector<int32> *old2new_map);
+void GetIlabelMapping(const std::vector<std::vector<int32> > &ilabel_info_old,
+                      const ContextDependencyInterface &ctx_dep,
+                      const TransitionModel &trans_model,
+                      std::vector<int32> *old2new_map);
 
 
 
@@ -152,6 +160,10 @@ void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
   * was created in such a way that it was stochastic).  Note that the
   * disambig_syms will be empty in some recipes (e.g.  if you already removed
   * the disambiguation symbols).
+  * This function will treat numbers over 10000000 (kNontermBigNumber) the
+  * same as disambiguation symbols, assuming they are special symbols for
+  * grammar decoding.
+  *
   * @param trans_model [in] Transition model
   * @param disambig_syms [in] Sorted, uniq list of disambiguation symbols, required
   *       if the graph contains disambiguation symbols but only needed for sanity checks.
@@ -317,6 +329,12 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep,
 void ChangeReorderingOfAlignment(const TransitionModel &trans_model,
                                  std::vector<int32> *alignment);
 
+
+// GetPdfToPhonesMap creates a map which maps each pdf-id into its
+// corresponding monophones.
+void GetPdfToPhonesMap(const TransitionModel &trans_model,
+                       std::vector<std::set<int32> > *pdf2phones);
+
 /// @} end "addtogroup hmm_group"
 
 } // end namespace kaldi
diff --git a/src/hmm/posterior-test.cc b/src/hmm/posterior-test.cc
index b6958674f9b..0906cb3d0dc 100644
--- a/src/hmm/posterior-test.cc
+++ b/src/hmm/posterior-test.cc
@@ -33,12 +33,12 @@ void TestVectorToPosteriorEntry() {
 
   std::vector<std::pair<int32, BaseFloat> > post_entry;
 
-  BaseFloat ans = VectorToPosteriorEntry(loglikes, gselect, min_post, &post_entry);
+  VectorToPosteriorEntry(loglikes, gselect, min_post, &post_entry);
 
   KALDI_ASSERT(post_entry.size() <= gselect);
 
   int32 max_elem;
-  BaseFloat max_val = loglikes.Max(&max_elem);
+  loglikes.Max(&max_elem);
   KALDI_ASSERT(post_entry[0].first == max_elem);
 
   KALDI_ASSERT(post_entry.back().second >= min_post);
@@ -48,7 +48,6 @@ void TestVectorToPosteriorEntry() {
   for (size_t i = 0; i < post_entry.size(); i++)
     sum += post_entry[i].second;
   KALDI_ASSERT(fabs(sum - 1.0) < 0.01);
-  KALDI_ASSERT(ans >= max_val);
 }
 
 void TestPosteriorIo() {
@@ -92,4 +91,3 @@ int main() {
   }
   std::cout << "Test OK.\n";
 }
-
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 42db6e99cf4..860a979a0ce 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -402,7 +402,7 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
   for (size_t i = 0; i < post->size(); i++) {
     std::vector<std::pair<int32, BaseFloat> > this_post;
     this_post.reserve((*post)[i].size());
-    BaseFloat sil_weight = 0.0, nonsil_weight = 0.0;   
+    BaseFloat sil_weight = 0.0, nonsil_weight = 0.0;
     for (size_t j = 0; j < (*post)[i].size(); j++) {
       int32 tid = (*post)[i][j].first,
           phone = trans_model.TransitionIdToPhone(tid);
@@ -418,12 +418,23 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
     if (frame_scale != 0.0) {
       for (size_t j = 0; j < (*post)[i].size(); j++) {
         int32 tid = (*post)[i][j].first;
-        BaseFloat weight = (*post)[i][j].second;    
+        BaseFloat weight = (*post)[i][j].second;
         this_post.push_back(std::make_pair(tid, weight * frame_scale));
       }
     }
-    (*post)[i].swap(this_post);    
+    (*post)[i].swap(this_post);
+  }
+}
+
+inline static BaseFloat GetTotalPosterior(
+    const std::vector<std::pair<int32, BaseFloat> > &post_entry) {
+  BaseFloat tot = 0.0;
+  std::vector<std::pair<int32, BaseFloat> >::const_iterator
+      iter =  post_entry.begin(), end = post_entry.end();
+  for (; iter != end; ++iter) {
+    tot += iter->second;
   }
+  return tot;
 }
 
 BaseFloat VectorToPosteriorEntry(
@@ -434,39 +445,66 @@ BaseFloat VectorToPosteriorEntry(
   KALDI_ASSERT(num_gselect > 0 && min_post >= 0 && min_post < 1.0);
   // we name num_gauss assuming each entry in log_likes represents a Gaussian;
   // it doesn't matter if they don't.
+
   int32 num_gauss = log_likes.Dim();
   KALDI_ASSERT(num_gauss > 0);
   if (num_gselect > num_gauss)
     num_gselect = num_gauss;
-  Vector<BaseFloat> log_likes_normalized(log_likes);
-  BaseFloat ans = log_likes_normalized.ApplySoftMax();
-  std::vector<std::pair<int32, BaseFloat> > temp_post(num_gauss);
-  for (int32 g = 0; g < num_gauss; g++)
-    temp_post[g] = std::pair<int32, BaseFloat>(g, log_likes_normalized(g));
+  std::vector<std::pair<int32, BaseFloat> > temp_post;
+  BaseFloat max_like = log_likes.Max();
+  if (min_post != 0.0) {
+    BaseFloat like_cutoff = max_like + Log(min_post);
+    for (int32 g = 0; g < num_gauss; g++) {
+      BaseFloat like = log_likes(g);
+      if (like > like_cutoff) {
+        BaseFloat post = exp(like - max_like);
+        temp_post.push_back(std::pair<int32, BaseFloat>(g, post));
+      }
+    }
+  }
+  if (temp_post.empty()) {
+    // we reach here if min_post was 0.0 or if no posteriors reached the
+    // threshold min_post (we need at least one).
+    temp_post.resize(num_gauss);
+    for (int32 g = 0; g < num_gauss; g++)
+      temp_post[g] = std::pair<int32, BaseFloat>(g, Exp(log_likes(g) - max_like));
+  }
+
   CompareReverseSecond compare;
-  // Sort in decreasing order on posterior.  For efficiency we
-  // first do nth_element and then sort, as we only need the part we're
-  // going to output, to be sorted.
-  std::nth_element(temp_post.begin(),
-                   temp_post.begin() + num_gselect, temp_post.end(),
-                   compare);
-  std::sort(temp_post.begin(), temp_post.begin() + num_gselect,
-            compare);
+  if (static_cast<int32>(temp_post.size()) > num_gselect * 2) {
+    // Sort in decreasing order on posterior.  For efficiency we
+    // first do nth_element and then sort, as we only need the part we're
+    // going to output, to be sorted.
+    std::nth_element(temp_post.begin(),
+                     temp_post.begin() + num_gselect, temp_post.end(),
+                     compare);
+    std::sort(temp_post.begin(), temp_post.begin() + num_gselect,
+              compare);
+  } else {
+    std::sort(temp_post.begin(), temp_post.end(), compare);
+  }
+
+  size_t num_to_insert = std::min<size_t>(temp_post.size(),
+                                          num_gselect);
 
   post_entry->clear();
   post_entry->insert(post_entry->end(),
-                     temp_post.begin(), temp_post.begin() + num_gselect);
-  while (post_entry->size() > 1 && post_entry->back().second < min_post)
-    post_entry->pop_back();  
+                     temp_post.begin(), temp_post.begin() + num_to_insert);
+
+  BaseFloat tot_post = GetTotalPosterior(*post_entry),
+      cutoff = min_post * tot_post;
+
+  while (post_entry->size() > 1 && post_entry->back().second < cutoff) {
+    tot_post -= post_entry->back().second;
+    post_entry->pop_back();
+  }
   // Now renormalize to sum to one after pruning.
-  BaseFloat tot = 0.0;
-  size_t size = post_entry->size();
-  for (size_t i = 0; i < size; i++)
-    tot += (*post_entry)[i].second;
-  BaseFloat inv_tot = 1.0 / tot;
-  for (size_t i = 0; i < size; i++)
-    (*post_entry)[i].second *= inv_tot;
-  return ans;
+  BaseFloat inv_tot = 1.0 / tot_post;
+  auto end = post_entry->end();
+  for (auto iter = post_entry->begin(); iter != end; ++iter)
+    iter->second *= inv_tot;
+
+  return max_like + log(tot_post);
 }
 
 
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index cfe3fc44572..e153c249740 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -24,7 +24,6 @@
 #define KALDI_HMM_POSTERIOR_H_
 
 #include "base/kaldi-common.h"
-#include "tree/context-dep.h"
 #include "util/const-integer-set.h"
 #include "util/kaldi-table.h"
 #include "hmm/transition-model.h"
@@ -190,8 +189,9 @@ struct CompareReverseSecond {
 /// by applying Softmax(), then prunes the posteriors using "gselect" and
 /// "min_post" (keeping at least one), and outputs the result into
 /// "post_entry", sorted from greatest to least posterior.
-/// Returns the total log-likelihood (the output of calling ApplySoftMax()
-/// on a copy of log_likes).
+///
+/// It returns the log of the sum of the selected log-likes that contributed
+/// to the posterior.
 BaseFloat VectorToPosteriorEntry(
     const VectorBase<BaseFloat> &log_likes,
     int32 num_gselect,
diff --git a/src/hmm/transition-model.cc b/src/hmm/transition-model.cc
index 83edbaf5805..420a94585ea 100644
--- a/src/hmm/transition-model.cc
+++ b/src/hmm/transition-model.cc
@@ -39,7 +39,7 @@ void TransitionModel::ComputeTuplesIsHmm(const ContextDependencyInterface &ctx_d
   const std::vector<int32> &phones = topo_.GetPhones();
   KALDI_ASSERT(!phones.empty());
 
-  // this is the case for normal models. but not fot chain models
+  // this is the case for normal models. but not for chain models
   std::vector<std::vector<std::pair<int32, int32> > > pdf_info;
   std::vector<int32> num_pdf_classes( 1 + *std::max_element(phones.begin(), phones.end()), -1);
   for (size_t i = 0; i < phones.size(); i++)
@@ -85,7 +85,7 @@ void TransitionModel::ComputeTuplesNotHmm(const ContextDependencyInterface &ctx_
 
   // pdf_info is a set of lists indexed by phone. Each list is indexed by
   // (pdf-class, self-loop pdf-class) of each state of that phone, and the element
-  // is a list of possible (pdf, self-loop pdf) pairs that that (pdf-class, self-loop pdf-class)
+  // is a list of possible (pdf, self-loop pdf) pairs that (pdf-class, self-loop pdf-class)
   // pair generates.
   std::vector<std::vector<std::vector<std::pair<int32, int32> > > > pdf_info;
   // pdf_class_pairs is a set of lists indexed by phone. Each list stores
@@ -166,7 +166,7 @@ void TransitionModel::ComputeDerived() {
 
   id2state_.resize(cur_transition_id);   // cur_transition_id is #transition-ids+1.
   id2pdf_id_.resize(cur_transition_id);
-  for (int32 tstate = 1; tstate <= static_cast<int32>(tuples_.size()); tstate++)
+  for (int32 tstate = 1; tstate <= static_cast<int32>(tuples_.size()); tstate++) {
     for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++) {
       id2state_[tid] = tstate;
       if (IsSelfLoop(tid))
@@ -174,6 +174,17 @@ void TransitionModel::ComputeDerived() {
       else
         id2pdf_id_[tid] = tuples_[tstate-1].forward_pdf;
     }
+  }
+
+  // The following statements put copies a large number in the region of memory
+  // past the end of the id2pdf_id_ array, while leaving the array as it was
+  // before.  The goal of this is to speed up decoding by disabling a check
+  // inside TransitionIdToPdf() that the transition-id was within the correct
+  // range.
+  int32 num_big_numbers = std::min<int32>(2000, cur_transition_id);
+  id2pdf_id_.resize(cur_transition_id + num_big_numbers,
+                    std::numeric_limits<int32>::max());
+  id2pdf_id_.resize(cur_transition_id);
 }
 
 void TransitionModel::InitializeProbs() {
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index 9843dff946b..c97980405c1 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -22,11 +22,12 @@
 #define KALDI_HMM_TRANSITION_MODEL_H_
 
 #include "base/kaldi-common.h"
-#include "tree/context-dep.h"
 #include "util/const-integer-set.h"
 #include "fst/fst-decl.h" // forward declarations.
 #include "hmm/hmm-topology.h"
 #include "itf/options-itf.h"
+#include "itf/context-dep-itf.h"
+#include "matrix/kaldi-vector.h"
 
 namespace kaldi {
 
@@ -46,7 +47,7 @@ namespace kaldi {
 // this depends on the number of transitions/final-probs in the topology for
 // that (phone, HMM-state).  Each probability has an associated transition-index.
 // We associate with each (transition-state, transition-index) a unique transition-id.
-// Each individual probability estimated by the transition-model is asociated with a
+// Each individual probability estimated by the transition-model is associated with a
 // transition-id.
 //
 // List of the various types of quantity referred to here and what they mean:
@@ -156,6 +157,10 @@ class TransitionModel {
   // this state doesn't have a self-loop.
 
   inline int32 TransitionIdToPdf(int32 trans_id) const;
+  // TransitionIdToPdfFast is as TransitionIdToPdf but skips an assertion
+  // (unless we're in paranoid mode).
+  inline int32 TransitionIdToPdfFast(int32 trans_id) const;
+
   int32 TransitionIdToPhone(int32 trans_id) const;
   int32 TransitionIdToPdfClass(int32 trans_id) const;
   int32 TransitionIdToHmmState(int32 trans_id) const;
@@ -316,14 +321,26 @@ class TransitionModel {
   /// of pdfs).
   int32 num_pdfs_;
 
-
   KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel);
-
 };
 
 inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
-               "Likely graph/model mismatch (graph built from wrong model?)");
+  KALDI_ASSERT(
+      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
+      "Likely graph/model mismatch (graph built from wrong model?)");
+  return id2pdf_id_[trans_id];
+}
+
+inline int32 TransitionModel::TransitionIdToPdfFast(int32 trans_id) const {
+  // Note: it's a little dangerous to assert this only in paranoid mode.
+  // However, this function is called in the inner loop of decoders and
+  // the assertion likely takes a significant amount of time.  We make
+  // sure that past the end of the id2pdf_id_ array there are big
+  // numbers, which will make the calling code more likely to segfault
+  // (rather than silently die) if this is called for out-of-range values.
+  KALDI_PARANOID_ASSERT(
+      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
+      "Likely graph/model mismatch (graph built from wrong model?)");
   return id2pdf_id_[trans_id];
 }
 
diff --git a/src/itf/context-dep-itf.h b/src/itf/context-dep-itf.h
index 40681bb5ccd..b62bd11e11a 100644
--- a/src/itf/context-dep-itf.h
+++ b/src/itf/context-dep-itf.h
@@ -51,7 +51,7 @@ class ContextDependencyInterface {
   /// than P [these represent unknown phone context due to end or
   /// begin of sequence].  We do not insist that Compute must always
   /// output (into stateseq) a nonempty sequence of states, but we
-  /// anticipate that stateseq will alyway be nonempty at output in
+  /// anticipate that stateseq will always be nonempty at output in
   /// typical use cases.  "Compute" returns false if expansion somehow
   /// failed.  Normally the calling code should raise an exception if
   /// this happens.  We can define a different interface later in
diff --git a/src/itf/decodable-itf.h b/src/itf/decodable-itf.h
index 9852861969d..20934dde8c9 100644
--- a/src/itf/decodable-itf.h
+++ b/src/itf/decodable-itf.h
@@ -50,7 +50,7 @@ namespace kaldi {
       // Process this frame
     }
     \endcode
-   and the the call to IsLastFrame would block if the features had not arrived yet.
+   and the call to IsLastFrame would block if the features had not arrived yet.
    The decodable object would have to know when to terminate the decoding.  This
    online-decoding mode is still supported, it is what happens when you call, for
    example, LatticeFasterDecoder::Decode().
@@ -72,19 +72,18 @@ namespace kaldi {
    always just return the number of frames in the file, and IsLastFrame() will
    return true for the last frame.
 
-   For truly online decoding, the "old" online decodable objects in ../online/ have a
-   "blocking" IsLastFrame() and will crash if you call NumFramesReady().
+   For truly online decoding, the "old" online decodable objects in ../online/
+   have a "blocking" IsLastFrame() and will crash if you call NumFramesReady().
    The "new" online decodable objects in ../online2/ return the number of frames
    currently accessible if you call NumFramesReady().  You will likely not need
    to call IsLastFrame(), but we implement it to only return true for the last
    frame of the file once we've decided to terminate decoding.
 */
-
 class DecodableInterface {
  public:
   /// Returns the log likelihood, which will be negated in the decoder.
-  /// The "frame" starts from zero.  You should verify that IsLastFrame(frame-1)
-  /// returns false before calling this.
+  /// The "frame" starts from zero.  You should verify that NumFramesReady() > frame
+  /// before calling this.
   virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
 
   /// Returns true if this is the last frame.  Frames are zero-based, so the
diff --git a/src/itf/online-feature-itf.h b/src/itf/online-feature-itf.h
index 3837024ab55..3d139b461f0 100644
--- a/src/itf/online-feature-itf.h
+++ b/src/itf/online-feature-itf.h
@@ -30,7 +30,7 @@ namespace kaldi {
    OnlineFeatureInterface is an interface for online feature processing (it is
    also usable in the offline setting, but currently we're not using it for
    that).  This is for use in the online2/ directory, and it supersedes the
-   interface in ../online/online-feat-input.h.  We have a slighty different
+   interface in ../online/online-feat-input.h.  We have a slightly different
    model that puts more control in the hands of the calling thread, and won't
    involve waiting on semaphores in the decoding thread.
 
@@ -45,11 +45,11 @@ namespace kaldi {
    implementing a child class you must not make assumptions about the
    order in which the user makes these calls.
 */
-   
+
 class OnlineFeatureInterface {
  public:
   virtual int32 Dim() const = 0; /// returns the feature dimension.
-  
+
   /// Returns the total number of frames, since the start of the utterance, that
   /// are now available.  In an online-decoding context, this will likely
   /// increase with time as more data becomes available.
@@ -65,7 +65,7 @@ class OnlineFeatureInterface {
   /// many frames are in the decodable object (as it used to be, and for backward
   /// compatibility, still is, in the Decodable interface).
   virtual bool IsLastFrame(int32 frame) const = 0;
-  
+
   /// Gets the feature vector for this frame.  Before calling this for a given
   /// frame, it is assumed that you called NumFramesReady() and it returned a
   /// number greater than "frame".  Otherwise this call will likely crash with
@@ -74,6 +74,21 @@ class OnlineFeatureInterface {
   /// the class.
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
 
+
+  /// This is like GetFrame() but for a collection of frames.  There is a
+  /// default implementation that just gets the frames one by one, but it
+  /// may be overridden for efficiency by child classes (since sometimes
+  /// it's more efficient to do things in a batch).
+  virtual void GetFrames(const std::vector<int32> &frames,
+                         MatrixBase<BaseFloat> *feats) {
+    KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
+    for (size_t i = 0; i < frames.size(); i++) {
+      SubVector<BaseFloat> feat(*feats, i);
+      GetFrame(frames[i], &feat);
+    }
+  }
+
+
   // Returns frame shift in seconds.  Helps to estimate duration from frame
   // counts.
   virtual BaseFloat FrameShiftInSeconds() const = 0;
@@ -81,8 +96,8 @@ class OnlineFeatureInterface {
   /// Virtual destructor.  Note: constructors that take another member of
   /// type OnlineFeatureInterface are not expected to take ownership of
   /// that pointer; the caller needs to keep track of that manually.
-  virtual ~OnlineFeatureInterface() { }  
-  
+  virtual ~OnlineFeatureInterface() { }
+
 };
 
 
diff --git a/src/ivector/Makefile b/src/ivector/Makefile
index 408018befa4..1154da6880b 100644
--- a/src/ivector/Makefile
+++ b/src/ivector/Makefile
@@ -13,8 +13,8 @@ OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o \
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/ivector/agglomerative-clustering.cc b/src/ivector/agglomerative-clustering.cc
index 30138e00637..ced912ed195 100644
--- a/src/ivector/agglomerative-clustering.cc
+++ b/src/ivector/agglomerative-clustering.cc
@@ -2,6 +2,7 @@
 
 // Copyright  2017-2018  Matthew Maciejewski
 //                 2018  David Snyder
+//                 2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -24,65 +25,98 @@
 namespace kaldi {
 
 void AgglomerativeClusterer::Cluster() {
-  KALDI_VLOG(2) << "Initializing cluster assignments.";
-  Initialize();
-
-  KALDI_VLOG(2) << "Clustering...";
-  // This is the main algorithm loop. It moves through the queue merging
-  // clusters until a stopping criterion has been reached.
-  while (num_clusters_ > min_clust_ && !queue_.empty()) {
-    std::pair<BaseFloat, std::pair<uint16, uint16> > pr = queue_.top();
-    int32 i = (int32) pr.second.first, j = (int32) pr.second.second;
-    queue_.pop();
-    // check to make sure clusters have not already been merged
-    if ((active_clusters_.find(i) != active_clusters_.end()) &&
-        (active_clusters_.find(j) != active_clusters_.end()))
-      MergeClusters(i, j);
-  }
+  if (num_points_ > first_pass_max_points_)
+    ClusterTwoPass();
+  else
+    ClusterSinglePass();
+}
 
-  std::vector<int32> new_assignments(num_points_);
-  int32 label_id = 0;
-  std::set<int32>::iterator it;
-  // Iterate through the clusters and assign all utterances within the cluster
-  // an ID label unique to the cluster. This is the final output and frees up
-  // the cluster memory accordingly.
-  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
-    ++label_id;
-    AhcCluster *cluster = clusters_map_[*it];
-    std::vector<int32>::iterator utt_it;
-    for (utt_it = cluster->utt_ids.begin();
-         utt_it != cluster->utt_ids.end(); ++utt_it)
-      new_assignments[*utt_it] = label_id;
-    delete cluster;
+void AgglomerativeClusterer::ClusterSinglePass() {
+  InitializeClusters(0, num_points_);
+  ComputeClusters(min_clusters_);
+  AssignClusters();
+}
+
+void AgglomerativeClusterer::ClusterTwoPass() {
+  // This is the first pass loop. We divide the input into equal size subsets
+  // making sure each subset has at most first_pass_max_points_ points. Then, we
+  // cluster the points in each subset separately until a stopping criterion is
+  // reached. We set the minimum number of clusters to 10 * min_clusters_ for
+  // each subset to avoid early merging of most clusters that would otherwise be
+  // kept separate in single pass clustering.
+  BaseFloat num_points = static_cast<BaseFloat>(num_points_);
+  int32 num_subsets = ceil(num_points / first_pass_max_points_);
+  int32 subset_size = ceil(num_points / num_subsets);
+  for (int32 n = 0; n < num_points_; n += subset_size) {
+    InitializeClusters(n, std::min(n + subset_size, num_points_));
+    ComputeClusters(min_clusters_ * 10);
+    AddClustersToSecondPass();
   }
-  assignments_->swap(new_assignments);
+
+  // We swap the contents of the first and second pass data structures so that
+  // we can use the same method to do second pass clustering.
+  clusters_map_.swap(second_pass_clusters_map_);
+  active_clusters_.swap(second_pass_active_clusters_);
+  cluster_cost_map_.swap(second_pass_cluster_cost_map_);
+  queue_.swap(second_pass_queue_);
+  count_ = second_pass_count_;
+
+  // This is the second pass. It moves through the queue merging clusters
+  // determined in the first pass until a stopping criterion is reached.
+  ComputeClusters(min_clusters_);
+
+  AssignClusters();
 }
 
-BaseFloat AgglomerativeClusterer::GetCost(int32 i, int32 j) {
+uint32 AgglomerativeClusterer::EncodePair(int32 i, int32 j) {
   if (i < j)
-    return cluster_cost_map_[std::make_pair(i, j)];
+    return (static_cast<uint32>(i) << 16) + static_cast<uint32>(j);
   else
-    return cluster_cost_map_[std::make_pair(j, i)];
+    return (static_cast<uint32>(j) << 16) + static_cast<uint32>(i);
+}
+
+std::pair<int32, int32> AgglomerativeClusterer::DecodePair(uint32 key) {
+  return std::make_pair(static_cast<int32>(key >> 16),
+                        static_cast<int32>(key & 0x0000FFFFu));
 }
 
-void AgglomerativeClusterer::Initialize() {
-  KALDI_ASSERT(num_clusters_ != 0);
-  for (int32 i = 0; i < num_points_; i++) {
+void AgglomerativeClusterer::InitializeClusters(int32 first, int32 last) {
+  KALDI_ASSERT(last > first);
+  clusters_map_.clear();
+  active_clusters_.clear();
+  cluster_cost_map_.clear();
+  queue_ = QueueType();  // priority_queue does not have a clear method
+
+  for (int32 i = first; i < last; i++) {
     // create an initial cluster of size 1 for each point
     std::vector<int32> ids;
     ids.push_back(i);
-    AhcCluster *c = new AhcCluster(++count_, -1, -1, ids);
-    clusters_map_[count_] = c;
-    active_clusters_.insert(count_);
+    AhcCluster *c = new AhcCluster(i + 1, -1, -1, ids);
+    clusters_map_[i + 1] = c;
+    active_clusters_.insert(i + 1);
 
     // propagate the queue with all pairs from the cost matrix
-    for (int32 j = i+1; j < num_clusters_; j++) {
-      BaseFloat cost = costs_(i,j);
-      cluster_cost_map_[std::make_pair(i+1, j+1)] = cost;
-      if (cost <= thresh_)
-        queue_.push(std::make_pair(cost,
-            std::make_pair(static_cast<uint16>(i+1),
-                           static_cast<uint16>(j+1))));
+    for (int32 j = i + 1; j < last; j++) {
+      BaseFloat cost = costs_(i, j);
+      uint32 key = EncodePair(i + 1, j + 1);
+      cluster_cost_map_[key] = cost;
+      if (cost <= threshold_)
+        queue_.push(std::make_pair(cost, key));
+    }
+  }
+}
+
+void AgglomerativeClusterer::ComputeClusters(int32 min_clusters) {
+  while (active_clusters_.size() > min_clusters && !queue_.empty()) {
+    std::pair<BaseFloat, uint32> pr = queue_.top();
+    int32 i, j;
+    std::tie(i, j) = DecodePair(pr.second);
+    queue_.pop();
+    // check to make sure clusters have not already been merged
+    if ((active_clusters_.find(i) != active_clusters_.end()) &&
+        (active_clusters_.find(j) != active_clusters_.end())) {
+      if (clusters_map_[i]->size + clusters_map_[j]->size <= max_cluster_size_)
+        MergeClusters(i, j);
     }
   }
 }
@@ -105,27 +139,99 @@ void AgglomerativeClusterer::MergeClusters(int32 i, int32 j) {
   std::set<int32>::iterator it;
   for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
     // The new cost is the sum of the costs of the new cluster's parents
-    BaseFloat new_cost = GetCost(*it, i) + GetCost(*it, j);
-    cluster_cost_map_[std::make_pair(*it, count_)] = new_cost;
+    BaseFloat new_cost = cluster_cost_map_[EncodePair(*it, i)] +
+                         cluster_cost_map_[EncodePair(*it, j)];
+    uint32 new_key = EncodePair(*it, count_);
+    cluster_cost_map_[new_key] = new_cost;
     BaseFloat norm = clust1->size * (clusters_map_[*it])->size;
-    if (new_cost / norm <= thresh_)
-      queue_.push(std::make_pair(new_cost / norm,
-          std::make_pair(static_cast<uint16>(*it),
-                         static_cast<uint16>(count_))));
+    if (new_cost / norm <= threshold_)
+      queue_.push(std::make_pair(new_cost / norm, new_key));
   }
   active_clusters_.insert(count_);
   clusters_map_[count_] = clust1;
   delete clust2;
-  num_clusters_--;
+}
+
+void AgglomerativeClusterer::AddClustersToSecondPass() {
+  // This method collects the results of first pass clustering for one subset,
+  // i.e. adds the set of active clusters to the set of second pass active
+  // clusters and computes the costs for the newly formed cluster pairs.
+  std::set<int32>::iterator it1, it2;
+  int32 count = second_pass_count_;
+  for (it1 = active_clusters_.begin(); it1 != active_clusters_.end(); ++it1) {
+    AhcCluster *clust1 = clusters_map_[*it1];
+    second_pass_clusters_map_[++count] = clust1;
+
+    // Compute new cluster pair costs
+    for (it2 = second_pass_active_clusters_.begin();
+         it2 != second_pass_active_clusters_.end(); ++it2) {
+      AhcCluster *clust2 = second_pass_clusters_map_[*it2];
+      uint32 new_key = EncodePair(count, *it2);
+
+      BaseFloat new_cost = 0.0;
+      std::vector<int32>::iterator utt_it1, utt_it2;
+      for (utt_it1 = clust1->utt_ids.begin();
+           utt_it1 != clust1->utt_ids.end(); ++utt_it1) {
+         for (utt_it2 = clust2->utt_ids.begin();
+              utt_it2 != clust2->utt_ids.end(); ++utt_it2) {
+           new_cost += costs_(*utt_it1, *utt_it2);
+         }
+      }
+
+      second_pass_cluster_cost_map_[new_key] = new_cost;
+      BaseFloat norm = clust1->size * clust2->size;
+      if (new_cost / norm <= threshold_)
+        second_pass_queue_.push(std::make_pair(new_cost / norm, new_key));
+    }
+
+    // Copy cluster pair costs that were already computed in the first pass
+    int32 count2 = second_pass_count_;
+    for (it2 = active_clusters_.begin(); it2 != it1; ++it2) {
+      uint32 key = EncodePair(*it1, *it2);
+      BaseFloat cost = cluster_cost_map_[key];
+      BaseFloat norm = clust1->size * (clusters_map_[*it2])->size;
+      uint32 new_key = EncodePair(count, ++count2);
+      second_pass_cluster_cost_map_[new_key] = cost;
+      if (cost / norm <= threshold_)
+        second_pass_queue_.push(std::make_pair(cost / norm, new_key));
+    }
+  }
+  // We update second_pass_count_ and second_pass_active_clusters_ here since
+  // above loop assumes they do not change while the loop is running.
+  while (second_pass_count_ < count)
+    second_pass_active_clusters_.insert(++second_pass_count_);
+}
+
+void AgglomerativeClusterer::AssignClusters() {
+  assignments_->resize(num_points_);
+  int32 label_id = 0;
+  std::set<int32>::iterator it;
+  // Iterate through the clusters and assign all utterances within the cluster
+  // an ID label unique to the cluster. This is the final output and frees up
+  // the cluster memory accordingly.
+  for (it = active_clusters_.begin(); it != active_clusters_.end(); ++it) {
+    ++label_id;
+    AhcCluster *cluster = clusters_map_[*it];
+    std::vector<int32>::iterator utt_it;
+    for (utt_it = cluster->utt_ids.begin();
+         utt_it != cluster->utt_ids.end(); ++utt_it)
+      (*assignments_)[*utt_it] = label_id;
+    delete cluster;
+  }
 }
 
 void AgglomerativeCluster(
     const Matrix<BaseFloat> &costs,
-    BaseFloat thresh,
-    int32 min_clust,
+    BaseFloat threshold,
+    int32 min_clusters,
+    int32 first_pass_max_points,
+    BaseFloat max_cluster_fraction,
     std::vector<int32> *assignments_out) {
-  KALDI_ASSERT(min_clust >= 0);
-  AgglomerativeClusterer ac(costs, thresh, min_clust, assignments_out);
+  KALDI_ASSERT(min_clusters >= 0);
+  KALDI_ASSERT(max_cluster_fraction >= 1.0 / min_clusters);
+  AgglomerativeClusterer ac(costs, threshold, min_clusters,
+                            first_pass_max_points, max_cluster_fraction,
+                            assignments_out);
   ac.Cluster();
 }
 
diff --git a/src/ivector/agglomerative-clustering.h b/src/ivector/agglomerative-clustering.h
index 310a336f8b5..ffd63a86e29 100644
--- a/src/ivector/agglomerative-clustering.h
+++ b/src/ivector/agglomerative-clustering.h
@@ -2,6 +2,7 @@
 
 // Copyright  2017-2018  Matthew Maciejewski
 //                 2018  David Snyder
+//                 2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -55,65 +56,108 @@ class AgglomerativeClusterer {
  public:
   AgglomerativeClusterer(
       const Matrix<BaseFloat> &costs,
-      BaseFloat thresh,
-      int32 min_clust,
+      BaseFloat threshold,
+      int32 min_clusters,
+      int32 first_pass_max_points,
+      BaseFloat max_cluster_fraction,
       std::vector<int32> *assignments_out)
-      : count_(0), costs_(costs), thresh_(thresh), min_clust_(min_clust),
+      : costs_(costs), threshold_(threshold), min_clusters_(min_clusters),
+        first_pass_max_points_(first_pass_max_points),
         assignments_(assignments_out) {
-    num_clusters_ = costs.NumRows();
     num_points_ = costs.NumRows();
+
+    // The max_cluster_size_ is a hard limit on the number points in a cluster.
+    // This is useful for handling degenerate cases where some outlier points
+    // form their own clusters and force everything else to be clustered
+    // together, e.g. when min-clusters is provided instead of a threshold.
+    max_cluster_size_ = ceil(num_points_ * max_cluster_fraction);
+
+    // The count_, which is used for identifying clusters, is initialized to
+    // num_points_ because cluster IDs 1..num_points_ are reserved for input
+    // points, which are the initial set of clusters.
+    count_ = num_points_;
+
+    // The second_pass_count_, which is used for identifying the initial set of
+    // second pass clusters and initializing count_ before the second pass, is
+    // initialized to 0 and incremented whenever a new cluster is added to the
+    // initial set of second pass clusters.
+    second_pass_count_ = 0;
   }
 
-  // Performs the clustering
+  // Clusters points. Chooses single pass or two pass algorithm.
   void Cluster();
+
+  // Clusters points using single pass algorithm.
+  void ClusterSinglePass();
+
+  // Clusters points using two pass algorithm.
+  void ClusterTwoPass();
+
  private:
-  // Returns the cost between clusters with IDs i and j
-  BaseFloat GetCost(int32 i, int32 j);
+  // Encodes cluster pair into a 32bit unsigned integer.
+  uint32 EncodePair(int32 i, int32 j);
+  // Decodes cluster pair from a 32bit unsigned integer.
+  std::pair<int32, int32> DecodePair(uint32 key);
   // Initializes the clustering queue with singleton clusters
-  void Initialize();
+  void InitializeClusters(int32 first, int32 last);
+  // Does hierarchical agglomerative clustering
+  void ComputeClusters(int32 min_clusters);
+  // Adds clusters created in first pass to second pass clusters
+  void AddClustersToSecondPass();
+  // Assigns points to clusters
+  void AssignClusters();
   // Merges clusters with IDs i and j and updates cost map and queue
   void MergeClusters(int32 i, int32 j);
 
-
-  int32 count_;  // Count of clusters that have been created. Also used to give
-                 // clusters unique IDs.
   const Matrix<BaseFloat> &costs_;  // cost matrix
-  BaseFloat thresh_;  // stopping criterion threshold
-  int32 min_clust_;  // minimum number of clusters
+  BaseFloat threshold_;  // stopping criterion threshold
+  int32 min_clusters_;  // minimum number of clusters
+  int32 first_pass_max_points_;  // maximum number of points in each subset
   std::vector<int32> *assignments_;  // assignments out
 
+  int32 num_points_;  // total number of points to cluster
+  int32 max_cluster_size_;  // maximum number of points in a cluster
+  int32 count_;  // count of first pass clusters, used for identifying clusters
+  int32 second_pass_count_;  // count of second pass clusters
+
   // Priority queue using greater (lowest costs are highest priority).
   // Elements contain pairs of cluster IDs and their cost.
-  typedef std::pair<BaseFloat, std::pair<uint16,
-    uint16> > QueueElement;
+  typedef std::pair<BaseFloat, uint32> QueueElement;
   typedef std::priority_queue<QueueElement, std::vector<QueueElement>,
     std::greater<QueueElement>  > QueueType;
-  QueueType queue_;
+  QueueType queue_, second_pass_queue_;
 
   // Map from cluster IDs to cost between them
-  std::unordered_map<std::pair<int32, int32>, BaseFloat,
-                     PairHasher<int32, int32>> cluster_cost_map_;
+  std::unordered_map<uint32, BaseFloat> cluster_cost_map_;
   // Map from cluster ID to cluster object address
   std::unordered_map<int32, AhcCluster*> clusters_map_;
-  std::set<int32> active_clusters_;  // IDs of unmerged clusters
-  int32 num_clusters_;  // number of active clusters
-  int32 num_points_;  // total number of points to cluster
+  // Set of unmerged cluster IDs
+  std::set<int32> active_clusters_;
+
+  // Map from second pass cluster IDs to cost between them
+  std::unordered_map<uint32, BaseFloat> second_pass_cluster_cost_map_;
+  // Map from second pass cluster ID to cluster object address
+  std::unordered_map<int32, AhcCluster*> second_pass_clusters_map_;
+  // Set of unmerged second pass cluster IDs
+  std::set<int32> second_pass_active_clusters_;
 };
 
 /** This is the function that is called to perform the agglomerative
  *  clustering. It takes the following arguments:
  *   - A matrix of all pairwise costs, with each row/column corresponding
  *      to an utterance ID, and the elements of the matrix containing the
-        cost for pairing the utterances for its row and column
+ *      cost for pairing the utterances for its row and column
  *   - A threshold which is used as the stopping criterion for the clusters
  *   - A minimum number of clusters that will not be merged past
+ *   - A maximum fraction of points that can be in a cluster
  *   - A vector which will be filled with integer IDs corresponding to each
  *      of the rows/columns of the score matrix.
  *
  *  The basic algorithm is as follows:
  *  \code
- *      while (num-clusters > min_clust && smallest-merge-cost <= thresh)
- *          merge the two clusters with lowest cost.
+ *      while (num-clusters > min-clusters && smallest-merge-cost <= threshold)
+ *          if (size-of-new-cluster <= max-cluster-size)
+ *              merge the two clusters with lowest cost
  *  \endcode
  *
  *  The cost between two clusters is the average cost of all pairwise
@@ -126,11 +170,19 @@ class AgglomerativeClusterer {
  *  costs between clusters I and M and clusters I and N, where
  *  cluster J was formed by merging clusters M and N.
  *
+ *  If the number of points to cluster is larger than first-pass-max-points,
+ *  then clustering is done in two passes. In the first pass, input points are
+ *  divided into contiguous subsets of size at most first-pass-max-points and
+ *  each subset is clustered separately. In the second pass, the first pass
+ *  clusters are merged into the final set of clusters.
+ *
  */
 void AgglomerativeCluster(
     const Matrix<BaseFloat> &costs,
-    BaseFloat thresh,
-    int32 min_clust,
+    BaseFloat threshold,
+    int32 min_clusters,
+    int32 first_pass_max_points,
+    BaseFloat max_cluster_fraction,
     std::vector<int32> *assignments_out);
 
 }  // end namespace kaldi.
diff --git a/src/ivector/ivector-extractor.cc b/src/ivector/ivector-extractor.cc
index aaba3837698..c3a122281c2 100644
--- a/src/ivector/ivector-extractor.cc
+++ b/src/ivector/ivector-extractor.cc
@@ -578,10 +578,96 @@ void OnlineIvectorEstimationStats::AccStats(
       quadratic_term_.AddToDiag(prior_scale_change);
     }
   }
+  num_frames_ += tot_weight;
+}
+
+
+// This is used in OnlineIvectorEstimationStats::AccStats().
+struct GaussInfo {
+  // total weight for this Gaussian.
+  BaseFloat tot_weight;
+  // vector of pairs of (frame-index, weight for this Gaussian)
+  std::vector<std::pair<int32, BaseFloat> > frame_weights;
+  GaussInfo(): tot_weight(0.0) { }
+};
+
+static void ConvertPostToGaussInfo(
+    const std::vector<std::vector<std::pair<int32, BaseFloat> > > &gauss_post,
+    std::unordered_map<int32, GaussInfo> *gauss_info) {
+  int32 num_frames = gauss_post.size();
+  for (int32 t = 0; t < num_frames; t++) {
+    const std::vector<std::pair<int32, BaseFloat> > &this_post = gauss_post[t];
+    auto iter = this_post.begin(), end = this_post.end();
+    for (; iter != end; ++iter) {
+      int32 gauss_idx = iter->first;
+      GaussInfo &info = (*gauss_info)[gauss_idx];
+      BaseFloat weight = iter->second;
+      info.tot_weight += weight;
+      info.frame_weights.push_back(std::pair<int32, BaseFloat>(t, weight));
+    }
+  }
+}
+
+void OnlineIvectorEstimationStats::AccStats(
+    const IvectorExtractor &extractor,
+    const MatrixBase<BaseFloat> &features,
+    const std::vector<std::vector<std::pair<int32, BaseFloat> > > &gauss_post) {
+  KALDI_ASSERT(extractor.IvectorDim() == this->IvectorDim());
+  KALDI_ASSERT(!extractor.IvectorDependentWeights());
+
+  int32 feat_dim = features.NumCols();
+  std::unordered_map<int32, GaussInfo> gauss_info;
+  ConvertPostToGaussInfo(gauss_post, &gauss_info);
+
+  Vector<double> weighted_feats(feat_dim, kUndefined);
+  double tot_weight = 0.0;
+  int32 ivector_dim = this->IvectorDim(),
+      quadratic_term_dim = (ivector_dim * (ivector_dim + 1)) / 2;
+  SubVector<double> quadratic_term_vec(quadratic_term_.Data(),
+                                       quadratic_term_dim);
+
+  std::unordered_map<int32, GaussInfo>::const_iterator
+      iter = gauss_info.begin(), end = gauss_info.end();
+  for (; iter != end; ++iter) {
+    int32 gauss_idx = iter->first;
+    const GaussInfo &info = iter->second;
+
+    weighted_feats.SetZero();
+    std::vector<std::pair<int32, BaseFloat> >::const_iterator
+        f_iter = info.frame_weights.begin(), f_end = info.frame_weights.end();
+    for (; f_iter != f_end; ++f_iter) {
+      int32 t = f_iter->first;
+      BaseFloat weight = f_iter->second;
+      weighted_feats.AddVec(weight, features.Row(t));
+    }
+    BaseFloat this_tot_weight = info.tot_weight;
 
+    linear_term_.AddMatVec(1.0, extractor.Sigma_inv_M_[gauss_idx], kTrans,
+                           weighted_feats, 1.0);
+    SubVector<double> U_g(extractor.U_, gauss_idx);
+    quadratic_term_vec.AddVec(this_tot_weight, U_g);
+    tot_weight += this_tot_weight;
+  }
+  if (max_count_ > 0.0) {
+    // see comments in header RE max_count for explanation.  It relates to
+    // prior scaling when the count exceeds max_count_
+    double old_num_frames = num_frames_,
+        new_num_frames = num_frames_ + tot_weight;
+    double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_,
+        new_prior_scale = std::max(new_num_frames, max_count_) / max_count_;
+    // The prior_scales are the inverses of the scales we would put on the stats
+    // if we were implementing this by scaling the stats.  Instead we
+    // scale the prior term.
+    double prior_scale_change = new_prior_scale - old_prior_scale;
+    if (prior_scale_change != 0.0) {
+      linear_term_(0) += prior_offset_ * prior_scale_change;
+      quadratic_term_.AddToDiag(prior_scale_change);
+    }
+  }
   num_frames_ += tot_weight;
 }
 
+
 void OnlineIvectorEstimationStats::Scale(double scale) {
   KALDI_ASSERT(scale >= 0.0 && scale <= 1.0);
   double old_num_frames = num_frames_;
diff --git a/src/ivector/ivector-extractor.h b/src/ivector/ivector-extractor.h
index 9641d9d79e8..938034859e2 100644
--- a/src/ivector/ivector-extractor.h
+++ b/src/ivector/ivector-extractor.h
@@ -323,10 +323,17 @@ class OnlineIvectorEstimationStats {
   OnlineIvectorEstimationStats(const OnlineIvectorEstimationStats &other);
 
 
+  // Accumulate stats for one frame.
   void AccStats(const IvectorExtractor &extractor,
                 const VectorBase<BaseFloat> &feature,
                 const std::vector<std::pair<int32, BaseFloat> > &gauss_post);
 
+  // Accumulate stats for a sequence (or collection) of frames.
+  void AccStats(const IvectorExtractor &extractor,
+                const MatrixBase<BaseFloat> &features,
+                const std::vector<std::vector<std::pair<int32, BaseFloat> > > &gauss_post);
+
+
   int32 IvectorDim() const { return linear_term_.Dim(); }
 
   /// This function gets the current estimate of the iVector.  Internally it
@@ -461,7 +468,7 @@ struct IvectorExtractorEstimationOptions {
                    "update any associated parameters.");
     opts->Register("diagonalize", &diagonalize,
                    "If true, diagonalize the quadratic term in the "
-                   "objective function. This reorders the ivector dimensions"
+                   "objective function. This reorders the ivector dimensions "
                    "from most to least important.");
   }
 };
diff --git a/src/ivector/logistic-regression.cc b/src/ivector/logistic-regression.cc
index 5d02c013294..4eae2ebe3d7 100644
--- a/src/ivector/logistic-regression.cc
+++ b/src/ivector/logistic-regression.cc
@@ -55,8 +55,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
 
   weights_.SetZero();
   TrainParameters(xs_with_prior, ys, conf, &xw);
-  KALDI_LOG <<
-    "Finished training parameters without mixture components." << std::endl;
+  KALDI_LOG << "Finished training parameters without mixture components.";
 
   // If we are using mixture components, we add those components
   // in MixUp and retrain with the extra weights.
@@ -64,8 +63,7 @@ void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
     MixUp(ys, num_classes, conf);
     Matrix<BaseFloat> xw(xs_num_rows, weights_.NumRows());
     TrainParameters(xs_with_prior, ys, conf, &xw);
-    KALDI_LOG <<
-      "Finished training mixture components." << std::endl;
+    KALDI_LOG << "Finished training mixture components.";
   }
 }
 
@@ -87,8 +85,7 @@ void LogisticRegression::MixUp(const std::vector<int32> &ys,
                                   static_cast<int32>(0));
 
   KALDI_LOG << "Target number mixture components was " << conf.mix_up
-            << ". Training " << new_dim << " mixture components. "
-            << std::endl;
+            << ". Training " << new_dim << " mixture components.";
 
   int32 old_dim = weights_.NumRows(),
         num_components = old_dim,
diff --git a/src/ivector/plda.h b/src/ivector/plda.h
index 1a2b10be80d..1bdff79eee4 100644
--- a/src/ivector/plda.h
+++ b/src/ivector/plda.h
@@ -91,7 +91,7 @@ class Plda {
   ///
   /// If config.normalize_length == true, it will also normalize the iVector's
   /// length by multiplying by a scalar that ensures that ivector^T inv_var
-  /// ivector = dim.  In this case, "num_examples" comes into play because it
+  /// ivector = dim.  In this case, "num_enroll_examples" comes into play because it
   /// affects the expected covariance matrix of the iVector.  The normalization
   /// factor is returned, even if config.normalize_length == false, in which
   /// case the normalization factor is computed but not applied.
@@ -100,25 +100,25 @@ class Plda {
   /// to be equal to the square root of the iVector dimension.
   double TransformIvector(const PldaConfig &config,
                           const VectorBase<double> &ivector,
-                          int32 num_examples,
+                          int32 num_enroll_examples,
                           VectorBase<double> *transformed_ivector) const;
 
   /// float version of the above (not BaseFloat because we'd be implementing it
   /// twice for the same type if BaseFloat == double).
   float TransformIvector(const PldaConfig &config,
                          const VectorBase<float> &ivector,
-                         int32 num_examples,
+                         int32 num_enroll_examples,
                          VectorBase<float> *transformed_ivector) const;
 
   /// Returns the log-likelihood ratio
   /// log (p(test_ivector | same) / p(test_ivector | different)).
-  /// transformed_train_ivector is an average over utterances for
-  /// that speaker.  Both transformed_train_vector and transformed_test_ivector
+  /// transformed_enroll_ivector is an average over utterances for
+  /// that speaker.  Both transformed_enroll_vector and transformed_test_ivector
   /// are assumed to have been transformed by the function TransformIvector().
   /// Note: any length normalization will have been done while computing
   /// the transformed iVectors.
-  double LogLikelihoodRatio(const VectorBase<double> &transformed_train_ivector,
-                            int32 num_train_utts,
+  double LogLikelihoodRatio(const VectorBase<double> &transformed_enroll_ivector,
+                            int32 num_enroll_utts,
                             const VectorBase<double> &transformed_test_ivector)
                             const;
 
diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile
index 75a17708c43..8dc3498b83b 100644
--- a/src/ivectorbin/Makefile
+++ b/src/ivectorbin/Makefile
@@ -4,7 +4,7 @@ all:
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
-BINFILES = ivector-extractor-init ivector-extractor-acc-stats \
+BINFILES = ivector-extractor-init ivector-extractor-copy ivector-extractor-acc-stats \
            ivector-extractor-sum-accs ivector-extractor-est \
            ivector-extract compute-vad select-voiced-frames \
            compute-vad-from-frame-likes merge-vads \
@@ -26,7 +26,7 @@ TESTFILES =
 
 
 ADDLIBS = ../ivector/kaldi-ivector.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/ivectorbin/agglomerative-cluster.cc b/src/ivectorbin/agglomerative-cluster.cc
index 9dca9bfeb83..4812dd291e1 100644
--- a/src/ivectorbin/agglomerative-cluster.cc
+++ b/src/ivectorbin/agglomerative-cluster.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2016-2018  David Snyder
 //           2017-2018  Matthew Maciejewski
+//                2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -47,8 +48,9 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
     std::string reco2num_spk_rspecifier;
-    BaseFloat threshold = 0.0;
+    BaseFloat threshold = 0.0, max_spk_fraction = 1.0;
     bool read_costs = false;
+    int32 first_pass_max_utterances = std::numeric_limits<int16>::max();
 
     po.Register("reco2num-spk-rspecifier", &reco2num_spk_rspecifier,
       "If supplied, clustering creates exactly this many clusters for each"
@@ -58,6 +60,16 @@ int main(int argc, char *argv[]) {
     po.Register("read-costs", &read_costs, "If true, the first"
       " argument is interpreted as a matrix of costs rather than a"
       " similarity matrix.");
+    po.Register("first-pass-max-utterances", &first_pass_max_utterances,
+      "If the number of utterances is larger than first-pass-max-utterances,"
+      " then clustering is done in two passes. In the first pass, input points"
+      " are divided into contiguous subsets of size first-pass-max-utterances"
+      " and each subset is clustered separately. In the second pass, the first"
+      " pass clusters are merged into the final set of clusters.");
+    po.Register("max-spk-fraction", &max_spk_fraction, "Merge clusters if the"
+      " total fraction of utterances in them is less than this threshold."
+      " This is active only when reco2num-spk-rspecifier is supplied and"
+      " 1.0 / num-spk <= max-spk-fraction <= 1.0.");
 
     po.Read(argc, argv);
 
@@ -90,10 +102,17 @@ int main(int argc, char *argv[]) {
       std::vector<int32> spk_ids;
       if (reco2num_spk_rspecifier.size()) {
         int32 num_speakers = reco2num_spk_reader.Value(reco);
-        AgglomerativeCluster(costs,
-          std::numeric_limits<BaseFloat>::max(), num_speakers, &spk_ids);
+        if (1.0 / num_speakers <= max_spk_fraction && max_spk_fraction <= 1.0)
+          AgglomerativeCluster(costs, std::numeric_limits<BaseFloat>::max(),
+                               num_speakers, first_pass_max_utterances,
+                               max_spk_fraction, &spk_ids);
+        else
+          AgglomerativeCluster(costs, std::numeric_limits<BaseFloat>::max(),
+                               num_speakers, first_pass_max_utterances,
+                               1.0, &spk_ids);
       } else {
-        AgglomerativeCluster(costs, threshold, 1, &spk_ids);
+        AgglomerativeCluster(costs, threshold, 1, first_pass_max_utterances,
+                             1.0, &spk_ids);
       }
       for (int32 i = 0; i < spk_ids.size(); i++)
         label_writer.Write(uttlist[i], spk_ids[i]);
diff --git a/src/ivectorbin/compute-vad.cc b/src/ivectorbin/compute-vad.cc
index 38854613297..6748c87bb7b 100644
--- a/src/ivectorbin/compute-vad.cc
+++ b/src/ivectorbin/compute-vad.cc
@@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "This program reads input features and writes out, for each utterance,\n"
-        "a vector of floats that are 1.0 if we judge the frame voice and 0.0\n"
+        "a vector of floats that are 1.0 if we judge the frame voiced and 0.0\n"
         "otherwise.  The algorithm is very simple and is based on thresholding\n"
         "the log mel energy (and taking the consensus of threshold decisions\n"
         "within a window centered on the current frame).  See the options for\n"
diff --git a/src/ivectorbin/ivector-compute-lda.cc b/src/ivectorbin/ivector-compute-lda.cc
index 4ce6e337025..02246a9a2fe 100644
--- a/src/ivectorbin/ivector-compute-lda.cc
+++ b/src/ivectorbin/ivector-compute-lda.cc
@@ -82,19 +82,33 @@ class CovarianceStats {
 
 template<class Real>
 void ComputeNormalizingTransform(const SpMatrix<Real> &covar,
+                                 Real floor,
                                  MatrixBase<Real> *proj) {
   int32 dim = covar.NumRows();
-  TpMatrix<Real> C(dim);  // Cholesky of covar, covar = C C^T
-  C.Cholesky(covar);
-  C.Invert();  // The matrix that makes covar unit is C^{-1}, because
-               // C^{-1} covar C^{-T} = C^{-1} C C^T C^{-T} = I.
-  proj->CopyFromTp(C, kNoTrans);  // set "proj" to C^{-1}.
+  Matrix<Real> U(dim, dim);
+  Vector<Real> s(dim);
+  covar.Eig(&s, &U);
+  // Sort eigvenvalues from largest to smallest.
+  SortSvd(&s, &U);
+  // Floor eigenvalues to a small positive value.
+  int32 num_floored;
+  floor *= s(0); // Floor relative to the largest eigenvalue
+  s.ApplyFloor(floor, &num_floored);
+  if (num_floored > 0) {
+    KALDI_WARN << "Floored " << num_floored << " eigenvalues of covariance "
+               << "to " << floor;
+  }
+  // Next two lines computes projection proj, such that
+  // proj * covar * proj^T = I.
+  s.ApplyPow(-0.5);
+  proj->AddDiagVecMat(1.0, s, U, kTrans, 0.0);
 }
 
 void ComputeLdaTransform(
     const std::map<std::string, Vector<BaseFloat> *> &utt2ivector,
     const std::map<std::string, std::vector<std::string> > &spk2utt,
     BaseFloat total_covariance_factor,
+    BaseFloat covariance_floor,
     MatrixBase<BaseFloat> *lda_out) {
   KALDI_ASSERT(!utt2ivector.empty());
   int32 lda_dim = lda_out->NumRows(), dim = lda_out->NumCols();
@@ -136,7 +150,8 @@ void ComputeLdaTransform(
   mat_to_normalize.AddSp(1.0 - total_covariance_factor, within_covar);
 
   Matrix<double> T(dim, dim);
-  ComputeNormalizingTransform(mat_to_normalize, &T);
+  ComputeNormalizingTransform(mat_to_normalize,
+    static_cast<double>(covariance_floor), &T);
 
   SpMatrix<double> between_covar(total_covar);
   between_covar.AddSp(-1.0, within_covar);
@@ -209,7 +224,8 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
 
     int32 lda_dim = 100; // Dimension we reduce to
-    BaseFloat total_covariance_factor = 0.0;
+    BaseFloat total_covariance_factor = 0.0,
+              covariance_floor = 1.0e-06;
     bool binary = true;
 
     po.Register("dim", &lda_dim, "Dimension we keep with the LDA transform");
@@ -217,6 +233,9 @@ int main(int argc, char *argv[]) {
                 "If this is 0.0 we normalize to make the within-class covariance "
                 "unit; if 1.0, the total covariance; if between, we normalize "
                 "an interpolated matrix.");
+    po.Register("covariance-floor", &covariance_floor, "Floor the eigenvalues "
+                "of the interpolated covariance matrix to the product of its "
+                "largest eigenvalue and this number.");
     po.Register("binary", &binary, "Write output in binary mode");
 
     po.Read(argc, argv);
@@ -230,6 +249,8 @@ int main(int argc, char *argv[]) {
         utt2spk_rspecifier = po.GetArg(2),
         lda_wxfilename = po.GetArg(3);
 
+    KALDI_ASSERT(covariance_floor >= 0.0);
+
     int32 num_done = 0, num_err = 0, dim = 0;
 
     SequentialBaseFloatVectorReader ivector_reader(ivector_rspecifier);
@@ -283,6 +304,7 @@ int main(int argc, char *argv[]) {
     ComputeLdaTransform(utt2ivector,
                         spk2utt,
                         total_covariance_factor,
+                        covariance_floor,
                         &linear_part);
     Vector<BaseFloat> offset(lda_dim);
     offset.AddMatVec(-1.0, linear_part, kNoTrans, mean, 0.0);
diff --git a/src/ivectorbin/ivector-extractor-copy.cc b/src/ivectorbin/ivector-extractor-copy.cc
new file mode 100644
index 00000000000..f04a6d20120
--- /dev/null
+++ b/src/ivectorbin/ivector-extractor-copy.cc
@@ -0,0 +1,64 @@
+// ivectorbin/ivector-extractor-copy.cc
+
+// Copyright 2019  Zili Huang
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "ivector/ivector-extractor.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using kaldi::int32;
+
+    const char *usage =
+        "Copy the i-vector extractor to a text file\n"
+        "Usage:  ivector-extractor-copy [options] <ivector-extractor-in> <ivector-extractor-out>\n"
+        "e.g.:\n"
+        " ivector-extractor-copy --binary=false 0.ie 0_txt.ie\n";
+
+    bool binary = true;
+    IvectorExtractorOptions ivector_opts;
+    ParseOptions po(usage);
+    po.Register("binary", &binary, "Write output in binary mode");
+    ivector_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string ivector_extractor_rxfilename = po.GetArg(1),
+        ivector_extractor_wxfilename = po.GetArg(2);
+
+    IvectorExtractor extractor;
+    ReadKaldiObject(ivector_extractor_rxfilename, &extractor);
+
+    WriteKaldiObject(extractor, ivector_extractor_wxfilename, binary);
+  
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
+
diff --git a/src/ivectorbin/ivector-extractor-sum-accs.cc b/src/ivectorbin/ivector-extractor-sum-accs.cc
index b90a3773657..96c7c0fac75 100644
--- a/src/ivectorbin/ivector-extractor-sum-accs.cc
+++ b/src/ivectorbin/ivector-extractor-sum-accs.cc
@@ -60,10 +60,13 @@ int main(int argc, char *argv[]) {
       }
       for (size_t i = 1; i < po.NumArgs(); i++) {
         bool b;
-        kaldi::InitKaldiInputStream(inputs[i-1]->Stream(), &b);
-        bool add = true;
-        stats.Read(inputs[i-1]->Stream(), b, add);
-        delete inputs[i-1];
+        if (kaldi::InitKaldiInputStream(inputs[i-1]->Stream(), &b)) {
+          bool add = true;
+          stats.Read(inputs[i-1]->Stream(), b, add);
+          delete inputs[i-1];
+        } else {
+          KALDI_ERR << "Malformed input file " << po.GetArg(i);
+        }
       }
     } else {
       for (int32 i = 1; i < po.NumArgs(); i++) {
diff --git a/src/ivectorbin/ivector-plda-scoring-dense.cc b/src/ivectorbin/ivector-plda-scoring-dense.cc
index 076fd41ad09..e96f7de99d4 100644
--- a/src/ivectorbin/ivector-plda-scoring-dense.cc
+++ b/src/ivectorbin/ivector-plda-scoring-dense.cc
@@ -27,7 +27,14 @@
 namespace kaldi {
 
 bool EstPca(const Matrix<BaseFloat> &ivector_mat, BaseFloat target_energy,
-  Matrix<BaseFloat> *mat) {
+  const std::string &reco, Matrix<BaseFloat> *mat) {
+
+  // If the target_energy is 1.0, it's equivalent to not applying the
+  // conversation-dependent PCA at all, so it's better to exit this
+  // function before doing any computation.
+  if (ApproxEqual(target_energy, 1.0, 0.001))
+    return false;
+
   int32 num_rows = ivector_mat.NumRows(),
     num_cols = ivector_mat.NumCols();
   Vector<BaseFloat> sum;
@@ -50,6 +57,8 @@ bool EstPca(const Matrix<BaseFloat> &ivector_mat, BaseFloat target_energy,
     else
       Matrix<BaseFloat>(sumsq).Svd(&s, &P, NULL);
   } catch (...) {
+    KALDI_WARN << "Unable to compute conversation dependent PCA for"
+      << " recording " << reco << ".";
     return false;
   }
 
@@ -181,7 +190,7 @@ int main(int argc, char *argv[]) {
         for (size_t i = 0; i < ivectors.size(); i++) {
           ivector_mat.Row(i).CopyFromVec(ivectors[i]);
         }
-        if (EstPca(ivector_mat, target_energy, &pca_transform)) {
+        if (EstPca(ivector_mat, target_energy, reco, &pca_transform)) {
           // Apply the PCA transform to the raw i-vectors.
           ApplyPca(ivector_mat, pca_transform, &ivector_mat_pca);
 
@@ -192,10 +201,9 @@ int main(int argc, char *argv[]) {
           TransformIvectors(ivector_mat_pca, plda_config, this_plda,
             &ivector_mat_plda);
         } else {
-          KALDI_WARN << "Unable to compute conversation dependent PCA for"
-            << " recording " << reco << ".";
-          ivector_mat_pca.Resize(ivector_mat.NumRows(), ivector_mat.NumCols());
-          ivector_mat_pca.CopyFromMat(ivector_mat);
+          // If EstPca returns false, we won't apply any PCA.
+          TransformIvectors(ivector_mat, plda_config, this_plda,
+          &ivector_mat_plda);
         }
         for (int32 i = 0; i < ivector_mat_plda.NumRows(); i++) {
           for (int32 j = 0; j < ivector_mat_plda.NumRows(); j++) {
diff --git a/src/kws/Makefile b/src/kws/Makefile
index a5b74ea2229..c4367eb2958 100644
--- a/src/kws/Makefile
+++ b/src/kws/Makefile
@@ -10,8 +10,7 @@ OBJFILES = kws-functions.o kws-functions2.o kws-scoring.o
 LIBNAME = kaldi-kws
 
 ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kwsbin/Makefile b/src/kwsbin/Makefile
index cade044e153..bcc2685b7f3 100644
--- a/src/kwsbin/Makefile
+++ b/src/kwsbin/Makefile
@@ -17,7 +17,6 @@ TESTFILES =
 
 ADDLIBS = ../kws/kaldi-kws.a ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kwsbin/compute-atwv.cc b/src/kwsbin/compute-atwv.cc
index 6d9f6d2c2bb..0907baf268a 100644
--- a/src/kwsbin/compute-atwv.cc
+++ b/src/kwsbin/compute-atwv.cc
@@ -112,7 +112,7 @@ int main(int argc, char *argv[]) {
       if (vals.size() != 4) {
         KALDI_ERR << "Incorrect format of the reference file"
           << " -- 4 entries expected, " << vals.size() << " given!\n"
-          << "Key: " << kwid << std::endl;
+          << "Key: " << kwid;
       }
       KwsTerm inst(kwid, vals);
       aligner.AddRef(inst);
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]) {
       if (vals.size() != 4) {
         KALDI_ERR << "Incorrect format of the hypotheses file"
           << " -- 4 entries expected, " << vals.size() << " given!\n"
-          << "Key: " << kwid << std::endl;
+          << "Key: " << kwid;
       }
       KwsTerm inst(kwid, vals);
       aligner.AddHyp(inst);
@@ -171,4 +171,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/kwsbin/generate-proxy-keywords.cc b/src/kwsbin/generate-proxy-keywords.cc
index 9c534abe816..253969bae6d 100644
--- a/src/kwsbin/generate-proxy-keywords.cc
+++ b/src/kwsbin/generate-proxy-keywords.cc
@@ -25,6 +25,7 @@
 #include "fstext/prune-special.h"
 
 namespace fst {
+using std::vector;
 
 bool PrintProxyFstPath(const VectorFst<StdArc> &proxy,
                        vector<vector<StdArc::Label> > *path,
@@ -58,6 +59,8 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;
+    using std::vector;
+    using std::string;
     typedef kaldi::int32 int32;
     typedef kaldi::uint64 uint64;
     typedef StdArc::StateId StateId;
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index f054e94e175..a644c1e2a36 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -159,6 +159,7 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;
+    using std::vector;
     typedef kaldi::int32 int32;
     typedef kaldi::uint32 uint32;
     typedef kaldi::uint64 uint64;
diff --git a/src/kwsbin/print-proxy-keywords.cc b/src/kwsbin/print-proxy-keywords.cc
index f1518650a9b..5f608844967 100644
--- a/src/kwsbin/print-proxy-keywords.cc
+++ b/src/kwsbin/print-proxy-keywords.cc
@@ -25,6 +25,7 @@
 #include "fstext/kaldi-fst-io.h"
 
 namespace fst {
+using std::vector;
 
 bool PrintProxyFstPath(const VectorFst<StdArc> &proxy,
                        vector<vector<StdArc::Label> > *path,
diff --git a/src/lat/Makefile b/src/lat/Makefile
index bba2329fdf6..56521486826 100644
--- a/src/lat/Makefile
+++ b/src/lat/Makefile
@@ -16,8 +16,7 @@ OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
 LIBNAME = kaldi-lat
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lat/arctic-weight.h b/src/lat/arctic-weight.h
index 2b308f44e65..5c0c6d3c416 100644
--- a/src/lat/arctic-weight.h
+++ b/src/lat/arctic-weight.h
@@ -43,29 +43,29 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
   ArcticWeightTpl(const ArcticWeightTpl<T> &w) : FloatWeightTpl<T>(w) {}
 
   static const ArcticWeightTpl<T> Zero() {
-    return ArcticWeightTpl<T>(-numeric_limits<T>::infinity()); }
+    return ArcticWeightTpl<T>(-std::numeric_limits<T>::infinity()); }
 
   static const ArcticWeightTpl<T> One() {
     return ArcticWeightTpl<T>(0.0F); }
 
-  static const string &Type() {
-    static const string type = string("arctic") +
+  static const std::string &Type() {
+    static const std::string type = std::string("arctic") +
         FloatWeightTpl<T>::GetPrecisionString();
     return type;
   }
 
   static ArcticWeightTpl<T> NoWeight() {
-    return ArcticWeightTpl<T>(numeric_limits<T>::infinity());
+    return ArcticWeightTpl<T>(std::numeric_limits<T>::infinity());
   }
 
   bool Member() const {
     // First part fails for IEEE NaN
-    return Value() == Value() && Value() != numeric_limits<T>::infinity();
+    return Value() == Value() && Value() != std::numeric_limits<T>::infinity();
   }
 
   ArcticWeightTpl<T> Quantize(float delta = kDelta) const {
-    if (Value() == -numeric_limits<T>::infinity() ||
-        Value() == numeric_limits<T>::infinity() ||
+    if (Value() == -std::numeric_limits<T>::infinity() ||
+        Value() == std::numeric_limits<T>::infinity() ||
         Value() != Value())
       return *this;
     else
@@ -103,9 +103,9 @@ template <class T>
 inline ArcticWeightTpl<T> Times(const ArcticWeightTpl<T> &w1,
                                   const ArcticWeightTpl<T> &w2) {
   T f1 = w1.Value(), f2 = w2.Value();
-  if (f1 == -numeric_limits<T>::infinity())
+  if (f1 == -std::numeric_limits<T>::infinity())
     return w1;
-  else if (f2 == -numeric_limits<T>::infinity())
+  else if (f2 == -std::numeric_limits<T>::infinity())
     return w2;
   else
     return ArcticWeightTpl<T>(f1 + f2);
@@ -126,10 +126,10 @@ inline ArcticWeightTpl<T> Divide(const ArcticWeightTpl<T> &w1,
                                    const ArcticWeightTpl<T> &w2,
                                    DivideType typ = DIVIDE_ANY) {
   T f1 = w1.Value(), f2 = w2.Value();
-  if (f2 == -numeric_limits<T>::infinity())
-    return numeric_limits<T>::quiet_NaN();
-  else if (f1 == -numeric_limits<T>::infinity())
-    return -numeric_limits<T>::infinity();
+  if (f2 == -std::numeric_limits<T>::infinity())
+    return std::numeric_limits<T>::quiet_NaN();
+  else if (f1 == -std::numeric_limits<T>::infinity())
+    return -std::numeric_limits<T>::infinity();
   else
     return ArcticWeightTpl<T>(f1 - f2);
 }
diff --git a/src/lat/compose-lattice-pruned.cc b/src/lat/compose-lattice-pruned.cc
index c6e4dafc008..cc71db38eab 100644
--- a/src/lat/compose-lattice-pruned.cc
+++ b/src/lat/compose-lattice-pruned.cc
@@ -658,6 +658,7 @@ void PrunedCompactLatticeComposer::AddFirstState() {
   composed_state_queue_.push(
       std::pair<BaseFloat, int32>(expected_cost_offset,
                                   state_id));  // actually (0.0, 0).
+
 }
 
 
@@ -771,7 +772,14 @@ void PrunedCompactLatticeComposer::ProcessTransition(int32 src_composed_state,
   // Note: we expect that ilabel == olabel, since this is a CompactLattice, but this
   // may not be so if we extend this to work with Lattice.
   fst::StdArc lm_arc;
-  if (!det_fst_->GetArc(src_info->lm_state, olabel, &lm_arc)) {
+
+  // the input lattice might have epsilons
+  if (olabel == 0) {
+    lm_arc.ilabel = 0;
+    lm_arc.olabel = 0;
+    lm_arc.nextstate = src_info->lm_state;
+    lm_arc.weight = fst::StdArc::Weight(0.0);
+  } else if (!det_fst_->GetArc(src_info->lm_state, olabel, &lm_arc)) {
     // for normal language models we don't expect this to happen, but the
     // appropriate behavior is to do nothing; the composed arc does not exist,
     // so there is no arc to add and no new state to create.
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index 447c951d02c..bdf8c3fabc8 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -30,6 +30,10 @@
 
 namespace fst {
 
+using std::vector;
+using std::pair;
+using std::greater;
+
 // class LatticeDeterminizerPruned is templated on the same types that
 // CompactLatticeWeight is templated on: the base weight (Weight), typically
 // LatticeWeightTpl<float> etc. but could also be e.g. TropicalWeight, and the
@@ -665,8 +669,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
         continue;
       if (opts_.max_loop > 0 && counter++ > opts_.max_loop) {
         KALDI_ERR << "Lattice determinization aborted since looped more than "
-                  << opts_.max_loop << " times during epsilon closure.\n";
-        throw std::runtime_error("looped more than max-arcs times in lattice determinization");
+                  << opts_.max_loop << " times during epsilon closure.";
       }
       for (ArcIterator<ExpandedFst<Arc> > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) {
         const Arc &arc = aiter.Value();
@@ -1031,7 +1034,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // an empty FST.
 
     double best_cost = backward_costs_[ifst_->Start()];
-    if (best_cost == numeric_limits<double>::infinity())
+    if (best_cost == std::numeric_limits<double>::infinity())
       KALDI_WARN << "Total weight of input lattice is zero.";
     cutoff_ = best_cost + beam_;
   }
diff --git a/src/lat/determinize-lattice-pruned.h b/src/lat/determinize-lattice-pruned.h
index 8e1858aa2b1..35323709815 100644
--- a/src/lat/determinize-lattice-pruned.h
+++ b/src/lat/determinize-lattice-pruned.h
@@ -105,8 +105,8 @@ namespace fst {
    representation" and hence the "minimal representation" will be the same.  We
    can use this to reduce compute.  Note that if two initial representations are
    different, this does not preclude the other representations from being the same.
-   
-*/   
+
+*/
 
 
 struct DeterminizeLatticePrunedOptions {
@@ -190,7 +190,7 @@ template<class Weight>
 bool DeterminizeLatticePruned(
     const ExpandedFst<ArcTpl<Weight> > &ifst,
     double prune,
-    MutableFst<ArcTpl<Weight> > *ofst, 
+    MutableFst<ArcTpl<Weight> > *ofst,
     DeterminizeLatticePrunedOptions opts = DeterminizeLatticePrunedOptions());
 
 
@@ -199,7 +199,7 @@ bool DeterminizeLatticePruned(
     (i.e. the sequences of output symbols are represented directly as strings The input
     FST must be topologically sorted in order for the algorithm to work. For efficiency
     it is recommended to sort the ilabel for the input FST as well.
-    Returns true on success, and false if it had to terminate the determinization
+    Returns true on normal success, and false if it had to terminate the determinization
     earlier than specified by the "prune" beam-- that is, if it terminated because
     of the max_mem, max_loop or max_arcs constraints in the options.
     CAUTION: if Lattice is the input, you need to Invert() before calling this,
@@ -261,7 +261,7 @@ bool DeterminizeLatticePhonePruned(
       = DeterminizeLatticePhonePrunedOptions());
 
 /** "Destructive" version of DeterminizeLatticePhonePruned() where the input
-    lattice might be changed. 
+    lattice might be changed.
 */
 template<class Weight, class IntType>
 bool DeterminizeLatticePhonePruned(
diff --git a/src/lat/lattice-functions.cc b/src/lat/lattice-functions.cc
index 54c856a9403..f4a184f3cd4 100644
--- a/src/lat/lattice-functions.cc
+++ b/src/lat/lattice-functions.cc
@@ -1107,17 +1107,16 @@ void CompactLatticeShortestPath(const CompactLattice &clat,
   // Now we can assume it's topologically sorted.
   shortest_path->DeleteStates();
   if (clat.Start() == kNoStateId) return;
-  KALDI_ASSERT(clat.Start() == 0); // since top-sorted.
   typedef CompactLatticeArc Arc;
   typedef Arc::StateId StateId;
   typedef CompactLatticeWeight Weight;
   vector<std::pair<double, StateId> > best_cost_and_pred(clat.NumStates() + 1);
   StateId superfinal = clat.NumStates();
   for (StateId s = 0; s <= clat.NumStates(); s++) {
-    best_cost_and_pred[s].first = numeric_limits<double>::infinity();
+    best_cost_and_pred[s].first = std::numeric_limits<double>::infinity();
     best_cost_and_pred[s].second = fst::kNoStateId;
   }
-  best_cost_and_pred[0].first = 0;
+  best_cost_and_pred[clat.Start()].first = 0;
   for (StateId s = 0; s < clat.NumStates(); s++) {
     double my_cost = best_cost_and_pred[s].first;
     for (ArcIterator<CompactLattice> aiter(clat, s);
@@ -1139,8 +1138,8 @@ void CompactLatticeShortestPath(const CompactLattice &clat,
     }
   }
   std::vector<StateId> states; // states on best path.
-  StateId cur_state = superfinal;
-  while (cur_state != 0) {
+  StateId cur_state = superfinal, start_state = clat.Start();
+  while (cur_state != start_state) {
     StateId prev_state = best_cost_and_pred[cur_state].second;
     if (prev_state == kNoStateId) {
       KALDI_WARN << "Failure in best-path algorithm for lattice (infinite costs?)";
diff --git a/src/lat/minimize-lattice.h b/src/lat/minimize-lattice.h
index fcf6c0f36df..eb13fc1c851 100644
--- a/src/lat/minimize-lattice.h
+++ b/src/lat/minimize-lattice.h
@@ -28,7 +28,6 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
 #include "lat/kaldi-lattice.h"
 
 namespace fst {
diff --git a/src/lat/phone-align-lattice.cc b/src/lat/phone-align-lattice.cc
index a8da7b76a0f..5f11128eddd 100644
--- a/src/lat/phone-align-lattice.cc
+++ b/src/lat/phone-align-lattice.cc
@@ -176,6 +176,8 @@ class LatticePhoneAligner {
       // have returned false or we wouldn't have been called, so we have to
       // force it out.
       CompactLatticeArc lat_arc;
+      // Note: the next call will change the computation-state of the tuple,
+      // so it becomes a different tuple.
       tuple.comp_state.OutputArcForce(tmodel_, opts_, &lat_arc, &error_);
       lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue.
       // The final-prob stuff will get called again from ProcessQueueElement().
@@ -201,12 +203,13 @@ class LatticePhoneAligner {
     // epsilon-sequencing rules encoded by the filters in
     // composition.
     CompactLatticeArc lat_arc;
-    Tuple tuple2(tuple); // temp
     if (tuple.comp_state.OutputPhoneArc(tmodel_, opts_, &lat_arc, &error_) ||
         tuple.comp_state.OutputWordArc(tmodel_, opts_, &lat_arc, &error_)) {
-      // note: this function changes the tuple (when it returns true).
-      lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to queue,
-      // if not already present.
+      // note: the functions OutputPhoneArc() and OutputWordArc() change the
+      // tuple (when they return true).
+      lat_arc.nextstate = GetStateForTuple(tuple, true); // true == add to
+                                                         // queue, if not
+                                                         // already present.
       KALDI_ASSERT(output_state != lat_arc.nextstate);
       lat_out_->AddArc(output_state, lat_arc);
     } else {
@@ -220,7 +223,7 @@ class LatticePhoneAligner {
         // ... since we did CreateSuperFinal.
         ProcessFinal(tuple, output_state);
       }
-      // Now process the arcs.  Note: final-state shouldn't have any arcs.
+      // Now process the arcs.  Note: final-states shouldn't have any arcs.
       for(fst::ArcIterator<CompactLattice> aiter(lat_, tuple.input_state);
           !aiter.Done(); aiter.Next()) {
         const CompactLatticeArc &arc = aiter.Value();
@@ -369,7 +372,7 @@ void LatticePhoneAligner::ComputationState::OutputArcForce(
   // although it might not be obvious from superficially checking
   // the code.  IsEmpty() would be true if we had transition_ids_.empty()
   // and opts.replace_output_symbols, so we would already die by assertion;
-  // in fact, this function would neve be called.
+  // in fact, this function would never be called.
 
   if (!transition_ids_.empty()) { // Do some checking here.
     int32 tid = transition_ids_[0];
diff --git a/src/lat/push-lattice-test.cc b/src/lat/push-lattice-test.cc
index cc9ae827a86..c2643292f11 100644
--- a/src/lat/push-lattice-test.cc
+++ b/src/lat/push-lattice-test.cc
@@ -44,8 +44,8 @@ void TestPushCompactLatticeStrings() {
   for (CompactLatticeArc::StateId s = 0; s < clat2.NumStates(); s++) {
     if (s == 0)
       continue; // We don't check state zero, as the "leftover string" stays
-               // there.
-    int32 first_label;
+                // there.
+    int32 first_label = -1;
     bool ok = false;
     bool first_label_set = false;
     for (ArcIterator<CompactLattice> aiter(clat2, s); !aiter.Done();
diff --git a/src/lat/push-lattice.h b/src/lat/push-lattice.h
index e782aadc0f3..080bb637604 100644
--- a/src/lat/push-lattice.h
+++ b/src/lat/push-lattice.h
@@ -28,7 +28,6 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
-#include "hmm/transition-model.h"
 #include "lat/kaldi-lattice.h"
 
 namespace fst {
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index 16a61b3f5eb..b851bc3604c 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 //           2015  Guoguo Chen
+//           2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -52,10 +53,45 @@ void MinimumBayesRisk::MbrDecode() {
       }
       // build the outputs (time, confidences),
       if (R_[q] != 0 || opts_.print_silence) {
-        one_best_times_.push_back(times_[q]);
+        // see which 'item' from the sausage-bin should we select,
+        // (not necessarily the 1st one when MBR decoding disabled)
+        int32 s = 0;
+        for (int32 j=0; j<gamma_[q].size(); j++) {
+          if (gamma_[q][j].first == R_[q]) {
+            s = j;
+            break;
+          }
+        }
+        one_best_times_.push_back(times_[q][s]);
+        // post-process the times,
+        size_t i = one_best_times_.size();
+        if (i > 1 && one_best_times_[i-2].second > one_best_times_[i-1].first) {
+          // It's quite possible for this to happen, but it seems like it would
+          // have a bad effect on the downstream processing, so we fix it here.
+          // We resolve overlaps by redistributing the available time interval.
+          BaseFloat prev_right = i > 2 ? one_best_times_[i-3].second : 0.0;
+          BaseFloat left = std::max(prev_right,
+                                    std::min(one_best_times_[i-2].first,
+                                             one_best_times_[i-1].first));
+          BaseFloat right = std::max(one_best_times_[i-2].second,
+                                     one_best_times_[i-1].second);
+          BaseFloat first_dur =
+              one_best_times_[i-2].second - one_best_times_[i-2].first;
+          BaseFloat second_dur =
+              one_best_times_[i-1].second - one_best_times_[i-1].first;
+          BaseFloat mid = first_dur > 0 ? left + (right - left) * first_dur /
+                                     (first_dur + second_dur) : left;
+          one_best_times_[i-2].first = left;
+          one_best_times_[i-2].second = one_best_times_[i-1].first = mid;
+          one_best_times_[i-1].second = right;
+        }
         BaseFloat confidence = 0.0;
-        for (int32 j = 0; j < gamma_[q].size(); j++)
-          if (gamma_[q][j].first == R_[q]) confidence = gamma_[q][j].second;
+        for (int32 j = 0; j < gamma_[q].size(); j++) {
+          if (gamma_[q][j].first == R_[q]) {
+            confidence = gamma_[q][j].second;
+            break;
+          }
+        }
         one_best_confidences_.push_back(confidence);
       }
     }
@@ -146,11 +182,11 @@ void MinimumBayesRisk::AccStats() {
   std::vector<map<int32, double> > gamma(Q+1); // temp. form of gamma.
   // index 1...Q [word] -> occ.
 
-  // The tau arrays below are the sums over words of the tau_b
-  // and tau_e timing quantities mentioned in Appendix C of
-  // the paper... we are using these to get averaged times for
-  // the sausage bins, not specifically for the 1-best output.
-  Vector<double> tau_b(Q+1), tau_e(Q+1);
+  // The tau maps below are the sums over arcs with the same word label
+  // of the tau_b and tau_e timing quantities mentioned in Appendix C of
+  // the paper... we are using these to get averaged times for both the
+  // the sausage bins and the 1-best output.
+  std::vector<map<int32, double> > tau_b(Q+1), tau_e(Q+1);
 
   double Ltmp = EditDistance(N, Q, alpha, alpha_dash, alpha_dash_arc);
   if (L_ != 0 && Ltmp > L_) { // L_ != 0 is to rule out 1st iter.
@@ -190,8 +226,8 @@ void MinimumBayesRisk::AccStats() {
             // next: gamma(q, w(a)) += beta_dash_arc(q)
             AddToMap(w_a, beta_dash_arc(q), &(gamma[q]));
             // next: accumulating times, see decl for tau_b,tau_e
-            tau_b(q) += state_times_[s_a] * beta_dash_arc(q);
-            tau_e(q) += state_times_[n] * beta_dash_arc(q);
+            AddToMap(w_a, state_times_[s_a] * beta_dash_arc(q), &(tau_b[q]));
+            AddToMap(w_a, state_times_[n] * beta_dash_arc(q), &(tau_e[q]));
             break;
           case 2:
             beta_dash(s_a, q) += beta_dash_arc(q);
@@ -204,8 +240,8 @@ void MinimumBayesRisk::AccStats() {
             // WARNING: there was an error in Appendix C.  If we followed
             // the instructions there the next line would say state_times_[sa], but
             // it would be wrong.  I will try to publish an erratum.
-            tau_b(q) += state_times_[n] * beta_dash_arc(q);
-            tau_e(q) += state_times_[n] * beta_dash_arc(q);
+            AddToMap(0, state_times_[n] * beta_dash_arc(q), &(tau_b[q]));
+            AddToMap(0, state_times_[n] * beta_dash_arc(q), &(tau_e[q]));
             break;
           default:
             KALDI_ERR << "Invalid b_arc value"; // error in code.
@@ -222,8 +258,8 @@ void MinimumBayesRisk::AccStats() {
     AddToMap(0, beta_dash_arc(q), &(gamma[q]));
     // the statements below are actually redundant because
     // state_times_[1] is zero.
-    tau_b(q) += state_times_[1] * beta_dash_arc(q);
-    tau_e(q) += state_times_[1] * beta_dash_arc(q);
+    AddToMap(0, state_times_[1] * beta_dash_arc(q), &(tau_b[q]));
+    AddToMap(0, state_times_[1] * beta_dash_arc(q), &(tau_e[q]));
   }
   for (int32 q = 1; q <= Q; q++) { // a check (line 35)
     double sum = 0.0;
@@ -240,7 +276,8 @@ void MinimumBayesRisk::AccStats() {
   for (int32 q = 1; q <= Q; q++) {
     for (map<int32, double>::iterator iter = gamma[q].begin();
          iter != gamma[q].end(); ++iter)
-      gamma_[q-1].push_back(std::make_pair(iter->first, static_cast<BaseFloat>(iter->second)));
+      gamma_[q-1].push_back(
+          std::make_pair(iter->first, static_cast<BaseFloat>(iter->second)));
     // sort gamma_[q-1] from largest to smallest posterior.
     GammaCompare comp;
     std::sort(gamma_[q-1].begin(), gamma_[q-1].end(), comp);
@@ -250,18 +287,32 @@ void MinimumBayesRisk::AccStats() {
   // indexing.
   times_.clear();
   times_.resize(Q);
+  sausage_times_.clear();
+  sausage_times_.resize(Q);
   for (int32 q = 1; q <= Q; q++) {
-    times_[q-1].first = tau_b(q);
-    times_[q-1].second = tau_e(q);
-    if (times_[q-1].first > times_[q-1].second) // this is quite bad.
-      KALDI_WARN << "Times out of order";
-    if (q > 1 && times_[q-2].second > times_[q-1].first) {
+    double t_b = 0.0, t_e = 0.0;
+    for (std::vector<std::pair<int32, BaseFloat>>::iterator iter = gamma_[q-1].begin();
+         iter != gamma_[q-1].end(); ++iter) {
+      double w_b = tau_b[q][iter->first], w_e = tau_e[q][iter->first];
+      if (w_b > w_e)
+        KALDI_WARN << "Times out of order";  // this is quite bad.
+      times_[q-1].push_back(
+          std::make_pair(static_cast<BaseFloat>(w_b / iter->second),
+                         static_cast<BaseFloat>(w_e / iter->second)));
+      t_b += w_b;
+      t_e += w_e;
+    }
+    sausage_times_[q-1].first = t_b;
+    sausage_times_[q-1].second = t_e;
+    if (sausage_times_[q-1].first > sausage_times_[q-1].second)
+      KALDI_WARN << "Times out of order";  // this is quite bad.
+    if (q > 1 && sausage_times_[q-2].second > sausage_times_[q-1].first) {
       // We previously had a warning here, but now we'll just set both
       // those values to their average.  It's quite possible for this
       // condition to happen, but it seems like it would have a bad effect
       // on the downstream processing, so we fix it.
-      double avg = 0.5 * (times_[q-2].second + times_[q-1].first);
-      times_[q-2].second = times_[q-1].first = avg;
+      sausage_times_[q-2].second = sausage_times_[q-1].first =
+          0.5 * (sausage_times_[q-2].second + sausage_times_[q-1].first);
     }
   }
 }
@@ -371,7 +422,7 @@ MinimumBayesRisk::MinimumBayesRisk(const CompactLattice &clat_in,
   PrepareLatticeAndInitStats(&clat);
 
   R_ = words;
-  times_ = times;
+  sausage_times_ = times;
   L_ = 0.0;
 
   MbrDecode();
diff --git a/src/lat/sausages.h b/src/lat/sausages.h
index f613097b190..13f359c60d9 100644
--- a/src/lat/sausages.h
+++ b/src/lat/sausages.h
@@ -2,6 +2,7 @@
 
 // Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
 //           2015  Guoguo Chen
+//           2019  Dogan Can
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -104,17 +105,27 @@ class MinimumBayesRisk {
     return R_;
   }
 
+  const std::vector<std::vector<std::pair<BaseFloat, BaseFloat> > > GetTimes() const {
+    return times_; // returns average (start,end) times for each word in each
+    // bin. These are raw averages without any processing, i.e. time intervals
+    // from different bins can overlap.
+  }
+
   const std::vector<std::pair<BaseFloat, BaseFloat> > GetSausageTimes() const {
-    return times_; // returns average (start,end) times for each bin (each entry
-    // of GetSausageStats()).  Note: if you want the times for the one best,
-    // you can work out the one best yourself from the sausage stats and get the times
-    // at the same time.
+    return sausage_times_; // returns average (start,end) times for each bin.
+    // This is typically the weighted average of the times in GetTimes() but can
+    // be slightly different if the times for the bins overlap, in which case
+    // the times returned by this method do not overlap unlike the times
+    // returned by GetTimes().
   }
 
   const std::vector<std::pair<BaseFloat, BaseFloat> > &GetOneBestTimes() const {
-    return one_best_times_; // returns average (start,end) times for each bin corresponding
-    // to an entry in the one-best output.  This is just the appropriate
-    // subsequence of the times in SausageTimes().
+    return one_best_times_; // returns average (start,end) times for each word
+    // corresponding to an entry in the one-best output.  This is typically the
+    // appropriate subset of the times in GetTimes() but can be slightly
+    // different if the times for the one-best words overlap, in which case
+    // the times returned by this method do not overlap unlike the times
+    // returned by GetTimes().
   }
 
   /// Outputs the confidences for the one-best transcript.
@@ -122,8 +133,7 @@ class MinimumBayesRisk {
     return one_best_confidences_;
   }
 
-  /// Returns the expected WER over this sentence (assuming
-  /// model correctness.
+  /// Returns the expected WER over this sentence (assuming model correctness).
   BaseFloat GetBayesRisk() const { return L_; }
 
   const std::vector<std::vector<std::pair<int32, BaseFloat> > > &GetSausageStats() const {
@@ -222,15 +232,20 @@ class MinimumBayesRisk {
   // paper.  We sort in reverse order on the second member (posterior), so more
   // likely word is first.
 
-  std::vector<std::pair<BaseFloat, BaseFloat> > times_;
+  std::vector<std::vector<std::pair<BaseFloat, BaseFloat> > > times_;
+  // The average start and end times for words in each confusion-network bin.
+  // This is like an average over arcs, of the tau_b and tau_e quantities in
+  // Appendix C of the paper.  Indexed from zero, like gamma_ and R_.
+
+  std::vector<std::pair<BaseFloat, BaseFloat> > sausage_times_;
   // The average start and end times for each confusion-network bin.  This
   // is like an average over words, of the tau_b and tau_e quantities in
   // Appendix C of the paper.  Indexed from zero, like gamma_ and R_.
 
   std::vector<std::pair<BaseFloat, BaseFloat> > one_best_times_;
-  // one_best_times_ is a subsequence of times_, corresponding to
-  // (start,end) times of words in the one best output.  Actually these
-  // times are averages over the bin that each word came from.
+  // The average start and end times for words in the one best output.  This
+  // is like an average over the arcs, of the tau_b and tau_e quantities in
+  // Appendix C of the paper. Indexed from zero, like gamma_ and R_.
 
   std::vector<BaseFloat> one_best_confidences_;
   // vector of confidences for the 1-best output (which could be
diff --git a/src/lat/word-align-lattice-lexicon.cc b/src/lat/word-align-lattice-lexicon.cc
index 63284b771de..60f094b1cc8 100644
--- a/src/lat/word-align-lattice-lexicon.cc
+++ b/src/lat/word-align-lattice-lexicon.cc
@@ -116,7 +116,7 @@ class LatticeLexiconWordAligner {
     size_t Hash() const {
       VectorHasher<int32> vh;
       const int32 p1 = 11117, p2 = 90647, p3 = 3967, p4 = 3557; // primes.
-      int32 ans = 0;
+      size_t ans = 0;
       for (int32 i = 0; i < static_cast<int32>(transition_ids_.size()); i++) {
         ans *= p1;
         ans += vh(transition_ids_[i]);
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index afff54cb845..9809cdcbb85 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -32,10 +32,9 @@ OBJFILES =
 
 TESTFILES =
 
-ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../lat/kaldi-lat.a ../nnet3/kaldi-nnet3.a ../lm/kaldi-lm.a \
+ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
+          ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/latbin/lattice-1best.cc b/src/latbin/lattice-1best.cc
index e03736561f8..e5f3c578fbb 100644
--- a/src/latbin/lattice-1best.cc
+++ b/src/latbin/lattice-1best.cc
@@ -91,11 +91,11 @@ int main(int argc, char *argv[]) {
                    << "(no output)";
         n_err++;
       } else {
-        fst::ScaleLattice(fst::LatticeScale(1.0 / lm_scale, 1.0/acoustic_scale),
-                          &best_path);
         if (word_ins_penalty > 0.0) {
-          AddWordInsPenToCompactLattice(word_ins_penalty, &clat);
+          AddWordInsPenToCompactLattice(-word_ins_penalty, &best_path);
         }
+        fst::ScaleLattice(fst::LatticeScale(1.0 / lm_scale, 1.0/acoustic_scale),
+                          &best_path);
         compact_1best_writer.Write(key, best_path);
         n_done++;
       }
diff --git a/src/latbin/lattice-expand-ngram.cc b/src/latbin/lattice-expand-ngram.cc
index 1b8cfbee24b..1e7625d79e0 100644
--- a/src/latbin/lattice-expand-ngram.cc
+++ b/src/latbin/lattice-expand-ngram.cc
@@ -36,15 +36,15 @@ int main(int argc, char *argv[]) {
       "Usage: lattice-expand-ngram [options] lattice-rspecifier "
       "lattice-wspecifier\n"
       "e.g.: lattice-expand-ngram --n=3 ark:lat ark:expanded_lat\n";
-      
+
     ParseOptions po(usage);
     int32 n = 3;
 
     std::string word_syms_filename;
     po.Register("n", &n, "n-gram context to expand to.");
-    
+
     po.Read(argc, argv);
- 
+
     if (po.NumArgs() != 2) {
       po.PrintUsage();
       exit(1);
@@ -58,10 +58,10 @@ int main(int argc, char *argv[]) {
     fst::UnweightedNgramFst<CompactLatticeArc> expand_fst(n);
 
     SequentialCompactLatticeReader lat_reader(lats_rspecifier);
-    CompactLatticeWriter lat_writer(lats_wspecifier); 
+    CompactLatticeWriter lat_writer(lats_wspecifier);
 
     int32 n_done = 0, n_fail = 0;
-    
+
     for (; !lat_reader.Done(); lat_reader.Next()) {
       std::string key = lat_reader.Key();
       KALDI_LOG << "Processing lattice for key " << key;
@@ -69,14 +69,14 @@ int main(int argc, char *argv[]) {
       CompactLattice expanded_lat;
       ComposeDeterministicOnDemand(lat, &expand_fst, &expanded_lat);
       if (expanded_lat.Start() == fst::kNoStateId) {
-        KALDI_WARN << "Empty lattice for utterance " << key << std::endl;
+        KALDI_WARN << "Empty lattice for utterance " << key;
        n_fail++;
       } else {
         if (lat.NumStates() == expanded_lat.NumStates()) {
-          KALDI_LOG << "Lattice for key " << key 
+          KALDI_LOG << "Lattice for key " << key
             << " did not need to be expanded for order " << n << ".";
         } else {
-          KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to " 
+          KALDI_LOG << "Lattice expanded from " << lat.NumStates() << " to "
             << expanded_lat.NumStates() << " states for order " << n << ".";
         }
         lat_writer.Write(key, expanded_lat);
@@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
       }
       lat_reader.FreeCurrent();
     }
-    KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail 
+    KALDI_LOG << "Processed " << n_done << " lattices with " << n_fail
       << " failures.";
     return 0;
   } catch(const std::exception &e) {
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index bd13fe0f4d7..5f2513131d7 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -27,6 +27,8 @@
 
 namespace kaldi {
 
+ using std::string;
+
 typedef fst::StdArc::Label Label;
 typedef std::vector<std::pair<Label, Label>> LabelPairVector;
 
@@ -148,7 +150,7 @@ void CountErrors(const fst::StdVectorFst &fst,
 
 bool CheckFst(const fst::StdVectorFst &fst, string name, string key) {
 #ifdef DEBUG
-  StateId numstates = fst.NumStates();
+  fst::StdArc::StateId numstates = fst.NumStates();
   std::cerr << " " << name << " has " << numstates << " states" << std::endl;
   std::stringstream ss;
   ss << name << key << ".fst";
diff --git a/src/latbin/lattice-reverse.cc b/src/latbin/lattice-reverse.cc
index ad288de04a7..433915ff79b 100644
--- a/src/latbin/lattice-reverse.cc
+++ b/src/latbin/lattice-reverse.cc
@@ -32,13 +32,15 @@ int main(int argc, char *argv[]) {
     using fst::VectorFst;
     using fst::StdArc;
 
+    using std::string;
+
     const char *usage =
         "Reverse a lattice in order to rescore the lattice with a RNNLM \n"
         "trained reversed text. An example for its application is at \n"
         "swbd/local/rnnlm/run_lstm_tdnn_back.sh\n"
         "Usage: lattice-reverse lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-reverse ark:forward.lats ark:backward.lats\n";
-    
+
     ParseOptions po(usage);
     std::string include_rxfilename;
     std::string exclude_rxfilename;
@@ -54,10 +56,10 @@ int main(int argc, char *argv[]) {
                 lats_wspecifier = po.GetArg(2);
 
     int32 n_done = 0;
-    
+
     SequentialLatticeReader lattice_reader(lats_rspecifier);
     LatticeWriter lattice_writer(lats_wspecifier);
-    
+
     for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) {
       string key = lattice_reader.Key();
       Lattice &lat = lattice_reader.Value();
@@ -67,7 +69,7 @@ int main(int argc, char *argv[]) {
     }
 
     KALDI_LOG << "Done reversing " << n_done << " lattices.";
-    
+
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/src/lm/Makefile b/src/lm/Makefile
index 3dfb409f970..5fff942fcee 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -12,7 +12,6 @@ OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
 LIBNAME = kaldi-lm
 
 ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/lm/README b/src/lm/README
index d53ffcc75be..2ca0c932221 100644
--- a/src/lm/README
+++ b/src/lm/README
@@ -4,7 +4,7 @@
 # Language model & lexicon examples
 # using command-line executables in lm/
 
-# To print and display FSTs, 
+# To print and display FSTs,
 # make sure you have OpenFst binaries in your PATH, for example:
 #   export PATH=$PATH:~/Sources/UBM-ASR/branches/clean/openfst-1.2/bin
 
@@ -18,7 +18,7 @@
 #-------------------------------------------
 # Language model FST (G)
 
-# The command-line utility for 
+# The command-line utility for
 # creating a language model FST from an arpa file is
 # "arpa2fst".
 
@@ -48,7 +48,7 @@ fstprint --save_isymbols=grammar.syms grammar.fst > /dev/null
 # A summary of options and usage can be displayed with:
 ./lex2fst --help
 
-# Read a lexicon file (containing prononciation probabilities)
+# Read a lexicon file (containing pronunciation probabilities)
 # and produce an FST with symbol tables.
 # By default it will have disambiguation markers,
 #   optional silence between words,
diff --git a/src/lm/arpa-file-parser-test.cc b/src/lm/arpa-file-parser-test.cc
index 36e609dc0a4..5331f9a3307 100644
--- a/src/lm/arpa-file-parser-test.cc
+++ b/src/lm/arpa-file-parser-test.cc
@@ -73,7 +73,8 @@ inline CountedArray<T> MakeCountedArray(T(&array)[N]) {
 
 class TestableArpaFileParser : public ArpaFileParser {
  public:
-  TestableArpaFileParser(ArpaParseOptions options, fst::SymbolTable *symbols)
+  TestableArpaFileParser(const ArpaParseOptions &options,
+                         fst::SymbolTable *symbols)
       : ArpaFileParser(options, symbols),
         header_available_(false),
         read_complete_(false),
@@ -89,7 +90,7 @@ class TestableArpaFileParser : public ArpaFileParser {
   bool header_available_;
   bool read_complete_;
   int32 last_order_;
-  std::vector <NGramTestData> ngrams_;
+  std::vector<NGramTestData> ngrams_;
 };
 
 void TestableArpaFileParser::HeaderAvailable() {
diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc
index f3565eabf4e..1165f03fb10 100644
--- a/src/lm/arpa-file-parser.cc
+++ b/src/lm/arpa-file-parser.cc
@@ -29,7 +29,7 @@
 
 namespace kaldi {
 
-ArpaFileParser::ArpaFileParser(ArpaParseOptions options,
+ArpaFileParser::ArpaFileParser(const ArpaParseOptions& options,
                                fst::SymbolTable* symbols)
     : options_(options), symbols_(symbols),
       line_number_(0), warning_count_(0) {
@@ -74,7 +74,7 @@ void ArpaFileParser::Read(std::istream &is) {
   warning_count_ = 0;
   current_line_.clear();
 
-#define PARSE_ERR (KALDI_ERR << LineReference() << ": ")
+#define PARSE_ERR KALDI_ERR << LineReference() << ": "
 
   // Give derived class an opportunity to prepare its state.
   ReadStarted();
@@ -261,20 +261,21 @@ void ArpaFileParser::Read(std::istream &is) {
                << "--max_warnings=-1 to see all warnings";
   }
 
-  current_line_.empty();
+  current_line_.clear();
   ReadComplete();
 
 #undef PARSE_ERR
 }
 
 std::string ArpaFileParser::LineReference() const {
-  std::stringstream ss;
+  std::ostringstream ss;
   ss << "line " << line_number_ << " [" << current_line_ << "]";
   return ss.str();
 }
 
 bool ArpaFileParser::ShouldWarn() {
-  return ++warning_count_ <= static_cast<uint32>(options_.max_warnings);
+  return (warning_count_ != -1) &&
+    (++warning_count_ <= static_cast<uint32>(options_.max_warnings));
 }
 
 }  // namespace kaldi
diff --git a/src/lm/arpa-file-parser.h b/src/lm/arpa-file-parser.h
index 46491f7d385..47eb4dc5891 100644
--- a/src/lm/arpa-file-parser.h
+++ b/src/lm/arpa-file-parser.h
@@ -89,7 +89,7 @@ class ArpaFileParser {
   /// If symbol table is a null pointer, the file should contain integer
   /// symbol values, and oov_handling has no effect. bos_symbol and eos_symbol
   /// must be valid symbols still.
-  ArpaFileParser(ArpaParseOptions options, fst::SymbolTable* symbols);
+  ArpaFileParser(const ArpaParseOptions& options, fst::SymbolTable* symbols);
   virtual ~ArpaFileParser();
 
   /// Read ARPA LM file from a stream.
diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc
index 697d70c416a..a84258edab8 100644
--- a/src/lm/arpa-lm-compiler-test.cc
+++ b/src/lm/arpa-lm-compiler-test.cc
@@ -72,7 +72,7 @@ static fst::StdVectorFst* CreateGenFst(bool seps, const fst::SymbolTable* pst) {
 }
 
 // Compile given ARPA file.
-ArpaLmCompiler* Compile(bool seps, const string &infile) {
+ArpaLmCompiler* Compile(bool seps, const std::string &infile) {
   ArpaParseOptions options;
   fst::SymbolTable symbols;
   // Use spaces on special symbols, so we rather fail than read them by mistake.
@@ -96,7 +96,7 @@ ArpaLmCompiler* Compile(bool seps, const string &infile) {
 }
 
 // Add a state to an FSA after last_state, add a form last_state to the new
-// atate, and return the new state.
+// state, and return the new state.
 fst::StdArc::StateId AddToChainFsa(fst::StdMutableFst* fst,
                                    fst::StdArc::StateId last_state,
                                    int64 symbol) {
@@ -116,7 +116,7 @@ void AddSelfLoops(fst::StdMutableFst* fst) {
 
 // Compiles infile and then runs kRandomSentences random coverage tests on the
 // compiled FST.
-bool CoverageTest(bool seps, const string &infile) {
+bool CoverageTest(bool seps, const std::string &infile) {
   // Compile ARPA model.
   ArpaLmCompiler* lm_compiler = Compile(seps, infile);
 
@@ -135,7 +135,7 @@ bool CoverageTest(bool seps, const string &infile) {
 
     fst::ArcSort(lm_compiler->MutableFst(), fst::StdOLabelCompare());
 
-    // The past must successfullycompose with the LM FST.
+    // The past must successfully compose with the LM FST.
     fst::StdVectorFst composition;
     Compose(sentence, lm_compiler->Fst(), &composition);
     if (composition.Start() != fst::kNoStateId)
@@ -153,7 +153,7 @@ bool CoverageTest(bool seps, const string &infile) {
   return ok;
 }
 
-bool ScoringTest(bool seps, const string &infile, const string& sentence,
+bool ScoringTest(bool seps, const std::string &infile, const std::string& sentence,
                  float expected) {
   ArpaLmCompiler* lm_compiler = Compile(seps, infile);
   const fst::SymbolTable* symbols = lm_compiler->Fst().InputSymbols();
@@ -166,7 +166,7 @@ bool ScoringTest(bool seps, const string &infile, const string& sentence,
     state = AddToChainFsa(&sentFst, state, kBos);
   }
   std::stringstream ss(sentence);
-  string word;
+  std::string word;
   while (ss >> word) {
     int64 word_sym = symbols->Find(word);
     KALDI_ASSERT(word_sym != -1);
@@ -204,13 +204,12 @@ bool ScoringTest(bool seps, const string &infile, const string& sentence,
   return ok;
 }
 
-bool ThrowsExceptionTest(bool seps, const string &infile) {
+bool ThrowsExceptionTest(bool seps, const std::string &infile) {
   try {
     // Make memory cleanup easy in both cases of try-catch block.
     std::unique_ptr<ArpaLmCompiler> compiler(Compile(seps, infile));
     return false;
-  } catch (const std::runtime_error&) {
-    // Kaldi throws only std::runtime_error in kaldi-error.cc
+  } catch (const KaldiFatalError&) {
     return true;
   }
 }
diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc
index d774deeb783..47bd20d4721 100644
--- a/src/lm/arpa-lm-compiler.cc
+++ b/src/lm/arpa-lm-compiler.cc
@@ -79,7 +79,7 @@ class GeneralHistKey {
 // in a 4-gram model, this optimized key is used for smaller models with up
 // to 4-gram and symbol values up to 2^21-1.
 //
-// See GeneralHistKey for interface requrements of a key class.
+// See GeneralHistKey for interface requirements of a key class.
 class OptimizedHistKey {
  public:
   enum {
@@ -140,12 +140,12 @@ ArpaLmCompilerImpl<HistKey>::ArpaLmCompilerImpl(
     : parent_(parent), fst_(fst), bos_symbol_(parent->Options().bos_symbol),
       eos_symbol_(parent->Options().eos_symbol), sub_eps_(sub_eps) {
   // The algorithm maintains state per history. The 0-gram is a special state
-  // for emptry history. All unigrams (including BOS) backoff into this state.
+  // for empty history. All unigrams (including BOS) backoff into this state.
   StateId zerogram = fst_->AddState();
   history_[HistKey()] = zerogram;
 
   // Also, if </s> is not treated as epsilon, create a common end state for
-  // all transitions acepting the </s>, since they do not back off. This small
+  // all transitions accepting the </s>, since they do not back off. This small
   // optimization saves about 2% states in an average grammar.
   if (sub_eps_ == 0) {
     eos_state_ = fst_->AddState();
@@ -288,7 +288,7 @@ void ArpaLmCompiler::HeaderAvailable() {
   int64 max_symbol = 0;
   if (Symbols() != NULL)
     max_symbol = Symbols()->AvailableKey() - 1;
-  // If augmenting the symbol table, assume the wors case when all words in
+  // If augmenting the symbol table, assume the worst case when all words in
   // the model being read are novel.
   if (Options().oov_handling == ArpaParseOptions::kAddToSymbols)
     max_symbol += NgramCounts()[0];
diff --git a/src/lm/arpa-lm-compiler.h b/src/lm/arpa-lm-compiler.h
index 3e3baeb6ee1..67a18273f5e 100644
--- a/src/lm/arpa-lm-compiler.h
+++ b/src/lm/arpa-lm-compiler.h
@@ -31,7 +31,7 @@ class ArpaLmCompilerImplInterface;
 
 class ArpaLmCompiler : public ArpaFileParser {
  public:
-  ArpaLmCompiler(ArpaParseOptions options, int sub_eps,
+  ArpaLmCompiler(const ArpaParseOptions& options, int sub_eps,
                  fst::SymbolTable* symbols)
       : ArpaFileParser(options, symbols),
         sub_eps_(sub_eps), impl_(NULL) {
diff --git a/src/lm/kaldi-rnnlm.cc b/src/lm/kaldi-rnnlm.cc
index 3a811c4c0e5..63e76e459de 100644
--- a/src/lm/kaldi-rnnlm.cc
+++ b/src/lm/kaldi-rnnlm.cc
@@ -48,7 +48,7 @@ KaldiRnnlmWrapper::KaldiRnnlmWrapper(
     label_to_word_[i] = word_symbols->Find(i);
     if (label_to_word_[i] == "") {
       KALDI_ERR << "Could not find word for integer " << i << "in the word "
-          << "symbol table, mismatched symbol table or you have discoutinuous "
+          << "symbol table, mismatched symbol table or you have discontinuous "
           << "integers in your symbol table?";
     }
   }
diff --git a/src/lm/kaldi-rnnlm.h b/src/lm/kaldi-rnnlm.h
index 2383058a1a8..dd7de9662ff 100644
--- a/src/lm/kaldi-rnnlm.h
+++ b/src/lm/kaldi-rnnlm.h
@@ -39,7 +39,7 @@ struct KaldiRnnlmWrapperOpts {
   void Register(OptionsItf *opts) {
     opts->Register("unk-symbol", &unk_symbol, "Symbol for out-of-vocabulary "
                    "words in rnnlm.");
-    opts->Register("eos-symbol", &eos_symbol, "End of setence symbol in "
+    opts->Register("eos-symbol", &eos_symbol, "End of sentence symbol in "
                    "rnnlm.");
   }
 };
diff --git a/src/lm/mikolov-rnnlm-lib.cc b/src/lm/mikolov-rnnlm-lib.cc
index 645f76c22d1..d867dcb4bf8 100644
--- a/src/lm/mikolov-rnnlm-lib.cc
+++ b/src/lm/mikolov-rnnlm-lib.cc
@@ -568,7 +568,7 @@ void CRnnLM::goToDelimiter(int delim, FILE *fi) {
 
 void CRnnLM::restoreNet() {   // will read whole network structure
   FILE *fi;
-  int a, b, ver;
+  int a, b, ver, unused_size;
   float fl;
   char str[MAX_STRING];
   double d;
@@ -580,7 +580,7 @@ void CRnnLM::restoreNet() {   // will read whole network structure
   }
 
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &ver);
+  unused_size = fscanf(fi, "%d", &ver);
   if ((ver == 4) && (version == 5)) {
     /* we will solve this later.. */
   } else {
@@ -590,71 +590,71 @@ void CRnnLM::restoreNet() {   // will read whole network structure
     }
   }
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &filetype);
+  unused_size = fscanf(fi, "%d", &filetype);
   goToDelimiter(':', fi);
   if (train_file_set == 0) {
-    fscanf(fi, "%s", train_file);
+    unused_size = fscanf(fi, "%s", train_file);
   } else {
-    fscanf(fi, "%s", str);
+    unused_size = fscanf(fi, "%s", str);
   }
   goToDelimiter(':', fi);
-  fscanf(fi, "%s", valid_file);
+  unused_size = fscanf(fi, "%s", valid_file);
   goToDelimiter(':', fi);
-  fscanf(fi, "%lf", &llogp);
+  unused_size = fscanf(fi, "%lf", &llogp);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &iter);
+  unused_size = fscanf(fi, "%d", &iter);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &train_cur_pos);
+  unused_size = fscanf(fi, "%d", &train_cur_pos);
   goToDelimiter(':', fi);
-  fscanf(fi, "%lf", &logp);
+  unused_size = fscanf(fi, "%lf", &logp);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &anti_k);
+  unused_size = fscanf(fi, "%d", &anti_k);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &train_words);
+  unused_size = fscanf(fi, "%d", &train_words);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &layer0_size);
+  unused_size = fscanf(fi, "%d", &layer0_size);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &layer1_size);
+  unused_size = fscanf(fi, "%d", &layer1_size);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &layerc_size);
+  unused_size = fscanf(fi, "%d", &layerc_size);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &layer2_size);
+  unused_size = fscanf(fi, "%d", &layer2_size);
   if (ver > 5) {
     goToDelimiter(':', fi);
-    fscanf(fi, "%lld", &direct_size);
+    unused_size = fscanf(fi, "%lld", &direct_size);
   }
   if (ver > 6) {
     goToDelimiter(':', fi);
-    fscanf(fi, "%d", &direct_order);
+    unused_size = fscanf(fi, "%d", &direct_order);
   }
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &bptt);
+  unused_size = fscanf(fi, "%d", &bptt);
   if (ver > 4) {
     goToDelimiter(':', fi);
-    fscanf(fi, "%d", &bptt_block);
+    unused_size = fscanf(fi, "%d", &bptt_block);
   } else {
     bptt_block = 10;
   }
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &vocab_size);
+  unused_size = fscanf(fi, "%d", &vocab_size);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &class_size);
+  unused_size = fscanf(fi, "%d", &class_size);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &old_classes);
+  unused_size = fscanf(fi, "%d", &old_classes);
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &independent);
+  unused_size = fscanf(fi, "%d", &independent);
   goToDelimiter(':', fi);
-  fscanf(fi, "%lf", &d);
+  unused_size = fscanf(fi, "%lf", &d);
   starting_alpha = d;
   goToDelimiter(':', fi);
   if (alpha_set == 0) {
-    fscanf(fi, "%lf", &d);
+    unused_size = fscanf(fi, "%lf", &d);
     alpha = d;
   } else {
-    fscanf(fi, "%lf", &d);
+    unused_size = fscanf(fi, "%lf", &d);
   }
   goToDelimiter(':', fi);
-  fscanf(fi, "%d", &alpha_divide);
+  unused_size = fscanf(fi, "%d", &alpha_divide);
 
   // read normal vocabulary
   if (vocab_max_size < vocab_size) {
@@ -666,11 +666,11 @@ void CRnnLM::restoreNet() {   // will read whole network structure
   }
   goToDelimiter(':', fi);
   for (a = 0; a < vocab_size; a++) {
-    // fscanf(fi, "%d%d%s%d", &b, &vocab[a].cn,
+    // unused_size = fscanf(fi, "%d%d%s%d", &b, &vocab[a].cn,
     // vocab[a].word, &vocab[a].class_index);
-    fscanf(fi, "%d%d", &b, &vocab[a].cn);
+    unused_size = fscanf(fi, "%d%d", &b, &vocab[a].cn);
     readWord(vocab[a].word, fi);
-    fscanf(fi, "%d", &vocab[a].class_index);
+    unused_size = fscanf(fi, "%d", &vocab[a].class_index);
     // printf("%d  %d  %s  %d\n", b, vocab[a].cn,
     // vocab[a].word, vocab[a].class_index);
   }
@@ -679,14 +679,14 @@ void CRnnLM::restoreNet() {   // will read whole network structure
   if (filetype == TEXT) {
     goToDelimiter(':', fi);
     for (a = 0; a < layer1_size; a++) {
-      fscanf(fi, "%lf", &d);
+      unused_size = fscanf(fi, "%lf", &d);
       neu1[a].ac = d;
     }
   }
   if (filetype == BINARY) {
     fgetc(fi);
     for (a = 0; a < layer1_size; a++) {
-      fread(&fl, 4, 1, fi);
+      unused_size = fread(&fl, 4, 1, fi);
       neu1[a].ac = fl;
     }
   }
@@ -694,7 +694,7 @@ void CRnnLM::restoreNet() {   // will read whole network structure
     goToDelimiter(':', fi);
     for (b = 0; b < layer1_size; b++) {
       for (a = 0; a < layer0_size; a++) {
-        fscanf(fi, "%lf", &d);
+        unused_size = fscanf(fi, "%lf", &d);
         syn0[a + b * layer0_size].weight = d;
       }
     }
@@ -702,7 +702,7 @@ void CRnnLM::restoreNet() {   // will read whole network structure
   if (filetype == BINARY) {
     for (b = 0; b < layer1_size; b++) {
       for (a = 0; a < layer0_size; a++) {
-        fread(&fl, 4, 1, fi);
+        unused_size = fread(&fl, 4, 1, fi);
         syn0[a + b * layer0_size].weight = fl;
       }
     }
@@ -712,14 +712,14 @@ void CRnnLM::restoreNet() {   // will read whole network structure
     if (layerc_size == 0) {  // no compress layer
       for (b = 0; b < layer2_size; b++) {
         for (a = 0; a < layer1_size; a++) {
-          fscanf(fi, "%lf", &d);
+          unused_size = fscanf(fi, "%lf", &d);
           syn1[a + b * layer1_size].weight = d;
         }
       }
     } else {        // with compress layer
       for (b = 0; b < layerc_size; b++) {
         for (a = 0; a < layer1_size; a++) {
-          fscanf(fi, "%lf", &d);
+          unused_size = fscanf(fi, "%lf", &d);
           syn1[a + b * layer1_size].weight = d;
         }
       }
@@ -728,7 +728,7 @@ void CRnnLM::restoreNet() {   // will read whole network structure
 
       for (b = 0; b < layer2_size; b++) {
         for (a = 0; a < layerc_size; a++) {
-          fscanf(fi, "%lf", &d);
+          unused_size = fscanf(fi, "%lf", &d);
           sync[a + b * layerc_size].weight = d;
         }
       }
@@ -738,41 +738,41 @@ void CRnnLM::restoreNet() {   // will read whole network structure
     if (layerc_size == 0) {  // no compress layer
       for (b = 0; b < layer2_size; b++) {
         for (a = 0; a < layer1_size; a++) {
-          fread(&fl, 4, 1, fi);
+          unused_size = fread(&fl, 4, 1, fi);
           syn1[a + b * layer1_size].weight = fl;
         }
       }
     } else {        // with compress layer
       for (b = 0; b < layerc_size; b++) {
         for (a = 0; a < layer1_size; a++) {
-          fread(&fl, 4, 1, fi);
+          unused_size = fread(&fl, 4, 1, fi);
           syn1[a + b * layer1_size].weight = fl;
         }
       }
 
       for (b = 0; b < layer2_size; b++) {
         for (a = 0; a < layerc_size; a++) {
-          fread(&fl, 4, 1, fi);
+          unused_size = fread(&fl, 4, 1, fi);
           sync[a + b * layerc_size].weight = fl;
         }
       }
     }
   }
   if (filetype == TEXT) {
-    goToDelimiter(':', fi);    // direct conenctions
+    goToDelimiter(':', fi);    // direct connections
     long long aa;
     for (aa = 0; aa < direct_size; aa++) {
-      fscanf(fi, "%lf", &d);
+      unused_size = fscanf(fi, "%lf", &d);
       syn_d[aa] = d;
     }
   }
   if (filetype == BINARY) {
     long long aa;
     for (aa = 0; aa < direct_size; aa++) {
-      fread(&fl, 4, 1, fi);
+      unused_size = fread(&fl, 4, 1, fi);
       syn_d[aa] = fl;
 
-      /*fread(&si, 2, 1, fi);
+      /*unused_size = fread(&si, 2, 1, fi);
         fl = si/(float)(4*256);
         syn_d[aa] = fl;*/
     }
@@ -780,6 +780,9 @@ void CRnnLM::restoreNet() {   // will read whole network structure
 
   saveWeights();
 
+  // idiom to "use" an unused variable
+  (void) unused_size;
+
   fclose(fi);
 }
 
diff --git a/src/lmbin/Makefile b/src/lmbin/Makefile
index c88f6151a8f..1e81391092e 100644
--- a/src/lmbin/Makefile
+++ b/src/lmbin/Makefile
@@ -10,7 +10,7 @@ OBJFILES =
 
 TESTFILES =
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/makefiles/android_openblas.mk b/src/makefiles/android_openblas.mk
index 4fa6a5d4a2b..f4112fcbe71 100644
--- a/src/makefiles/android_openblas.mk
+++ b/src/makefiles/android_openblas.mk
@@ -25,7 +25,7 @@ $(error Android build does not support compiling with $(CXX).
         Supported compilers: clang++)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self -Wno-mismatched-tags \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
deleted file mode 100644
index 9a343d1ae24..00000000000
--- a/src/makefiles/cuda_32bit.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-ifndef DOUBLE_PRECISION
-$(error DOUBLE_PRECISION not defined.)
-endif
-ifndef CUDATKDIR
-$(error CUDATKDIR not defined.)
-endif
-
-CUDA_INCLUDE= -I$(CUDATKDIR)/include
-CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA \
-             -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
-LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib
-LDLIBS += -lcublas -lcusparse -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index be76798b1d3..459cfa652ef 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -5,9 +5,13 @@ ifndef CUDATKDIR
 $(error CUDATKDIR not defined.)
 endif
 
-CUDA_INCLUDE= -I$(CUDATKDIR)/include
-CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
-             -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -fPIC -pthread -isystem $(OPENFSTINC)
+
+CUDA_INCLUDE= -I$(CUDATKDIR)/include -I$(CUBROOT)
+CUDA_FLAGS = --machine 64 -DHAVE_CUDA \
+             -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+             -std=c++11 -DCUDA_API_PER_THREAD_DEFAULT_STREAM  -lineinfo \
+             --verbose -Xcompiler "$(CXXFLAGS)"
+
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
+CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand -lcufft -lnvToolsExt #LDLIBS : The .so libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index d2b9c0e6474..b8e74c9af1e 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -10,7 +10,7 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
index 81351d185b6..7599d8ed9d6 100644
--- a/src/makefiles/darwin.mk
+++ b/src/makefiles/darwin.mk
@@ -10,7 +10,7 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index ee0f3c2e90b..14989e8afaf 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -3,11 +3,19 @@ SHELL := /bin/bash
 
 ifeq ($(KALDI_FLAVOR), dynamic)
   ifeq ($(shell uname), Darwin)
-    ifdef LIBNAME
-      LIBFILE = lib$(LIBNAME).dylib
+    ifdef ANDROIDINC # cross-compiling enabled on host MacOS
+      ifdef LIBNAME
+        LIBFILE = lib$(LIBNAME).so
+      endif
+      LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
+      EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))$(notdir $(basename $(dep))).a)
+    else
+      ifdef LIBNAME
+        LIBFILE = lib$(LIBNAME).dylib
+      endif
+      LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
+      EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
     endif
-    LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
-    EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
   else ifeq ($(shell uname), Linux)
     ifdef LIBNAME
       LIBFILE = lib$(LIBNAME).so
@@ -27,10 +35,16 @@ endif
 
 all: $(LIBFILE) $(BINFILES)
 
-$(LIBFILE): $(OBJFILES)
+
+ifdef LIBNAME
+
+$(LIBNAME).a: $(OBJFILES)
 	$(AR) -cr $(LIBNAME).a $(OBJFILES)
 	$(RANLIB) $(LIBNAME).a
+
 ifeq ($(KALDI_FLAVOR), dynamic)
+# the LIBFILE is not the same as $(LIBNAME).a
+$(LIBFILE): $(LIBNAME).a
   ifeq ($(shell uname), Darwin)
 	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(LDLIBS)
 	ln -sf $(shell pwd)/$@ $(KALDILIBDIR)/$@
@@ -41,7 +55,8 @@ ifeq ($(KALDI_FLAVOR), dynamic)
   else  # Platform not supported
 	$(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
   endif
-endif
+endif # ifeq ($(KALDI_FLAVOR), dynamic)
+endif # ifdef LIBNAME
 
 # By default (GNU) make uses the C compiler $(CC) for linking object files even
 # if they were compiled from a C++ source. Below redefinition forces make to
@@ -114,8 +129,30 @@ valgrind: .valgrind
 	rm valgrind.out
 	touch .valgrind
 
+
+#buid up dependency commands
+CC_SRCS=$(wildcard *.cc)
+#check if files exist to run dependency commands on
+ifneq ($(CC_SRCS),)
+CC_DEP_COMMAND=$(CXX) -M $(CXXFLAGS) $(CC_SRCS)
+endif
+
+ifeq ($(CUDA), true)
+CUDA_SRCS=$(wildcard *.cu)
+#check if files exist to run dependency commands on
+ifneq ($(CUDA_SRCS),)
+NVCC_DEP_COMMAND = $(CUDATKDIR)/bin/nvcc -M $(CUDA_FLAGS) $(CUDA_INCLUDE) $(CUDA_SRCS)
+endif
+endif
+
 depend:
-	-$(CXX) -M $(CXXFLAGS) *.cc > .depend.mk
+	rm -f .depend.mk
+ifneq ($(CC_DEP_COMMAND),)
+	$(CC_DEP_COMMAND) >> .depend.mk
+endif
+ifneq ($(NVCC_DEP_COMMAND),)
+	$(NVCC_DEP_COMMAND) >> .depend.mk
+endif
 
 # removing automatic making of "depend" as it's quite slow.
 #.depend.mk: depend
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index 8b8807a74f7..bd3086e0c7e 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -1,5 +1,8 @@
 # ATLAS specific Linux configuration
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -16,18 +19,25 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -msse -msse2 -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 4c83ce71d6c..b139a53fe23 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -16,7 +16,7 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index 1e4194c2869..fdb2618c91b 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -1,5 +1,8 @@
 # ATLAS specific Linux ppc64le configuration
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -16,19 +19,26 @@ ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
            -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 75a514a85d7..058c4eeab1d 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -1,5 +1,8 @@
 # CLAPACK specific Linux configuration
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -10,18 +13,25 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -msse -msse2 -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 52a2a663eb7..c80710bd0a0 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -1,5 +1,8 @@
 # CLAPACK specific Linux ARM configuration
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -10,18 +13,25 @@ ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 1da16117a68..f2bd7ec42ab 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -1,5 +1,8 @@
 # OpenBLAS specific Linux configuration
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -16,18 +19,25 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -msse -msse2 -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_openblas_aarch64.mk b/src/makefiles/linux_openblas_aarch64.mk
new file mode 100644
index 00000000000..7098f8b6a54
--- /dev/null
+++ b/src/makefiles/linux_openblas_aarch64.mk
@@ -0,0 +1,49 @@
+# OpenBLAS specific Linux ARM configuration
+
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
+endif
+ifndef OPENBLASLIBS
+$(error OPENBLASLIBS not defined.)
+endif
+
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -ftree-vectorize -pthread \
+           -g
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index 7f462925c74..5a79d8244fa 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -1,5 +1,8 @@
 # OpenBLAS specific Linux ARM configuration
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -16,18 +19,25 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index c098b9d92e8..4d3919e1f98 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -1,5 +1,8 @@
 # OpenBLAS specific Linux configuration
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -16,19 +19,26 @@ ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
            -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
            -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 26d22253d08..dc1fa7a738a 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -9,6 +9,9 @@
 # Use the options obtained from this website to manually configure for other
 # platforms using MKL.
 
+ifndef DEBUG_LEVEL
+$(error DEBUG_LEVEL not defined.)
+endif
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
@@ -22,20 +25,27 @@ ifndef MKLROOT
 $(error MKLROOT not defined.)
 endif
 
-MKLLIB ?= $(MKLROOT)/lib/em64t
+MKLLIB ?= $(MKLROOT)/lib/intel64
 
-CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+CXXFLAGS = -std=c++11 -I.. -isystem $(OPENFSTINC) -O1 $(EXTRA_CXXFLAGS) \
            -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
            -Wno-deprecated-declarations -Winit-self \
            -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
            -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL -I$(MKLROOT)/include \
            -m64 -msse -msse2 -pthread \
-           -g # -O0 -DKALDI_PARANOID
+           -g
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+ifeq ($(DEBUG_LEVEL), 0)
+CXXFLAGS += -DNDEBUG
+endif
+ifeq ($(DEBUG_LEVEL), 2)
+CXXFLAGS += -O0 -DKALDI_PARANOID
+endif
+
 # Compiler specific flags
 COMPILER = $(shell $(CXX) -v 2>&1)
 ifeq ($(findstring clang,$(COMPILER)),clang)
diff --git a/src/matrix/Makefile b/src/matrix/Makefile
index b0fcdd4d266..e39be1ffec9 100644
--- a/src/matrix/Makefile
+++ b/src/matrix/Makefile
@@ -10,10 +10,10 @@ include ../kaldi.mk
 
 # you can uncomment matrix-lib-speed-test if you want to do the speed tests.
 
-TESTFILES = matrix-lib-test kaldi-gpsr-test sparse-matrix-test #matrix-lib-speed-test
+TESTFILES = matrix-lib-test sparse-matrix-test #matrix-lib-speed-test
 
 OBJFILES = kaldi-matrix.o kaldi-vector.o packed-matrix.o sp-matrix.o tp-matrix.o \
-           matrix-functions.o qr.o srfft.o kaldi-gpsr.o compressed-matrix.o \
+           matrix-functions.o qr.o srfft.o compressed-matrix.o \
            sparse-matrix.o optimization.o
 
 LIBNAME = kaldi-matrix
diff --git a/src/matrix/kaldi-blas.h b/src/matrix/kaldi-blas.h
index 5d25ab852bd..8a06540bba2 100644
--- a/src/matrix/kaldi-blas.h
+++ b/src/matrix/kaldi-blas.h
@@ -50,8 +50,8 @@
 
 #ifdef HAVE_ATLAS
   extern "C" {
-    #include <cblas.h>
-    #include <clapack.h>
+    #include "cblas.h"
+    #include "clapack.h"
   }
 #elif defined(HAVE_CLAPACK)
   #ifdef __APPLE__
@@ -74,7 +74,7 @@
       // from the tools/CLAPACK_include directory.
       #include <cblas.h>
       #include <f2c.h>
-      #include <clapack.h>  
+      #include <clapack.h>
 
       // get rid of macros from f2c.h -- these are dangerous.
       #undef abs
@@ -110,7 +110,7 @@
   #undef bit_clear
   #undef bit_set
 #else
-  #error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)"  
+  #error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)"
 #endif
 
 #ifdef HAVE_OPENBLAS
diff --git a/src/matrix/kaldi-gpsr-test.cc b/src/matrix/kaldi-gpsr-test.cc
deleted file mode 100644
index 6d895527e55..00000000000
--- a/src/matrix/kaldi-gpsr-test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-// matrix/kaldi-gpsr-test.cc
-
-// Copyright 2012   Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmm/model-test-common.h"
-#include "matrix/kaldi-gpsr.h"
-#include "util/kaldi-io.h"
-
-using kaldi::int32;
-using kaldi::BaseFloat;
-namespace ut = kaldi::unittest;
-
-namespace kaldi {
-
-template<typename Real> static void InitRand(VectorBase<Real> *v) {
-  for (MatrixIndexT i = 0;i < v->Dim();i++)
-    (*v)(i) = RandGauss();
-}
-
-template<typename Real> static void InitRand(MatrixBase<Real> *M) {
- start:
-  for (MatrixIndexT i = 0;i < M->NumRows();i++)
-    for (MatrixIndexT j = 0;j < M->NumCols();j++)
-      (*M)(i, j) = RandGauss();
-  if (M->NumRows() != 0 && M->Cond() > 100) {
-    KALDI_WARN << "Condition number of random matrix large" << M->Cond()
-               << ": trying again (this is normal)";
-    goto start;
-  }
-}
-
-template<typename Real> static void InitRand(SpMatrix<Real> *M) {
- start_sp:
-  for (MatrixIndexT i = 0;i < M->NumRows();i++)
-    for (MatrixIndexT j = 0;j<=i;j++)
-      (*M)(i, j) = RandGauss();
-  if (M->NumRows() != 0 && M->Cond() > 100) {
-    KALDI_WARN << "Condition number of random matrix large" << M->Cond()
-               << ": trying again (this is normal)";
-    goto start_sp;
-  }
-}
-
-template<typename Real> static void UnitTestGpsr() {
-  for (int32 i = 0; i < 5; i++) {
-    MatrixIndexT dim1 = (Rand() % 10) + 10;
-    MatrixIndexT dim2 = (Rand() % 10) + 10;
-
-    Matrix<Real> M(dim1, dim2);
-    InitRand(&M);
-    SpMatrix<Real> H(dim2);
-    H.AddMat2(1.0, M, kTrans, 0.0);  // H = M^T M
-//    InitRand(&H);
-//    KALDI_LOG << "dim 1 " << dim1 << "; dim 2 " << dim2 << " LD " << H.LogDet()
-//              << " Cond " << H.Cond() << "\nH " << H;
-//    KALDI_ASSERT(H.IsPosDef());
-
-    Vector<Real> x(dim2);
-    InitRand(&x);
-    Vector<Real> g(dim2);
-    InitRand(&g);
-    GpsrConfig opts;
-    opts.debias = (Rand()%2 == 0);
-    Real objf_old = 0.5* VecSpVec(x, H, x) - VecVec(x, g) +
-        opts.gpsr_tau * x.Norm(1.0);
-    GpsrBasic(opts, H, g, &x);
-    Real objf_new = 0.5* VecSpVec(x, H, x) - VecVec(x, g) +
-        opts.gpsr_tau * x.Norm(1.0);
-    KALDI_ASSERT(objf_old >= objf_new);  // since we are minimizing
-    KALDI_LOG << "GPSR-basic: objf old = " << objf_old << "; new = " << objf_new;
-    Vector<Real> x2(x);
-    GpsrBB(opts, H, g, &x);
-    Real objf_new_bb = 0.5* VecSpVec(x, H, x) - VecVec(x, g) +
-        opts.gpsr_tau * x.Norm(1.0);
-    KALDI_ASSERT(objf_old >= objf_new_bb);  // since we are minimizing
-    KALDI_LOG << "GPSR-BB: objf old = " << objf_old << "; new = " << objf_new_bb;
-  }
-}
-
-}
-
-int main() {
-  kaldi::g_kaldi_verbose_level = 1;
-  kaldi::UnitTestGpsr<float>();
-  kaldi::UnitTestGpsr<double>();
-  std::cout << "Test OK.\n";
-  return 0;
-}
diff --git a/src/matrix/kaldi-gpsr.cc b/src/matrix/kaldi-gpsr.cc
deleted file mode 100644
index db9dabadd70..00000000000
--- a/src/matrix/kaldi-gpsr.cc
+++ /dev/null
@@ -1,496 +0,0 @@
-// matrix/kaldi-gpsr.cc
-
-// Copyright 2010-2012   Liang Lu,  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-// This is an implementation of the GPSR algorithm. See, Figueiredo, Nowak and
-// Wright, "Gradient Projection for Sparse Reconstruction: Application to
-// Compressed Sensing and Other Inverse Problems," IEEE Journal of Selected
-// Topics in Signal Processing, vol. 1, no. 4, pp. 586-597, 2007.
-// http://dx.doi.org/10.1109/JSTSP.2007.910281
-
-#include <algorithm>
-#include <string>
-#include <vector>
-using std::vector;
-
-#include "matrix/kaldi-gpsr.h"
-
-namespace kaldi {
-
-/// This calculates the objective function: \f$ c^T z + 0.5 * z^T B z, \f$
-/// where z is formed by stacking u and v, and B = [H -H; -H H].
-double GpsrObjective(const SpMatrix<double> &H, const Vector<double> &c,
-                     const Vector<double> &u, const Vector<double> &v) {
-  KALDI_ASSERT(u.Dim() == v.Dim() && u.Dim() > 0);
-  KALDI_ASSERT(c.Dim() == 2 * u.Dim());
-  KALDI_VLOG(2) << "u dim = " << u.Dim() << ", v dim = " << v.Dim()
-                << ", c dim = " << c.Dim();
-
-  MatrixIndexT dim = u.Dim();
-  Vector<double> H_x(dim), x(dim);
-  // x = u - v, where u_i = (x_i)_+; v_i = (-x_i)_+; and (x)_+ = max{0,x}
-  x.CopyFromVec(u);
-  x.AddVec(-1.0, v);
-
-  // Calculate c^T z = c^T [u^T v^T]^T
-  double objf = VecVec(c.Range(0, dim), u);
-  objf += VecVec(c.Range(dim, dim), v);
-
-  // Now, calculate the quadratic term: z^T B z = (u-v)^T H (u-v) = x^T H x
-  H_x.AddSpVec(1.0, H, x, 0.0);
-  objf += 0.5 * VecVec(x, H_x);
-  return objf;
-}
-
-/// This calculates the gradient: \f$ c + B z, \f$
-/// where z is formed by stacking u and v, and B = [H -H; -H H].
-void GpsrGradient(const SpMatrix<double> &H, const Vector<double> &c,
-                     const Vector<double> &u, const Vector<double> &v,
-                     Vector<double> *grad_u, Vector<double> *grad_v) {
-  KALDI_ASSERT(u.Dim() == v.Dim() && u.Dim() > 0);
-  KALDI_ASSERT(u.Dim() == grad_u->Dim() && v.Dim() == grad_v->Dim());
-  KALDI_ASSERT(c.Dim() == 2 * u.Dim());
-  KALDI_VLOG(2) << "u dim = " << u.Dim() << ", v dim = " << v.Dim()
-                << ", c dim = " << c.Dim();
-
-  MatrixIndexT dim = u.Dim();
-  Vector<double> H_x(dim), x(dim);
-  // x = u - v, where u_i = (x_i)_+; v_i = (-x_i)_+; and (x)_+ = max{0,x}
-  x.CopyFromVec(u);
-  x.AddVec(-1.0, v);
-  // To calculate B z = [ H (u-v); -H (u-v) ] = [ H x; -H x ], we only need H x
-  H_x.AddSpVec(1.0, H, x, 0.0);
-  grad_u->CopyFromVec(c.Range(0, dim));
-  grad_u->AddVec(1.0, H_x);
-  grad_v->CopyFromVec(c.Range(dim, dim));
-  grad_v->AddVec(-1.0, H_x);
-}
-
-/// Returns the initial guess of step size in the feasible direction.
-/// This is the exact minimizer of the objective function along the feasible
-/// direction, which is the negative gradient projected on to the constraint
-/// set, or the non-negative orthant, in this case:
-/// \f[ \alpha = \frac{g^T g}{g^T B g},  \f]
-/// where g is the projected gradient, formed by stacking the projected
-/// gradients for the positive & negative parts (u & v); and B = [H -H; -H H].
-double GpsrBasicAlpha(const SpMatrix<double> &H, const Vector<double> &u,
-                      const Vector<double> &v, const Vector<double> &grad_u,
-                      const Vector<double> &grad_v) {
-  KALDI_ASSERT(H.NumRows() == grad_u.Dim() && grad_u.Dim() == grad_v.Dim() &&
-               grad_u.Dim() > 0);
-  KALDI_VLOG(2) << "grad_u dim = " << grad_u.Dim() << ", grad_v dim = "
-                << grad_v.Dim() << ", H rows = " << H.NumRows();
-  MatrixIndexT dim = grad_u.Dim();
-
-  // Find the projection of the gradient on the nonnegative orthant, or, more
-  // precisely, the projection s.t. the next iterate will be in the orthant.
-  Vector<double> proj_grad_u(dim);
-  Vector<double> proj_grad_v(dim);
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    proj_grad_u(i) = (u(i) > 0 || grad_u(i) < 0)? grad_u(i) : 0;
-    proj_grad_v(i) = (v(i) > 0 || grad_v(i) < 0)? grad_v(i) : 0;
-  }
-
-  // The numerator: g^T g = g_u^T g_u + g_v^T g_v
-  double alpha = VecVec(proj_grad_u, proj_grad_u);
-  alpha += VecVec(proj_grad_v, proj_grad_v);
-
-  // The denominator: g^T B g = (g_u - g_v)^T H (g_u - g_v)
-  Vector<double> diff_g(proj_grad_u);
-  diff_g.AddVec(-1.0, proj_grad_v);
-  Vector<double> H_diff_g(dim);
-  H_diff_g.AddSpVec(1.0, H, diff_g, 0.0);
-  alpha /= (VecVec(diff_g, H_diff_g) + DBL_EPSILON);
-  return alpha;
-}
-
-/// This calculates the coefficient for the linear term used in the
-/// bound-constrained quadratic program: c = \tau 1_{2n} + [-g; g]
-void GpsrCalcLinearCoeff(double tau, const Vector<double> &g,
-                         Vector<double> *c) {
-  KALDI_ASSERT(c->Dim() == 2 * g.Dim() && g.Dim() != 0);
-  MatrixIndexT dim = g.Dim();
-  c->Set(tau);
-  c->Range(0, dim).AddVec(-1.0, g);
-  c->Range(dim, dim).AddVec(1.0, g);
-}
-
-// This removes the L1 penalty term, and uses conjugate gradient to solve the
-// resulting quadratic problem while keeping the zero elements fixed at 0.
-double Debias(const GpsrConfig &opts, const SpMatrix<double> &H,
-              const Vector<double> &g, Vector<double> *x) {
-  KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0);
-//  KALDI_ASSERT(H.IsPosDef() &&
-//               "Must have positive definite matrix for conjugate gradient.");
-  MatrixIndexT dim = x->Dim();
-
-  Vector<double> x_bias(*x);
-  Vector<double> nonzero_indices(dim);
-  // Initialize the index of non-zero elements in x
-  for (MatrixIndexT i = 0; i < dim; i++)
-    nonzero_indices(i) = (x_bias(i) == 0)? 0.0 : 1.0;
-
-  Vector<double> residual(dim);
-  Vector<double> conj_direction(dim);
-  Vector<double> resid_change(dim);
-  double alpha_cg;  // CG step size for iterate: x <- x + \alpha p
-  double beta_cg;   // CG step size for conj. direction: p <- \beta p - r
-  double resid_prod, resid_prod_new;  // inner product of residual vectors
-
-  // Calculate the initial residual: r = H x_0 - g
-  residual.AddSpVec(1.0, H, x_bias, 0.0);
-  residual.AddVec(-1.0, g);
-  residual.MulElements(nonzero_indices);  // only change non-zero elements of x
-
-  conj_direction.CopyFromVec(residual);
-  conj_direction.Scale(-1.0);  // Initial conjugate direction p = -r
-  resid_prod = VecVec(residual, residual);
-
-  // set the convergence threshold for residual
-  double tol_debias = opts.stop_thresh_debias * VecVec(residual, residual);
-
-  for (int32 iter = 0; iter < opts.max_iters_debias; iter++) {
-    resid_change.AddSpVec(1.0, H, conj_direction, 0.0);
-    resid_change.MulElements(nonzero_indices);  // only change non-zero elements
-
-    alpha_cg = resid_prod / VecVec(conj_direction, resid_change);
-    x_bias.AddVec(alpha_cg, conj_direction);
-    residual.AddVec(alpha_cg, resid_change);
-
-    resid_prod_new = VecVec(residual, residual);
-    beta_cg = resid_prod_new / resid_prod;
-    conj_direction.Scale(beta_cg);
-    conj_direction.AddVec(-1.0, residual);
-    resid_prod = resid_prod_new;
-
-    if (resid_prod < tol_debias) {
-      KALDI_VLOG(1) << "iter=" << iter << "\t residual =" << resid_prod
-                    << "\t tol_debias=" << tol_debias;
-      break;
-    }
-  }  // end CG iters
-
-  x->CopyFromVec(x_bias);
-  return resid_prod;
-}
-
-template<>
-double GpsrBasic(const GpsrConfig &opts, const SpMatrix<double> &H,
-                 const Vector<double> &g, Vector<double> *x,
-                 const char *debug_str) {
-  KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0);
-  MatrixIndexT dim = x->Dim();
-  if (H.IsZero(0.0)) {
-    KALDI_WARN << "Zero quadratic term in GPSR for " << debug_str
-               << ": leaving it unchanged.";
-    return 0.0;
-  }
-
-  // initialize the positive (u) and negative (v) parts of x, s.t. x = u - v
-  Vector<double> u(dim, kSetZero);
-  Vector<double> v(dim, kSetZero);
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    if ((*x)(i) > 0) {
-      u(i) = (*x)(i);
-    } else {
-      v(i) = -(*x)(i);
-    }
-  }
-
-  double tau = opts.gpsr_tau;  // May be modified later.
-  Vector<double> c(2*dim);
-  GpsrCalcLinearCoeff(tau, g, &c);
-
-  double objf_ori = GpsrObjective(H, c, u, v);  // the obj. function at start
-  KALDI_VLOG(2) << "GPSR for " << debug_str << ": tau = " << tau
-                << ";\t objf = " << objf_ori;
-
-  Vector<double> grad_u(dim);
-  Vector<double> grad_v(dim);
-  Vector<double> delta_u(dim);
-  Vector<double> delta_v(dim);
-  Vector<double> u_new(dim);
-  Vector<double> v_new(dim);
-  double objf_old, objf_new, num_zeros;
-  bool keep_going = true;
-
-  for (int32 iter = 0; keep_going; iter++) {
-    objf_old = GpsrObjective(H, c, u, v);
-    GpsrGradient(H, c, u, v, &grad_u, &grad_v);
-    double alpha = GpsrBasicAlpha(H, u, v, grad_u, grad_v);
-    if (alpha < opts.alpha_min) alpha = opts.alpha_min;
-    if (alpha > opts.alpha_max) alpha = opts.alpha_max;
-
-    // This is the backtracking line search part:
-    for (int32 k = 0; k < opts.max_iters_backtrak; k++) {
-      // Calculate the potential new iterate: [z_k - \alpha_k \grad F(z_k)]_+
-      u_new.CopyFromVec(u);
-      u_new.AddVec(-alpha, grad_u);
-      u_new.ApplyFloor(0.0);
-      v_new.CopyFromVec(v);
-      v_new.AddVec(-alpha, grad_v);
-      v_new.ApplyFloor(0.0);
-
-      delta_u.CopyFromVec(u_new);
-      delta_v.CopyFromVec(v_new);
-      delta_u.AddVec(-1.0, u);
-      delta_v.AddVec(-1.0, v);
-
-      double delta_objf_apx = opts.gpsr_mu * (VecVec(grad_u, delta_u) +
-                                              VecVec(grad_v, delta_v));
-      objf_new = GpsrObjective(H, c, u_new, v_new);
-      double delta_objf_real = objf_new - objf_old;
-
-      KALDI_VLOG(2) << "GPSR for " << debug_str << ": iter " << iter
-                    << "; tau = " << tau << ";\t objf = " << objf_new
-                    << ";\t alpha = " << alpha << ";\t delta_apx = "
-                    << delta_objf_apx << ";\t delta_real = " << delta_objf_real;
-
-      if (delta_objf_real < delta_objf_apx + DBL_EPSILON)
-        break;
-      else
-        alpha *= opts.gpsr_beta;
-
-      if (k == opts.max_iters_backtrak - 1) {  // Stop further optimization
-        KALDI_WARN << "Backtracking line search did not decrease objective.";
-        u_new.CopyFromVec(u);
-        u_new.ApplyFloor(0.0);
-        v_new.CopyFromVec(v);
-        v_new.ApplyFloor(0.0);
-        delta_u.SetZero();
-        delta_v.SetZero();
-      }
-    }  // end of backtracking line search
-
-    x->CopyFromVec(u_new);
-    x->AddVec(-1.0, v_new);
-
-    num_zeros = 0;
-    for (MatrixIndexT i = 0; i < dim; i++)
-      if ((*x)(i) == 0)
-        num_zeros++;
-
-    // ad hoc way to modify tau, if the solution is too sparse
-    if ((num_zeros / static_cast<double>(dim)) > opts.max_sparsity) {
-      std::ostringstream msg;
-      msg << num_zeros << " out of " << dim << " dimensions set to 0. "
-          << "Changing tau from " << tau;
-      tau *= opts.tau_reduction;
-      GpsrCalcLinearCoeff(tau, g, &c);  // Recalculate c with new tau
-      double tmp_objf = GpsrObjective(H, c, u, v);
-      msg << " to " << tau << ".\n\tStarting objective function changed from "
-          << objf_ori << " to " << tmp_objf << ".";
-      KALDI_LOG << "GPSR for " << debug_str << ": " << msg.str();
-      iter = 0;
-      keep_going = true;
-      continue;
-    }
-
-    u.CopyFromVec(u_new);
-    v.CopyFromVec(v_new);
-    double delta = (delta_u.Norm(2.0) + delta_v.Norm(2.0)) / x->Norm(2.0);
-    KALDI_VLOG(1) << "GPSR for " << debug_str << ": iter " << iter
-                  << ", objf = " << objf_new << ", delta = " << delta;
-
-    keep_going = (iter < opts.max_iters) && (delta > opts.stop_thresh);
-
-    KALDI_VLOG(3) << "GPSR for " << debug_str << ": iter " << iter
-                  << ", objf = " << objf_new << ", value = " << x;
-  }
-
-  if (num_zeros != 0) {
-    KALDI_LOG << "GPSR for " << debug_str << ": number of 0's = " << num_zeros
-              << " out of " << dim << " dimensions.";
-  }
-
-  if (opts.debias && num_zeros != 0) {
-    double residual = Debias(opts, H, g, x);
-    KALDI_LOG << "Debiasing: new residual = " << residual;
-  }
-  return objf_new - objf_ori;
-}
-
-template<>
-float GpsrBasic(const GpsrConfig &opts, const SpMatrix<float> &H,
-                const Vector<float> &g, Vector<float> *x,
-                const char *debug_str) {
-  KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0);
-  SpMatrix<double> Hd(H);
-  Vector<double> gd(g);
-  Vector<double> xd(*x);
-  float ans = GpsrBasic(opts, Hd, gd, &xd, debug_str);
-  x->CopyFromVec(xd);
-  return ans;
-}
-
-template<>
-double GpsrBB(const GpsrConfig &opts, const SpMatrix<double> &H,
-              const Vector<double> &g, Vector<double> *x,
-              const char *debug_str) {
-  KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0);
-  MatrixIndexT dim = x->Dim();
-  if (H.IsZero(0.0)) {
-    KALDI_WARN << "Zero quadratic term in GPSR for " << debug_str
-               << ": leaving it unchanged.";
-    return 0.0;
-  }
-
-  // initialize the positive (u) and negative (v) parts of x, s.t. x = u - v
-  Vector<double> u(dim, kSetZero);
-  Vector<double> v(dim, kSetZero);
-  for (MatrixIndexT i = 0; i < dim; i++) {
-    if ((*x)(i) > 0) {
-      u(i) = (*x)(i);
-    } else {
-      v(i) = -(*x)(i);
-    }
-  }
-
-  double tau = opts.gpsr_tau;  // May be modified later.
-  Vector<double> c(2*dim);
-  GpsrCalcLinearCoeff(tau, g, &c);
-
-  double objf_ori = GpsrObjective(H, c, u, v);  // the obj. function at start
-  KALDI_VLOG(2) << "GPSR for " << debug_str << ": tau = " << tau
-                << ";\t objf = " << objf_ori;
-
-  Vector<double> grad_u(dim);
-  Vector<double> grad_v(dim);
-  Vector<double> delta_u(dim);
-  Vector<double> delta_v(dim);
-  Vector<double> delta_x(dim);
-  Vector<double> H_delta_x(dim);
-  Vector<double> u_new(dim);
-  Vector<double> v_new(dim);
-  double objf_old, objf_new, num_zeros;
-  bool keep_going = true;
-  double alpha = 1.0;
-
-  for (int32 iter = 0; keep_going; iter++) {
-    objf_old = GpsrObjective(H, c, u, v);
-    GpsrGradient(H, c, u, v, &grad_u, &grad_v);
-
-    // Calculate the new step: [z_k - \alpha_k \grad F(z_k)]_+ - z_k
-    delta_u.CopyFromVec(u);
-    delta_u.AddVec(-alpha, grad_u);
-    delta_u.ApplyFloor(0.0);
-    delta_u.AddVec(-1.0, u);
-    delta_v.CopyFromVec(v);
-    delta_v.AddVec(-alpha, grad_v);
-    delta_v.ApplyFloor(0.0);
-    delta_v.AddVec(-1.0, v);
-
-    delta_x.CopyFromVec(delta_u);
-    delta_x.AddVec(-1.0, delta_v);
-    H_delta_x.AddSpVec(1.0, H, delta_x, 0.0);
-    double dx_H_dx = VecVec(delta_x, H_delta_x);
-
-    double lambda = -(VecVec(delta_u, grad_u) + VecVec(delta_v, grad_v))
-                / (dx_H_dx + DBL_EPSILON);  // step length
-    if (lambda < 0)
-      KALDI_WARN << "lambda is less than zero";
-    if (lambda > 1.0) lambda = 1.0;
-
-    //update alpha
-    alpha = (VecVec(delta_u, delta_u) + VecVec(delta_v, delta_v))
-                / (dx_H_dx + DBL_EPSILON);
-    if (dx_H_dx <= 0) {
-      KALDI_WARN << "nonpositive curvature detected";
-      alpha = opts.alpha_max;
-    }
-    else if (alpha < opts.alpha_min)
-      alpha = opts.alpha_min;
-    else if (alpha > opts.alpha_max) alpha = opts.alpha_max;
-
-    u_new.CopyFromVec(delta_u);
-    u_new.Scale(lambda);
-    v_new.CopyFromVec(delta_v);
-    v_new.Scale(lambda);
-    u_new.AddVec(1.0, u);
-    v_new.AddVec(1.0, v);
-
-    objf_new = GpsrObjective(H, c, u_new, v_new);
-    double delta_objf = objf_old - objf_new;
-    KALDI_VLOG(2) << "GPSR for " << debug_str << ": iter " << iter
-                  << "; tau = " << tau << ";\t objf = " << objf_new
-                  << ";\t alpha = " << alpha << ";\t delta_real = "
-                  << delta_objf;
-
-    u.CopyFromVec(u_new);
-    v.CopyFromVec(v_new);
-    x->CopyFromVec(u);
-    x->AddVec(-1.0, v);
-
-    num_zeros = 0;
-    for (MatrixIndexT i = 0; i < dim; i++)
-      if ((*x)(i) == 0)
-        num_zeros++;
-
-    // ad hoc way to modify tau, if the solution is too sparse
-    if ((num_zeros / static_cast<double>(dim)) > opts.max_sparsity) {
-      std::ostringstream msg;
-      msg << num_zeros << " out of " << dim << " dimensions set to 0. "
-          << "Changing tau from " << tau;
-      tau *= 0.9;
-      GpsrCalcLinearCoeff(tau, g, &c);  // Recalculate c with new tau
-      double tmp_objf = GpsrObjective(H, c, u, v);
-      msg << " to " << tau << ".\n\tStarting objective function changed from "
-          << objf_ori << " to " << tmp_objf << ".";
-      KALDI_LOG << "GPSR for " << debug_str << ": " << msg.str();
-      iter = 0;
-      keep_going = true;
-      continue;
-    }
-
-    double delta = (delta_u.Norm(2.0) + delta_v.Norm(2.0)) / x->Norm(2.0);
-    KALDI_VLOG(1) << "GPSR for " << debug_str << ": iter " << iter
-                  << ", objf = " << objf_new << ", delta = " << delta;
-
-    keep_going = (iter < opts.max_iters) && (delta > opts.stop_thresh);
-
-    KALDI_VLOG(3) << "GPSR for " << debug_str << ": iter " << iter
-                  << ", objf = " << objf_new << ", value = " << x;
-  }
-
-  if (num_zeros != 0) {
-    KALDI_LOG << "GPSR for " << debug_str << ": number of 0's = " << num_zeros
-              << " out of " << dim << " dimensions.";
-  }
-
-  if (opts.debias && num_zeros != 0) {
-    double residual = Debias(opts, H, g, x);
-    KALDI_LOG << "Debiasing: new residual = " << residual;
-  }
-  return objf_new - objf_ori;
-}
-
-template<>
-float GpsrBB(const GpsrConfig &opts, const SpMatrix<float> &H,
-             const Vector<float> &g, Vector<float> *x,
-             const char *debug_str) {
-  KALDI_ASSERT(H.NumRows() == g.Dim() && g.Dim() == x->Dim() && x->Dim() != 0);
-  SpMatrix<double> Hd(H);
-  Vector<double> gd(g);
-  Vector<double> xd(*x);
-  float ans = GpsrBB(opts, Hd, gd, &xd, debug_str);
-  x->CopyFromVec(xd);
-  return ans;
-}
-
-}  // namespace kaldi
-
diff --git a/src/matrix/kaldi-gpsr.h b/src/matrix/kaldi-gpsr.h
deleted file mode 100644
index 896c8160782..00000000000
--- a/src/matrix/kaldi-gpsr.h
+++ /dev/null
@@ -1,166 +0,0 @@
-// matrix/kaldi-gpsr.h
-
-// Copyright 2012  Arnab Ghoshal
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_MATRIX_KALDI_GPSR_H_
-#define KALDI_MATRIX_KALDI_GPSR_H_
-
-#include <string>
-#include <vector>
-
-#include "base/kaldi-common.h"
-#include "matrix/matrix-lib.h"
-#include "itf/options-itf.h"
-
-namespace kaldi {
-
-/// This is an implementation of the GPSR algorithm. See, Figueiredo, Nowak and
-/// Wright, "Gradient Projection for Sparse Reconstruction: Application to
-/// Compressed Sensing and Other Inverse Problems," IEEE Journal of Selected
-/// Topics in Signal Processing, vol. 1, no. 4, pp. 586-597, 2007.
-/// http://dx.doi.org/10.1109/JSTSP.2007.910281
-
-/// The GPSR algorithm, described in Figueiredo, et al., 2007, solves:
-/// \f[ \min_x 0.5 * ||y - Ax||_2^2 + \tau ||x||_1, \f]
-/// where \f$ x \in R^n, y \in R^k \f$, and \f$ A \in R^{n \times k} \f$.
-/// In this implementation, we solve:
-/// \f[ \min_x 0.5 * x^T H x - g^T x + \tau ||x||_1, \f]
-/// which is the more natural form in which such problems arise in our case.
-/// Here, \f$ H = A^T A \in R^{n \times n} \f$ and \f$ g = A^T y \in R^n \f$.
-
-
-/** \struct GpsrConfig
- *  Configuration variables needed in the GPSR algorithm.
- */
-struct GpsrConfig {
-  bool use_gpsr_bb;  ///< Use the Barzilai-Borwein gradient projection method
-
-  /// The following options are common to both the basic & Barzilai-Borwein
-  /// versions of GPSR
-  double stop_thresh;  ///< Stopping threshold
-  int32 max_iters;  ///< Maximum number of iterations
-  double gpsr_tau;  ///< Regularization scale
-  double alpha_min;  ///< Minimum step size in the feasible direction
-  double alpha_max;  ///< Maximum step size in the feasible direction
-  double max_sparsity;  ///< Maximum percentage of dimensions set to 0
-  double tau_reduction;  ///< Multiply tau by this if max_sparsity reached
-
-  /// The following options are for the backtracking line search in basic GPSR.
-  /// Step size reduction factor in backtracking line search. 0 < beta < 1
-  double gpsr_beta;
-  /// Improvement factor in backtracking line search, i.e. the new objective
-  /// function must be less than the old one by mu times the gradient in the
-  /// direction of the change in x. 0 < mu < 1
-  double gpsr_mu;
-  int32 max_iters_backtrak;  ///< Max iterations for backtracking line search
-
-  bool debias;  ///< Do debiasing, i.e. unconstrained optimization at the end
-  double stop_thresh_debias;  ///< Stopping threshold for debiasing stage
-  int32 max_iters_debias;  ///< Maximum number of iterations for debiasing stage
-
-  GpsrConfig() {
-    use_gpsr_bb = true;
-
-    stop_thresh = 0.005;
-    max_iters = 100;
-    gpsr_tau = 10;
-    alpha_min = 1.0e-10;
-    alpha_max = 1.0e+20;
-    max_sparsity = 0.9;
-    tau_reduction = 0.8;
-
-    gpsr_beta = 0.5;
-    gpsr_mu = 0.1;
-    max_iters_backtrak = 50;
-
-    debias = false;
-    stop_thresh_debias = 0.001;
-    max_iters_debias = 50;
-  }
-
-  void Register(OptionsItf *opts);
-};
-
-inline void GpsrConfig::Register(OptionsItf *opts) {
-  std::string module = "GpsrConfig: ";
-  opts->Register("use-gpsr-bb", &use_gpsr_bb, module+
-                 "Use the Barzilai-Borwein gradient projection method.");
-
-  opts->Register("stop-thresh", &stop_thresh, module+
-                 "Stopping threshold for GPSR.");
-  opts->Register("max-iters", &max_iters, module+
-                 "Maximum number of iterations of GPSR.");
-  opts->Register("gpsr-tau", &gpsr_tau, module+
-                 "Regularization scale for GPSR.");
-  opts->Register("alpha-min", &alpha_min, module+
-                 "Minimum step size in feasible direction.");
-  opts->Register("alpha-max", &alpha_max, module+
-                 "Maximum step size in feasible direction.");
-  opts->Register("max-sparsity", &max_sparsity, module+
-                 "Maximum percentage of dimensions set to 0.");
-  opts->Register("tau-reduction", &tau_reduction, module+
-                 "Multiply tau by this if maximum sparsity is reached.");
-
-  opts->Register("gpsr-beta", &gpsr_beta, module+
-                 "Step size reduction factor in backtracking line search (0<beta<1).");
-  opts->Register("gpsr-mu", &gpsr_mu, module+
-                 "Improvement factor in backtracking line search (0<mu<1).");
-  opts->Register("max-iters-backtrack", &max_iters_backtrak, module+
-                 "Maximum number of iterations of backtracking line search.");
-
-  opts->Register("debias", &debias, module+
-                 "Do final debiasing step.");
-  opts->Register("stop-thresh-debias", &stop_thresh_debias, module+
-                 "Stopping threshold for debiaisng step.");
-  opts->Register("max-iters-debias", &max_iters_debias, module+
-                 "Maximum number of iterations of debiasing.");
-}
-
-/// Solves a quadratic program in \f$ x \f$, with L_1 regularization:
-/// \f[ \min_x 0.5 * x^T H x - g^T x + \tau ||x||_1. \f]
-/// This is similar to SolveQuadraticProblem() in sp-matrix.h with an added
-/// L_1 term.
-template<typename Real>
-Real Gpsr(const GpsrConfig &opts, const SpMatrix<Real> &H,
-          const Vector<Real> &g, Vector<Real> *x,
-          const char *debug_str = "[unknown]") {
-  if (opts.use_gpsr_bb)
-    return GpsrBB(opts, H, g, x, debug_str);
-  else
-    return GpsrBasic(opts, H, g, x, debug_str);
-}
-
-/// This is the basic GPSR algorithm, where the step size is determined by a
-/// backtracking line search. The line search is called "Armijo rule along the
-/// projection arc" in Bertsekas, Nonlinear Programming, 2nd ed. page 230.
-template<typename Real>
-Real GpsrBasic(const GpsrConfig &opts, const SpMatrix<Real> &H,
-               const Vector<Real> &g, Vector<Real> *x,
-               const char *debug_str = "[unknown]");
-
-/// This is the paper calls the Barzilai-Borwein variant. This is a constrained
-/// Netwon's method where the Hessian is approximated by scaled identity matrix
-template<typename Real>
-Real GpsrBB(const GpsrConfig &opts, const SpMatrix<Real> &H,
-            const Vector<Real> &g, Vector<Real> *x,
-            const char *debug_str = "[unknown]");
-
-
-}  // namespace kaldi
-
-#endif  // KALDI_MATRIX_KALDI_GPSR_H_
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index fcfe0616b64..faf23cdf0c5 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -5,6 +5,7 @@
 //                       Yanmin Qian;  Petr Schwarz;  Jan Silovsky;
 //                       Haihua Xu
 //           2017        Shiyin Kang
+//           2019        Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -93,7 +94,7 @@ void MatrixBase<Real>::Invert(Real *log_det, Real *det_sign,
       prod *= (*this)(i, i);
       if (i == num_rows_ - 1 || std::fabs(prod) < 1.0e-10 ||
           std::fabs(prod) > 1.0e+10) {
-        if (log_det != NULL) *log_det += Log(std::fabs(prod));
+        if (log_det != NULL) *log_det += kaldi::Log(std::fabs(prod));
         if (det_sign != NULL) *det_sign *= (prod > 0 ? 1.0 : -1.0);
         prod = 1.0;
       }
@@ -1470,7 +1471,7 @@ void Matrix<Real>::Read(std::istream & is, bool binary, bool add) {
   if (binary) {  // Read in binary mode.
     int peekval = Peek(is, binary);
     if (peekval == 'C') {
-      // This code enable us to read CompressedMatrix as a regular matrix.
+      // This code enables us to read CompressedMatrix as a regular matrix.
       CompressedMatrix compressed_mat;
       compressed_mat.Read(is, binary); // at this point, add == false.
       this->Resize(compressed_mat.NumRows(), compressed_mat.NumCols());
@@ -2098,90 +2099,135 @@ void Matrix<Real>::Transpose() {
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyFloor(Real floor_val) {
+void MatrixBase<Real>::Heaviside(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
-  for (MatrixIndexT i = 0; i < num_rows; i++) {
-    Real *data = this->RowData(i);
-    for (MatrixIndexT j = 0; j < num_cols; j++)
-      data[j] = (data[j] < floor_val ? floor_val : data[j]);
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyCeiling(Real ceiling_val) {
+void MatrixBase<Real>::Exp(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
-  for (MatrixIndexT i = 0; i < num_rows; i++) {
-    Real *data = this->RowData(i);
-    for (MatrixIndexT j = 0; j < num_cols; j++)
-      data[j] = (data[j] > ceiling_val ? ceiling_val : data[j]);
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = kaldi::Exp(src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyLog() {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyLog();
+void MatrixBase<Real>::Pow(const MatrixBase<Real> &src, Real power) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++) {
+      row_data[col] = pow(src_row_data[col], power);
+    }
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyExp() {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyExp();
+void MatrixBase<Real>::PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col ++) {
+      if (include_sign == true && src_row_data[col] < 0) {
+	row_data[col] = -pow(std::abs(src_row_data[col]), power);
+      } else {
+	row_data[col] = pow(std::abs(src_row_data[col]), power);
+      }
+    }
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyExpSpecial() {
-  int32 num_rows = num_rows_, num_cols = num_cols_,
-      stride = stride_;
-  Real *data = data_;
-  for (MatrixIndexT i = 0; i < num_rows; ++i) {
-    for (MatrixIndexT j = 0; j < num_cols; ++j) {
-      Real &x = *(data + j + stride * i);
-      x = x < Real(0) ? Exp(x) : x + Real(1);
-    }
+void MatrixBase<Real>::Floor(const MatrixBase<Real> &src, Real floor_val) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] < floor_val ? floor_val : src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyPow(Real power) {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyPow(power);
+void MatrixBase<Real>::Ceiling(const MatrixBase<Real> &src, Real ceiling_val) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] > ceiling_val ? ceiling_val : src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyPowAbs(Real power, bool include_sign) {
-  for (MatrixIndexT i = 0; i < num_rows_; i++) {
-    Row(i).ApplyPowAbs(power, include_sign);
+void MatrixBase<Real>::Log(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
+  MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = kaldi::Log(src_row_data[col]);
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::ApplyHeaviside() {
+void MatrixBase<Real>::ExpSpecial(const MatrixBase<Real> &src) {
+  KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
-  for (MatrixIndexT i = 0; i < num_rows; i++) {
-    Real *data = this->RowData(i);
-    for (MatrixIndexT j = 0; j < num_cols; j++)
-      data[j] = (data[j] > 0 ? 1.0 : 0.0);
+  Real *row_data = data_;
+  const Real *src_row_data = src.Data();
+  for (MatrixIndexT row = 0; row < num_rows;
+       row++,row_data += stride_, src_row_data += src.stride_) {
+    for (MatrixIndexT col = 0; col < num_cols; col++)
+      row_data[col] = (src_row_data[col] < Real(0) ? kaldi::Exp(src_row_data[col]) : (src_row_data[col] + Real(1)));
   }
 }
 
 template<typename Real>
-void MatrixBase<Real>::Heaviside(const MatrixBase<Real> &src) {
+void MatrixBase<Real>::ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit) {
   KALDI_ASSERT(SameDim(*this, src));
   MatrixIndexT num_rows = num_rows_, num_cols = num_cols_;
   Real *row_data = data_;
   const Real *src_row_data = src.Data();
   for (MatrixIndexT row = 0; row < num_rows;
        row++,row_data += stride_, src_row_data += src.stride_) {
-    for (MatrixIndexT col = 0; col < num_cols; col++)
-      row_data[col] = (src_row_data[col] > 0 ? 1.0 : 0.0);
+    for (MatrixIndexT col = 0; col < num_cols; col++) {
+      const Real x = src_row_data[col];
+      if (!(x >= lower_limit))
+	row_data[col] = kaldi::Exp(lower_limit);
+      else if (x > upper_limit)
+	row_data[col] = kaldi::Exp(upper_limit);
+      else
+	row_data[col] = kaldi::Exp(x);
+    }
   }
 }
 
-
 template<typename Real>
 bool MatrixBase<Real>::Power(Real power) {
   KALDI_ASSERT(num_rows_ > 0 && num_rows_ == num_cols_);
@@ -2695,10 +2741,10 @@ Real MatrixBase<Real>::LogSumExp(Real prune) const {
     for (MatrixIndexT j = 0; j < num_cols_; j++) {
       BaseFloat f = (*this)(i, j);
       if (f >= cutoff)
-        sum_relto_max_elem += Exp(f - max_elem);
+        sum_relto_max_elem += kaldi::Exp(f - max_elem);
     }
   }
-  return max_elem + Log(sum_relto_max_elem);
+  return max_elem + kaldi::Log(sum_relto_max_elem);
 }
 
 template<typename Real>
@@ -2707,9 +2753,9 @@ Real MatrixBase<Real>::ApplySoftMax() {
   // the 'max' helps to get in good numeric range.
   for (MatrixIndexT i = 0; i < num_rows_; i++)
     for (MatrixIndexT j = 0; j < num_cols_; j++)
-      sum += ((*this)(i, j) = Exp((*this)(i, j) - max));
+      sum += ((*this)(i, j) = kaldi::Exp((*this)(i, j) - max));
   this->Scale(1.0 / sum);
-  return max + Log(sum);
+  return max + kaldi::Log(sum);
 }
 
 template<typename Real>
@@ -2739,7 +2785,7 @@ void MatrixBase<Real>::SoftHinge(const MatrixBase<Real> &src) {
       Real x = src_row_data[c], y;
       if (x > 10.0) y = x; // avoid exponentiating large numbers; function
       // approaches y=x.
-      else y = Log1p(Exp(x)); // these defined in kaldi-math.h
+      else y = Log1p(kaldi::Exp(x)); // these defined in kaldi-math.h
       row_data[c] = y;
     }
   }
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index a973824128c..bf634b0ec2a 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -4,6 +4,7 @@
 //                      Saarland University;  Petr Schwarz;  Yanmin Qian;
 //                      Karel Vesely;  Go Vivace Inc.;  Haihua Xu
 //           2017       Shiyin Kang
+//           2019       Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -23,6 +24,8 @@
 #ifndef KALDI_MATRIX_KALDI_MATRIX_H_
 #define KALDI_MATRIX_KALDI_MATRIX_H_ 1
 
+#include <algorithm>
+
 #include "matrix/matrix-common.h"
 
 namespace kaldi {
@@ -57,10 +60,10 @@ class MatrixBase {
   friend class SparseMatrix<float>;
   friend class SparseMatrix<double>;
 
-  /// Returns number of rows (or zero for emtpy matrix).
+  /// Returns number of rows (or zero for empty matrix).
   inline MatrixIndexT  NumRows() const { return num_rows_; }
 
-  /// Returns number of columns (or zero for emtpy matrix).
+  /// Returns number of columns (or zero for empty matrix).
   inline MatrixIndexT NumCols() const { return num_cols_; }
 
   /// Stride (distance in memory between each row).  Will be >= NumCols.
@@ -335,37 +338,42 @@ class MatrixBase {
                  const MatrixIndexT *indexes,
                  MatrixBase<Real> *dst) const;
 
-  /// Applies floor to all matrix elements
-  void ApplyFloor(Real floor_val);
+  inline void ApplyPow(Real power) {
+    this -> Pow(*this, power);
+  }
 
-  /// Applies floor to all matrix elements
-  void ApplyCeiling(Real ceiling_val);
 
-  /// Calculates log of all the matrix elemnts
-  void ApplyLog();
+  inline void ApplyPowAbs(Real power, bool include_sign=false) {
+    this -> PowAbs(*this, power, include_sign);
+  }
 
-  /// Exponentiate each of the elements.
-  void ApplyExp();
+  inline void ApplyHeaviside() {
+    this -> Heaviside(*this);
+  }
 
-  /// For each element x of the matrix, set it to
-  /// (x < 0 ? exp(x) : x + 1).  This function is used
-  /// in our RNNLM training.
-  void ApplyExpSpecial();
+  inline void ApplyFloor(Real floor_val) {
+    this -> Floor(*this, floor_val);
+  }
 
-  /// Applies power to all matrix elements
-  void ApplyPow(Real power);
+  inline void ApplyCeiling(Real ceiling_val) {
+    this -> Ceiling(*this, ceiling_val);
+  }
 
-  /// Apply power to the absolute value of each element.
-  /// Include the sign of the input element if include_sign == true.
-  /// If the power is negative and the input to the power is zero,
-  /// The output will be set zero.
-  void ApplyPowAbs(Real power, bool include_sign=false);
+  inline void ApplyExp() {
+    this -> Exp(*this);
+  }
 
-  /// Applies the Heaviside step function (x > 0 ? 1 : 0) to all matrix elements
-  /// Note: in general you can make different choices for x = 0, but for now
-  /// please leave it as it (i.e. returning zero) because it affects the
-  /// RectifiedLinearComponent in the neural net code.
-  void ApplyHeaviside();
+  inline void ApplyExpSpecial() {
+    this -> ExpSpecial(*this);
+  }
+
+  inline void ApplyExpLimited(Real lower_limit, Real upper_limit) {
+    this -> ExpLimited(*this, lower_limit, upper_limit);
+  }
+
+  inline void ApplyLog() {
+    this -> Log(*this);
+  }
 
   /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
   /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
@@ -481,6 +489,35 @@ class MatrixBase {
   /// because it affects the RectifiedLinearComponent in the neural net code.
   void Heaviside(const MatrixBase<Real> &src);
 
+  void Exp(const MatrixBase<Real> &src);
+
+  void Pow(const MatrixBase<Real> &src, Real power);
+
+  void Log(const MatrixBase<Real> &src);
+
+  /// Apply power to the absolute value of each element.
+  /// If include_sign is true, the result will be multiplied with
+  /// the sign of the input value.
+  /// If the power is negative and the input to the power is zero,
+  /// The output will be set zero. If include_sign is true, it will
+  /// multiply the result by the sign of the input.
+  void PowAbs(const MatrixBase<Real> &src, Real power, bool include_sign=false);
+
+  void Floor(const MatrixBase<Real> &src, Real floor_val);
+
+  void Ceiling(const MatrixBase<Real> &src, Real ceiling_val);
+
+  /// For each element x of the matrix, set it to
+  /// (x < 0 ? exp(x) : x + 1).  This function is used
+  /// in our RNNLM training.
+  void ExpSpecial(const MatrixBase<Real> &src);
+
+  /// This is equivalent to running:
+  /// Floor(src, lower_limit);
+  /// Ceiling(src, upper_limit);
+  /// Exp(src)
+  void ExpLimited(const MatrixBase<Real> &src, Real lower_limit, Real upper_limit);
+
   /// Set each element to y = log(1 + exp(x))
   void SoftHinge(const MatrixBase<Real> &src);
 
@@ -529,10 +566,19 @@ class MatrixBase {
    * positive semi-definite (check_thresh controls how stringent the check is;
    * set it to 2 to ensure it won't ever complain, but it will zero out negative
    * dimensions in your matrix.
+   *
+   * Caution: if you want the eigenvalues, it may make more sense to convert to
+   * SpMatrix and use Eig() function there, which uses eigenvalue decomposition
+   * directly rather than SVD.
   */
   void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
                         Real check_thresh = 0.001);
 
+  // There are some weird issue with template friend function in a class
+  // template in Windows version of nvcc. This is simple an ugly walkaround.
+#if defined(__NVCC__) && defined(_MSC_VER)
+  template<typename Real>
+#endif
   friend Real kaldi::TraceMatMat<Real>(const MatrixBase<Real> &A,
       const MatrixBase<Real> &B, MatrixTransposeType trans);  // tr (A B)
 
@@ -761,7 +807,7 @@ class MatrixBase {
   /// data memory area
   Real*   data_;
 
-  /// these atributes store the real matrix size as it is stored in memory
+  /// these attributes store the real matrix size as it is stored in memory
   /// including memalignment
   MatrixIndexT    num_cols_;   /// < Number of columns
   MatrixIndexT    num_rows_;   /// < Number of rows
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index c8ea35112ea..ccc7e89b5bc 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -6,7 +6,7 @@
 //                      Haihua Xu; Wei Shi
 //                2015  Guoguo Chen
 //                2017  Daniel Galvez
-
+//                2019  Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -264,7 +264,7 @@ void VectorBase<Real>::CopyFromVec(const VectorBase<OtherReal> &other) {
 template void VectorBase<float>::CopyFromVec(const VectorBase<double> &other);
 template void VectorBase<double>::CopyFromVec(const VectorBase<float> &other);
 
-// Remove element from the vector. The vector is non reallocated
+// Remove element from the vector. The vector is not reallocated
 template<typename Real>
 void Vector<Real>::RemoveElement(MatrixIndexT i) {
   KALDI_ASSERT(i <  this->dim_ && "Access out of vector");
@@ -302,7 +302,7 @@ void VectorBase<Real>::SetRandn() {
   kaldi::RandomState rstate;
   MatrixIndexT last = (Dim() % 2 == 1) ? Dim() - 1 : Dim();
   for (MatrixIndexT i = 0; i < last; i += 2) {
-    kaldi::RandGauss2(data_ + i, data_ + i +1, &rstate);
+    kaldi::RandGauss2(data_ + i, data_ + i + 1, &rstate);
   }
   if (Dim() != last) data_[last] = static_cast<Real>(kaldi::RandGauss(&rstate));
 }
@@ -335,7 +335,13 @@ MatrixIndexT VectorBase<Real>::RandCategorical() const {
 template<typename Real>
 void VectorBase<Real>::Set(Real f) {
   // Why not use memset here?
-  for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = f; }
+  // The basic unit of memset is a byte.
+  // If f != 0 and sizeof(Real) > 1, then we cannot use memset.
+  if (f == 0) {
+    this->SetZero(); // calls std::memset
+  } else {
+    for (MatrixIndexT i = 0; i < dim_; i++) { data_[i] = f; }
+  }
 }
 
 template<typename Real>
@@ -448,32 +454,21 @@ void VectorBase<double>::CopyRowFromSp(const SpMatrix<double> &mat, MatrixIndexT
 
 #ifdef HAVE_MKL
 template<>
-void VectorBase<float>::ApplyPow(float power) { vsPowx(dim_, data_, power, data_); }
+void VectorBase<float>::Pow(const VectorBase<float> &v, float power) {
+  vsPowx(dim_, data_, power, v.data_);
+}
 template<>
-void VectorBase<double>::ApplyPow(double power) { vdPowx(dim_, data_, power, data_); }
+void VectorBase<double>::Pow(const VectorBase<double> &v, double power) {
+  vdPowx(dim_, data_, power, v.data_);
+}
 #else
-// takes elements to a power.  Throws exception if could not (but only for power != 1 and power != 2).
+
+// takes elements to a power.  Does not check output.
 template<typename Real>
-void VectorBase<Real>::ApplyPow(Real power) {
-  if (power == 1.0) return;
-  if (power == 2.0) {
-    for (MatrixIndexT i = 0; i < dim_; i++)
-      data_[i] = data_[i] * data_[i];
-  } else if (power == 0.5) {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (!(data_[i] >= 0.0))
-        KALDI_ERR << "Cannot take square root of negative value "
-                  << data_[i];
-      data_[i] = std::sqrt(data_[i]);
-    }
-  } else {
-    for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = pow(data_[i], power);
-      if (data_[i] == HUGE_VAL) {  // HUGE_VAL is what errno returns on error.
-        KALDI_ERR << "Could not raise element "  << i << " to power "
-                  << power << ": returned value = " << data_[i];
-      }
-    }
+void VectorBase<Real>::Pow(const VectorBase<Real> &v, Real power) {
+  KALDI_ASSERT(dim_ == v.dim_);
+  for (MatrixIndexT i = 0; i < dim_; i++) {
+    data_[i] = pow(v.data_[i], power);
   }
 }
 #endif
@@ -814,17 +809,20 @@ void VectorBase<Real>::ApplyAbs() {
 }
 
 template<typename Real>
-void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
+void VectorBase<Real>::Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count) {
+  KALDI_ASSERT(dim_ == v.dim_);
   if (floored_count == nullptr) {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::max(data_[i], floor_val);
+      data_[i] = std::max(v.data_[i], floor_val);
     }
   } else {
     MatrixIndexT num_floored = 0;
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (data_[i] < floor_val) {
-	data_[i] = floor_val;
-	num_floored++;
+      if (v.data_[i] < floor_val) {
+        data_[i] = floor_val;
+        num_floored++;
+      } else {
+        data_[i] = v.data_[i];
       }
     }
     *floored_count = num_floored;
@@ -832,17 +830,20 @@ void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
 }
 
 template<typename Real>
-void VectorBase<Real>::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) {
+void VectorBase<Real>::Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count) {
+  KALDI_ASSERT(dim_ == v.dim_);
   if (ceiled_count == nullptr) {
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      data_[i] = std::min(data_[i], ceil_val);
+      data_[i] = std::min(v.data_[i], ceil_val);
     }
   } else {
     MatrixIndexT num_changed = 0;
     for (MatrixIndexT i = 0; i < dim_; i++) {
-      if (data_[i] > ceil_val) {
-	data_[i] = ceil_val;
-	num_changed++;
+      if (v.data_[i] > ceil_val) {
+        data_[i] = ceil_val;
+        num_changed++;
+      } else {
+        data_[i] = v.data_[i];
       }
     }
     *ceiled_count = num_changed;
@@ -1082,7 +1083,7 @@ void VectorBase<double>::AddVec2(const double alpha, const VectorBase<float> &v)
 
 
 template<typename Real>
-void VectorBase<Real>::Read(std::istream & is,  bool binary, bool add) {
+void VectorBase<Real>::Read(std::istream &is,  bool binary, bool add) {
   if (add) {
     Vector<Real> tmp(Dim());
     tmp.Read(is, binary, false);  // read without adding.
@@ -1105,7 +1106,7 @@ void VectorBase<Real>::Read(std::istream & is,  bool binary, bool add) {
 
 
 template<typename Real>
-void Vector<Real>::Read(std::istream & is,  bool binary, bool add) {
+void Vector<Real>::Read(std::istream &is,  bool binary, bool add) {
   if (add) {
     Vector<Real> tmp(this->Dim());
     tmp.Read(is, binary, false);  // read without adding.
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 3eb4a932095..a5baa3c2d24 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -7,6 +7,7 @@
 //                       Wei Shi;
 //                2015   Guoguo Chen
 //                2017   Daniel Galvez
+//                2019   Yiwen Shao
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -119,6 +120,15 @@ class VectorBase {
   template<typename OtherReal>
   void CopyFromVec(const CuVectorBase<OtherReal> &v);
 
+  /// Applies floor to all elements. Returns number of elements
+  /// floored in floored_count if it is non-null.
+  void Floor(const VectorBase<Real> &v, Real floor_val, MatrixIndexT *floored_count = nullptr);
+
+  /// Applies ceiling to all elements. Returns number of elements
+  /// changed in ceiled_count if it is non-null.
+  void Ceiling(const VectorBase<Real> &v, Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
+
+  void Pow(const VectorBase<Real> &v, Real power);
 
   /// Apply natural log to all elements.  Throw if any element of
   /// the vector is negative (but doesn't complain about zero; the
@@ -136,11 +146,15 @@ class VectorBase {
 
   /// Applies floor to all elements. Returns number of elements
   /// floored in floored_count if it is non-null.
-  void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr);
+  inline void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr) {
+    this->Floor(*this, floor_val, floored_count);
+  };
 
   /// Applies ceiling to all elements. Returns number of elements
   /// changed in ceiled_count if it is non-null.
-  void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
+  inline void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr) {
+    this->Ceiling(*this, ceil_val, ceiled_count);
+  };
 
   /// Applies floor to all elements. Returns number of elements floored.
   MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
@@ -162,7 +176,9 @@ class VectorBase {
   void Sigmoid(const VectorBase<Real> &src);
 
   /// Take all  elements of vector to a power.
-  void ApplyPow(Real power);
+  inline void ApplyPow(Real power) {
+    this->Pow(*this, power);
+  };
 
   /// Take the absolute value of all elements of a vector to a power.
   /// Include the sign of the input element if include_sign == true.
@@ -218,9 +234,9 @@ class VectorBase {
   /// Set each element to y = (x == orig ? changed : x).
   void ReplaceValue(Real orig, Real changed);
 
-  /// Multipy element-by-element by another vector.
+  /// Multiply element-by-element by another vector.
   void MulElements(const VectorBase<Real> &v);
-  /// Multipy element-by-element by another vector of different type.
+  /// Multiply element-by-element by another vector of different type.
   template<typename OtherReal>
   void MulElements(const VectorBase<OtherReal> &v);
 
@@ -233,7 +249,7 @@ class VectorBase {
   /// Add a constant to each element of a vector.
   void Add(Real c);
 
-  /// Add element-by-element product of vectlrs:
+  /// Add element-by-element product of vectors:
   //  this <-- alpha * v .* r + beta*this .
   void AddVecVec(Real alpha, const VectorBase<Real> &v,
                  const VectorBase<Real> &r, Real beta);
@@ -246,7 +262,7 @@ class VectorBase {
   /// Multiplies all elements by this constant.
   void Scale(Real alpha);
 
-  /// Multiplies this vector by lower-triangular marix:  *this <-- *this *M
+  /// Multiplies this vector by lower-triangular matrix:  *this <-- *this *M
   void MulTp(const TpMatrix<Real> &M, const MatrixTransposeType trans);
 
   /// If trans == kNoTrans, solves M x = b, where b is the value of *this at input
@@ -344,7 +360,7 @@ class VectorBase {
 
   /// Reads from C++ stream (option to add to existing contents).
   /// Throws exception on failure
-  void Read(std::istream & in, bool binary, bool add = false);
+  void Read(std::istream &in, bool binary, bool add = false);
 
   /// Writes to C++ stream (option to write in binary).
   void Write(std::ostream &Out, bool binary) const;
@@ -355,7 +371,7 @@ class VectorBase {
   friend class CuVector<Real>;
  protected:
   /// Destructor;  does not deallocate memory, this is handled by child classes.
-  /// This destructor is protected so this object so this object can only be
+  /// This destructor is protected so this object can only be
   /// deleted via a child.
   ~VectorBase() {}
 
@@ -439,7 +455,7 @@ class Vector: public VectorBase<Real> {
 
   /// Read function using C++ streams.  Can also add to existing contents
   /// of matrix.
-  void Read(std::istream & in, bool binary, bool add = false);
+  void Read(std::istream &in, bool binary, bool add = false);
 
   /// Set vector to a specified size (can be zero).
   /// The value of the new data depends on resize_type:
@@ -453,7 +469,7 @@ class Vector: public VectorBase<Real> {
   /// Remove one element and shifts later elements down.
   void RemoveElement(MatrixIndexT i);
 
-  /// Assignment operator, protected so it can only be used by std::vector
+  /// Assignment operator.
   Vector<Real> &operator = (const Vector<Real> &other) {
     Resize(other.Dim(), kUndefined);
     this->CopyFromVec(other);
@@ -494,36 +510,36 @@ class SubVector : public VectorBase<Real> {
     KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
                  static_cast<UnsignedMatrixIndexT>(length) <=
                  static_cast<UnsignedMatrixIndexT>(t.Dim()));
-    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
-    VectorBase<Real>::dim_   = length;
+    this->data_ = const_cast<Real*> (t.Data()+origin);
+    this->dim_   = length;
   }
 
   /// This constructor initializes the vector to point at the contents
   /// of this packed matrix (SpMatrix or TpMatrix).
   SubVector(const PackedMatrix<Real> &M) {
-    VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
-    VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
+    this->data_ = const_cast<Real*> (M.Data());
+    this->dim_   = (M.NumRows()*(M.NumRows()+1))/2;
   }
 
   /// Copy constructor
   SubVector(const SubVector &other) : VectorBase<Real> () {
     // this copy constructor needed for Range() to work in base class.
-    VectorBase<Real>::data_ = other.data_;
-    VectorBase<Real>::dim_ = other.dim_;
+    this->data_ = other.data_;
+    this->dim_ = other.dim_;
   }
 
   /// Constructor from a pointer to memory and a length.  Keeps a pointer
   /// to the data but does not take ownership (will never delete).
-  SubVector(Real *data, MatrixIndexT length) : VectorBase<Real> () {
-    VectorBase<Real>::data_ = data;
-    VectorBase<Real>::dim_   = length;
+  /// Caution: this constructor enables you to evade const constraints.
+  SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
+    this->data_ = const_cast<Real*>(data);
+    this->dim_   = length;
   }
 
-
   /// This operation does not preserve const-ness, so be careful.
   SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
-    VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
-    VectorBase<Real>::dim_   = matrix.NumCols();
+    this->data_ = const_cast<Real*>(matrix.RowData(row));
+    this->dim_   = matrix.NumCols();
   }
 
   ~SubVector() {}  ///< Destructor (does nothing; no pointers are owned here).
@@ -594,4 +610,3 @@ Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
 
 
 #endif  // KALDI_MATRIX_KALDI_VECTOR_H_
-
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index bcac19783c7..1cd0c14c9af 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -2520,7 +2520,9 @@ template<typename Real> static void UnitTestIo() {
       bool binary_in;
       bool either_way = (i%2 == 0);
       std::ifstream ins("tmpf", std::ios_base::in | std::ios_base::binary);
-      InitKaldiInputStream(ins, &binary_in);
+      if (!InitKaldiInputStream(ins, &binary_in)) {
+        KALDI_ERR << "Malformed input stream.";
+      }
       N.Resize(0, 0);
       T.Resize(0);
       v2.Resize(0);
@@ -2585,7 +2587,9 @@ template<typename Real> static void UnitTestIoCross() {  // across types.
     {
       std::ifstream ins("tmpf", std::ios_base::in | std::ios_base::binary);
       bool binary_in;
-      InitKaldiInputStream(ins, &binary_in);
+      if (!InitKaldiInputStream(ins, &binary_in)) {
+        KALDI_ERR << "Malformed input stream";
+      }
 
       MO.Read(ins, binary_in);
       SO.Read(ins, binary_in);
@@ -2691,14 +2695,14 @@ template<typename Real> static void UnitTestRange() {  // Testing SubMatrix clas
 
     SubVector<Real> sub(V, lenStart, lenEnd-lenStart);
 
-    KALDI_ASSERT(sub.Sum() == V.Range(lenStart, lenEnd-lenStart).Sum());
+    KALDI_ASSERT(ApproxEqual(sub.Sum(), V.Range(lenStart, lenEnd-lenStart).Sum()));
 
     for (MatrixIndexT i = lenStart;i < lenEnd;i++)
       KALDI_ASSERT(V(i) == sub(i-lenStart));
 
     sub.SetRandn();
 
-    KALDI_ASSERT(sub.Sum() == V.Range(lenStart, lenEnd-lenStart).Sum());
+    KALDI_ASSERT(ApproxEqual(sub.Sum(), V.Range(lenStart, lenEnd-lenStart).Sum()));
 
     for (MatrixIndexT i = lenStart;i < lenEnd;i++)
       KALDI_ASSERT(V(i) == sub(i-lenStart));
diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc
index 55d8edeb4b3..68a61e17dc3 100644
--- a/src/matrix/sparse-matrix.cc
+++ b/src/matrix/sparse-matrix.cc
@@ -808,6 +808,7 @@ void GeneralMatrix::Compress() {
 
 void GeneralMatrix::Uncompress() {
   if (cmat_.NumRows() != 0) {
+    mat_.Resize(cmat_.NumRows(), cmat_.NumCols(), kUndefined);
     cmat_.CopyToMat(&mat_);
     cmat_.Clear();
   }
diff --git a/src/matrix/srfft.cc b/src/matrix/srfft.cc
index 9d04c3badd6..f6189496018 100644
--- a/src/matrix/srfft.cc
+++ b/src/matrix/srfft.cc
@@ -212,7 +212,8 @@ void SplitRadixComplexFft<Real>::ComputeRecursive(Real *xr, Real *xi, MatrixInde
 
   MatrixIndexT    m, m2, m4, m8, nel, n;
   Real    *xr1, *xr2, *xi1, *xi2;
-  Real    *cn, *spcn, *smcn, *c3n, *spc3n, *smc3n;
+  Real    *cn = nullptr, *spcn = nullptr, *smcn = nullptr, *c3n = nullptr,
+    *spc3n = nullptr, *smc3n = nullptr;
   Real    tmp1, tmp2;
   Real   sqhalf = M_SQRT1_2;
 
diff --git a/src/matrix/tp-matrix.cc b/src/matrix/tp-matrix.cc
index f01ee1e8f46..6e34dc643e9 100644
--- a/src/matrix/tp-matrix.cc
+++ b/src/matrix/tp-matrix.cc
@@ -51,7 +51,7 @@ void TpMatrix<Real>::Invert() {
   // format, so we temporarily put in non-packed format.
   Matrix<Real> tmp(*this);
   int rows = static_cast<int>(this->num_rows_);
-  
+
   // ATLAS call.  It's really row-major ordering and a lower triangular matrix,
   // but there is some weirdness with Fortran-style indexing that we need to
   // take account of, so everything gets swapped.
@@ -102,14 +102,13 @@ void TpMatrix<Real>::Cholesky(const SpMatrix<Real> &orig) {
     }
     // d = orig(j, j) - d;
     d = orig_jdata[j] - d;
-    
+
     if (d >= 0.0) {
       // (*this)(j, j) = std::sqrt(d);
       jdata[j] = std::sqrt(d);
     } else {
-      KALDI_WARN << "Cholesky decomposition failed. Maybe matrix "
-          "is not positive definite. Throwing error";
-      throw std::runtime_error("Cholesky decomposition failed.");
+      KALDI_ERR << "Cholesky decomposition failed. Maybe matrix "
+          "is not positive definite.";
     }
   }
 }
@@ -144,5 +143,3 @@ template class TpMatrix<float>;
 template class TpMatrix<double>;
 
 }  // namespace kaldi
-
-
diff --git a/src/matrix/tp-matrix.h b/src/matrix/tp-matrix.h
index b215e73b000..e3b08701543 100644
--- a/src/matrix/tp-matrix.h
+++ b/src/matrix/tp-matrix.h
@@ -45,11 +45,11 @@ class TpMatrix : public PackedMatrix<Real> {
   /// Copy constructor from CUDA TpMatrix
   /// This is defined in ../cudamatrix/cu-tp-matrix.cc
   explicit TpMatrix(const CuTpMatrix<Real> &cu);
-  
-  
+
+
   template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& orig)
       : PackedMatrix<Real>(orig) {}
-  
+
   Real operator() (MatrixIndexT r, MatrixIndexT c) const {
     if (static_cast<UnsignedMatrixIndexT>(c) >
         static_cast<UnsignedMatrixIndexT>(r)) {
@@ -74,9 +74,9 @@ class TpMatrix : public PackedMatrix<Real> {
     return *(this->data_ + (r*(r+1)) / 2 + c);
     // Duplicating code from PackedMatrix.h
   }
-  // Note: Cholesky may throw std::runtime_error
+  // Note: Cholesky may throw KaldiFatalError.
   void Cholesky(const SpMatrix<Real>& orig);
-  
+
   void Invert();
 
   // Inverts in double precision.
@@ -99,7 +99,7 @@ class TpMatrix : public PackedMatrix<Real> {
 
   /// This is implemented in ../cudamatrix/cu-tp-matrix.cc
   void CopyFromMat(const CuTpMatrix<Real> &other);
-  
+
   /// CopyFromTp copies another triangular matrix into this one.
   void CopyFromTp(const TpMatrix<Real> &other) {
     PackedMatrix<Real>::CopyFromPacked(other);
@@ -132,4 +132,3 @@ class TpMatrix : public PackedMatrix<Real> {
 
 
 #endif
-
diff --git a/src/nnet/Makefile b/src/nnet/Makefile
index 99f54ae2af2..7f324479a0f 100644
--- a/src/nnet/Makefile
+++ b/src/nnet/Makefile
@@ -15,8 +15,8 @@ OBJFILES = nnet-nnet.o nnet-component.o nnet-loss.o \
 LIBNAME = kaldi-nnet
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/nnet/nnet-activation.h b/src/nnet/nnet-activation.h
index 74b0ebad650..ad9acac26bc 100644
--- a/src/nnet/nnet-activation.h
+++ b/src/nnet/nnet-activation.h
@@ -49,7 +49,7 @@ class Softmax : public Component {
   void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
                     CuMatrixBase<BaseFloat> *out) {
     // y = e^x_j/sum_j(e^x_j)
-    out->ApplySoftMaxPerRow(in);
+    out->SoftMaxPerRow(in);
   }
 
   void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
@@ -81,7 +81,7 @@ class HiddenSoftmax : public Component {
   void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
                     CuMatrixBase<BaseFloat> *out) {
     // y = e^x_j/sum_j(e^x_j)
-    out->ApplySoftMaxPerRow(in);
+    out->SoftMaxPerRow(in);
   }
 
   void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
@@ -167,7 +167,7 @@ class BlockSoftmax : public Component {
       CuSubMatrix<BaseFloat> out_bl =
         out->ColRange(block_offset[bl], block_dims[bl]);
       // y = e^x_j/sum_j(e^x_j),
-      out_bl.ApplySoftMaxPerRow(in_bl);
+      out_bl.SoftMaxPerRow(in_bl);
     }
   }
 
diff --git a/src/nnet/nnet-average-pooling-2d-component.h b/src/nnet/nnet-average-pooling-2d-component.h
deleted file mode 100644
index 17ae87f94db..00000000000
--- a/src/nnet/nnet-average-pooling-2d-component.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// nnet/nnet-average-pooling-2d-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely)
-//                 Johns Hopkins University (author: Sri Harish Mallidi)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * AveragePoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class AveragePooling2DComponent : public Component {
- public:
-  AveragePooling2DComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    pool_x_len_(0), pool_y_len_(0),
-    pool_x_step_(0), pool_y_step_(0)
-  { }
-  ~AveragePooling2DComponent()
-  { }
-
-  Component* Copy() const { return new AveragePooling2DComponent(*this); }
-  ComponentType GetType() const { return kAveragePooling2DComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<PoolXLen>") ReadBasicType(is, false, &pool_x_len_);
-      else if (token == "<PoolYLen>") ReadBasicType(is, false, &pool_y_len_);
-      else if (token == "<PoolXStep>") ReadBasicType(is, false, &pool_x_step_);
-      else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-             << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-    }
-    // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
-    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
-    KALDI_ASSERT(pool_x_step_ * pool_y_step_  != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<PoolXLen>");
-    ReadBasicType(is, binary, &pool_x_len_);
-    ExpectToken(is, binary, "<PoolYLen>");
-    ReadBasicType(is, binary, &pool_y_len_);
-    ExpectToken(is, binary, "<PoolXStep>");
-    ReadBasicType(is, binary, &pool_x_step_);
-    ExpectToken(is, binary, "<PoolYStep>");
-    ReadBasicType(is, binary, &pool_y_step_);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - pool_x_len_) % (pool_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - pool_y_len_) % (pool_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - pool_x_len_)/pool_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - pool_y_len_)/pool_y_step_ + 1;
-    //    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_ASSERT(num_input_fmaps == num_output_fmaps);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<PoolXLen>");
-    WriteBasicType(os, binary, pool_x_len_);
-    WriteToken(os, binary, "<PoolYLen>");
-    WriteBasicType(os, binary, pool_y_len_);
-    WriteToken(os, binary, "<PoolXStep>");
-    WriteBasicType(os, binary, pool_x_step_);
-    WriteToken(os, binary, "<PoolYStep>");
-    WriteBasicType(os, binary, pool_y_step_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> pool(out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
-        pool.SetZero();  // reset
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            pool.AddMat(1.0, in.ColRange(c, num_input_fmaps));
-          }
-        }
-        pool.Scale(1.0 / (pool_x_len_ * pool_y_len_));
-        out_fmap_cnt++;
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(inp_fmap_size, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> src(out_diff.ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps));
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
-            tgt.AddMat(1.0, src);
-            patch_summands[c / num_input_fmaps] += 1;
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-
-    // divide diff by average-pooling-dim (derivative of averaging)
-    in_diff->Scale(1.0 / (pool_x_len_ * pool_y_len_));
-
-    // divide diff by #summands (compensate for patches used in more pools)
-    for (int i = 0; i < fmap_x_len_; i++) {
-      for (int32 j = 0; j < fmap_y_len_; j++) {
-        int32 c = i * fmap_y_len_ + j;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c*num_input_fmaps, num_input_fmaps));
-        KALDI_ASSERT(patch_summands[c] > 0);  // patch at least in one pool
-        tgt.Scale(1.0 / patch_summands[c]);
-      }
-    }
-  }
-
- private:
-  int32 fmap_x_len_, fmap_y_len_,
-        pool_x_len_, pool_y_len_,
-        pool_x_step_, pool_y_step_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_AVERAGE_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-component-test.cc b/src/nnet/nnet-component-test.cc
index da181bd18f6..0786eb51c15 100644
--- a/src/nnet/nnet-component-test.cc
+++ b/src/nnet/nnet-component-test.cc
@@ -24,10 +24,7 @@
 #include "nnet/nnet-component.h"
 #include "nnet/nnet-nnet.h"
 #include "nnet/nnet-convolutional-component.h"
-#include "nnet/nnet-convolutional-2d-component.h"
 #include "nnet/nnet-max-pooling-component.h"
-#include "nnet/nnet-max-pooling-2d-component.h"
-#include "nnet/nnet-average-pooling-2d-component.h"
 #include "util/common-utils.h"
 
 namespace kaldi {
@@ -245,151 +242,6 @@ namespace nnet1 {
     delete c;
   }
 
-  void UnitTestMaxPooling2DComponent() { /* Implemented by Harish Mallidi */
-    // make max-pooling2d component
-    Component* c = Component::Init(
-      "<MaxPooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
-       <PoolXStep> 1 <PoolYStep> 2"
-    );
-
-    // input matrix,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 21 21 \
-      22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    // expected output (max values in the patch)
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 9 9 11 11 13 13 16 16 18 18 \
-      20 20 23 23 25 25 27 27 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-
-    // locations of max values will be shown
-    CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    ReadCuMatrixFromString(
-      "[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff
-    );
-
-    // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 \
-      0.25 0.25 0 0 1 1 0 0 0 0 0.75 0.75 0 0 1 1 0 0 2.5 2.5 \
-      0 0 0 0 3 3 0 0 3.5 3.5 0 0 8 8 ]", &mat_in_diff_ref
-    );
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-  void UnitTestAveragePooling2DComponent() { /* Implemented by Harish Mallidi */
-    // make average-pooling2d component
-    Component* c = Component::Init(
-      "<AveragePooling2DComponent> <InputDim> 56 <OutputDim> 18 \
-       <FmapXLen> 4 <FmapYLen> 7 <PoolXLen> 2 <PoolYLen> 3 \
-       <PoolXStep> 1 <PoolYStep> 2"
-    );
-
-    // input matrix,
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
-      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    // expected output (max values in the patch)
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 4.5 4.5 6.5 6.5 8.5 8.5 11.5 11.5 13.5 13.5 \
-      15.5 15.5 18.5 18.5 20.5 20.5 22.5 22.5 ]", &mat_out_ref);
-
-    // propagate,
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out_ref" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-
-    // locations of max values will be shown
-    CuMatrix<BaseFloat> mat_out_diff(mat_out);
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]", &mat_out_diff);
-
-    // expected backpropagated values,
-    CuMatrix<BaseFloat> mat_in_diff_ref;  // hand-computed back-propagated values,
-    ReadCuMatrixFromString("[  0 0 0 0 0.0833333 0.0833333 0.166667 0.166667 \
-      0.25 0.25 0.333333 0.333333 0.333333 0.333333 0.25 0.25 0.25 0.25 \
-      0.333333 0.333333 0.416667 0.416667 0.5 0.5 0.583333 0.583333 0.583333 \
-      0.583333 0.75 0.75 0.75 0.75 0.833333 0.833333 0.916667 0.916667 1 1 \
-      1.08333 1.08333 1.08333 1.08333 1 1 1 1 1.08333 1.08333 1.16667 1.16667 \
-      1.25 1.25 1.33333 1.33333 1.33333 1.33333 ]", &mat_in_diff_ref
-    );
-
-    // backpropagate,
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
-
-  void UnitTestConvolutional2DComponent() { /* Implemented by Harish Mallidi */
-    // Convolutional2D component
-    Component* c = ReadComponentFromString("<Convolutional2DComponent> 18 56 \
-      <LearnRateCoef> 0 <BiasLearnRateCoef> 0 <FmapXLen> 4 <FmapYLen> 7 \
-      <FiltXLen> 2 <FiltYLen> 3 <FiltXStep> 1 <FiltYStep> 2 <ConnectFmap> 1 \
-      <Filters> [ 0 0 1 1 2 2 3 3 4 4 5 5 ; 0 0 1 1 2 2 3 3 4 4 5 5 ] \
-      <Bias> [ 0 0 ]"
-    );
-
-    // input matrix
-    CuMatrix<BaseFloat> mat_in;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10 10 \
-      11 11 12 12 13 13 14 14 15 15 16 16 17 17 18 18 19 19 20 20 \
-      21 21 22 22 23 23 24 24 25 25 26 26 27 27 ]", &mat_in);
-
-    CuMatrix<BaseFloat> mat_out_ref;
-    ReadCuMatrixFromString("[ 206 206 266 266 326 326 416 416 476 476 536 536 \
-      626 626 686 686 746 746 ]", &mat_out_ref);
-
-    // propagate
-    CuMatrix<BaseFloat> mat_out;
-    c->Propagate(mat_in, &mat_out);
-    KALDI_LOG << "mat_out" << mat_out << "mat_out" << mat_out_ref;
-    AssertEqual(mat_out, mat_out_ref);
-
-    // prepare mat_out_diff, mat_in_diff_ref,
-    CuMatrix<BaseFloat> mat_out_diff;
-    ReadCuMatrixFromString("[ 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 ]",
-                           &mat_out_diff);
-
-    CuMatrix<BaseFloat> mat_in_diff_ref;
-    ReadCuMatrixFromString("[ 0 0 0 0 0 0 2 2 2 2 4 4 8 8 0 0 3 3 4.5 4.5 8 8 \
-      9.5 9.5 13 13 20 20 9 9 18 18 19.5 19.5 23 23 24.5 24.5 28 28 41 41 \
-      36 36 48 48 51 51 56 56 59 59 64 64 80 80 ]", &mat_in_diff_ref);
-
-    // backpropagate
-    CuMatrix<BaseFloat> mat_in_diff;
-    c->Backpropagate(mat_in, mat_out, mat_out_diff, &mat_in_diff);
-    KALDI_LOG << "mat_in_diff " << mat_in_diff
-              << " mat_in_diff_ref " << mat_in_diff_ref;
-    AssertEqual(mat_in_diff, mat_in_diff_ref);
-
-    delete c;
-  }
-
   void UnitTestDropoutComponent() {
     Component* c = ReadComponentFromString("<Dropout> 100 100 <DropoutRetention> 0.7");
     // buffers,
@@ -434,9 +286,6 @@ int main() {
     UnitTestConvolutionalComponentUnity();
     UnitTestConvolutionalComponent3x3();
     UnitTestMaxPoolingComponent();
-    UnitTestConvolutional2DComponent();
-    UnitTestMaxPooling2DComponent();
-    UnitTestAveragePooling2DComponent();
     UnitTestDropoutComponent();
     // end of unit-tests,
     if (loop == 0)
diff --git a/src/nnet/nnet-component.cc b/src/nnet/nnet-component.cc
index 34f988972a0..cf7741e6e57 100644
--- a/src/nnet/nnet-component.cc
+++ b/src/nnet/nnet-component.cc
@@ -35,10 +35,6 @@
 #include "nnet/nnet-average-pooling-component.h"
 #include "nnet/nnet-max-pooling-component.h"
 
-#include "nnet/nnet-convolutional-2d-component.h"
-#include "nnet/nnet-average-pooling-2d-component.h"
-#include "nnet/nnet-max-pooling-2d-component.h"
-
 #include "nnet/nnet-lstm-projected.h"
 #include "nnet/nnet-blstm-projected.h"
 #include "nnet/nnet-recurrent.h"
@@ -56,7 +52,6 @@ const struct Component::key_value Component::kMarkerMap[] = {
   { Component::kAffineTransform, "<AffineTransform>" },
   { Component::kLinearTransform, "<LinearTransform>" },
   { Component::kConvolutionalComponent, "<ConvolutionalComponent>" },
-  { Component::kConvolutional2DComponent, "<Convolutional2DComponent>" },
   { Component::kLstmProjected, "<LstmProjected>" },
   { Component::kLstmProjected, "<LstmProjectedStreams>" }, // bwd compat.
   { Component::kBlstmProjected, "<BlstmProjected>" },
@@ -77,9 +72,7 @@ const struct Component::key_value Component::kMarkerMap[] = {
   { Component::kRescale, "<Rescale>" },
   { Component::kKlHmm, "<KlHmm>" },
   { Component::kAveragePoolingComponent, "<AveragePoolingComponent>" },
-  { Component::kAveragePooling2DComponent, "<AveragePooling2DComponent>" },
   { Component::kMaxPoolingComponent, "<MaxPoolingComponent>" },
-  { Component::kMaxPooling2DComponent, "<MaxPooling2DComponent>" },
   { Component::kSentenceAveragingComponent, "<SentenceAveragingComponent>" },
   { Component::kSimpleSentenceAveragingComponent, "<SimpleSentenceAveragingComponent>" },
   { Component::kFramePoolingComponent, "<FramePoolingComponent>" },
@@ -127,9 +120,6 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
     case Component::kConvolutionalComponent :
       ans = new ConvolutionalComponent(input_dim, output_dim);
       break;
-    case Component::kConvolutional2DComponent :
-      ans = new Convolutional2DComponent(input_dim, output_dim);
-      break;
     case Component::kLstmProjected :
       ans = new LstmProjected(input_dim, output_dim);
       break;
@@ -190,15 +180,9 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
     case Component::kAveragePoolingComponent :
       ans = new AveragePoolingComponent(input_dim, output_dim);
       break;
-    case Component::kAveragePooling2DComponent :
-      ans = new AveragePooling2DComponent(input_dim, output_dim);
-      break;
     case Component::kMaxPoolingComponent :
       ans = new MaxPoolingComponent(input_dim, output_dim);
       break;
-    case Component::kMaxPooling2DComponent :
-      ans = new MaxPooling2DComponent(input_dim, output_dim);
-      break;
     case Component::kFramePoolingComponent :
       ans = new FramePoolingComponent(input_dim, output_dim);
       break;
diff --git a/src/nnet/nnet-component.h b/src/nnet/nnet-component.h
index 2ef56622ca8..0cca2608b21 100644
--- a/src/nnet/nnet-component.h
+++ b/src/nnet/nnet-component.h
@@ -51,7 +51,6 @@ class Component {
     kAffineTransform,
     kLinearTransform,
     kConvolutionalComponent,
-    kConvolutional2DComponent,
     kLstmProjected,
     kBlstmProjected,
     kRecurrentComponent,
@@ -79,9 +78,7 @@ class Component {
     kSentenceAveragingComponent, /* deprecated */
     kSimpleSentenceAveragingComponent,
     kAveragePoolingComponent,
-    kAveragePooling2DComponent,
     kMaxPoolingComponent,
-    kMaxPooling2DComponent,
     kFramePoolingComponent,
     kParallelComponent,
     kMultiBasisComponent
diff --git a/src/nnet/nnet-convolutional-2d-component.h b/src/nnet/nnet-convolutional-2d-component.h
deleted file mode 100644
index 135ce894541..00000000000
--- a/src/nnet/nnet-convolutional-2d-component.h
+++ /dev/null
@@ -1,495 +0,0 @@
-// nnet/nnet-convolutional-2d-component.h
-
-// Copyright 2014-2015  Johns Hopkins University (author: Sri Harish Mallidi)
-//                      Brno University of Technology (author: Karel Vesely),
-//
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-various.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * Convolutional2DComponent implements convolution over 2-axis (frequency and temporal)
- * (i.e. frequency axis in case we are the 1st component in NN).
- * // We don't do convolution along temporal axis, which simplifies the
- * // implementation (and was not helpful for Tara).
- *
- * We assume the input featrues are spliced, i.e. each frame
- * is in fact a set of stacked frames, where we can form patches
- * which span over several frequency bands and time axes.
- *
- * The convolution is done over whole axis with same filters,
- * i.e. we don't use separate filters for different 'regions'
- * of frequency axis.
- *
- * In order to have a fast implementations, the filters
- * are represented in vectorized form, where each rectangular
- * filter corresponds to a row in a matrix, where all filters
- * are stored. The features are then re-shaped to a set of matrices,
- * where one matrix corresponds to single patch-position,
- * where the filters get applied.
- *
- * The type of convolution is controled by hyperparameters:
- * x_patch_dim_,y_patch_dim_     ... temporal and frequency axes sizes of the patch (e.g. (9,9) for 9x9 2D filter)
- * x_patch_step_,y_patch_step_    ... temporal and frequencey sizes of shifts in the convolution (e.g. (1,1) 2D filter with 1 step shift in both axes)
- * x_patch_stride_,y_patch_stride_  ... dimension of the feature (maps if inside convolutional layer) (e.g. (11,32) for 32-band 11 frame spliced spectrogram patch)
- * The type of convolution is controlled by hyperparameters:
- * fmap_x_len_, fmap_y_len_ ... dimension of the feature (maps if inside convolutional layer) (e.g. (11,32) for 32-band 11 frame spliced spectrogram patch)
- * filt_x_len_, filt_y_len_ ... temporal and frequency sizes of the filters (e.g. (9,9) for 9x9 2D filter)
- * filt_x_step_, filt_y_step_ ... temporal and frequency sizes of the filters (e.g. (1,1) for 2D-filter, with 1 step shift in both axes)
- *
- *
- * Due to convolution same weights are used repeateadly,
- * the final gradient is average of all position-specific
- * gradients.
- *
- */
-class Convolutional2DComponent : public UpdatableComponent {
- public:
-  Convolutional2DComponent(int32 dim_in, int32 dim_out):
-    UpdatableComponent(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    filt_x_len_(0), filt_y_len_(0),
-    filt_x_step_(0), filt_y_step_(0),
-    connect_fmap_(0)
-  { }
-
-  ~Convolutional2DComponent()
-  { }
-
-  Component* Copy() const { return new Convolutional2DComponent(*this); }
-  ComponentType GetType() const { return kConvolutional2DComponent; }
-
-  void InitData(std::istream &is) {
-    // define options
-    BaseFloat bias_mean = -2.0, bias_range = 2.0, param_stddev = 0.1;
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<ParamStddev>") ReadBasicType(is, false, &param_stddev);
-      else if (token == "<BiasMean>")    ReadBasicType(is, false, &bias_mean);
-      else if (token == "<BiasRange>")   ReadBasicType(is, false, &bias_range);
-      else if (token == "<FmapXLen>")    ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>")    ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<FiltXLen>")    ReadBasicType(is, false, &filt_x_len_);
-      else if (token == "<FiltYLen>")    ReadBasicType(is, false, &filt_y_len_);
-      else if (token == "<FiltXStep>")   ReadBasicType(is, false, &filt_x_step_);
-      else if (token == "<FiltYStep>")   ReadBasicType(is, false, &filt_y_step_);
-      else if (token == "<ConnectFmap>") ReadBasicType(is, false, &connect_fmap_);
-      else if (token == "<LearnRateCoef>") ReadBasicType(is, false, &learn_rate_coef_);
-      else if (token == "<BiasLearnRateCoef>") ReadBasicType(is, false, &bias_learn_rate_coef_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config? "
-                     << "(ParamStddev|BiasMean|BiasRange|FmapXLen|FmapYLen|"
-                        "FiltXLen|FiltYLen|FiltXStep|FiltYStep|ConnectFmap|"
-                        "LearnRateCoef|BiasLearnRateCoef)";
-    }
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_input_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - filt_x_len_) % (filt_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - filt_y_len_) % (filt_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_LOG << "num_output_fmaps " << num_output_fmaps;
-    int32 num_filters = output_dim_/(out_fmap_x_len*out_fmap_y_len);
-    KALDI_LOG << "num_filters " << num_filters;
-
-    //
-    // Initialize trainable parameters,
-    //
-    filters_.Resize(num_filters, num_input_fmaps*filt_x_len_*filt_y_len_);
-    RandGauss(0.0, param_stddev, &filters_);
-    //
-    bias_.Resize(num_filters);
-    RandUniform(bias_mean, bias_range, &bias_);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    ExpectToken(is, binary, "<LearnRateCoef>");
-    ReadBasicType(is, binary, &learn_rate_coef_);
-    ExpectToken(is, binary, "<BiasLearnRateCoef>");
-    ReadBasicType(is, binary, &bias_learn_rate_coef_);
-    // convolution hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<FiltXLen>");
-    ReadBasicType(is, binary, &filt_x_len_);
-    ExpectToken(is, binary, "<FiltYLen>");
-    ReadBasicType(is, binary, &filt_y_len_);
-    ExpectToken(is, binary, "<FiltXStep>");
-    ReadBasicType(is, binary, &filt_x_step_);
-    ExpectToken(is, binary, "<FiltYStep>");
-    ReadBasicType(is, binary, &filt_y_step_);
-    ExpectToken(is, binary, "<ConnectFmap>");
-    ReadBasicType(is, binary, &connect_fmap_);
-
-    // trainable parameters
-    ExpectToken(is, binary, "<Filters>");
-    filters_.Read(is, binary);
-    ExpectToken(is, binary, "<Bias>");
-    bias_.Read(is, binary);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    // int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    // KALDI_LOG << "num_input_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - filt_x_len_) % (filt_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - filt_y_len_) % (filt_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    WriteToken(os, binary, "<LearnRateCoef>");
-    WriteBasicType(os, binary, learn_rate_coef_);
-    WriteToken(os, binary, "<BiasLearnRateCoef>");
-    WriteBasicType(os, binary, bias_learn_rate_coef_);
-    if (!binary) os << "\n";
-
-    // convolution hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<FiltXLen>");
-    WriteBasicType(os, binary, filt_x_len_);
-    WriteToken(os, binary, "<FiltYLen>");
-    WriteBasicType(os, binary, filt_y_len_);
-    WriteToken(os, binary, "<FiltXStep>");
-    WriteBasicType(os, binary, filt_x_step_);
-    WriteToken(os, binary, "<FiltYStep>");
-    WriteBasicType(os, binary, filt_y_step_);
-    WriteToken(os, binary, "<ConnectFmap>");
-    WriteBasicType(os, binary, connect_fmap_);
-    if (!binary) os << "\n";
-
-    // trainable parameters
-    WriteToken(os, binary, "<Filters>");
-    if (!binary) os << "\n";
-    filters_.Write(os, binary);
-    WriteToken(os, binary, "<Bias>");
-    if (!binary) os << "\n";
-    bias_.Write(os, binary);
-  }
-
-  int32 NumParams() const {
-    return filters_.NumRows()*filters_.NumCols() + bias_.Dim();
-  }
-
-  void GetGradient(VectorBase<BaseFloat>* gradient) const {
-    KALDI_ASSERT(gradient->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    gradient->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    gradient->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void GetParams(VectorBase<BaseFloat>* params) const {
-    KALDI_ASSERT(params->Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    params->Range(0, filters_num_elem).CopyRowsFromMat(filters_);
-    params->Range(filters_num_elem, bias_.Dim()).CopyFromVec(bias_);
-  }
-
-  void SetParams(const VectorBase<BaseFloat>& params) {
-    KALDI_ASSERT(params.Dim() == NumParams());
-    int32 filters_num_elem = filters_.NumRows() * filters_.NumCols();
-    filters_.CopyRowsFromVec(params.Range(0, filters_num_elem));
-    bias_.CopyFromVec(params.Range(filters_num_elem, bias_.Dim()));
-  }
-
-  std::string Info() const {
-    return std::string("\n  filters") + MomentStatistics(filters_) +
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           "\n  bias" + MomentStatistics(bias_) +
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-  std::string InfoGradient() const {
-    return std::string("\n  filters_grad") + MomentStatistics(filters_grad_) +
-           ", lr-coef " + ToString(learn_rate_coef_) +
-           "\n  bias_grad" + MomentStatistics(bias_grad_) +
-           ", lr-coef " + ToString(bias_learn_rate_coef_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    // int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    // this is total num_filters,
-    // so each input_fmap has size num_filters/num_input_fmaps
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-    // int32 filter_size = filt_x_len_*filt_y_len_;
-    int32 num_frames = in.NumRows();
-
-    // we will need the buffers
-    if (vectorized_feature_patches_.size() == 0) {
-      vectorized_feature_patches_.resize(out_fmap_size);
-      feature_patch_diffs_.resize(out_fmap_size);
-    }
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      vectorized_feature_patches_[p].Resize(num_frames, filters_.NumCols());
-    }
-
-    // Checked for num_input_fmaps=1, check for num_inp_fmaps>1
-    int32 out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-    std::vector<int32> column_mask;
-    int32 st = 0;
-    if (connect_fmap_ == 1) {
-      st = (m * fmap_y_len_ + n) * num_input_fmaps;
-    } else {
-      st = m * fmap_y_len_ * num_input_fmaps + n;
-    }
-
-    for (int32 i = 0; i < filt_x_len_; i++) {
-      for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-        int32 c = 0;
-        if (connect_fmap_ == 1) {
-          c = st + i * (num_input_fmaps*fmap_y_len_) + j;
-        } else {
-          c = st + i * (num_input_fmaps * fmap_y_len_)
-                     + (j / num_input_fmaps)
-                     + (j % num_input_fmaps) * fmap_y_len_;
-        }
-        column_mask.push_back(c);
-      }
-    }
-    CuArray<int32> cu_column_mask(column_mask);
-    vectorized_feature_patches_[out_fmap_cnt].CopyCols(in, cu_column_mask);
-    out_fmap_cnt++;
-      }
-    }
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      CuSubMatrix<BaseFloat> tgt(out->ColRange(p*num_filters, num_filters));
-      tgt.AddVecToRows(1.0, bias_, 0.0);
-      tgt.AddMatMat(1.0, vectorized_feature_patches_[p], kNoTrans, filters_, kTrans, 1.0);
-    }
-  }
-
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len * out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    // this is total num_filters,
-    // so each input_fmap has num_filters/num_input_fmaps
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-    // int32 filter_size = filt_x_len_*filt_y_len_;
-    int32 num_frames = in.NumRows();
-
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      feature_patch_diffs_[p].Resize(num_frames, filters_.NumCols(), kSetZero);
-      CuSubMatrix<BaseFloat> out_diff_patch(out_diff.ColRange(p*num_filters, num_filters));
-      feature_patch_diffs_[p].AddMatMat(1.0, out_diff_patch, kNoTrans, filters_, kNoTrans, 0.0);
-    }
-
-    // compute in_diff_summands_ once
-    if (in_diff_summands_.Dim() == 0) {
-      in_diff_summands_.Resize(in_diff->NumCols(), kSetZero);
-      for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-        for (int32 n = 0; n < fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-          int32 st = 0;
-          if (connect_fmap_ == 1) {
-            st = (m * fmap_y_len_ + n) * num_input_fmaps;
-          } else {
-            st = m * fmap_y_len_ * num_input_fmaps + n;
-          }
-          for (int32 i = 0; i < filt_x_len_; i++) {
-            for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-              int32 c = 0;
-              if (connect_fmap_ == 1) {
-                c = st + i * (num_input_fmaps * fmap_y_len_) + j;
-              } else {
-                c = st + i * (num_input_fmaps * fmap_y_len_)
-                       + (j / num_input_fmaps)
-                       + (j % num_input_fmaps) * fmap_y_len_;
-              }
-              // add 1.0
-              in_diff_summands_.Range(c, 1).Add(1.0);
-            }
-          }
-        }
-      }
-      in_diff_summands_.InvertElements();
-    }
-
-    int32 out_fmap_cnt = 0;
-
-    for (int32 m = 0; m < fmap_x_len_-filt_x_len_+1; m = m+filt_x_step_) {
-      for (int32 n = 0; n< fmap_y_len_-filt_y_len_+1; n = n+filt_y_step_) {
-        int32 st = 0;
-        if (connect_fmap_ == 1) {
-          st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        } else {
-          st = m * fmap_y_len_ * num_input_fmaps + n;
-        }
-
-        for (int32 i = 0; i < filt_x_len_; i++) {
-          for (int32 j = 0; j < filt_y_len_*num_input_fmaps; j++) {
-            int32 c = 0;
-            if (connect_fmap_ == 1) {
-              c = st + i *(num_input_fmaps*fmap_y_len_)+j;
-            } else {
-              c = st + i * (num_input_fmaps * fmap_y_len_)
-                     + (j / num_input_fmaps)
-                     + (j % num_input_fmaps) * fmap_y_len_;
-            }
-            // from which col?
-            CuMatrix<BaseFloat>& diff_mat = feature_patch_diffs_[out_fmap_cnt];
-            CuSubMatrix<BaseFloat> src(diff_mat.ColRange(i*filt_y_len_*num_input_fmaps+j, 1));
-            // to which col?
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, 1));
-            tgt.AddMat(1.0, src);
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-    // compensate for summands
-    in_diff->MulColsVec(in_diff_summands_);
-  }
-
-
-  void Update(const CuMatrixBase<BaseFloat> &input,
-              const CuMatrixBase<BaseFloat> &diff) {
-    // useful dims,
-    int32 out_fmap_x_len = (fmap_x_len_ - filt_x_len_)/filt_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - filt_y_len_)/filt_y_step_ + 1;
-    int32 out_fmap_size = out_fmap_x_len * out_fmap_y_len;
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-
-    // This is total num_filters,
-    // each input_fmap has num_filters / num_input_fmaps:
-    int32 num_filters = filters_.NumRows();
-    KALDI_ASSERT(num_filters == num_output_fmaps);
-
-    // we use following hyperparameters from the option class,
-    const BaseFloat lr = opts_.learn_rate;
-
-    //
-    // calculate the gradient
-    //
-    filters_grad_.Resize(filters_.NumRows(), filters_.NumCols(), kSetZero);
-    bias_grad_.Resize(filters_.NumRows(), kSetZero);
-    //
-    for (int32 p = 0; p < out_fmap_size; p++) {
-      CuSubMatrix<BaseFloat> diff_patch(diff.ColRange(p * num_filters, num_filters));
-      filters_grad_.AddMatMat(1.0, diff_patch, kTrans, vectorized_feature_patches_[p], kNoTrans, 1.0);
-      bias_grad_.AddRowSumMat(1.0, diff_patch, 1.0);
-    }
-    // scale
-    filters_grad_.Scale(1.0/num_output_fmaps);
-    bias_grad_.Scale(1.0/num_output_fmaps);
-
-    //
-    // update
-    //
-    filters_.AddMat(-lr * learn_rate_coef_, filters_grad_);
-    bias_.AddVec(-lr * bias_learn_rate_coef_, bias_grad_);
-  }
-
- private:
-  /// feature maps dimensions (for input x_ is usually splice
-  /// and y_ is num of fbanks) shift for 2nd dim of a patch
-  /// (i.e. frame length before splicing),
-  int32 fmap_x_len_, fmap_y_len_;
-
-  /// 2D filter dimensions, x_ temporal, y_ spectral,
-  int32 filt_x_len_, filt_y_len_;
-
-  /// 2D shifts along temporal and spectral axis,
-  int32 filt_x_step_, filt_y_step_;
-
-  int32 connect_fmap_;  ///< if connect_fmap_ = 1, then each fmap has num_filt
-
-  CuMatrix<BaseFloat> filters_;  ///< row = vectorized rectangular filter
-  CuVector<BaseFloat> bias_;  ///< bias for each filter
-
-  CuMatrix<BaseFloat> filters_grad_;  ///< gradient of filters
-  CuVector<BaseFloat> bias_grad_;  ///< gradient of biases
-
-  /** Buffer of reshaped inputs:
-   *  1row = vectorized rectangular feature patch,
-   *  1col = dim over speech frames,
-   *  std::vector-dim = patch-position
-   */
-  std::vector<CuMatrix<BaseFloat> > vectorized_feature_patches_;
-
-  /** Buffer for backpropagation:
-   *  derivatives in the domain of 'vectorized_feature_patches_',
-   *  1row = vectorized rectangular feature patch,
-   *  1col = dim over speech frames,
-   *  std::vector-dim = patch-position
-   */
-  std::vector<CuMatrix<BaseFloat> > feature_patch_diffs_;
-
-  /// Auxiliary vector for compensating #summands when backpropagating
-  CuVector<BaseFloat> in_diff_summands_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_CONVOLUTIONAL_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-lstm-projected.h b/src/nnet/nnet-lstm-projected.h
index cc2b7c24ed2..eaf1062794f 100644
--- a/src/nnet/nnet-lstm-projected.h
+++ b/src/nnet/nnet-lstm-projected.h
@@ -655,21 +655,21 @@ class LstmProjected : public MultistreamComponent {
               const CuMatrixBase<BaseFloat> &diff) {
 
     // apply the gradient clipping,
-    if (clip_gradient_ > 0.0) {
-      w_gifo_x_corr_.ApplyFloor(-clip_gradient_);
-      w_gifo_x_corr_.ApplyCeiling(clip_gradient_);
-      w_gifo_r_corr_.ApplyFloor(-clip_gradient_);
-      w_gifo_r_corr_.ApplyCeiling(clip_gradient_);
-      bias_corr_.ApplyFloor(-clip_gradient_);
-      bias_corr_.ApplyCeiling(clip_gradient_);
-      w_r_m_corr_.ApplyFloor(-clip_gradient_);
-      w_r_m_corr_.ApplyCeiling(clip_gradient_);
-      peephole_i_c_corr_.ApplyFloor(-clip_gradient_);
-      peephole_i_c_corr_.ApplyCeiling(clip_gradient_);
-      peephole_f_c_corr_.ApplyFloor(-clip_gradient_);
-      peephole_f_c_corr_.ApplyCeiling(clip_gradient_);
-      peephole_o_c_corr_.ApplyFloor(-clip_gradient_);
-      peephole_o_c_corr_.ApplyCeiling(clip_gradient_);
+    if (grad_clip_ > 0.0) {
+      w_gifo_x_corr_.ApplyFloor(-grad_clip_);
+      w_gifo_x_corr_.ApplyCeiling(grad_clip_);
+      w_gifo_r_corr_.ApplyFloor(-grad_clip_);
+      w_gifo_r_corr_.ApplyCeiling(grad_clip_);
+      bias_corr_.ApplyFloor(-grad_clip_);
+      bias_corr_.ApplyCeiling(grad_clip_);
+      w_r_m_corr_.ApplyFloor(-grad_clip_);
+      w_r_m_corr_.ApplyCeiling(grad_clip_);
+      peephole_i_c_corr_.ApplyFloor(-grad_clip_);
+      peephole_i_c_corr_.ApplyCeiling(grad_clip_);
+      peephole_f_c_corr_.ApplyFloor(-grad_clip_);
+      peephole_f_c_corr_.ApplyCeiling(grad_clip_);
+      peephole_o_c_corr_.ApplyFloor(-grad_clip_);
+      peephole_o_c_corr_.ApplyCeiling(grad_clip_);
     }
 
     const BaseFloat lr  = opts_.learn_rate;
@@ -698,9 +698,6 @@ class LstmProjected : public MultistreamComponent {
   // buffer for transfering state across batches,
   CuMatrix<BaseFloat> prev_nnet_state_;
 
-  // gradient-clipping value,
-  BaseFloat clip_gradient_;
-
   // feed-forward connections: from x to [g, i, f, o]
   CuMatrix<BaseFloat> w_gifo_x_;
   CuMatrix<BaseFloat> w_gifo_x_corr_;
diff --git a/src/nnet/nnet-max-pooling-2d-component.h b/src/nnet/nnet-max-pooling-2d-component.h
deleted file mode 100644
index 4a4045ca73d..00000000000
--- a/src/nnet/nnet-max-pooling-2d-component.h
+++ /dev/null
@@ -1,225 +0,0 @@
-// nnet/nnet-max-pooling-2d-component.h
-
-// Copyright 2014  Brno University of Technology (author: Karel Vesely),
-//                 Johns Hopkins University (author: Sri Harish Mallidi)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
-#define KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
-
-#include <string>
-#include <vector>
-
-#include "nnet/nnet-component.h"
-#include "nnet/nnet-utils.h"
-#include "cudamatrix/cu-math.h"
-
-namespace kaldi {
-namespace nnet1 {
-
-/**
- * MaxPoolingComponent :
- * The input/output matrices are split to submatrices with width 'pool_stride_'.
- * The pooling is done over 3rd axis, of the set of 2d matrices.
- * Our pooling supports overlaps, overlaps occur when (pool_step_ < pool_size_).
- */
-class MaxPooling2DComponent : public Component {
- public:
-  MaxPooling2DComponent(int32 dim_in, int32 dim_out):
-    Component(dim_in, dim_out),
-    fmap_x_len_(0), fmap_y_len_(0),
-    pool_x_len_(0), pool_y_len_(0),
-    pool_x_step_(0), pool_y_step_(0)
-  { }
-
-  ~MaxPooling2DComponent()
-  { }
-
-  Component* Copy() const { return new MaxPooling2DComponent(*this); }
-  ComponentType GetType() const { return kMaxPooling2DComponent; }
-
-  void InitData(std::istream &is) {
-    // parse config
-    std::string token;
-    while (is >> std::ws, !is.eof()) {
-      ReadToken(is, false, &token);
-      /**/ if (token == "<FmapXLen>") ReadBasicType(is, false, &fmap_x_len_);
-      else if (token == "<FmapYLen>") ReadBasicType(is, false, &fmap_y_len_);
-      else if (token == "<PoolXLen>") ReadBasicType(is, false, &pool_x_len_);
-      else if (token == "<PoolYLen>") ReadBasicType(is, false, &pool_y_len_);
-      else if (token == "<PoolXStep>") ReadBasicType(is, false, &pool_x_step_);
-      else if (token == "<PoolYStep>") ReadBasicType(is, false, &pool_y_step_);
-      else KALDI_ERR << "Unknown token " << token << ", a typo in config?"
-                     << " (FmapXLen|FmapYLen|PoolXLen|PoolYLen|PoolXStep|PoolYStep)";
-    }
-    // check
-    KALDI_ASSERT(fmap_x_len_ * fmap_y_len_ != 0);
-    KALDI_ASSERT(pool_x_len_ * pool_y_len_ != 0);
-    KALDI_ASSERT(pool_x_step_ * pool_y_step_ != 0);
-  }
-
-  void ReadData(std::istream &is, bool binary) {
-    // pooling hyperparameters
-    ExpectToken(is, binary, "<FmapXLen>");
-    ReadBasicType(is, binary, &fmap_x_len_);
-    ExpectToken(is, binary, "<FmapYLen>");
-    ReadBasicType(is, binary, &fmap_y_len_);
-    ExpectToken(is, binary, "<PoolXLen>");
-    ReadBasicType(is, binary, &pool_x_len_);
-    ExpectToken(is, binary, "<PoolYLen>");
-    ReadBasicType(is, binary, &pool_y_len_);
-    ExpectToken(is, binary, "<PoolXStep>");
-    ReadBasicType(is, binary, &pool_x_step_);
-    ExpectToken(is, binary, "<PoolYStep>");
-    ReadBasicType(is, binary, &pool_y_step_);
-
-    //
-    // Sanity checks:
-    //
-    // input sanity checks
-    // input_dim_ should be multiple of (fmap_x_len_ * fmap_y_len_)
-    KALDI_ASSERT(input_dim_ % (fmap_x_len_ * fmap_y_len_) == 0);
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    KALDI_LOG << "num_fmaps " << num_input_fmaps;
-    // check if step is in sync with fmap_len and filt_len
-    KALDI_ASSERT((fmap_x_len_ - pool_x_len_) % (pool_x_step_) == 0);
-    KALDI_ASSERT((fmap_y_len_ - pool_y_len_) % (pool_y_step_) == 0);
-    int32 out_fmap_x_len = (fmap_x_len_ - pool_x_len_)/pool_x_step_ + 1;
-    int32 out_fmap_y_len = (fmap_y_len_ - pool_y_len_)/pool_y_step_ + 1;
-    //    int32 out_fmap_size = out_fmap_x_len*out_fmap_y_len;
-    // output sanity checks
-    KALDI_ASSERT(output_dim_ % (out_fmap_x_len * out_fmap_y_len)  == 0);
-    int32 num_output_fmaps = output_dim_ / (out_fmap_x_len * out_fmap_y_len);
-    KALDI_ASSERT(num_input_fmaps == num_output_fmaps);
-  }
-
-  void WriteData(std::ostream &os, bool binary) const {
-    // pooling hyperparameters
-    WriteToken(os, binary, "<FmapXLen>");
-    WriteBasicType(os, binary, fmap_x_len_);
-    WriteToken(os, binary, "<FmapYLen>");
-    WriteBasicType(os, binary, fmap_y_len_);
-    WriteToken(os, binary, "<PoolXLen>");
-    WriteBasicType(os, binary, pool_x_len_);
-    WriteToken(os, binary, "<PoolYLen>");
-    WriteBasicType(os, binary, pool_y_len_);
-    WriteToken(os, binary, "<PoolXStep>");
-    WriteBasicType(os, binary, pool_x_step_);
-    WriteToken(os, binary, "<PoolYStep>");
-    WriteBasicType(os, binary, pool_y_step_);
-  }
-
-  void PropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                    CuMatrixBase<BaseFloat> *out) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m * fmap_y_len_ + n) * num_input_fmaps;
-        CuSubMatrix<BaseFloat> pool(
-          out->ColRange(out_fmap_cnt * num_input_fmaps, num_input_fmaps)
-        );
-        pool.Set(-1e20);  // reset (large neg value)
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            pool.Max(in.ColRange(c, num_input_fmaps));
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-  }
-
-  void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in,
-                        const CuMatrixBase<BaseFloat> &out,
-                        const CuMatrixBase<BaseFloat> &out_diff,
-                        CuMatrixBase<BaseFloat> *in_diff) {
-    // useful dims
-    int32 num_input_fmaps = input_dim_ / (fmap_x_len_ * fmap_y_len_);
-    int32 inp_fmap_size = fmap_x_len_ * fmap_y_len_;
-
-    //
-    // here we note how many diff matrices are summed for each input patch,
-    std::vector<int32> patch_summands(inp_fmap_size, 0);
-    // this metainfo will be used to divide diff of patches
-    // used in more than one pool.
-    //
-
-    in_diff->SetZero();  // reset
-
-    int out_fmap_cnt = 0;
-    for (int32 m = 0; m < fmap_x_len_-pool_x_len_+1; m = m+pool_x_step_) {
-      for (int32 n = 0; n < fmap_y_len_-pool_y_len_+1; n = n+pool_y_step_) {
-        int32 st = 0;
-        st = (m*fmap_y_len_+n)*num_input_fmaps;
-
-        for (int32 i = 0; i < pool_x_len_; i++) {
-          for (int32 j = 0; j < pool_y_len_; j++) {
-            int32 c = 0;
-            c = st + i * (num_input_fmaps * fmap_y_len_)
-                   + j * num_input_fmaps;
-            //
-            CuSubMatrix<BaseFloat> in_p(in.ColRange(c, num_input_fmaps));
-            CuSubMatrix<BaseFloat> out_p(
-              out.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
-            );
-            //
-
-            CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c, num_input_fmaps));
-            CuMatrix<BaseFloat> src(
-              out_diff.ColRange(out_fmap_cnt*num_input_fmaps, num_input_fmaps)
-            );
-
-            CuMatrix<BaseFloat> mask;
-            in_p.EqualElementMask(out_p, &mask);
-            src.MulElements(mask);
-            tgt.AddMat(1.0, src);
-
-            patch_summands[c/num_input_fmaps] += 1;
-          }
-        }
-        out_fmap_cnt++;
-      }
-    }
-
-    // divide diff by #summands (compensate for patches used in more pools),
-    for (int i = 0; i < fmap_x_len_; i++) {
-      for (int32 j = 0; j < fmap_y_len_; j++) {
-        int32 c = i * fmap_y_len_ + j;
-        CuSubMatrix<BaseFloat> tgt(in_diff->ColRange(c * num_input_fmaps, num_input_fmaps));
-        KALDI_ASSERT(patch_summands[c] > 0);  // patch at least in one pool
-        tgt.Scale(1.0 / patch_summands[c]);
-      }
-    }
-  }
-
- private:
-  int32 fmap_x_len_, fmap_y_len_,
-        pool_x_len_, pool_y_len_,
-        pool_x_step_, pool_y_step_;
-};
-
-}  // namespace nnet1
-}  // namespace kaldi
-
-#endif  // KALDI_NNET_NNET_MAX_POOLING_2D_COMPONENT_H_
diff --git a/src/nnet/nnet-trnopts.h b/src/nnet/nnet-trnopts.h
index 12ad1b1cbb5..0a064e17fd4 100644
--- a/src/nnet/nnet-trnopts.h
+++ b/src/nnet/nnet-trnopts.h
@@ -52,7 +52,7 @@ struct NnetTrainOptions {
 
   // print for debug purposes
   friend std::ostream& operator<<(std::ostream& os, const NnetTrainOptions& opts) {
-    os << "RbmTrainOptions : "
+    os << "NnetTrainOptions : "
        << "learn_rate" << opts.learn_rate << ", "
        << "momentum" << opts.momentum << ", "
        << "l2_penalty" << opts.l2_penalty << ", "
diff --git a/src/nnet2/Makefile b/src/nnet2/Makefile
index 5fc27419ec1..7c19ec2603c 100644
--- a/src/nnet2/Makefile
+++ b/src/nnet2/Makefile
@@ -27,7 +27,7 @@ LIBNAME = kaldi-nnet2
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet2/combine-nnet-fast.cc b/src/nnet2/combine-nnet-fast.cc
index 3f3ae1f04bd..7ab2c9caf05 100644
--- a/src/nnet2/combine-nnet-fast.cc
+++ b/src/nnet2/combine-nnet-fast.cc
@@ -204,7 +204,7 @@ void FastNnetCombiner::CombineNnets(const Vector<double> &scale_params,
   int32 num_nnets = nnets.size();
   KALDI_ASSERT(num_nnets >= 1);
   int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
+  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
 
 
   *dest = nnets[0];
@@ -384,7 +384,7 @@ int32 FastNnetCombiner::GetInitialModel(
   int32 num_nnets = static_cast<int32>(nnets.size());
   KALDI_ASSERT(!nnets.empty());
   int32 best_n = -1;
-  double best_objf;
+  double best_objf = -std::numeric_limits<double>::infinity();
   Vector<double> objfs(nnets.size());
   for (int32 n = 0; n < num_nnets; n++) {
     double num_frames;
diff --git a/src/nnet2/combine-nnet.cc b/src/nnet2/combine-nnet.cc
index 791d3d74aae..57cc6133c58 100644
--- a/src/nnet2/combine-nnet.cc
+++ b/src/nnet2/combine-nnet.cc
@@ -31,9 +31,9 @@ static void CombineNnets(const Vector<BaseFloat> &scale_params,
   int32 num_nnets = nnets.size();
   KALDI_ASSERT(num_nnets >= 1);
   int32 num_uc = nnets[0].NumUpdatableComponents();
-  KALDI_ASSERT(num_nnets * nnets[0].NumUpdatableComponents());
-  
-  
+  KALDI_ASSERT(nnets[0].NumUpdatableComponents() >= 1);
+
+
   *dest = nnets[0];
   SubVector<BaseFloat> scale_params0(scale_params, 0, num_uc);
   dest->ScaleComponents(scale_params0);
@@ -54,12 +54,12 @@ static int32 GetInitialModel(
   KALDI_ASSERT(!nnets.empty());
   BaseFloat tot_frames = validation_set.size();
   int32 best_n = -1;
-  BaseFloat best_objf;
+  BaseFloat best_objf = -std::numeric_limits<BaseFloat>::infinity();
   Vector<BaseFloat> objfs(nnets.size());
   for (int32 n = 0; n < num_nnets; n++) {
     BaseFloat objf = ComputeNnetObjf(nnets[n], validation_set,
                                      minibatch_size) / tot_frames;
-    
+
     if (n == 0 || objf > best_objf) {
       best_objf = objf;
       best_n = n;
@@ -98,7 +98,7 @@ static void GetInitialScaleParams(
       num_nnets = static_cast<int32>(nnets.size());
   if (initial_model < 0 || initial_model > num_nnets)
     initial_model = GetInitialModel(validation_set, nnets);
-  
+
   KALDI_ASSERT(initial_model >= 0 && initial_model <= num_nnets);
   int32 num_uc = nnets[0].NumUpdatableComponents();
 
@@ -107,7 +107,7 @@ static void GetInitialScaleParams(
     KALDI_LOG << "Initializing with neural net with index " << initial_model;
     // At this point we're using the best of the individual neural nets.
     scale_params->Set(0.0);
-    
+
     // Set the block of parameters corresponding to the "best" of the
     // source neural nets to
     SubVector<double> best_block(*scale_params, num_uc * initial_model, num_uc);
@@ -129,14 +129,14 @@ static double ComputeObjfAndGradient(
     Vector<double> *gradient) {
 
   Vector<BaseFloat> scale_params_float(scale_params);
-  
+
   Nnet nnet_combined;
   CombineNnets(scale_params_float, nnets, &nnet_combined);
-  
+
   Nnet nnet_gradient(nnet_combined);
   bool is_gradient = true;
   nnet_gradient.SetZero(is_gradient);
-  
+
   // note: "ans" is normalized by the total weight of validation frames.
   int32 batch_size = 1024;
   double ans = ComputeNnetGradient(nnet_combined,
@@ -146,7 +146,7 @@ static double ComputeObjfAndGradient(
 
   double tot_frames = validation_set.size();
   if (gradient != NULL) {
-    int32 i = 0; // index into scale_params.  
+    int32 i = 0; // index into scale_params.
     for (int32 n = 0; n < static_cast<int32>(nnets.size()); n++) {
       for (int32 j = 0; j < nnet_combined.NumComponents(); j++) {
         const UpdatableComponent *uc =
@@ -155,7 +155,7 @@ static double ComputeObjfAndGradient(
             dynamic_cast<const UpdatableComponent*>(&(nnet_gradient.GetComponent(j)));
         if (uc != NULL) {
           double dotprod = uc->DotProduct(*uc_gradient) / tot_frames;
-          (*gradient)(i) = dotprod; 
+          (*gradient)(i) = dotprod;
           i++;
         }
       }
@@ -165,14 +165,14 @@ static double ComputeObjfAndGradient(
 
   if (debug) {
     KALDI_LOG << "Double-checking gradient computation";
-    
+
     Vector<BaseFloat> manual_gradient(scale_params.Dim());
     for (int32 i = 0; i < scale_params.Dim(); i++) {
       double delta = 1.0e-04, fg = fabs((*gradient)(i));
       if (fg < 1.0e-07) fg = 1.0e-07;
       if (fg * delta < 1.0e-05)
         delta = 1.0e-05 / fg;
-      
+
       Vector<double> scale_params_temp(scale_params);
       scale_params_temp(i) += delta;
       double new_ans = ComputeObjfAndGradient(validation_set,
@@ -185,10 +185,10 @@ static double ComputeObjfAndGradient(
     KALDI_LOG << "Manually computed gradient is " << manual_gradient;
     KALDI_LOG << "Gradient we computed is " << *gradient;
   }
-  
+
   return ans;
 }
-                                   
+
 
 void CombineNnets(const NnetCombineConfig &combine_config,
                   const std::vector<NnetExample> &validation_set,
@@ -205,7 +205,7 @@ void CombineNnets(const NnetCombineConfig &combine_config,
   int32 dim = scale_params.Dim();
   KALDI_ASSERT(dim > 0);
   Vector<double> gradient(dim);
-  
+
   double objf, initial_objf;
 
   LbfgsOptions lbfgs_options;
@@ -213,11 +213,11 @@ void CombineNnets(const NnetCombineConfig &combine_config,
   lbfgs_options.m = dim; // Store the same number of vectors as the dimension
   // itself, so this is BFGS.
   lbfgs_options.first_step_impr = combine_config.initial_impr;
-  
+
   OptimizeLbfgs<double> lbfgs(scale_params,
                               lbfgs_options);
-  
-  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {    
+
+  for (int32 i = 0; i < combine_config.num_bfgs_iters; i++) {
     scale_params.CopyFromVec(lbfgs.GetProposedValue());
     objf = ComputeObjfAndGradient(validation_set,
                                   scale_params,
@@ -227,9 +227,9 @@ void CombineNnets(const NnetCombineConfig &combine_config,
 
     KALDI_VLOG(2) << "Iteration " << i << " scale-params = " << scale_params
                   << ", objf = " << objf << ", gradient = " << gradient;
-    
+
     if (i == 0) initial_objf = objf;
-    
+
     lbfgs.DoStep(objf, gradient);
   }
 
@@ -244,10 +244,10 @@ void CombineNnets(const NnetCombineConfig &combine_config,
                                      nnets[0].NumUpdatableComponents());
   scale_params_mat.CopyRowsFromVec(scale_params_float);
   KALDI_LOG << "Final scale factors are " << scale_params_mat;
-  
+
   CombineNnets(scale_params_float, nnets, nnet_out);
 }
- 
-  
+
+
 } // namespace nnet2
 } // namespace kaldi
diff --git a/src/nnet2/decodable-am-nnet.h b/src/nnet2/decodable-am-nnet.h
index e3dedb33727..6c40b11bf9d 100644
--- a/src/nnet2/decodable-am-nnet.h
+++ b/src/nnet2/decodable-am-nnet.h
@@ -76,14 +76,14 @@ class DecodableAmNnet: public DecodableInterface {
   // from one (this routine is called by FSTs).
   virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
     return log_probs_(frame,
-                      trans_model_.TransitionIdToPdf(transition_id));
+                      trans_model_.TransitionIdToPdfFast(transition_id));
   }
 
   virtual int32 NumFramesReady() const { return log_probs_.NumRows(); }
-  
+
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
@@ -139,7 +139,7 @@ class DecodableAmNnetParallel: public DecodableInterface {
   virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
     if (feats_) Compute(); // this function sets feats_ to NULL.
     return log_probs_(frame,
-                      trans_model_.TransitionIdToPdf(transition_id));
+                      trans_model_.TransitionIdToPdfFast(transition_id));
   }
 
   int32 NumFramesReady() const {
@@ -155,10 +155,10 @@ class DecodableAmNnetParallel: public DecodableInterface {
       return log_probs_.NumRows();
     }
   }
-  
+
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
@@ -180,7 +180,7 @@ class DecodableAmNnetParallel: public DecodableInterface {
 
 
 
-  
+
 } // namespace nnet2
 } // namespace kaldi
 
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
index 04e476c01bd..5aeaf28cd1e 100644
--- a/src/nnet2/nnet-component-test.cc
+++ b/src/nnet2/nnet-component-test.cc
@@ -605,7 +605,7 @@ void UnitTestSumGroupComponent() {
 
 
 void UnitTestDctComponent() {
-  int32 m = 1 + Rand() % 4, n = 1 + Rand() % 4,
+  int32 m = 3 + Rand() % 4, n = 3 + Rand() % 4,
   dct_dim = m, dim = m * n;
   bool reorder = (Rand() % 2 == 0);
   {
@@ -619,12 +619,6 @@ void UnitTestDctComponent() {
     component.InitFromString(str);
     UnitTestGenericComponentInternal(component);
   }
-  {
-    const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=1";
-    DctComponent component;
-    component.InitFromString(str);
-    UnitTestGenericComponentInternal(component);
-  }
   {
     const char *str = "dim=10 dct-dim=5 reorder=true dct-keep-dim=2";
     DctComponent component;
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index eafeaceb9fe..f0919acfac8 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -909,7 +909,7 @@ void SoftmaxComponent::Propagate(const ChunkInfo &in_info,
   // for that row, we do
   // x_i = exp(x_i) / sum_j exp(x_j).
 
-  out->ApplySoftMaxPerRow(in);
+  out->SoftMaxPerRow(in);
 
   // This floor on the output helps us deal with
   // almost-zeros in a way that doesn't lead to overflow.
@@ -956,7 +956,7 @@ void LogSoftmaxComponent::Propagate(const ChunkInfo &in_info,
 
   // Applies log softmax function to each row of the output. For each row, we do
   // x_i = x_i - log(sum_j exp(x_j))
-  out->ApplyLogSoftMaxPerRow(in);
+  out->LogSoftMaxPerRow(in);
 
   // Just to be consistent with SoftmaxComponent::Propagate()
   out->ApplyFloor(Log(1.0e-20));
diff --git a/src/nnet2bin/Makefile b/src/nnet2bin/Makefile
index 3280acfc968..b172bd3739b 100644
--- a/src/nnet2bin/Makefile
+++ b/src/nnet2bin/Makefile
@@ -1,5 +1,10 @@
 
+# cuda-compiled was moved to nnet3bin/; remove it in case an outdated
+# version persists.
 all:
+	-rm -f cuda-compiled
+
+
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
@@ -23,14 +28,11 @@ BINFILES = nnet-am-info nnet-init \
    nnet-train-discriminative-simple nnet-train-discriminative-parallel \
    nnet-modify-learning-rates nnet-normalize-stddev  \
    nnet-get-weighted-egs nnet-adjust-priors \
-   cuda-compiled nnet-replace-last-layers nnet-am-switch-preconditioning \
+   nnet-replace-last-layers nnet-am-switch-preconditioning \
    nnet1-to-raw-nnet raw-nnet-copy nnet-relabel-egs nnet-am-reinitialize
 
 OBJFILES =
 
-# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure.
-cuda-compiled.o: ../kaldi.mk
-
 
 TESTFILES =
 
@@ -38,7 +40,7 @@ ADDLIBS = ../nnet2/kaldi-nnet2.a ../nnet/kaldi-nnet.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet2bin/nnet-am-average.cc b/src/nnet2bin/nnet-am-average.cc
index d35375f44f2..61ee9218ff7 100644
--- a/src/nnet2bin/nnet-am-average.cc
+++ b/src/nnet2bin/nnet-am-average.cc
@@ -113,6 +113,7 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace kaldi::nnet2;
+    using std::string;
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
 
diff --git a/src/nnet2bin/nnet-am-compute.cc b/src/nnet2bin/nnet-am-compute.cc
index 32da30b73a5..2b50f7cc656 100644
--- a/src/nnet2bin/nnet-am-compute.cc
+++ b/src/nnet2bin/nnet-am-compute.cc
@@ -94,7 +94,7 @@ int main(int argc, char *argv[]) {
     int64 num_done = 0, num_frames = 0;
 
     Vector<BaseFloat> inv_priors(am_nnet.Priors());
-    KALDI_ASSERT(inv_priors.Dim() == am_nnet.NumPdfs() &&
+    KALDI_ASSERT((!divide_by_priors || inv_priors.Dim() == am_nnet.NumPdfs()) &&
                  "Priors in neural network not set up.");
     inv_priors.ApplyPow(-1.0);
 
@@ -159,5 +159,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 135853cadc3..5e67211c3a7 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -15,7 +15,7 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-common-test convolution-test attention-test
 
 OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
-  nnet-simple-component.o nnet-normalize-component.o \
+  nnet-simple-component.o nnet-combined-component.o nnet-normalize-component.o \
   nnet-general-component.o nnet-parse.o natural-gradient-online.o \
   nnet-descriptor.o nnet-optimize.o nnet-computation.o \
   nnet-computation-graph.o nnet-graph.o am-nnet-simple.o \
@@ -31,15 +31,16 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-compile-looped.o decodable-simple-looped.o \
   decodable-online-looped.o convolution.o \
   nnet-convolutional-component.o attention.o \
-  nnet-attention-component.o nnet-tdnn-component.o
+  nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o
 
 
 LIBNAME = kaldi-nnet3
 
 ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
-          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
+          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
+          ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/attention.cc b/src/nnet3/attention.cc
index bd8cb6bf85c..ddfddbaf74a 100644
--- a/src/nnet3/attention.cc
+++ b/src/nnet3/attention.cc
@@ -133,7 +133,7 @@ void AttentionForward(BaseFloat key_scale,
   // compute the soft-max function.  Up till this point, 'c'
   // actually contained what in attention.h we called 'b', which is
   // the input to the softmax.
-  c->ApplySoftMaxPerRow(*c);
+  c->SoftMaxPerRow(*c);
 
 
   // the part of the output that is weighted
diff --git a/src/nnet3/attention.h b/src/nnet3/attention.h
index 0993b78fc86..a3371f18118 100644
--- a/src/nnet3/attention.h
+++ b/src/nnet3/attention.h
@@ -59,7 +59,7 @@ namespace attention {
 //
 // To explain what's going on, we start with the simplest form of attention:
 // single-head, and no positional encoding, but with restricted context.  For purposes
-// of exposition we assume that the time offsets we need form a contigous
+// of exposition we assume that the time offsets we need form a contiguous
 // range, i.e. with time-stride == 1; the code does have the notion of a stride (you'll
 // see later).
 //
diff --git a/src/nnet3/convolution.cc b/src/nnet3/convolution.cc
index 287ab7f47dd..1c5396949f8 100644
--- a/src/nnet3/convolution.cc
+++ b/src/nnet3/convolution.cc
@@ -976,7 +976,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts,
     // work out how many rows the temporary matrix should have, taking
     // into account the specified memory limit.
     temp_rows = computation->num_t_out * computation->num_images;
-    BaseFloat num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0,
+    BaseFloat num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0)),
         megabyte_limit = opts.max_memory_mb;
     // C++ rounds down; here, we want to round up so we add one.
     int32 ratio = 1.0 + num_megabytes / megabyte_limit;
@@ -986,7 +986,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts,
     // >= temp_rows so that we don't have a small leftover piece.
     int32 new_num_t_out = (computation->num_t_out + ratio - 1) / ratio;
     temp_rows = new_num_t_out * computation->num_images;
-    BaseFloat new_num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0;
+    BaseFloat new_num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0));
     // make sure we're within the memory limit.
     if (new_num_megabytes > 1.01 * megabyte_limit) {
       KALDI_WARN << "Memory consumed in convolution is more than requested "
diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
index f231a2d5b62..751438606e8 100644
--- a/src/nnet3/decodable-online-looped.cc
+++ b/src/nnet3/decodable-online-looped.cc
@@ -17,7 +17,7 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include <nnet3/decodable-online-looped.h>
+#include "nnet3/decodable-online-looped.h"
 #include "nnet3/nnet-utils.h"
 
 namespace kaldi {
@@ -30,6 +30,7 @@ DecodableNnetLoopedOnlineBase::DecodableNnetLoopedOnlineBase(
     num_chunks_computed_(0),
     current_log_post_subsampled_offset_(-1),
     info_(info),
+    frame_offset_(0),
     input_features_(input_features),
     ivector_features_(ivector_features),
     computer_(info_.opts.compute_config, info_.computation,
@@ -66,7 +67,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
   if (input_finished) {
     // if the input has finished,... we'll pad with duplicates of the last frame
     // as needed to get the required right context.
-    return (features_ready + sf - 1) / sf;
+    return (features_ready + sf - 1) / sf - frame_offset_;
   } else {
     // note: info_.right_context_ includes both the model context and any
     // extra_right_context_ (but this
@@ -78,7 +79,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
     // doesn't need any attention to rounding because info_.frames_per_chunk
     // is always a multiple of 'sf' (see 'frames_per_chunk = GetChunksize..."
     // in decodable-simple-looped.cc).
-    return num_chunks_ready * info_.frames_per_chunk / sf;
+    return num_chunks_ready * info_.frames_per_chunk / sf - frame_offset_;
   }
 }
 
@@ -105,9 +106,14 @@ bool DecodableNnetLoopedOnlineBase::IsLastFrame(
     return false;
   int32 sf = info_.opts.frame_subsampling_factor,
      num_subsampled_frames_ready = (features_ready + sf - 1) / sf;
-  return (subsampled_frame == num_subsampled_frames_ready - 1);
+  return (subsampled_frame + frame_offset_ == num_subsampled_frames_ready - 1);
 }
 
+void DecodableNnetLoopedOnlineBase::SetFrameOffset(int32 frame_offset) {
+  KALDI_ASSERT(0 <= frame_offset &&
+               frame_offset <= frame_offset_ + NumFramesReady());
+  frame_offset_ = frame_offset;
+}
 
 void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
   // Prepare the input data for the next chunk of features.
@@ -231,6 +237,7 @@ void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
 
 BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   // note: we index by 'inde
   return current_log_post_(
@@ -241,10 +248,11 @@ BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
 
 BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   return current_log_post_(
       subsampled_frame - current_log_post_subsampled_offset_,
-      trans_model_.TransitionIdToPdf(index));
+      trans_model_.TransitionIdToPdfFast(index));
 }
 
 
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
index 3041d3c4637..cc79e58feba 100644
--- a/src/nnet3/decodable-online-looped.h
+++ b/src/nnet3/decodable-online-looped.h
@@ -81,6 +81,17 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
     return info_.opts.frame_subsampling_factor;
   }
 
+  /// Sets the frame offset value. Frame offset is initialized to 0 when the
+  /// decodable object is constructed and stays as 0 unless this method is
+  /// called. This method is useful when we want to reset the decoder state,
+  /// i.e. call decoder.InitDecoding(), but we want to keep using the same
+  /// decodable object, e.g. in case of an endpoint. The frame offset affects
+  /// the behavior of IsLastFrame(), NumFramesReady() and LogLikelihood()
+  /// methods.
+  void SetFrameOffset(int32 frame_offset);
+
+  /// Returns the frame offset value.
+  int32 GetFrameOffset() const { return frame_offset_; }
 
  protected:
 
@@ -111,6 +122,11 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
 
   const DecodableNnetSimpleLoopedInfo &info_;
 
+  // IsLastFrame(), NumFramesReady() and LogLikelihood() methods take into
+  // account this offset value. We initialize frame_offset_ as 0 and it stays as
+  // 0 unless SetFrameOffset() method is called.
+  int32 frame_offset_;
+
  private:
 
   // This function does the computation for the next chunk.  It will change
@@ -132,7 +148,7 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
 // It fully implements DecodableInterface.
 // Note: whether or not division by the prior takes place depends on
 // whether you supplied class AmNnetSimple (or just Nnet), to the constructor
-// of the DecodableNnetSimpleLoopedInfo that you initailized this
+// of the DecodableNnetSimpleLoopedInfo that you initialized this
 // with.
 class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
  public:
@@ -162,7 +178,7 @@ class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
 // pdf-ids.
 // Note: whether or not division by the prior takes place depends on
 // whether you supplied class AmNnetSimple (or just Nnet), to the constructor
-// of the DecodableNnetSimpleLoopedInfo that you initailized this
+// of the DecodableNnetSimpleLoopedInfo that you initialized this
 // with.
 class DecodableAmNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
  public:
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index d4edb440d5a..71aa7daaa17 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -52,7 +52,6 @@ DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
   Init(opts, &(am_nnet->GetNnet()));
 }
 
-
 void DecodableNnetSimpleLoopedInfo::Init(
     const NnetSimpleLoopedComputationOptions &opts,
     Nnet *nnet) {
@@ -86,10 +85,8 @@ void DecodableNnetSimpleLoopedInfo::Init(
   CompileLooped(*nnet, opts.optimize_config, request1, request2, request3,
                 &computation);
   computation.ComputeCudaIndexes();
-  if (GetVerboseLevel() >= 3) {
-    KALDI_VLOG(3) << "Computation is:";
-    computation.Print(std::cerr, *nnet);
-  }
+  KALDI_VLOG(3) << "Computation is:\n"
+                << NnetComputationPrintInserter{computation, *nnet};
 }
 
 
@@ -257,7 +254,7 @@ DecodableAmNnetSimpleLooped::DecodableAmNnetSimpleLooped(
 
 BaseFloat DecodableAmNnetSimpleLooped::LogLikelihood(int32 frame,
                                                      int32 transition_id) {
-  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(transition_id);
   return decodable_nnet_.GetOutput(frame, pdf_id);
 }
 
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
index 4a32236c9ff..8076da2b753 100644
--- a/src/nnet3/discriminative-training.cc
+++ b/src/nnet3/discriminative-training.cc
@@ -26,7 +26,7 @@ namespace kaldi {
 namespace discriminative {
 
 DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo() {
-  std::memset(this, 0, sizeof(*this));
+  std::memset((void *)this, 0, sizeof(*this));
 }
 
 DiscriminativeObjectiveInfo::DiscriminativeObjectiveInfo(int32 num_pdfs) :
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index b5740053f46..a205490ee3f 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -28,7 +28,7 @@ OnlineNaturalGradient::OnlineNaturalGradient():
     rank_(40), update_period_(1), num_samples_history_(2000.0),
     num_minibatches_history_(0.0), alpha_(4.0),
     epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0),
-    self_debug_(false) { }
+    self_debug_(false), rho_t_(-1.0e+10) { }
 
 
 /**
@@ -119,14 +119,14 @@ void OnlineNaturalGradient::InitDefault(int32 D) {
   t_ = 0;
 }
 
-void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
-  int32 D = R0.NumCols();
+void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &X0) {
+  int32 D = X0.NumCols();
   // for locking reasons it's better to use a different object.
   OnlineNaturalGradient this_copy(*this);
   this_copy.InitDefault(D);
   this_copy.t_ = 1;  // Prevent recursion to Init() again.
 
-  CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
+  CuMatrix<BaseFloat> X0_copy(X0.NumRows(), X0.NumCols(), kUndefined);
   // 'num_iters' is number of iterations with the same data from a pseudorandom
   // start.  this is a faster way of starting than doing eigenvalue
   // decomposition.
@@ -134,11 +134,11 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   // Note: we only do three iterations of initialization if we have enough data
   // that it's reasonably possible to estimate the subspace of dimension
   // this_copy.rank_.  If we don't have more than that many rows in our initial
-  // minibatch R0, we just do one iteration... this gives us almost exactly
-  // (barring small effects due to epsilon_ > 0) the row subspace of R0 after
+  // minibatch X0, we just do one iteration... this gives us almost exactly
+  // (barring small effects due to epsilon_ > 0) the row subspace of X0 after
   // one iteration anyway.
   int32 num_init_iters;
-  if (R0.NumRows() <= this_copy.rank_)
+  if (X0.NumRows() <= this_copy.rank_)
     num_init_iters = 1;
   else
     num_init_iters = 3;
@@ -147,8 +147,8 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
                                // initialize.
   for (int32 i = 0; i < num_init_iters; i++) {
     BaseFloat scale;
-    R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, &scale);
+    X0_copy.CopyFromMat(X0);
+    this_copy.PreconditionDirections(&X0_copy, &scale);
   }
   rank_ = this_copy.rank_;
   W_t_.Swap(&this_copy.W_t_);
@@ -197,7 +197,7 @@ void OnlineNaturalGradient::PreconditionDirections(
   t_ += 1;
 }
 
-void OnlineNaturalGradient::ReorthogonalizeXt1(
+void OnlineNaturalGradient::ReorthogonalizeRt1(
     const VectorBase<BaseFloat> &d_t1,
     BaseFloat rho_t1,
     CuMatrixBase<BaseFloat> *W_t1,
@@ -214,7 +214,7 @@ void OnlineNaturalGradient::ReorthogonalizeXt1(
   ComputeEt(d_t1, beta_t1, &e_t1, &sqrt_e_t1, &inv_sqrt_e_t1);
 
   temp_O->SymAddMat2(1.0, *W_t1, kNoTrans, 0.0);
-  // O_t =  E_t^{-0.5} W_t W_t^T E_t^{-0.5}
+  // O_{t+1} =  E_{t+1}^{-0.5} W_{t+1} W_{t+1}^T E_{t+1}^{-0.5}
   Matrix<BaseFloat> O_mat(*temp_O);
   SpMatrix<BaseFloat> O(O_mat, kTakeLower);
   for (int32 i = 0; i < R; i++) {
@@ -439,7 +439,7 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
     if (self_debug_) {
       KALDI_WARN << "Reorthogonalizing.";
     }
-    ReorthogonalizeXt1(d_t1,
+    ReorthogonalizeRt1(d_t1,
                        rho_t1,
                        &W_t1,
                        &J_t,
@@ -510,7 +510,7 @@ void OnlineNaturalGradient::ComputeWt1(int32 N,
   // B_t = J_t + (1-\eta)/(\eta/N) (D_t + \rho_t I) W_t
   J_t->AddDiagVecMat(1.0, w_t_coeff_gpu, W_t, kNoTrans, 1.0);
 
-  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5} B_t
+  // A_t = (\eta/N) E_{t+1}^{0.5} C_t^{-0.5} U_t^T E_t^{-0.5}
   Matrix<BaseFloat> A_t(U_t, kTrans);
   for (int32 i = 0; i < R; i++) {
     BaseFloat i_factor = (eta / N) * sqrt_e_t1(i) * inv_sqrt_c_t(i);
@@ -623,6 +623,21 @@ void OnlineNaturalGradient::SetAlpha(BaseFloat alpha) {
   alpha_ = alpha;
 }
 
-
-}
+void OnlineNaturalGradient::Swap(OnlineNaturalGradient *other) {
+  std::swap(rank_, other->rank_);
+  std::swap(update_period_, other->update_period_);
+  std::swap(num_samples_history_, other->num_samples_history_);
+  std::swap(num_minibatches_history_, other->num_minibatches_history_);
+  std::swap(alpha_, other->alpha_);
+  std::swap(epsilon_, other->epsilon_);
+  std::swap(delta_, other->delta_);
+  std::swap(frozen_, other->frozen_);
+  std::swap(t_, other->t_);
+  std::swap(self_debug_, other->self_debug_);
+  W_t_.Swap(&(other->W_t_));
+  std::swap(rho_t_, other->rho_t_);
+  d_t_.Swap(&(other->d_t_));
 }
+
+}  // namespace nnet3
+}  // namespace kaldi
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index b49769da540..77be28a19d4 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -375,8 +375,8 @@ namespace nnet3 {
    * Initialization *
 
    Now, a note on what we do on time t = 0, i.e. for the first minibatch.  We
-   initialize X_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
-   minibatch size (num-rows of R0).  If L is the corresponding RxR diagonal
+   initialize R_0 to the top R eigenvectors of 1/N X_0 X_0^T, where N is the
+   minibatch size (num-rows of X0).  If L is the corresponding RxR diagonal
    matrix of eigenvalues, then we will set D_0 = L - \rho_0 I.  We set \rho_0
    to ensure that
                       tr(F_0) = 1/N tr(X_0 X_0^T),
@@ -457,7 +457,7 @@ class OnlineNaturalGradient {
             not.
 
   */
-  void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
+  void PreconditionDirections(CuMatrixBase<BaseFloat> *X,
                               BaseFloat *scale);
 
 
@@ -466,6 +466,9 @@ class OnlineNaturalGradient {
   explicit OnlineNaturalGradient(const OnlineNaturalGradient &other);
   // Assignent operator
   OnlineNaturalGradient &operator = (const OnlineNaturalGradient &other);
+
+  // Shallow swap
+  void Swap(OnlineNaturalGradient *other);
  private:
 
 
@@ -512,7 +515,7 @@ class OnlineNaturalGradient {
   // This function is called if C_t has high condition number; it makes sure
   // that R_{t+1} is orthogonal.  See the section in the extended comment above
   // on "keeping R_t orthogonal".
-  void ReorthogonalizeXt1(const VectorBase<BaseFloat> &d_t1,
+  void ReorthogonalizeRt1(const VectorBase<BaseFloat> &d_t1,
                           BaseFloat rho_t1,
                           CuMatrixBase<BaseFloat> *W_t1,
                           CuMatrixBase<BaseFloat> *temp_W,
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index d66e24830c6..9682bd96bc7 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -77,7 +77,7 @@ DecodableAmNnetSimple::DecodableAmNnetSimple(
 
 BaseFloat DecodableAmNnetSimple::LogLikelihood(int32 frame,
                                                int32 transition_id) {
-  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(transition_id);
   return decodable_nnet_.GetOutput(frame, pdf_id);
 }
 
@@ -204,7 +204,7 @@ void DecodableNnetSimple::GetCurrentIvector(int32 output_t_start,
                 << ", only available till frame "
                 << online_ivector_feats_->NumRows()
                 << " * ivector-period=" << online_ivector_period_
-                << " (mismatched --ivector-period?)";
+                << " (mismatched --online-ivector-period?)";
     }
     ivector_frame = online_ivector_feats_->NumRows() - 1;
   }
@@ -357,7 +357,7 @@ void DecodableAmNnetSimpleParallel::DeletePointers() {
 
 BaseFloat DecodableAmNnetSimpleParallel::LogLikelihood(int32 frame,
                                                        int32 transition_id) {
-  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(transition_id);
   return decodable_nnet_->GetOutput(frame, pdf_id);
 }
 
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index e83b9e4bab2..74a1e75b59a 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -103,6 +103,39 @@ struct NnetSimpleComputationOptions {
     ParseOptions compute_opts("computation", opts);
     compute_config.Register(&compute_opts);
   }
+
+  void CheckAndFixConfigs(int32 nnet_modulus) {
+    static bool warned_frames_per_chunk = false;
+    if (frame_subsampling_factor < 1 || frames_per_chunk < 1) {
+      KALDI_ERR << "--frame-subsampling-factor and "
+                << "--frames-per-chunk must be > 0";
+    }
+    KALDI_ASSERT(nnet_modulus > 0);
+    int32 n = Lcm(frame_subsampling_factor, nnet_modulus);
+
+    if (frames_per_chunk % n != 0) {
+      // round up to the nearest multiple of n.
+      int32 new_frames_per_chunk = n * ((frames_per_chunk + n - 1) / n);
+      if (!warned_frames_per_chunk) {
+        warned_frames_per_chunk = true;
+        if (nnet_modulus == 1) {
+          // simpler error message.
+          KALDI_LOG << "Increasing --frames-per-chunk from " << frames_per_chunk
+                    << " to " << new_frames_per_chunk
+                    << " to make it a multiple of "
+                    << "--frame-subsampling-factor="
+                    << frame_subsampling_factor;
+        } else {
+          KALDI_LOG << "Increasing --frames-per-chunk from " << frames_per_chunk
+                    << " to " << new_frames_per_chunk << " due to "
+                    << "--frame-subsampling-factor=" << frame_subsampling_factor
+                    << " and "
+                    << "nnet shift-invariance modulus = " << nnet_modulus;
+        }
+      }
+      frames_per_chunk = new_frames_per_chunk;
+    }
+  }
 };
 
 /*
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 584a7c19ab8..a3696403eba 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -880,7 +880,7 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Backprop input needed but not supplied.";
         if ((properties & kBackpropNeedsOutput) && c.arg4 == 0)
           KALDI_ERR << "Backprop output needed but not supplied.";
-        if (c.arg6 == 0 && !(properties && kUpdatableComponent)) {
+        if (c.arg6 == 0 && !(properties & kUpdatableComponent)) {
           // note: we could perhaps make this just a warning,
           // or optimize it away somehow.
           KALDI_ERR << "Backprop is done but has no effect.";
diff --git a/src/nnet3/nnet-attention-component.cc b/src/nnet3/nnet-attention-component.cc
index 58e662af774..4dd9f606f0b 100644
--- a/src/nnet3/nnet-attention-component.cc
+++ b/src/nnet3/nnet-attention-component.cc
@@ -545,7 +545,7 @@ bool RestrictedAttentionComponent::IsComputable(
       } else {
         // This input index is not available.
         int32 offset = (t - output_index.t) / time_stride_;
-        if (offset >= num_left_inputs_required_ &&
+        if (offset >= -num_left_inputs_required_ &&
             offset <= num_right_inputs_required_) {
           used_inputs->clear();
           return false;
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
new file mode 100644
index 00000000000..0e07834ed3d
--- /dev/null
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -0,0 +1,1528 @@
+// nnet3/nnet-batch-compute.cc
+
+// Copyright 2012-2018  Johns Hopkins University (author: Daniel Povey)
+//           2018       Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <iomanip>
+#include "nnet3/nnet-batch-compute.h"
+#include "nnet3/nnet-utils.h"
+#include "decoder/decodable-matrix.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+NnetBatchComputer::NnetBatchComputer(
+    const NnetBatchComputerOptions &opts,
+    const Nnet &nnet,
+    const VectorBase<BaseFloat> &priors):
+    opts_(opts),
+    nnet_(nnet),
+    compiler_(nnet_, opts.optimize_config),
+    log_priors_(priors),
+    num_full_minibatches_(0) {
+  log_priors_.ApplyLog();
+  opts_.CheckAndFixConfigs(nnet_.Modulus());
+  KALDI_ASSERT(opts_.minibatch_size >= 1 && opts_.edge_minibatch_size >= 1 &&
+               opts_.partial_minibatch_factor < 1.0 &&
+               opts_.partial_minibatch_factor >= 0.0);
+
+  ComputeSimpleNnetContext(nnet, &nnet_left_context_, &nnet_right_context_);
+  input_dim_ = nnet.InputDim("input");
+  ivector_dim_ = std::max<int32>(0, nnet.InputDim("ivector"));
+  output_dim_ = nnet.OutputDim("output");
+  KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0);
+}
+
+void NnetBatchComputer::PrintMinibatchStats() {
+  int32 max_stats_to_print = 10;
+  int64 tot_tasks = 0, tot_minibatches = 0;
+  double tot_time = 0.0;
+  std::ostringstream os;
+  struct MinibatchStats {
+    int32 num_frames_out;
+    int32 num_frames_in;
+    int32 minibatch_size;
+    int32 num_done;
+    int32 percent_full;
+    BaseFloat seconds_taken;
+
+    bool operator < (const MinibatchStats &other) const {
+      return seconds_taken > other.seconds_taken;  // sort from most to least time.
+    }
+  };
+  std::vector<MinibatchStats> all_stats;
+  os << "Minibatch stats: seconds-taken,frames-in:frames-out*minibatch-size=num-done(percent-full%)  ";
+
+  for (MapType::const_iterator iter = tasks_.begin();
+       iter != tasks_.end(); ++iter) {
+    for (std::map<int32, MinibatchSizeInfo>::const_iterator
+             miter = iter->second.minibatch_info.begin();
+         miter != iter->second.minibatch_info.end(); ++miter) {
+      const ComputationGroupKey &key = iter->first;
+      const MinibatchSizeInfo &minfo = miter->second;
+      MinibatchStats stats;
+      stats.num_frames_in = key.num_input_frames;
+      stats.num_frames_out = key.num_output_frames;
+      stats.minibatch_size = miter->first;
+      stats.num_done = minfo.num_done;
+      stats.seconds_taken = minfo.seconds_taken;
+
+      tot_tasks += minfo.tot_num_tasks;
+      tot_minibatches += minfo.num_done;
+      tot_time += minfo.seconds_taken;
+      stats.percent_full = int32(minfo.tot_num_tasks * 100.0 /
+                                 (stats.minibatch_size * stats.num_done));
+      all_stats.push_back(stats);
+    }
+  }
+
+  std::sort(all_stats.begin(), all_stats.end());
+  os << std::fixed << std::setprecision(2);
+  int32 num_stats = all_stats.size();
+  for (int32 i = 0; i < std::min<int32>(num_stats, max_stats_to_print); i++) {
+    MinibatchStats &stats = all_stats[i];
+    os << stats.seconds_taken << ',' << stats.num_frames_in << ':'
+       << stats.num_frames_out << '*' << stats.minibatch_size
+       << '=' << stats.num_done << '(' << stats.percent_full << "%) ";
+  }
+  if (num_stats > max_stats_to_print)
+    os << "...";
+  KALDI_LOG << os.str();
+  KALDI_LOG << "Did " << tot_tasks << " tasks in " << tot_minibatches
+            << " minibatches, taking " << tot_time << " seconds.";
+}
+
+NnetBatchComputer::~NnetBatchComputer() {
+  PrintMinibatchStats();
+  // the destructor shouldn't be called while the mutex is locked; if it is, it
+  // likely means the program has already crashed, or it's a programming error.
+  if (!mutex_.try_lock())
+    KALDI_ERR << "Destructor called while object locked.";
+  int32 num_pending_tasks = 0;
+  for (auto iter = tasks_.begin(); iter != tasks_.end(); ++iter)
+    num_pending_tasks += iter->second.tasks.size();
+  if (num_pending_tasks > 0)
+    KALDI_ERR << "Tasks are pending but object is being destroyed";
+  for (auto iter = no_more_than_n_minibatches_full_.begin();
+       iter != no_more_than_n_minibatches_full_.end(); ++iter) {
+    std::condition_variable *cond = iter->second;
+    // the next call will notify any threads that were waiting on this condition
+    // variable-- there shouldn't be any, though, as it would be a programming
+    // error, but better to wake them up so we can see any messages they print.
+    cond->notify_all();
+    delete cond;
+  }
+  KALDI_ASSERT(num_full_minibatches_ == 0);  // failure would be a coding error.
+}
+
+NnetBatchComputer::MinibatchSizeInfo*
+NnetBatchComputer::GetHighestPriorityComputation(
+    bool allow_partial_minibatch,
+    int32 *minibatch_size_out,
+    std::vector<NnetInferenceTask*> *tasks) {
+  tasks->clear();
+  std::unique_lock<std::mutex> lock(mutex_);
+  MapType::iterator iter = tasks_.begin(), end = tasks_.end(),
+      best_iter = tasks_.end();
+  double highest_priority = -std::numeric_limits<double>::infinity();
+
+  for (; iter != end; ++iter) {
+    ComputationGroupInfo &info = iter->second;
+    double this_priority = GetPriority(allow_partial_minibatch, info);
+    if (this_priority > highest_priority) {
+      highest_priority = this_priority;
+      best_iter = iter;
+    }
+  }
+  if (best_iter == tasks_.end()) {
+    // either allow_partial_minibatch == false and there were no full
+    // minibatches, or there were no pending tasks at all.
+    return NULL;
+  }
+  ComputationGroupInfo &info = best_iter->second;
+  int32 actual_minibatch_size = GetActualMinibatchSize(info);
+  *minibatch_size_out = actual_minibatch_size;
+  MinibatchSizeInfo *minfo = &(info.minibatch_info[actual_minibatch_size]);
+  if (minfo->computation == NULL)
+    minfo->computation = GetComputation(info, actual_minibatch_size);
+  GetHighestPriorityTasks(actual_minibatch_size, &info, tasks);
+  return minfo;
+}
+
+
+void NnetBatchComputer::GetHighestPriorityTasks(
+    int32 num_tasks_needed,
+    ComputationGroupInfo *info,
+    std::vector<NnetInferenceTask*> *tasks) {
+  int32 num_tasks_present = info->tasks.size(),
+      minibatch_size = GetMinibatchSize(*info);
+  KALDI_ASSERT(tasks->empty());
+  if (num_tasks_needed >= num_tasks_present) {
+    tasks->swap(info->tasks);
+  } else {
+    int32 num_tasks_not_needed = num_tasks_present - num_tasks_needed;
+    // We don't sort the tasks with a comparator that dereferences the pointers,
+    // because the priorities can change asynchronously, and we're concerned that
+    // something weird might happen in the sorting if the things it's comparing
+    // are changing.
+    std::vector<std::pair<double, NnetInferenceTask*> > pairs(num_tasks_present);
+    for (int32 i = 0; i < num_tasks_present; i++) {
+      pairs[i].first = info->tasks[i]->priority;
+      pairs[i].second = info->tasks[i];
+    }
+    std::nth_element(pairs.begin(), pairs.begin() + num_tasks_not_needed,
+                     pairs.end());
+
+    // The lowest-priority 'num_tasks_not_needed' stay in the 'info' struct.
+    info->tasks.clear();
+    for (int32 i = 0; i < num_tasks_not_needed; i++)
+      info->tasks.push_back(pairs[i].second);
+    // The highest-priority 'num_tasks_needed' tasks go to the output 'tasks'
+    // array.
+    for (int32 i = num_tasks_not_needed; i < num_tasks_present; i++)
+      tasks->push_back(pairs[i].second);
+    // The following assertion checks that the is_edge and is_irregular values
+    // are the same for the entire minibatch, which they should always be.
+    KALDI_ASSERT(GetMinibatchSize(*info) == minibatch_size);
+  }
+
+  {
+    // This block updates num_full_minibatches_ and notifies threads waiting on
+    // any related condition variable.
+    int32 new_num_tasks_present = info->tasks.size(),
+        full_minibatch_reduction =
+        (num_tasks_present / minibatch_size) -
+        (new_num_tasks_present / minibatch_size);
+    for (int32 i = 0; i < full_minibatch_reduction; i++) {
+      num_full_minibatches_--;
+      KALDI_ASSERT(num_full_minibatches_ >= 0);
+      std::unordered_map<int32, std::condition_variable*>::const_iterator
+          iter = no_more_than_n_minibatches_full_.find(num_full_minibatches_);
+      if (iter != no_more_than_n_minibatches_full_.end()) {
+        std::condition_variable *cond = iter->second;
+        cond->notify_all();
+      }
+    }
+  }
+}
+
+
+int32 NnetBatchComputer::GetMinibatchSize(
+    const ComputationGroupInfo &info) const {
+  if (info.tasks.empty()) {
+    return opts_.minibatch_size; // actually it shouldn't matter what we return
+                                 // in this case.
+  }
+  const NnetInferenceTask &task = *(info.tasks[0]);
+  if (task.is_irregular)
+    return 1;
+  else if (task.is_edge)
+    return opts_.edge_minibatch_size;
+  else
+    return opts_.minibatch_size;
+}
+
+int32 NnetBatchComputer::GetActualMinibatchSize(
+    const ComputationGroupInfo &info) const {
+  KALDI_ASSERT(!info.tasks.empty());
+  int32 num_tasks = info.tasks.size(),
+      this_minibatch_size = GetMinibatchSize(info);
+  KALDI_ASSERT(num_tasks > 0);
+  while (num_tasks <
+         int32(opts_.partial_minibatch_factor * this_minibatch_size))
+    this_minibatch_size *= opts_.partial_minibatch_factor;
+  return int32(this_minibatch_size);
+}
+
+
+std::shared_ptr<const NnetComputation> NnetBatchComputer::GetComputation(
+    const ComputationGroupInfo &info,
+    int32 minibatch_size) {
+  KALDI_ASSERT(!info.tasks.empty());
+  // note: all the tasks will have the same structure, in the respects that
+  // would affect the computation.
+  NnetInferenceTask *example_task = info.tasks[0];
+  ComputationRequest request;
+  GetComputationRequest(*example_task, minibatch_size, &request);
+  return compiler_.Compile(request);
+}
+
+
+double NnetBatchComputer::GetPriority(bool allow_partial_minibatch,
+                                      const ComputationGroupInfo &info) const {
+  if (info.tasks.empty())
+    return -std::numeric_limits<double>::infinity();
+  int32 this_minibatch_size = GetMinibatchSize(info);
+  int32 num_tasks = info.tasks.size();
+
+  if (!allow_partial_minibatch && num_tasks < this_minibatch_size)
+    return -std::numeric_limits<double>::infinity();
+
+  // penalty_for_not_full will be negative if the minibatch is not full, up to a
+  // maximum of 10.  the 10 is a heuristic; it could be changed.
+  // Note: the penalty is effectively infinity if allow_partial_minibatch == false;
+  // see the 'return' above.
+  double proportion_full = std::min<int32>(num_tasks, this_minibatch_size) /
+      double(this_minibatch_size),
+      penalty_for_not_full = 10.0 * (proportion_full - 1.0),
+      task_priority_sum = 0.0;
+
+
+  if (num_tasks > this_minibatch_size) {
+    // Get the average of the priorities of the highest-priority tasks (no more
+    // than 'minibatch_size' of them.
+    std::vector<double> priorities;
+    priorities.resize(num_tasks);
+    for (int32 i = 0; i < num_tasks; i++)
+      priorities[i] = info.tasks[i]->priority;
+    // sort from greatest to least.
+    std::nth_element(priorities.begin(),
+                     priorities.begin() + this_minibatch_size,
+                     priorities.end(),
+                     std::greater<double>());
+    for (int32 i = 0; i < this_minibatch_size; i++)
+      task_priority_sum += priorities[i];
+    return penalty_for_not_full + task_priority_sum / this_minibatch_size;
+  } else {
+    for (int32 i = 0; i < num_tasks; i++)
+      task_priority_sum += info.tasks[i]->priority;
+    return penalty_for_not_full + task_priority_sum / num_tasks;
+  }
+}
+
+
+// static
+void NnetBatchComputer::GetComputationRequest(
+    const NnetInferenceTask &task,
+    int32 minibatch_size,
+    ComputationRequest *request) {
+  request->need_model_derivative = false;
+  request->store_component_stats = false;
+  request->inputs.reserve(2);
+
+  int32 num_input_frames = task.input.NumRows(),
+      first_input_t = task.first_input_t,
+      num_output_frames = task.num_output_frames,
+      output_t_stride = task.output_t_stride;
+  bool has_ivector = (task.ivector.Dim() != 0);
+
+  std::vector<Index> input_indexes, ivector_indexes, output_indexes;
+  input_indexes.reserve(minibatch_size * num_input_frames);
+  output_indexes.reserve(minibatch_size * num_output_frames);
+  if (has_ivector)
+    ivector_indexes.reserve(minibatch_size);
+
+  for (int32 n = 0; n < minibatch_size; n++) {
+    for (int32 t = first_input_t; t < first_input_t + num_input_frames; t++)
+      input_indexes.push_back(Index(n, t, 0));
+    if (has_ivector)
+      ivector_indexes.push_back(Index(n, 0, 0));
+    for (int32 t = 0; t < num_output_frames; t++)
+      output_indexes.push_back(Index(n, t * output_t_stride, 0));
+  }
+  request->inputs.push_back(IoSpecification("input", input_indexes));
+  if (has_ivector)
+    request->inputs.push_back(IoSpecification("ivector", ivector_indexes));
+  request->outputs.push_back(IoSpecification("output", output_indexes));
+}
+
+void NnetBatchComputer::FormatInputs(
+    int32 minibatch_size,
+    const std::vector<NnetInferenceTask*> &tasks,
+    CuMatrix<BaseFloat> *input,
+    CuMatrix<BaseFloat> *ivector) {
+  int32 num_input_frames = tasks[0]->input.NumRows(),
+      input_dim = tasks[0]->input.NumCols(),
+      ivector_dim = tasks[0]->ivector.Dim(),
+      num_tasks = tasks.size();
+  KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);
+  
+  // destination matrix
+  input->Resize(minibatch_size * num_input_frames, input_dim,
+                kUndefined);
+ 
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+
+    std::vector<const BaseFloat*> inputs(num_tasks);
+    std::vector<BaseFloat*> outputs(num_tasks);
+    std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+    std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+    // compute matrix descriptions for each copy
+    for (int32 n = 0; n < num_tasks; n++) {
+      const CuMatrix<BaseFloat> &input_mat = tasks[n]->input;
+      CuSubMatrix<BaseFloat> output_mat = input->RowRange(
+          n * num_input_frames, num_input_frames);
+
+      // create matrix batch description arrays
+      num_rows[n] = num_input_frames;
+      num_cols[n] = input_dim;
+      outputs[n] = output_mat.Data();
+      inputs[n] = input_mat.Data();
+      ldo[n] = output_mat.Stride();
+      ldi[n] = input_mat.Stride();
+    }
+
+    // execute batched copy
+    cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], 
+        &ldi[0], &outputs[0], &ldo[0]);
+
+  } else 
+#endif
+  {
+    for (int32 n = 0; n < num_tasks; n++) {
+      CuSubMatrix<BaseFloat> input_part(*input,
+                                    n * num_input_frames, num_input_frames,
+                                    0, input_dim);
+      input_part.CopyFromMat(tasks[n]->input);
+    }
+  }
+
+  if (GetVerboseLevel() >=2 ) {
+    if (num_tasks < minibatch_size) {
+      // The following will make things easier to debug if something fails, but
+      // shouldn't be strictly necessary.
+      // the -1 means 'take all remaining rows'.
+      input->RowRange(num_tasks * num_input_frames,
+                      (minibatch_size - num_tasks) * num_input_frames).SetZero();
+    }
+  }
+
+  if (ivector_dim != 0) {
+    ivector->Resize(minibatch_size, ivector_dim, kUndefined);
+
+#if HAVE_CUDA == 1
+    if (CuDevice::Instantiate().Enabled()) {
+     
+      // using the batched matrix copy routine for this.  This isn't
+      // extremely efficient but the kernel takes a minimal amount of 
+      // time so making a batched vector copy is not worth the effort.
+      std::vector<const BaseFloat*> inputs(num_tasks);
+      std::vector<BaseFloat*> outputs(num_tasks);
+      std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+      std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+      // compute source pointers for each input
+      for (int32 n = 0; n < num_tasks; n++) {
+        const CuVector<BaseFloat> &input_vec = tasks[n]->ivector;
+        CuSubVector<BaseFloat> output_vec = ivector->Row(n);
+        // create matrix batch description arrays
+        num_rows[n] = 1;
+        num_cols[n] = ivector_dim;
+        outputs[n] = output_vec.Data();
+        inputs[n] = input_vec.Data();
+        ldo[n] = 1;
+        ldi[n] = 1;
+      }
+
+      // execute batched copy
+      cuda_batched_copy_mats(num_tasks, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+          &outputs[0], &ldo[0]);
+
+    } else 
+#endif
+    {
+      for (int32 n = 0; n < num_tasks; n++) {
+        ivector->Row(n).CopyFromVec(tasks[n]->ivector);
+      }
+    }
+
+    if (GetVerboseLevel() >= 2) {
+      if (num_tasks < minibatch_size) {
+        // The following will make things easier to debug if something fails, but
+        // shouldn't be strictly necessary.
+        // the -1 means 'take all remaining rows'.
+        ivector->RowRange(num_tasks, minibatch_size - num_tasks).SetZero();
+      }
+    }
+  }
+}
+
+void NnetBatchComputer::FormatOutputs(
+    const CuMatrix<BaseFloat> &output,
+    const std::vector<NnetInferenceTask*> &tasks) {
+  KALDI_ASSERT(!tasks.empty());
+  int32 num_output_frames = tasks[0]->num_output_frames,
+      output_dim = output.NumCols(),
+      num_tasks = tasks.size();
+  bool did_output_to_gpu = false;
+
+  // We don't bother zeroing frames of the output that are unused, but you could
+  // un-comment the commented lines of code below to do so and add equivalent
+  // calls to the cuda version.
+
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) {
+
+    std::vector<const BaseFloat*> inputs(num_tasks);
+    std::vector<BaseFloat*> outputs(num_tasks);
+    std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+    std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+    int b=0;  // batch counter
+    for (int32 n = 0; n < num_tasks; n++) {
+      NnetInferenceTask *task = tasks[n];
+
+      int32 left_unused = task->num_initial_unused_output_frames,
+            used = task->num_used_output_frames;
+      // int32 right_unused = num_output_frames - used - left_unused;
+      
+      // TODO do we really expect different tasks to output CPU or GPU? 
+      // This adds a bit of code complexity.  Perhaps output_to_cpu should 
+      // be a property of the batch computer and not the tasks
+      if (task->output_to_cpu) {
+        task->output_cpu.Resize(num_output_frames, output_dim,
+            kUndefined);
+        // if (left_unused > 0)
+        //   task->output_cpu.RowRange(0, left_unused).SetZero();
+        task->output_cpu.RowRange(left_unused, used).CopyFromMat(
+            output.RowRange(n * num_output_frames + left_unused, used));
+        // if (right_unused > 0)
+        //   task->output_cpu.RowRange(
+        //   0, left_unused + used, right_unused).SetZero();
+
+      } else {
+        did_output_to_gpu = true;
+        task->output.Resize(num_output_frames, output_dim,
+            kUndefined);
+
+        CuSubMatrix<BaseFloat> output_mat = task->output.RowRange(
+            left_unused, used);
+        const CuSubMatrix<BaseFloat> input_mat = output.RowRange(
+            n * num_output_frames + left_unused, used);
+       
+        // create matrix batch description arrays
+        num_rows[b] = output_mat.NumRows();
+        num_cols[b] = output_mat.NumCols();
+        outputs[b] = output_mat.Data();
+        inputs[b] = input_mat.Data();
+        ldo[b] = output_mat.Stride();
+        ldi[b] = input_mat.Stride();
+        b++; // increase batch count
+      }
+    }
+    
+    // execute batched copy
+    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+        &outputs[0], &ldo[0]);
+  
+  } else
+#endif
+  {
+    //TODO i don't think all of these paths are actually possible.  We should simplify this.  
+    //Is it possible to output_to_gpu with HAVE_CUDA == 0 or when the device is disabled?
+    for (int32 n = 0; n < num_tasks; n++) {
+      NnetInferenceTask *task = tasks[n];
+
+      int32 left_unused = task->num_initial_unused_output_frames,
+            used = task->num_used_output_frames;
+      // int32 right_unused = num_output_frames - used - left_unused;
+
+      if (task->output_to_cpu) {
+        task->output_cpu.Resize(num_output_frames, output_dim,
+            kUndefined);
+        // if (left_unused > 0)
+        //   task->output_cpu.RowRange(0, left_unused).SetZero();
+        task->output_cpu.RowRange(left_unused, used).CopyFromMat(
+            output.RowRange(n * num_output_frames + left_unused, used));
+        // if (right_unused > 0)
+        //   task->output_cpu.RowRange(0, left_unused + used, right_unused).SetZero();
+      } else {
+        did_output_to_gpu = true;
+        task->output.Resize(num_output_frames, output_dim,
+            kUndefined);
+        // if (left_unused > 0)
+        //   task->output.RowRange(0, left_unused).SetZero();
+        task->output.RowRange(left_unused, used).CopyFromMat(
+            output.RowRange(n * num_output_frames + left_unused, used));
+        // if (right_unused > 0)
+        //   task->output.RowRange(0, left_unused + used, right_unused).SetZero();
+      }
+    }
+  }
+  // The output of this function will likely be consumed by another thread.
+  // The following call will make sure the relevant kernels complete before
+  // any kernels from the other thread use the output.
+  if (did_output_to_gpu)
+    SynchronizeGpu();
+}
+
+void NnetBatchComputer::AcceptTask(NnetInferenceTask *task,
+                                   int32 max_minibatches_full) {
+  std::unique_lock<std::mutex> lock(mutex_);
+
+  if (max_minibatches_full > 0 && num_full_minibatches_ > max_minibatches_full) {
+    std::unordered_map<int32, std::condition_variable*>::iterator
+        iter = no_more_than_n_minibatches_full_.find(max_minibatches_full);
+    std::condition_variable *cond;
+    if (iter != no_more_than_n_minibatches_full_.end()) {
+      cond = iter->second;
+    } else {
+      cond = new std::condition_variable();
+      no_more_than_n_minibatches_full_[max_minibatches_full] = cond;
+    }
+    while (num_full_minibatches_ > max_minibatches_full)
+      cond->wait(lock);
+  }
+  ComputationGroupKey key(*task);
+  ComputationGroupInfo &info = tasks_[key];
+  info.tasks.push_back(task);
+  int32 minibatch_size = GetMinibatchSize(info);
+  if (static_cast<int32>(info.tasks.size()) % minibatch_size == 0)
+    num_full_minibatches_++;
+}
+
+bool NnetBatchComputer::Compute(bool allow_partial_minibatch) {
+  int32 minibatch_size;
+  std::vector<NnetInferenceTask*> tasks;
+  MinibatchSizeInfo *minfo =
+      GetHighestPriorityComputation(allow_partial_minibatch,
+                                    &minibatch_size,
+                                    &tasks);
+  if (minfo == NULL)
+    return false;
+
+  Timer tim;
+  Nnet *nnet_to_update = NULL;  // we're not doing any update
+  NnetComputer computer(opts_.compute_config, *(minfo->computation),
+                        nnet_, nnet_to_update);
+
+
+  CuMatrix<BaseFloat> input;
+  CuMatrix<BaseFloat> ivector;
+  FormatInputs(minibatch_size, tasks, &input, &ivector);
+  computer.AcceptInput("input", &input);
+  if (ivector.NumRows() != 0)
+    computer.AcceptInput("ivector", &ivector);
+  computer.Run();
+  CuMatrix<BaseFloat> output;
+  computer.GetOutputDestructive("output", &output);
+  if (log_priors_.Dim() != 0) {
+    output.AddVecToRows(-1.0, log_priors_);
+  }
+  output.Scale(opts_.acoustic_scale);
+  FormatOutputs(output, tasks);
+
+  // Update the stats, for diagnostics.
+  minfo->num_done++;
+  minfo->tot_num_tasks += static_cast<int64>(tasks.size());
+  minfo->seconds_taken += tim.Elapsed();
+
+  SynchronizeGpu();
+
+  for (size_t i = 0; i < tasks.size(); i++)
+    tasks[i]->semaphore.Signal();
+
+  return true;
+}
+
+
+/**
+   This namespace contains things needed for the implementation of
+   the function NnetBatchComputer::SplitUtteranceIntoTasks().
+ */
+namespace utterance_splitting {
+/**
+   This function figures out how many chunks are needed for this utterance,
+   sets 'tasks' to a vector with that many elements, and sets up the
+   following elements in 'tasks':
+   output_t_stride
+   num_output_frames
+   num_initial_unused_output_frames
+   num_used_output_frames
+   @param [in] opts   Options class
+   @param [in] num_subsampled_frames  The number of output frames in this
+   utterance.  Must be > 0.
+   @param [in] num_subsampled_frames_per_chunk  The number of output frames
+   per chunk
+   @param [out] The 'tasks' array is output to here; it will have one
+   task per chunk, with only the members 'output_t_stride',
+    'num_output_frames', 'num_initial_unused_output_frames',
+    'num_used_output_frames' and 'is_irregular' set up.
+*/
+void GetOutputFrameInfoForTasks(
+    const NnetBatchComputerOptions &opts,
+    int32 num_subsampled_frames,
+    int32 num_subsampled_frames_per_chunk,
+    std::vector<NnetInferenceTask> *tasks) {
+  KALDI_ASSERT(num_subsampled_frames > 0);
+  int32 fpc = num_subsampled_frames_per_chunk;
+  int32 num_tasks = (num_subsampled_frames + fpc - 1) / fpc;
+  tasks->resize(num_tasks);
+  for (int32 i = 0; i < num_tasks; i++) {
+    (*tasks)[i].output_t_stride = opts.frame_subsampling_factor;
+  }
+  if (num_subsampled_frames <= fpc) {  // there is one chunk.
+    KALDI_ASSERT(num_tasks == 1);  // TODO: remove this.
+    NnetInferenceTask &task = (*tasks)[0];
+    task.first_used_output_frame_index = 0;
+    if (opts.ensure_exact_final_context) {
+      task.num_output_frames = num_subsampled_frames;
+      task.num_initial_unused_output_frames = 0;
+      task.num_used_output_frames = num_subsampled_frames;
+      task.is_irregular = true;
+    } else {
+      task.num_output_frames = fpc;
+      task.num_initial_unused_output_frames = 0;
+      task.num_used_output_frames = num_subsampled_frames;
+      task.is_irregular = false;
+    }
+  } else {
+    for (int32 i = 0; i + 1 < num_tasks; i++) {
+      NnetInferenceTask &task = (*tasks)[i];
+      task.num_output_frames = fpc;
+      task.num_initial_unused_output_frames = 0;
+      task.num_used_output_frames = fpc;
+      task.first_used_output_frame_index = i * fpc;
+      task.is_irregular = false;
+    }
+    // The last chunk will end on the last frame of the file, but we won't use
+    // the part of its output that overlaps with the preceding chunk.
+    NnetInferenceTask &task = (*tasks)[num_tasks - 1];
+    task.num_output_frames = fpc;
+    task.num_initial_unused_output_frames = ((num_tasks - 1) * fpc) -
+        (num_subsampled_frames - fpc);
+    task.num_used_output_frames =
+        num_subsampled_frames - ((num_tasks - 1) * fpc);
+    task.first_used_output_frame_index = (num_tasks - 1) * fpc;
+    task.is_irregular = false;
+  }
+
+  if (true) {
+    // Do some checking.  TODO: remove this.
+    KALDI_ASSERT((*tasks)[0].first_used_output_frame_index == 0);
+    for (int32 i = 1; i < num_tasks; i++) {
+      KALDI_ASSERT((*tasks)[i].first_used_output_frame_index ==
+                   (*tasks)[i-1].first_used_output_frame_index +
+                   (*tasks)[i-1].num_used_output_frames);
+    }
+    KALDI_ASSERT((*tasks)[num_tasks-1].first_used_output_frame_index +
+                 (*tasks)[num_tasks-1].num_used_output_frames ==
+                 num_subsampled_frames);
+    for (int32 i = 0; i < num_tasks; i++) {
+      const NnetInferenceTask &task = (*tasks)[i];
+      KALDI_ASSERT(task.num_used_output_frames +
+                   task.num_initial_unused_output_frames <=
+                   task.num_output_frames);
+    }
+  }
+}
+
+void AddOnlineIvectorsToTasks(
+    const NnetBatchComputerOptions &opts,
+    const CuMatrix<BaseFloat> &online_ivectors,
+    int32 online_ivector_period,
+    std::vector<NnetInferenceTask> *tasks) {
+  int32 f = opts.frame_subsampling_factor,
+      num_tasks = tasks->size();
+  for (int32 i = 0; i < num_tasks; i++) {
+    NnetInferenceTask &task = (*tasks)[i];
+    // begin_output_t and end_output_t are the subsampled frame indexes at
+    // the output; you'd have to multiply them by f to get real frame indexes.
+    int32 begin_output_t = task.first_used_output_frame_index -
+        task.num_initial_unused_output_frames,
+        mid_output_t = begin_output_t + (task.num_output_frames / 2),
+        mid_input_t = mid_output_t * f,
+        ivector_frame = mid_input_t / online_ivector_period,
+        num_ivector_frames = online_ivectors.NumRows(),
+        margin_in_frames = 20,
+        margin_in_ivector_frames =
+        (margin_in_frames + online_ivector_period - 1) / online_ivector_period;
+    // the 'margin' is our tolerance for when the number of rows of
+    // 'online_ivectors' is less than what we expected; we allow 20 frames of
+    // tolerance in the numbering of the original (input) features.
+    if (ivector_frame >= num_ivector_frames) {
+      if (num_ivector_frames > 0 && ivector_frame > num_ivector_frames -
+          margin_in_ivector_frames) {
+        ivector_frame = num_ivector_frames - 1;  // Just take the last available one.
+      } else {
+        KALDI_ERR << "Could not get iVector for frame " << ivector_frame
+                  << ", online-ivectors matrix has "
+                  << online_ivectors.NumRows()
+                  << " rows.  Mismatched --online-ivector-period?";
+      }
+    }
+    task.ivector = online_ivectors.Row(ivector_frame);
+  }
+}
+
+
+
+/**
+   This function sets up the 'input' and 'first_input_t' and 'is_edge' members
+   of the 'tasks' array; it is responsible for working out, for each task,
+   which input frames it needs (including left-context and right-context).
+
+   The 'nnet_left_context' and 'nnet_right_context' are the inherent left
+   and right context of the network (num-frames required on left and right
+   to compute an output frame), and may be computed by doing:
+    ComputeSimpleNnetContext(nnet, &nnet_left_context_, &nnet_right_context_)
+*/
+static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
+                              int32 nnet_left_context,
+                              int32 nnet_right_context,
+                              const CuMatrix<BaseFloat> &input,
+                              std::vector<NnetInferenceTask> *tasks) {
+  int32 num_input_frames = input.NumRows(),
+      f = opts.frame_subsampling_factor,
+      num_subsampled_frames = (num_input_frames + f - 1) / f,
+      extra_left_context_initial = (opts.extra_left_context_initial < 0 ?
+                                    opts.extra_left_context :
+                                    opts.extra_left_context_initial),
+      extra_right_context_final = (opts.extra_right_context_final < 0 ?
+                                   opts.extra_right_context :
+                                   opts.extra_right_context_final),
+      num_tasks = tasks->size();
+
+  for (int32 i = 0; i < num_tasks; i++) {
+    NnetInferenceTask &task = (*tasks)[i];
+    // begin_output_t and end_output_t are the subsampled frame indexes at
+    // the output; you'd have to multiply them by f to get real frame indexes.
+    int32 begin_output_t = task.first_used_output_frame_index -
+        task.num_initial_unused_output_frames,
+        end_output_t = begin_output_t + task.num_output_frames;
+    // begin_input_t and end_input_t are the real 't' values corresponding to
+    // begin_output_t and end_output_t; they are the beginning and end
+    // (i.e. first and last-plus-one) frame indexes without any left or right
+    // context.
+    int32 begin_input_t = begin_output_t * f,
+        end_input_t = end_output_t * f;
+    // Detect whether the left and right edges touch (or pass over) the left
+    // and right boundaries.  Note: we don't expect begin_output_t to ever be
+    // negative.
+    bool left_edge = (begin_output_t <= 0),
+        right_edge = (end_output_t >= num_subsampled_frames);
+    int32 tot_left_context = nnet_left_context +
+        (left_edge ? extra_left_context_initial : opts.extra_left_context),
+        tot_right_context = nnet_right_context +
+        (right_edge ? extra_right_context_final : opts.extra_right_context);
+
+    // 'is_edge' is only true if it's an edge minibatch *and* its being an
+    // edge actually made a difference to the structure of the example.
+    task.is_edge =
+        (tot_left_context != nnet_left_context + opts.extra_left_context ||
+         tot_right_context !=  nnet_right_context + opts.extra_right_context);
+
+    int32 begin_input_t_padded = begin_input_t - tot_left_context,
+        end_input_t_padded = end_input_t + tot_right_context;
+
+    // 'task.first_input_t' is a representation of 'begin_input_t_padded' in a
+    // shifted/normalized numbering where the output time indexes start from
+    // zero.
+    task.first_input_t = begin_input_t_padded - (begin_output_t * f);
+
+    task.input.Resize(end_input_t_padded - begin_input_t_padded,
+                      input.NumCols(), kUndefined);
+
+    // Copy from intput into task input with clamping
+    task.input.CopyRangeFromMatClamped(input, begin_input_t_padded, 
+        end_input_t_padded, 0, num_input_frames-1);
+  }
+}
+
+} // namespace utterance_splitting
+
+void NnetBatchComputer::SplitUtteranceIntoTasks(
+    bool output_to_cpu,
+    const Matrix<BaseFloat> &input,
+    const Vector<BaseFloat> *h_ivector,
+    const Matrix<BaseFloat> *h_online_ivectors,
+    int32 online_ivector_period,
+    std::vector<NnetInferenceTask> *tasks) {
+
+  // Inputs are expected to be in device memory. 
+  // create temporary device arrays and copy
+  // inputs into them
+  CuMatrix<BaseFloat> cu_input(input);
+  CuVector<BaseFloat> cu_ivector, *ivector = NULL;
+  CuMatrix<BaseFloat> cu_online_ivectors, *online_ivectors = NULL;
+
+  if (h_ivector!=NULL) {
+    cu_ivector.Resize(h_ivector->Dim(), kUndefined);
+    cu_ivector.CopyFromVec(*h_ivector);
+    ivector = &cu_ivector;
+  }
+  if (h_online_ivectors!=NULL) {
+    cu_online_ivectors.Resize(h_online_ivectors->NumRows(), h_online_ivectors->NumCols(), kUndefined);
+    cu_online_ivectors.CopyFromMat(*h_online_ivectors);
+    online_ivectors = &cu_online_ivectors;
+  }
+
+  SplitUtteranceIntoTasks(output_to_cpu, cu_input, ivector,
+      online_ivectors, online_ivector_period, tasks);
+}
+
+void NnetBatchComputer::SplitUtteranceIntoTasks(
+    bool output_to_cpu,
+    const CuMatrix<BaseFloat> &input,
+    const CuVector<BaseFloat> *ivector,
+    const CuMatrix<BaseFloat> *online_ivectors,
+    int32 online_ivector_period,
+    std::vector<NnetInferenceTask> *tasks) {
+  using namespace utterance_splitting;
+
+
+  { // This block does some checking.
+    if (input.NumCols() != input_dim_) {
+      KALDI_ERR << "Input features did not have expected dimension: expected "
+          << input_dim_ << ", got " << input.NumCols();
+    }
+    int32 ivector_dim = (ivector != NULL ? ivector->Dim() :
+                         (online_ivectors != NULL ?
+                          online_ivectors->NumCols() : 0));
+    if (ivector_dim_ != 0 && ivector_dim == 0)
+      KALDI_ERR << "Model expects i-vectors but none were supplied";
+    else if (ivector_dim_ == 0 && ivector_dim != 0)
+      KALDI_ERR << "You supplied i-vectors but model does not expect them.";
+    else if (ivector_dim != ivector_dim_)
+      KALDI_ERR << "I-vector dimensions mismatch: model expects "
+                << ivector_dim_ << ", you supplied " << ivector_dim;
+  }
+
+
+  int32 num_input_frames = input.NumRows(),
+      f = opts_.frame_subsampling_factor,
+      num_subsampled_frames = (num_input_frames + f - 1) / f,
+      num_subsampled_frames_per_chunk = opts_.frames_per_chunk / f;
+
+  GetOutputFrameInfoForTasks(opts_, num_subsampled_frames,
+                             num_subsampled_frames_per_chunk,
+                             tasks);
+
+  SplitInputToTasks(opts_, nnet_left_context_, nnet_right_context_,
+                    input, tasks);
+
+
+  if (ivector != NULL) {
+    KALDI_ASSERT(online_ivectors == NULL);
+
+#if HAVE_CUDA == 1 
+    if (CuDevice::Instantiate().Enabled()) {
+      int32_t num_tasks = tasks->size();
+
+      std::vector<const BaseFloat*> inputs(num_tasks);
+      std::vector<BaseFloat*> outputs(num_tasks);
+      std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+      std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+      int b=0;  // batch counter
+        
+      for (size_t i = 0; i < tasks->size(); i++) {
+        CuVector<BaseFloat> &output_vec = (*tasks)[i].ivector;
+        const CuVector<BaseFloat> &input_vec =  *ivector;
+
+        output_vec.Resize(input_vec.Dim(), kUndefined);
+
+        // create matrix batch description arrays
+        num_rows[b] = 1;
+        num_cols[b] = output_vec.Dim();
+        outputs[b] = output_vec.Data();
+        inputs[b] = input_vec.Data();
+        ldo[b] = 0;
+        ldi[b] = 0;
+        b++; // increase batch count
+      }
+    
+      // execute batched copy
+      cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+          &outputs[0], &ldo[0]);
+    } else
+#endif
+    {
+      for (size_t i = 0; i < tasks->size(); i++)
+        (*tasks)[i].ivector = *ivector;
+    }
+
+  } else if (online_ivectors != NULL) {
+    AddOnlineIvectorsToTasks(opts_, *online_ivectors,
+                             online_ivector_period, tasks);
+  }
+
+  for (size_t i = 0; i < tasks->size(); i++) {
+    (*tasks)[i].output_to_cpu = output_to_cpu;
+    // The priority will be set by the user; this just avoids undefined
+    // behavior.
+    (*tasks)[i].priority = 0.0;
+  }
+}
+
+
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    Matrix<BaseFloat> *output) {
+  int32 num_tasks = tasks.size(),
+      num_output_frames = 0,
+      output_dim = -1;
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    num_output_frames += task.num_used_output_frames;
+    if (i == 0) {
+      output_dim = (task.output_to_cpu ?
+                    task.output_cpu.NumCols() :
+                    task.output.NumCols());
+    }
+  }
+  KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
+  int32 cur_output_frame = 0;
+  output->Resize(num_output_frames, output_dim);
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    int32 skip = task.num_initial_unused_output_frames,
+        num_used = task.num_used_output_frames;
+    KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
+    if (task.output_to_cpu) {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output_cpu.RowRange(skip, num_used));
+    } else {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output.RowRange(skip, num_used));
+    }
+    cur_output_frame += num_used;
+  }
+  KALDI_ASSERT(cur_output_frame == num_output_frames);
+}
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    CuMatrix<BaseFloat> *output) {
+  int32 num_tasks = tasks.size(),
+      num_output_frames = 0,
+      output_dim = -1;
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    num_output_frames += task.num_used_output_frames;
+    if (i == 0) {
+      output_dim = (task.output_to_cpu ?
+                    task.output_cpu.NumCols() :
+                    task.output.NumCols());
+    }
+  }
+  KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
+  int32 cur_output_frame = 0;
+  output->Resize(num_output_frames, output_dim, kUndefined);
+  
+#if HAVE_CUDA == 1 
+  if (CuDevice::Instantiate().Enabled()) {
+
+    std::vector<const BaseFloat*> inputs(num_tasks);
+    std::vector<BaseFloat*> outputs(num_tasks);
+    std::vector<int32_t> ldi(num_tasks), ldo(num_tasks);
+    std::vector<int32_t> num_rows(num_tasks), num_cols(num_tasks);
+
+    int b=0;  // batch counter
+    for (int32 i = 0; i < num_tasks; i++) {
+      const NnetInferenceTask &task = tasks[i];
+      int32 skip = task.num_initial_unused_output_frames,
+            num_used = task.num_used_output_frames;
+      KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
+      if (task.output_to_cpu) {
+        output->RowRange(cur_output_frame, num_used).CopyFromMat(
+            task.output_cpu.RowRange(skip, num_used));
+      } else {
+        CuSubMatrix<BaseFloat> output_mat = 
+          output->RowRange(cur_output_frame, num_used);
+        const CuSubMatrix<BaseFloat> input_mat =  
+          task.output.RowRange(skip, num_used);
+
+        // create matrix batch description arrays
+        num_rows[b] = output_mat.NumRows();
+        num_cols[b] = output_mat.NumCols();
+        outputs[b] = output_mat.Data();
+        inputs[b] = input_mat.Data();
+        ldo[b] = output_mat.Stride();
+        ldi[b] = input_mat.Stride();
+        b++; // increase batch count
+      }
+      cur_output_frame += num_used;
+    }
+
+    // execute batched copy
+    cuda_batched_copy_mats(b, &num_rows[0], &num_cols[0], &inputs[0], &ldi[0], 
+        &outputs[0], &ldo[0]);
+
+  } else
+#endif
+ {
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    int32 skip = task.num_initial_unused_output_frames,
+        num_used = task.num_used_output_frames;
+    KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
+    if (task.output_to_cpu) {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output_cpu.RowRange(skip, num_used));
+    } else {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output.RowRange(skip, num_used));
+    }
+    cur_output_frame += num_used;
+  }
+ }
+ 
+  KALDI_ASSERT(cur_output_frame == num_output_frames);
+}
+
+
+NnetBatchInference::NnetBatchInference(
+    const NnetBatchComputerOptions &opts,
+    const Nnet &nnet,
+    const VectorBase<BaseFloat> &priors):
+    computer_(opts, nnet, priors),
+    is_finished_(false),
+    utterance_counter_(0) {
+  // 'thread_' will run the Compute() function in the background.
+  compute_thread_ = std::thread(ComputeFunc, this);
+}
+
+
+void NnetBatchInference::AcceptInput(
+    const std::string &utterance_id,
+    const Matrix<BaseFloat> &input,
+    const Vector<BaseFloat> *ivector,
+    const Matrix<BaseFloat> *online_ivectors,
+    int32 online_ivector_period) {
+
+  UtteranceInfo *info = new UtteranceInfo();
+  info->utterance_id = utterance_id;
+  info->num_tasks_finished = 0;
+  bool output_to_cpu = true;  // This wrapper is for when you need the nnet
+                              // output on CPU, e.g.  because you want it
+                              // written to disk.  If this needs to be
+                              // configurable in the future, we can make changes
+                              // then.
+  computer_.SplitUtteranceIntoTasks(
+      output_to_cpu, input, ivector, online_ivectors,
+      online_ivector_period, &(info->tasks));
+
+  // Setting this to a nonzero value will cause the AcceptTask() call below to
+  // hang until the computation thread has made some progress, if too much
+  // data is already queued.
+  int32 max_full_minibatches = 2;
+
+  // Earlier utterances have higher priority, which is important to make sure
+  // that their corresponding tasks are completed and they can be output to disk.
+  double priority = -1.0 * (utterance_counter_++);
+  for (size_t i = 0; i < info->tasks.size(); i++) {
+    info->tasks[i].priority = priority;
+    computer_.AcceptTask(&(info->tasks[i]), max_full_minibatches);
+  }
+  utts_.push_back(info);
+  tasks_ready_semaphore_.Signal();
+}
+
+bool NnetBatchInference::GetOutput(std::string *utterance_id,
+                                   Matrix<BaseFloat> *output) {
+  if (utts_.empty())
+    return false;
+
+  UtteranceInfo *info = *utts_.begin();
+  std::vector<NnetInferenceTask> &tasks = info->tasks;
+  int32 num_tasks = tasks.size();
+  for (; info->num_tasks_finished < num_tasks; ++info->num_tasks_finished) {
+    Semaphore &semaphore = tasks[info->num_tasks_finished].semaphore;
+    if (is_finished_) {
+      semaphore.Wait();
+    } else {
+      if (!semaphore.TryWait()) {
+        // If not all of the tasks of this utterance are ready yet,
+        // just return false.
+        return false;
+      }
+    }
+  }
+  MergeTaskOutput(tasks, output);
+  *utterance_id = info->utterance_id;
+  delete info;
+  utts_.pop_front();
+  return true;
+}
+
+NnetBatchInference::~NnetBatchInference() {
+  if (!is_finished_)
+    KALDI_ERR << "Object destroyed before Finished() was called.";
+  if (!utts_.empty())
+    KALDI_ERR << "You should get all output before destroying this object.";
+  compute_thread_.join();
+}
+
+void NnetBatchInference::Finished() {
+  is_finished_ = true;
+  tasks_ready_semaphore_.Signal();
+}
+
+// This is run as the thread of class NnetBatchInference.
+void NnetBatchInference::Compute() {
+  bool allow_partial_minibatch = false;
+  while (true) {
+    // keep calling Compute() as long as it makes progress.
+    while (computer_.Compute(allow_partial_minibatch));
+
+    // ... then wait on tasks_ready_semaphore_.
+    tasks_ready_semaphore_.Wait();
+    if (is_finished_) {
+      allow_partial_minibatch = true;
+      while (computer_.Compute(allow_partial_minibatch));
+      return;
+    }
+  }
+}
+
+
+NnetBatchDecoder::NnetBatchDecoder(
+    const fst::Fst<fst::StdArc> &fst,
+    const LatticeFasterDecoderConfig &decoder_opts,
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    bool allow_partial,
+    int32 num_threads,
+    NnetBatchComputer *computer):
+  fst_(fst), decoder_opts_(decoder_opts),
+  trans_model_(trans_model), word_syms_(word_syms),
+  allow_partial_(allow_partial),  computer_(computer),
+  is_finished_(false), tasks_finished_(false), priority_offset_(0.0),
+  tot_like_(0.0), frame_count_(0), num_success_(0), num_fail_(0),
+  num_partial_(0) {
+  KALDI_ASSERT(num_threads > 0);
+  for (int32 i = 0; i < num_threads; i++)
+    decode_threads_.push_back(new std::thread(DecodeFunc, this));
+  compute_thread_ = std::thread(ComputeFunc, this);
+}
+
+void NnetBatchDecoder::SetPriorities(std::vector<NnetInferenceTask> *tasks) {
+  size_t num_tasks = tasks->size();
+  double priority_offset = priority_offset_;
+  for (size_t i = 0; i < num_tasks; i++)
+    (*tasks)[i].priority = priority_offset - (double)i;
+}
+
+void NnetBatchDecoder::UpdatePriorityOffset(double priority) {
+  size_t num_tasks = decode_threads_.size(),
+      new_weight = 1.0 / num_tasks,
+      old_weight = 1.0 - new_weight;
+  // The next line is vulnerable to a race condition but if it happened it
+  // wouldn't matter.
+  priority_offset_ = priority_offset_ * old_weight + priority * new_weight;
+}
+
+void NnetBatchDecoder::AcceptInput(
+    const std::string &utterance_id,
+    const Matrix<BaseFloat> &input,
+    const Vector<BaseFloat> *ivector,
+    const Matrix<BaseFloat> *online_ivectors,
+    int32 online_ivector_period){
+  // This function basically does a handshake with one of the decoder threads.
+  // It may have to wait till one of the decoder threads becomes ready.
+  input_utterance_.utterance_id = utterance_id;
+  input_utterance_.input = &input;
+  input_utterance_.ivector = ivector;
+  input_utterance_.online_ivectors = online_ivectors;
+  input_utterance_.online_ivector_period = online_ivector_period;
+
+
+  UtteranceOutput *this_output = new UtteranceOutput();
+  this_output->utterance_id = utterance_id;
+  pending_utts_.push_back(this_output);
+
+  input_ready_semaphore_.Signal();
+  input_consumed_semaphore_.Wait();
+}
+
+int32 NnetBatchDecoder::Finished() {
+  is_finished_ = true;
+  for (size_t i = 0; i < decode_threads_.size(); i++)
+    input_ready_semaphore_.Signal();
+  for (size_t i = 0; i < decode_threads_.size(); i++) {
+    decode_threads_[i]->join();
+    delete decode_threads_[i];
+    decode_threads_[i] = NULL;
+  }
+  // don't clear decode_threads_, since its size is needed in the destructor to
+  // compute timing.
+
+  tasks_finished_ = true;
+  tasks_ready_semaphore_.Signal();
+  compute_thread_.join();
+  return num_success_;
+}
+
+
+bool NnetBatchDecoder::GetOutput(
+    std::string *utterance_id,
+    CompactLattice *clat,
+    std::string *sentence) {
+  if (!decoder_opts_.determinize_lattice)
+    KALDI_ERR << "Don't call this version of GetOutput if you are "
+        "not determinizing.";
+  while (true) {
+    if (pending_utts_.empty())
+      return false;
+    if (!pending_utts_.front()->finished)
+      return false;
+    UtteranceOutput *this_output = pending_utts_.front();
+    pending_utts_.pop_front();
+    if (this_output->compact_lat.NumStates() == 0) {
+      delete this_output;
+      // ... and continue round the loop, without returning any output to the
+      // user for this utterance.  Something went wrong in decoding: for
+      // example, the user specified allow_partial == false and no final-states
+      // were active on the last frame, or something more unexpected.  A warning
+      // would have been printed in the decoder thread.
+    } else {
+      *clat = this_output->compact_lat;
+      utterance_id->swap(this_output->utterance_id);
+      sentence->swap(this_output->sentence);
+      delete this_output;
+      return true;
+    }
+  }
+}
+
+
+bool NnetBatchDecoder::GetOutput(
+    std::string *utterance_id,
+    Lattice *lat,
+    std::string *sentence) {
+  if (decoder_opts_.determinize_lattice)
+    KALDI_ERR << "Don't call this version of GetOutput if you are "
+        "determinizing.";
+  while (true) {
+    if (pending_utts_.empty())
+      return false;
+    if (!pending_utts_.front()->finished)
+      return false;
+    UtteranceOutput *this_output = pending_utts_.front();
+    pending_utts_.pop_front();
+    if (this_output->lat.NumStates() == 0) {
+      delete this_output;
+      // ... and continue round the loop, without returning any output to the
+      // user for this utterance.  Something went wrong in decoding: for
+      // example, the user specified allow_partial == false and no final-states
+      // were active on the last frame, or something more unexpected.  A warning
+      // would have been printed in the decoder thread.
+    } else {
+      *lat = this_output->lat;  // OpenFST has shallow copy so no need to swap.
+      utterance_id->swap(this_output->utterance_id);
+      sentence->swap(this_output->sentence);
+      delete this_output;
+      return true;
+    }
+  }
+}
+
+void NnetBatchDecoder::Compute() {
+  while (!tasks_finished_) {
+    tasks_ready_semaphore_.Wait();
+    bool allow_partial_minibatch = true;
+    while (computer_->Compute(allow_partial_minibatch));
+  }
+}
+
+void NnetBatchDecoder::Decode() {
+  while (true) {
+    input_ready_semaphore_.Wait();
+    if (is_finished_)
+      return;
+
+    std::vector<NnetInferenceTask> tasks;
+    std::string utterance_id;
+    // we can be confident that the last element of 'pending_utts_' is the one
+    // for this utterance, as we know exactly at what point in the code the main
+    // thread will be in AcceptInput().
+    UtteranceOutput *output_utterance = pending_utts_.back();
+    {
+      UtteranceInput input_utterance(input_utterance_);
+      utterance_id = input_utterance.utterance_id;
+      bool output_to_cpu = true;
+      computer_->SplitUtteranceIntoTasks(output_to_cpu,
+                                         *(input_utterance.input),
+                                         input_utterance.ivector,
+                                         input_utterance.online_ivectors,
+                                         input_utterance.online_ivector_period,
+                                         &tasks);
+      KALDI_ASSERT(output_utterance->utterance_id == utterance_id);
+      input_consumed_semaphore_.Signal();
+      // Now let input_utterance go out of scope; it's no longer valid as it may
+      // be overwritten by something else.
+    }
+
+    SetPriorities(&tasks);
+    for (size_t i = 0; i < tasks.size(); i++)
+      computer_->AcceptTask(&(tasks[i]));
+    tasks_ready_semaphore_.Signal();
+
+    {
+      int32 frame_offset = 0;
+      LatticeFasterDecoder decoder(fst_, decoder_opts_);
+      decoder.InitDecoding();
+
+
+      for (size_t i = 0; i < tasks.size(); i++) {
+        NnetInferenceTask &task = tasks[i];
+        task.semaphore.Wait();
+        UpdatePriorityOffset(task.priority);
+
+        SubMatrix<BaseFloat> post(task.output_cpu,
+                                  task.num_initial_unused_output_frames,
+                                  task.num_used_output_frames,
+                                  0, task.output_cpu.NumCols());
+        DecodableMatrixMapped decodable(trans_model_, post, frame_offset);
+        frame_offset += post.NumRows();
+        decoder.AdvanceDecoding(&decodable);
+        task.output.Resize(0, 0);  // Free some memory.
+      }
+
+      bool use_final_probs = true;
+      if (!decoder.ReachedFinal()) {
+        if (allow_partial_) {
+          KALDI_WARN << "Outputting partial output for utterance "
+                     << utterance_id << " since no final-state reached\n";
+          use_final_probs = false;
+          std::unique_lock<std::mutex> lock(stats_mutex_);
+          num_partial_++;
+        } else {
+          KALDI_WARN << "Not producing output for utterance " << utterance_id
+                     << " since no final-state reached and "
+                     << "--allow-partial=false.\n";
+          std::unique_lock<std::mutex> lock(stats_mutex_);
+          num_fail_++;
+          continue;
+        }
+      }
+      // if we reached this point, we are getting a lattice.
+      decoder.GetRawLattice(&output_utterance->lat, use_final_probs);
+      // Let the decoder and the decodable object go out of scope, to save
+      // memory.
+    }
+    ProcessOutputUtterance(output_utterance);
+  }
+}
+
+
+void NnetBatchDecoder::UtteranceFailed() {
+  std::unique_lock<std::mutex> lock(stats_mutex_);
+  num_fail_++;
+}
+
+void NnetBatchDecoder::ProcessOutputUtterance(UtteranceOutput *output) {
+  fst::Connect(&(output->lat));
+  if (output->lat.NumStates() == 0) {
+    KALDI_WARN << "Unexpected problem getting lattice for utterance "
+               << output->utterance_id;
+    std::unique_lock<std::mutex> lock(stats_mutex_);
+    num_fail_++;
+    return;
+  }
+
+  { // This block accumulates diagnostics, prints log messages, and sets
+    // output->sentence.
+    Lattice best_path;
+    LatticeWeight weight;
+    ShortestPath(output->lat, &best_path);
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(best_path, &alignment, &words, &weight);
+    int32 num_frames = alignment.size();
+    if (word_syms_ != NULL) {
+      std::ostringstream os;
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms_->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+        os << s << ' ';
+      }
+      output->sentence = os.str();
+    }
+    double likelihood = -(weight.Value1() + weight.Value2());
+    // Note: these logging messages will be out-of-order w.r.t. the transcripts
+    // that are printed to cerr; we keep those transcripts in the same order
+    // that the utterances were in, but these logging messages may be out of
+    // order (due to multiple threads).
+    KALDI_LOG << "Log-like per frame for utterance " << output->utterance_id
+              << " is " << (likelihood / num_frames) << " over "
+              << num_frames << " frames.";
+    KALDI_VLOG(2) << "Cost for utterance " << output->utterance_id << " is "
+                  << weight.Value1() << " + " << weight.Value2();
+
+    std::unique_lock<std::mutex> lock(stats_mutex_);
+    tot_like_ += likelihood;
+    frame_count_ += num_frames;
+    num_success_ += 1;
+  }
+
+  if (decoder_opts_.determinize_lattice) {
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model_,
+            &output->lat,
+            decoder_opts_.lattice_beam,
+            &(output->compact_lat),
+            decoder_opts_.det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << output->utterance_id;
+    output->lat.DeleteStates();  // Save memory.
+  }
+
+  // We'll write the lattice without acoustic scaling, so we need to reverse
+  // the scale that we applied when decoding.
+  BaseFloat acoustic_scale = computer_->GetOptions().acoustic_scale;
+  if (acoustic_scale != 0.0) {
+    if (decoder_opts_.determinize_lattice)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale),
+                        &(output->compact_lat));
+    else
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale),
+                        &(output->lat));
+  }
+  output->finished = true;
+}
+
+
+
+NnetBatchDecoder::~NnetBatchDecoder() {
+  if (!is_finished_ || !pending_utts_.empty()) {
+    // At this point the application is bound to fail so raising another
+    // exception is not a big problem.
+    KALDI_ERR << "Destroying NnetBatchDecoder object without calling "
+        "Finished() and consuming the remaining output";
+  }
+  // Print diagnostics.
+
+  kaldi::int64 input_frame_count =
+      frame_count_ * computer_->GetOptions().frame_subsampling_factor;
+  int32 num_threads = static_cast<int32>(decode_threads_.size());
+
+  KALDI_LOG << "Overall likelihood per frame was "
+            << tot_like_ / std::max<int64>(1, frame_count_)
+            << " over " << frame_count_ << " frames.";
+
+  double elapsed = timer_.Elapsed();
+  // the +1 below is just to avoid division-by-zero errors.
+  KALDI_LOG << "Time taken "<< elapsed
+            << "s: real-time factor assuming 100 frames/sec is "
+            << (num_threads * elapsed * 100.0 /
+                std::max<int64>(input_frame_count, 1))
+            << " (per thread; with " << num_threads << " threads).";
+  KALDI_LOG << "Done " << num_success_ << " utterances ("
+            << num_partial_ << " forced out); failed for "
+            << num_fail_;
+}
+
+
+}  // namespace nnet3
+}  // namespace kaldi
diff --git a/src/nnet3/nnet-batch-compute.h b/src/nnet3/nnet-batch-compute.h
new file mode 100644
index 00000000000..a29973761d9
--- /dev/null
+++ b/src/nnet3/nnet-batch-compute.h
@@ -0,0 +1,846 @@
+// nnet3/nnet-batch-compute.h
+
+// Copyright 2012-2018  Johns Hopkins University (author: Daniel Povey)
+//                2018       Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_BATCH_COMPUTE_H_
+#define KALDI_NNET3_NNET_BATCH_COMPUTE_H_
+
+#include <vector>
+#include <string>
+#include <list>
+#include <utility>
+#include <condition_variable>
+#include "base/kaldi-common.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "itf/decodable-itf.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "decoder/lattice-faster-decoder.h"
+#include "util/stl-utils.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+
+/**
+   class NnetInferenceTask represents a chunk of an utterance that is
+   requested to be computed.  This will be given to NnetBatchComputer, which
+   will aggregate the tasks and complete them.
+ */
+struct NnetInferenceTask {
+  // The copy constructor is required to exist because of std::vector's resize()
+  // function, but in practice should never be used.
+  NnetInferenceTask(const NnetInferenceTask &other) {
+    KALDI_ERR << "NnetInferenceTask was not designed to be copied.";
+  }
+  NnetInferenceTask() { }
+
+
+  // The input frames, which are treated as being numbered t=0, t=1, etc.  (If
+  // the lowest t value was originally nonzero in the 'natural' numbering, this
+  // just means we conceptually shift the 't' values; the only real constraint
+  // is that the 't' values are contiguous.
+  CuMatrix<BaseFloat> input;
+
+  // The index of the first output frame (in the shifted numbering where the
+  // first output frame is numbered zero.  This will typically be less than one,
+  // because most network topologies require left context.  If this was an
+  // 'interior' chunk of a recurrent topology like LSTMs, first_input_t may be
+  // substantially less than zero, due to 'extra_left_context'.
+  int32 first_input_t;
+
+  // The stride of output 't' values: e.g., will be 1 for normal-frame-rate
+  // models, and 3 for low-frame-rate models such as chain models.
+  int32 output_t_stride;
+
+  // The number of output 't' values (they will start from zero and be separated
+  // by output_t_stride).  This will be the num-rows of 'output'.
+  int32 num_output_frames;
+
+  // 'num_initial_unused_output_frames', which will normally be zero, is the
+  // number of rows of the output matrix ('output' or 'output_cpu') which won't
+  // actually be needed by the user, usually because they overlap with a
+  // previous chunk.  This can happen because the number of outputs isn't a
+  // multiple of the number of chunks.
+  int32 num_initial_unused_output_frames;
+
+  // 0 < num_used_output_frames <= num_output_frames - num_initial_unused_output_frames
+  // is the number of output frames which are actually going to be used by the
+  // user.  (Due to edge effects, not all are necessarily used).
+  int32 num_used_output_frames;
+
+  // first_used_output_frame_index is provided for the convenience of the user
+  // so that they can know how this chunk relates to the utterance which it is
+  // a part of.
+  // It represents an output frame index in the original utterance-- after
+  // subsampling; so not a 't' value but a 't' value divided by
+  // frame-subsampling-factor.  Specifically, it tells you the row index in the
+  // full utterance's output which corresponds to the first 'used' frame index
+  // at the output of this chunk, specifically: the row numbered
+  // 'num_initial_unused_output_frames' in the 'output' or 'output_cpu' data
+  // member.
+  int32 first_used_output_frame_index;
+
+  // True if this chunk is an 'edge' (the beginning or end of an utterance) AND
+  // is structurally different somehow from non-edge chunk, e.g. requires less
+  // context.  This is present only so that NnetBatchComputer will know the
+  // appropriate minibatch size to use.
+  bool is_edge;
+
+  // True if this task represents an irregular-sized chunk.  These can happen
+  // only for utterances that are shorter than the requested minibatch size, and
+  // it should be quite rare.  We use a minibatch size of 1 in this case.
+  bool is_irregular;
+
+  // The i-vector for this chunk, if this network accepts i-vector inputs.
+  CuVector<BaseFloat> ivector;
+
+  // A priority (higher is more urgent); may be either sign.  May be updated
+  // after this object is provided to class NnetBatchComputer.
+  double priority;
+
+  // This semaphore will be incremented by class NnetBatchComputer when this
+  // chunk is done.  After this semaphore is incremented, class
+  // NnetBatchComputer will no longer hold any pointers to this class.
+  Semaphore semaphore;
+
+  // Will be set to true by the caller if they want the output of the neural net
+  // to be copied to CPU (to 'output').  If false, the output will stay on
+  // the GPU (if used)- in cu_output.
+  bool output_to_cpu;
+
+  // The neural net output, of dimension num_output_frames by the output-dim of
+  // the neural net, will be written to 'output_cpu' if 'output_to_cpu' is true.
+  // This is expected to be empty when this task is provided to class
+  // NnetBatchComputer, and will be nonempty (if output_to_cpu == true) when the
+  // task is completed and the semaphore is signaled.
+  Matrix<BaseFloat> output_cpu;
+
+  // The output goes here instead of 'output_to_cpu' is false.
+  CuMatrix<BaseFloat> output;
+};
+
+
+struct NnetBatchComputerOptions: public NnetSimpleComputationOptions {
+  int32 minibatch_size;
+  int32 edge_minibatch_size;
+  bool ensure_exact_final_context;
+  BaseFloat partial_minibatch_factor;
+
+  NnetBatchComputerOptions(): minibatch_size(128),
+                              edge_minibatch_size(32),
+                              ensure_exact_final_context(false),
+                              partial_minibatch_factor(0.5) {
+  }
+
+  void Register(OptionsItf *po) {
+    NnetSimpleComputationOptions::Register(po);
+    po->Register("minibatch-size", &minibatch_size, "Number of chunks per "
+                 "minibatch (see also edge-minibatch-size)");
+    po->Register("edge-minibatch-size", &edge_minibatch_size, "Number of "
+                 "chunks per minibatch: this applies to chunks at the "
+                 "beginnings and ends of utterances, in cases (such as "
+                 "recurrent models) when the computation would be different "
+                 "from the usual one.");
+    po->Register("ensure-exact-final-context", &ensure_exact_final_context,
+                 "If true, for utterances shorter than --frames-per-chunk, "
+                 "use exact-length, special computations.  If false, "
+                 "pad with repeats of the last frame.  Would only affect "
+                 "the output for backwards-recurrent models, but would "
+                 "negatively impact speed in all cases.");
+    po->Register("partial-minibatch-factor", &partial_minibatch_factor,
+                 "Factor that controls how small partial minibatches will be "
+                 "they become necessary.  We will potentially do the computation "
+                 "for sizes: int(partial_minibatch_factor^n * minibatch_size "
+                 ", for n = 0, 1, 2....  Set it to 0.0 if you want to use "
+                 "only the specified minibatch sizes.");
+  }
+};
+
+
+/**
+   Merges together the 'output_cpu' (if the 'output_to_cpu' members are true) or
+   the 'output' members of 'tasks' into a single CPU matrix 'output'.  Requires that
+   those outputs are nonempty (i.e. that those tasks must have been completed).
+
+   @param [in] tasks  The vector of tasks whose outputs are to be merged.
+         The tasks must have already been completed.
+   @param [output  output  The spliced-together output matrix
+
+   TODO: in the future, maybe start from GPU and use pinned matrices for the
+   transfer.
+ */
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    Matrix<BaseFloat> *output);
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    CuMatrix<BaseFloat> *output);
+
+/**
+   This class does neural net inference in a way that is optimized for GPU use:
+   it combines chunks of multiple utterances into minibatches for more efficient
+   computation.  It does the computation in one background thread that accesses
+   the GPU.  It is thread safe, i.e. you can call it from multiple threads
+   without having to worry about data races and the like.
+*/
+class NnetBatchComputer {
+ public:
+  /**  Constructor.  It stores references to all the arguments, so don't delete
+       them till this object goes out of scop.
+
+       \param [in] opts  Options struct
+       \param [in] nnet  The neural net which we'll be doing the computation with
+       \param [in] priors Either the empty vector, or a vector of prior
+                        probabilities which we'll take the log of and subtract
+                        from the neural net outputs (e.g. used in non-chain
+                        systems).
+   */
+  NnetBatchComputer(const NnetBatchComputerOptions &opts,
+                    const Nnet &nnet,
+                    const VectorBase<BaseFloat> &priors);
+
+
+  /// Accepts a task, meaning the task will be queued.  (Note: the pointer is
+  /// still owned by the caller.
+  /// If the max_minibatches_full >= 0, then the calling thread will block until
+  /// no more than that many full minibatches are waiting to be computed.  This
+  /// is a mechanism to prevent too many requests from piling up in memory.
+  void AcceptTask(NnetInferenceTask *task,
+                  int32 max_minibatches_full = -1);
+
+  /// Returns the number of full minibatches waiting to be computed.
+  int32 NumFullPendingMinibatches() const { return num_full_minibatches_; }
+
+
+  /**
+      Does some kind of computation, choosing the highest-priority thing to
+      compute.  It returns true if it did some kind of computation, and false
+      otherwise.  This function locks the class, but not for the entire time
+      it's being called: only at the beginning and at the end.
+        @param [in] allow_partial_minibatch  If false, then this will only
+              do the computation if a full minibatch is ready; if true, it
+              is allowed to do computation on partial (not-full) minibatches.
+   */
+  bool Compute(bool allow_partial_minibatch);
+
+
+  /**
+     Split a single utterance into a list of separate tasks which can then
+     be given to this class by AcceptTask().
+
+     @param [in] output_to_cpu  Will become the 'output_to_cpu' member of the
+             output tasks; this controls whether the computation code should transfer
+             the outputs to CPU (which is to save GPU memory).
+     @param [in] ivector  If non-NULL, and i-vector for the whole utterance is
+             expected to be supplied here (and online_ivectors should be NULL).
+             This is relevant if you estimate i-vectors per speaker instead of
+             online.
+     @param [in] online_ivectors  Matrix of ivectors, one every 'online_ivector_period' frames.
+     @param [in] online_ivector_period  Affects the interpretation of 'online_ivectors'.
+     @param [out]  tasks       The tasks created will be output to here.  The
+                      priorities will be set to zero; setting them to a meaningful
+                      value is up to the caller.
+  */
+  void SplitUtteranceIntoTasks(
+      bool output_to_cpu,
+      const Matrix<BaseFloat> &input,
+      const Vector<BaseFloat> *ivector,
+      const Matrix<BaseFloat> *online_ivectors,
+      int32 online_ivector_period,
+      std::vector<NnetInferenceTask> *tasks);
+  void SplitUtteranceIntoTasks(
+      bool output_to_cpu,
+      const CuMatrix<BaseFloat> &input,
+      const CuVector<BaseFloat> *ivector,
+      const CuMatrix<BaseFloat> *online_ivectors,
+      int32 online_ivector_period,
+      std::vector<NnetInferenceTask> *tasks);
+
+  const NnetBatchComputerOptions &GetOptions() { return opts_; }
+
+  ~NnetBatchComputer();
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetBatchComputer);
+
+  // Information about a specific minibatch size for a group of tasks sharing a
+  // specific structure (in terms of left and right context, etc.)
+  struct MinibatchSizeInfo {
+    // the computation for this minibatch size.
+    std::shared_ptr<const NnetComputation> computation;
+    int32 num_done;  // The number of minibatches computed: for diagnostics.
+    int64 tot_num_tasks;  // The total number of tasks in those minibatches,
+    // also for diagnostics... can be used to compute
+    // how 'full', on average, these minibatches were.
+    double seconds_taken;  // The total time elapsed in computation for this
+                          // minibatch type.
+    MinibatchSizeInfo(): computation(NULL), num_done(0),
+                         tot_num_tasks(0), seconds_taken(0.0) { }
+  };
+
+
+  // A computation group is a group of tasks that have the same structure
+  // (number of input and output frames, left and right context).
+  struct ComputationGroupInfo {
+    // The tasks to be completed.  This array is added-to by AcceptTask(),
+    // and removed-from by GetHighestPriorityComputation(), which is called
+    // from Compute().
+    std::vector<NnetInferenceTask*> tasks;
+
+    // Map from minibatch-size to information specific to this minibatch-size,
+    // including the NnetComputation.  This is set up by
+    // GetHighestPriorityComputation(), which is called from Compute().
+    std::map<int32, MinibatchSizeInfo> minibatch_info;
+  };
+
+  // This struct allows us to arrange the tasks into groups that can be
+  // computed in the same minibatch.
+  struct ComputationGroupKey {
+    ComputationGroupKey(const NnetInferenceTask &task):
+        num_input_frames(task.input.NumRows()),
+        first_input_t(task.first_input_t),
+        num_output_frames(task.num_output_frames) {}
+
+    bool operator == (const ComputationGroupKey &other) const {
+      return num_input_frames == other.num_input_frames &&
+          first_input_t == other.first_input_t &&
+          num_output_frames == other.num_output_frames;
+    }
+    int32 num_input_frames;
+    int32 first_input_t;
+    int32 num_output_frames;
+  };
+
+  struct ComputationGroupKeyHasher {
+    int32 operator () (const ComputationGroupKey &key) const {
+      return key.num_input_frames + 18043 * key.first_input_t +
+          6413 * key.num_output_frames;
+    }
+  };
+
+
+  typedef unordered_map<ComputationGroupKey, ComputationGroupInfo,
+                        ComputationGroupKeyHasher> MapType;
+
+  // Gets the priority for a group, higher means higher priority.  (A group is a
+  // list of tasks that may be computed in the same minibatch).  What this
+  // function does is a kind of heuristic.
+  // If allow_partial_minibatch == false, it will set the priority for
+  // any minibatches that are not full to negative infinity.
+  inline double GetPriority(bool allow_partial_minibatch,
+                            const ComputationGroupInfo &info) const;
+
+  // Returns the minibatch size for this group of tasks, i.e. the size of a full
+  // minibatch for this type of task, which is what we'd ideally like to
+  // compute.  Note: the is_edge and is_irregular options should be the same
+  // for for all tasks in the group.
+  //   - If 'tasks' is empty or info.is_edge and info.is_irregular are both,
+  //     false, then return opts_.minibatch_size
+  //   - If 'tasks' is nonempty and tasks[0].is_irregular is true, then
+  //     returns 1.
+  //   - If 'tasks' is nonempty and tasks[0].is_irregular is false and
+  //     tasks[0].is_edge is true, then returns opts_.edge_minibatch_size.
+  inline int32 GetMinibatchSize(const ComputationGroupInfo &info) const;
+
+
+  // This function compiles, and returns, a computation for tasks of
+  // the structure present in info.tasks[0], and the specified minibatch
+  // size.
+  std::shared_ptr<const NnetComputation> GetComputation(
+      const ComputationGroupInfo &info,
+      int32 minibatch_size);
+
+
+  // Returns the actual minibatch size we'll use for this computation.  In most
+  // cases it will be opts_.minibatch_size (or opts_.edge_minibatch_size if
+  // appropriate; but if the number of available tasks is much less than the
+  // appropriate minibatch size, it may be less.  The minibatch size may be
+  // greater than info.tasks.size(); in that case, the remaining 'n' values in
+  // the minibatch are not used.  (It may also be less than info.tasks.size(),
+  // in which case we only do some of them).
+  int32 GetActualMinibatchSize(const ComputationGroupInfo &info) const;
+
+
+  // This function gets the highest-priority 'num_tasks' tasks from 'info',
+  // removes them from the array info->tasks, and puts them into the array
+  // 'tasks' (which is assumed to be initially empty).
+  // This function also updates the num_full_minibatches_ variable if
+  // necessary, and takes care of notifying any related condition variables.
+  void GetHighestPriorityTasks(
+      int32 num_tasks,
+      ComputationGroupInfo *info,
+      std::vector<NnetInferenceTask*> *tasks);
+
+  /**
+      This function finds and returns the computation corresponding to the
+      highest-priority group of tasks.
+
+       @param [in] allow_partial_minibatch  If this is true, then this
+             function may return a computation corresponding to a partial
+             minibatch-- i.e. the minibatch size in the computation may be
+             less than the minibatch size in the options class, and/or
+             the number of tasks may not be as many as the minibatch size
+             in the computation.
+       @param [out] minibatch_size  If this function returns non-NULL, then
+             this will be set to the minibatch size that the returned
+             computation expects.  This may be less than tasks->size(),
+             in cases where the minibatch was not 'full'.
+       @param [out] tasks  The tasks which we'll be doing the computation
+             for in this minibatch are put here (and removed from tasks_,
+             in cases where this function returns non-NULL.
+       @return  This function returns a pointer to the appropriate
+             'MinibatchSizeInfo' object corresponding to the computation
+             that we'll be doing for this minibatch, or NULL if there is nothing
+             to compute.
+  */
+  MinibatchSizeInfo *GetHighestPriorityComputation(
+      bool allow_partial_minibatch,
+      int32 *minibatch_size,
+      std::vector<NnetInferenceTask*> *tasks);
+
+  /**
+     formats the inputs to the computation and transfers them to GPU.
+        @param [in]  minibatch_size  The number of parallel sequences
+            we're doing this computation for.  This will be
+            more than tasks.size() in some cases.
+        @param [in] tasks  The tasks we're doing the computation for.
+            The input comes from here.
+        @param [out] input  The main feature input to the computation is
+            put into here.
+        @param [out] ivector  If we're using i-vectors, the i-vectors are
+            put here.
+  */
+  void FormatInputs(int32 minibatch_size,
+                    const std::vector<NnetInferenceTask*> &tasks,
+                    CuMatrix<BaseFloat> *input,
+                    CuMatrix<BaseFloat> *ivector);
+
+
+  // Copies 'output', piece by piece, to the 'output_cpu' or 'output'
+  // members of 'tasks', depending on their 'output_to_cpu' value.
+  void FormatOutputs(const CuMatrix<BaseFloat> &output,
+                     const std::vector<NnetInferenceTask*> &tasks);
+
+
+  // Changes opts_.frames_per_chunk to be a multiple of
+  // opts_.frame_subsampling_factor, if needed.
+  void CheckAndFixConfigs();
+
+  // this function creates and returns the computation request which is to be
+  // compiled.
+  static void GetComputationRequest(const NnetInferenceTask &task,
+                                    int32 minibatch_size,
+                                    ComputationRequest *request);
+
+  // Prints some logging information about what we computed, with breakdown by
+  // minibatch type.
+  void PrintMinibatchStats();
+
+  NnetBatchComputerOptions opts_;
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  CuVector<BaseFloat> log_priors_;
+
+  // Mutex that guards this object.  It is only held for fairly quick operations
+  // (not while the actual computation is being done).
+  std::mutex mutex_;
+
+  // tasks_ contains all the queued tasks.
+  // Each key contains a vector of NnetInferenceTask* pointers, of the same
+  // structure (i.e., IsCompatible() returns true).
+  MapType tasks_;
+
+  // num_full_minibatches_ is a function of the data in tasks_ (and the
+  // minibatch sizes, specified in opts_.  It is the number of full minibatches
+  // of tasks that are pending, meaning: for each group of tasks, the number of
+  // pending tasks divided by the minibatch-size for that group in integer
+  // arithmetic.  This is kept updated for thread synchronization reasons, because
+  // it is the shared variable
+  int32 num_full_minibatches_;
+
+  // a map from 'n' to a condition variable corresponding to the condition:
+  // num_full_minibatches_ <= n.  Any time the number of full minibatches drops
+  // below n, the corresponding condition variable is notified (if it exists).
+  std::unordered_map<int32, std::condition_variable*> no_more_than_n_minibatches_full_;
+
+  // some static information about the neural net, computed at the start.
+  int32 nnet_left_context_;
+  int32 nnet_right_context_;
+  int32 input_dim_;
+  int32 ivector_dim_;
+  int32 output_dim_;
+};
+
+
+/**
+   This class implements a simplified interface to class NnetBatchComputer,
+   which is suitable for programs like 'nnet3-compute' where you want to support
+   fast GPU-based inference on a sequence of utterances, and get them back
+   from the object in the same order.
+ */
+class NnetBatchInference {
+ public:
+
+  NnetBatchInference(
+      const NnetBatchComputerOptions &opts,
+      const Nnet &nnet,
+      const VectorBase<BaseFloat> &priors);
+
+  /**
+    The user should call this one by one for the utterances that this class
+    needs to compute (interspersed with calls to GetOutput()).  This call
+    will block when enough ready-to-be-computed data is present.
+
+      @param [in] utterance_id  The string representing the utterance-id;
+             it will be provided back to the user when GetOutput() is
+             called.
+      @param [in] input  The input features (e.g. MFCCs)
+      @param [in] ivector  If non-NULL, this is expected to be the
+             i-vector for this utterance (and 'online_ivectors' should
+             be NULL).
+      @param [in] online_ivector_period  Only relevant if
+             'online_ivector' is non-NULL, this says how many
+             frames of 'input' is covered by each row of
+             'online_ivectors'.
+  */
+  void AcceptInput(const std::string &utterance_id,
+                   const Matrix<BaseFloat> &input,
+                   const Vector<BaseFloat> *ivector,
+                   const Matrix<BaseFloat> *online_ivectors,
+                   int32 online_ivector_period);
+
+  /**
+     The user should call this after the last input has been provided
+     via AcceptInput().  This will force the last utterances to be
+     flushed out (to be retrieved by GetOutput()), rather than waiting
+     until the relevant minibatches are full.
+  */
+  void Finished();
+
+  /**
+      The user should call this to obtain output.  It's guaranteed to
+      be in the same order as the input was provided, but it may be
+      delayed.  'output' will be the output of the neural net, spliced
+      together over the chunks (and with acoustic scaling applied if
+      it was specified in the options; the subtraction of priors will
+      depend whether you supplied a non-empty vector of priors to the
+      constructor.
+
+      This call does not block (i.e. does not wait on any semaphores) unless you
+      have previously called Finished().  It returns true if it actually got any
+      output; if none was ready it will return false.
+  */
+  bool GetOutput(std::string *utterance_id,
+                 Matrix<BaseFloat> *output);
+
+  ~NnetBatchInference();
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetBatchInference);
+
+  // This is the computation thread, which is run in the background.  It will
+  // exit once the user calls Finished() and all computation is completed.
+  void Compute();
+  // static wrapper for Compute().
+  static void ComputeFunc(NnetBatchInference *object) { object->Compute(); }
+
+
+  // This object implements the internals of what this class does.  It is
+  // accessed both by the main thread (from where AcceptInput(), Finished() and
+  // GetOutput() are called), and from the background thread in which Compute()
+  // is called.
+  NnetBatchComputer computer_;
+
+  // This is set to true when the user calls Finished(); the computation thread
+  // sees it and knows to flush
+  bool is_finished_;
+
+  // This semaphore is signaled by the main thread (the thread in which
+  // AcceptInput() is called) every time a new utterance is added, and waited on
+  // in the background thread in which Compute() is called.
+  Semaphore tasks_ready_semaphore_;
+
+  struct UtteranceInfo {
+    std::string utterance_id;
+    // The tasks into which we split this utterance.
+    std::vector<NnetInferenceTask> tasks;
+    // 'num_tasks_finished' is the number of tasks which are known to be
+    // finished, meaning we successfully waited for those tasks' 'semaphore'
+    // member.  When this reaches tasks.size(), we are ready to consolidate
+    // the output into a single matrix and return it to the user.
+    size_t num_tasks_finished;
+  };
+
+  // This list is only accessed directly by the main thread, by AcceptInput()
+  // and GetOutput().  It is a list of utterances, with more recently added ones
+  // at the back.  When utterances are given to the user by GetOutput(),
+  std::list<UtteranceInfo*> utts_;
+
+  int32 utterance_counter_;  // counter that increases on every utterance.
+
+  // The thread running the Compute() process.
+  std::thread compute_thread_;
+};
+
+
+/**
+   Decoder object that uses multiple CPU threads for the graph search, plus a
+   GPU for the neural net inference (that's done by a separate
+   NnetBatchComputer object).  The interface of this object should
+   accessed from only one thread, though-- presumably the main thread of the
+   program.
+ */
+class NnetBatchDecoder {
+ public:
+  /**
+     Constructor.
+        @param [in] fst    FST that we are decoding with, will be shared between
+                           all decoder threads.
+        @param [in] decoder_config  Configuration object for the decoders.
+        @param [in] trans_model   The transition model-- needed to construct the decoders,
+                           and for determinization.
+        @param [in] word_syms  A pointer to a symbol table of words, used for printing
+                          the decoded words to stderr.  If NULL, the word-level output will not
+                          be logged.
+        @param [in] allow_partial   If true, in cases where no final-state was reached
+                           on the final frame of the decoding, we still output a lattice;
+                           it just may contain partial words (words that are cut off in
+                           the middle).  If false, we just won't output anything for
+                           those lattices.
+        @param [in] num_threads  The number of decoder threads to use.  It will use
+                          two more threads on top of this: the main thread, for I/O,
+                          and a thread for possibly-GPU-based inference.
+        @param [in] computer The NnetBatchComputer object, through which the
+                           neural net will be evaluated.
+   */
+  NnetBatchDecoder(const fst::Fst<fst::StdArc> &fst,
+                   const LatticeFasterDecoderConfig &decoder_config,
+                   const TransitionModel &trans_model,
+                   const fst::SymbolTable *word_syms,
+                   bool allow_partial,
+                   int32 num_threads,
+                   NnetBatchComputer *computer);
+
+  /**
+    The user should call this one by one for the utterances that
+    it needs to compute (interspersed with calls to GetOutput()).  This
+    call will block when no threads are ready to start processing this
+    utterance.
+
+      @param [in] utterance_id  The string representing the utterance-id;
+             it will be provided back to the user when GetOutput() is
+             called.
+      @param [in] input  The input features (e.g. MFCCs)
+      @param [in] ivector  If non-NULL, this is expected to be the
+             i-vector for this utterance (and 'online_ivectors' should
+             be NULL).
+      @param [in] online_ivector_period  Only relevant if
+             'online_ivector' is non-NULL, this says how many
+             frames of 'input' is covered by each row of
+             'online_ivectors'.
+  */
+  void AcceptInput(const std::string &utterance_id,
+                   const Matrix<BaseFloat> &input,
+                   const Vector<BaseFloat> *ivector,
+                   const Matrix<BaseFloat> *online_ivectors,
+                   int32 online_ivector_period);
+
+  /*
+    The user should call this function each time there was a problem with an utterance
+    prior to being able to call AcceptInput()-- e.g. missing i-vectors.  This will
+    update the num-failed-utterances stats which are stored in this class.
+   */
+  void UtteranceFailed();
+
+  /*
+     The user should call this when all input has been provided, e.g.
+     when AcceptInput will not be called any more.  It will block until
+     all threads have terminated; after that, you can call GetOutput()
+     until it returns false, which will guarantee that nothing remains
+     to compute.
+     It returns the number of utterances that have been successfully decoded.
+   */
+  int32 Finished();
+
+  /**
+      The user should call this to obtain output (This version should
+      only be called if config.determinize_lattice == true (w.r.t. the
+      config provided to the constructor).  The output is guaranteed to
+      be in the same order as the input was provided, but it may be
+      delayed, *and* some outputs may be missing, for example because
+      of search failures (allow_partial will affect this).
+
+      The acoustic scores in the output lattice will already be divided by
+      the acoustic scale we decoded with.
+
+      This call does not block (i.e. does not wait on any semaphores).  It
+      returns true if it actually got any output; if none was ready it will
+      return false.
+         @param [out] utterance_id  If an output was ready, its utterance-id is written to here.
+         @param [out] clat  If an output was ready, it compact lattice will be
+                            written to here.
+         @param [out] sentence  If an output was ready and a nonempty symbol table
+                            was provided to the constructor of this class, contains
+                            the word-sequence decoded as a string.  Otherwise will
+                            be empty.
+         @return  Returns true if a decoded output was ready.  (These appear asynchronously
+                  as the decoding is done in background threads).
+  */
+  bool GetOutput(std::string *utterance_id,
+                 CompactLattice *clat,
+                 std::string *sentence);
+
+  // This version of GetOutput is for where config.determinize_lattice == false
+  // (w.r.t. the config provided to the constructor).  It is the same as the
+  // other version except it outputs to a normal Lattice, not a CompactLattice.
+  bool GetOutput(std::string *utterance_id,
+                 Lattice *lat,
+                 std::string *sentence);
+
+  ~NnetBatchDecoder();
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetBatchDecoder);
+
+  struct UtteranceInput {
+    std::string utterance_id;
+    const Matrix<BaseFloat> *input;
+    const Vector<BaseFloat> *ivector;
+    const Matrix<BaseFloat> *online_ivectors;
+    int32 online_ivector_period;
+  };
+
+  // This object is created when a thread finished an utterance.  For utterances
+  // where decoding failed somehow, the relevant lattice (compact_lat, if
+  // opts_.determinize == true, or lat otherwise) will be empty (have no
+  // states).
+  struct UtteranceOutput {
+    std::string utterance_id;
+    bool finished;
+    CompactLattice compact_lat;
+    Lattice lat;
+    std::string sentence;  // 'sentence' is only nonempty if a non-NULL symbol
+                           // table was provided to the constructor of class
+                           // NnetBatchDecoder; it's the sentence as a string (a
+                           // sequence of words separated by space).  It's used
+                           // for printing the sentence to stderr, which we do
+                           // in the main thread to keep the order consistent.
+  };
+
+  // This is the decoding thread, several copies of which are run in the
+  // background.  It will exit once the user calls Finished() and all
+  // computation is completed.
+  void Decode();
+  // static wrapper for Compute().
+  static void DecodeFunc(NnetBatchDecoder *object) { object->Decode(); }
+
+  // This is the computation thread; it handles the neural net inference.
+  void Compute();
+  // static wrapper for Compute().
+  static void ComputeFunc(NnetBatchDecoder *object) { object->Compute(); }
+
+
+  // Sets the priorities of the tasks in a newly provided utterance.
+  void SetPriorities(std::vector<NnetInferenceTask> *tasks);
+
+  // In the single-thread case, this sets priority_offset_ to 'priority'.
+  // In the multi-threaded case it causes priority_offset_ to approach
+  // 'priority' at a rate that depends on the nunber of threads.
+  void UpdatePriorityOffset(double priority);
+
+  // This function does the determinization (if needed) and finds the best path through
+  // the lattice to update the stats.  It is expected that when it is called, 'output' must
+  // have its 'lat' member set up.
+  void ProcessOutputUtterance(UtteranceOutput *output);
+
+  const fst::Fst<fst::StdArc> &fst_;
+  const LatticeFasterDecoderConfig &decoder_opts_;
+  const TransitionModel &trans_model_;
+  const fst::SymbolTable *word_syms_;  // May be NULL.  Owned here.
+  bool allow_partial_;
+  NnetBatchComputer *computer_;
+  std::vector<std::thread*> decode_threads_;
+  std::thread compute_thread_;  // Thread that calls computer_->Compute().
+
+
+  // 'input_utterance', together with utterance_ready_semaphore_ and
+  // utterance_consumed_semaphore_, use used to 'hand off' information about a
+  // newly provided utterance from AcceptInput() to a decoder thread that is
+  // ready to process a new utterance.
+  UtteranceInput input_utterance_;
+  Semaphore input_ready_semaphore_;  // Is signaled by the main thread when
+                                     // AcceptInput() is called and a new
+                                     // utterance is being provided (or when the
+                                     // input is finished), and waited on in
+                                     // decoder thread.
+  Semaphore input_consumed_semaphore_;  // Is signaled in decoder thread when it
+                                        // has finished consuming the input, so
+                                        // the main thread can know when it
+                                        // should continue (to avoid letting
+                                        // 'input' go out of scope while it's
+                                        // still needed).
+
+  Semaphore tasks_ready_semaphore_; // Is signaled when new tasks are added to
+                                    // the computer_ object (or when we're finished).
+
+  bool is_finished_;  // True if the input is finished.  If this is true, a
+                      // signal to input_ready_semaphore_ indicates to the
+                      // decoder thread that it should terminate.
+
+  bool tasks_finished_;  // True if we know that no more tasks will be given
+                         // to the computer_ object.
+
+
+  // pending_utts_ is a list of utterances that have been provided via
+  // AcceptInput(), but their decoding has not yet finished.  AcceptInput() will
+  // push_back to it, and GetOutput() will pop_front().  When a decoding thread
+  // has finished an utterance it will set its 'finished' member to true.  There
+  // is no need to synchronize or use mutexes here.
+  std::list<UtteranceOutput*> pending_utts_;
+
+  // priority_offset_ is something used in determining the priorities of nnet
+  // computation tasks.  It starts off at zero and becomes more negative with
+  // time, with the aim being that the priority of the first task (i.e. the
+  // leftmost chunk) of a new utterance should be at about the same priority as
+  // whatever chunks we are just now getting around to decoding.
+  double priority_offset_;
+
+  // Some statistics accumulated by this class, for logging and timing purposes.
+  double tot_like_;  // Total likelihood (of best path) over all lattices that
+                     // we output.
+  int64 frame_count_;  // Frame count over all latices that we output.
+  int32 num_success_;  // Number of successfully decoded files.
+  int32 num_fail_;  // Number of files where decoding failed.
+  int32 num_partial_;  // Number of files that were successfully decoded but
+                       // reached no final-state (can only be nonzero if
+                       // allow_partial_ is true).
+  std::mutex stats_mutex_;  // Mutex that guards the statistics from tot_like_
+                            // through num_partial_.
+  Timer timer_;  // Timer used to print real-time info.
+};
+
+
+}  // namespace nnet3
+}  // namespace kaldi
+
+#endif  // KALDI_NNET3_NNET_BATCH_COMPUTE_H_
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index c627bb1032a..53da15d6f4e 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -357,7 +357,7 @@ void ShiftChainExampleTimes(int32 frame_shift,
       input_end = eg->inputs.end();
   for (; input_iter != input_end; ++input_iter) {
     bool must_exclude = false;
-    std::vector<string>::const_iterator exclude_iter = exclude_names.begin(),
+    std::vector<std::string>::const_iterator exclude_iter = exclude_names.begin(),
         exclude_end = exclude_names.end();
     for (; exclude_iter != exclude_end; ++exclude_iter)
       if (input_iter->name == *exclude_iter)
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 2ec2699ec97..d9562887817 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -33,6 +33,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
     compiler_(*nnet, opts_.nnet_config.optimize_config,
               opts_.nnet_config.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -41,9 +42,6 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
                opts.nnet_config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (opts.nnet_config.read_cache != "") {
     bool binary;
@@ -88,80 +86,11 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   } else { // conventional training
     TrainInternal(chain_eg, *computation);
   }
-
-  num_minibatches_processed_++;
-}
-
-// This object exists to help avoid memory fragmentation: it allocates,
-// but does not use, the exact sizes of memory that are going to be needed
-// in ComputeChainObjfAndDeriv().
-class ChainTrainerMemoryHolder {
- public:
-  ChainTrainerMemoryHolder(const Nnet &nnet,
-                           int32 num_den_graph_states,
-                           const NnetChainExample &eg);
- private:
-  CuMatrix<BaseFloat> nnet_output_deriv_;
-  CuMatrix<BaseFloat> xent_output_deriv_;
-  CuMatrix<BaseFloat> beta_;
-  CuMatrix<BaseFloat> alpha_;
-
-};
-
-ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet,
-                                                   int32 den_graph_states,
-                                                   const NnetChainExample &eg) {
-
-  std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
-      end = eg.outputs.end();
-
-  int32 max_rows = 0,
-      max_cols = 0;
-
-  size_t max_frames_per_sequence = 0,
-         max_sequence_size = 0,
-         max_alpha_matrix_size = 0;
-
-  for (; iter != end; ++iter) {
-    // there will normally be just one of these things; we'll normally loop once.
-    const NnetChainSupervision &sup = *iter;
-
-    int32 output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence;
-    int32 output_cols = nnet.OutputDim("output");
-
-    size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1;
-    size_t den_graph_size = den_graph_states + 1;
-    size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences;
-    size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size;
-
-    if (curr_alpha_matrix_size > max_alpha_matrix_size) {
-      max_alpha_matrix_size = curr_alpha_matrix_size;
-      max_frames_per_sequence = curr_frames_per_sequence;
-      max_sequence_size = curr_sequence_size;
-    }
-
-    size_t matrix_size = output_rows * output_cols;
-    if (matrix_size > (max_rows * max_cols)) {
-      max_rows = output_rows;
-      max_cols = output_cols;
-    }
+  if (num_minibatches_processed_ == 0) {
+    ConsolidateMemory(nnet_);
+    ConsolidateMemory(delta_nnet_);
   }
-
-  // the sequence of resizes is in a specific order (bigger to smaller)
-  // so that the cudaMalloc won't trash the memory it has already
-  // alloc'd in the previous iterations
-  alpha_.Resize(max_frames_per_sequence,
-                max_sequence_size,
-                kUndefined);
-
-
-  nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined);
-  // note: the same block of memory can be used for xent_output_deriv_ as is
-  // used for exp_nnet_output_transposed_ in chain-training.cc.
-  xent_output_deriv_.Resize(max_rows, max_cols,
-                            kUndefined, kStrideEqualNumCols);
-
-  beta_.Resize(2, max_sequence_size, kUndefined);
+  num_minibatches_processed_++;
 }
 
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
@@ -173,34 +102,26 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
 
-  // reserve the memory needed in ProcessOutputs (before memory gets fragmented
-  // by the call to computer.Run().
-  ChainTrainerMemoryHolder *memory_holder =
-      new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg);
-
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
 
-  // 'this->ProcessOutputs()' is going to need the same sizes as are stored in
-  // 'memory_holder'.
-  delete memory_holder;
-
-  // Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ?
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
-  // If relevant, add in the part of the gradient that comes from L2
-  // regularization.
+  // If relevant, add in the part of the gradient that comes from
+  // parameter-level L2 regularization.
   ApplyL2Regularization(*nnet_,
                         GetNumNvalues(eg.inputs, false) *
                         nnet_config.l2_regularize_factor,
                         delta_nnet_);
 
   // Updates the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_,
+      nnet_config.max_param_change,
+      1.0, 1.0 - nnet_config.momentum, nnet_,
+      &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -255,9 +176,10 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, nnet_config.max_param_change,
+      max_change_scale, scale_adding, nnet_,
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -355,41 +277,10 @@ bool NnetChainTrainer::PrintTotalStats() const {
     const ObjectiveFunctionInfo &info = iter->second;
     ans = info.PrintTotalStats(name) || ans;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetChainTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / nnet_config.backstitch_training_interval))
-                  << " \% of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / nnet_config.backstitch_training_interval))
-              << " \% of the time.";
-}
-
 NnetChainTrainer::~NnetChainTrainer() {
   if (opts_.nnet_config.write_cache != "") {
     Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 5bf6a3f6fce..bc5143491ac 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -64,10 +64,6 @@ class NnetChainTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetChainTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -88,11 +84,8 @@ class NnetChainTrainer {
 
   chain::DenominatorGraph den_graph_;
   Nnet *nnet_;
-  Nnet *delta_nnet_;  // Only used if momentum != 0.0 or max-param-change !=
-                      // 0.0.  nnet representing accumulated parameter-change
-                      // (we'd call this gradient_nnet_, but due to
-                      // natural-gradient update, it's better to consider it as
-                      // a delta-parameter nnet.
+  Nnet *delta_nnet_;  // stores the change to the parameters on each training
+                      // iteration.
   CachingOptimizingCompiler compiler_;
 
   // This code supports multiple output layers, even though in the
@@ -101,8 +94,7 @@ class NnetChainTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-combined-component.cc b/src/nnet3/nnet-combined-component.cc
new file mode 100644
index 00000000000..0a2fb3f5a91
--- /dev/null
+++ b/src/nnet3/nnet-combined-component.cc
@@ -0,0 +1,2332 @@
+// nnet3/nnet-combined-component.cc
+
+// Copyright 2015-2018  Johns Hopkins University (author: Daniel Povey)
+//                2015  Daniel Galvez
+//                2018  Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <iomanip>
+#include "nnet3/nnet-combined-component.h"
+#include "nnet3/nnet-parse.h"
+#include "cudamatrix/cu-math.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// Constructors for the convolution component
+ConvolutionComponent::ConvolutionComponent():
+    UpdatableComponent(),
+    input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+    filt_x_dim_(0), filt_y_dim_(0),
+    filt_x_step_(0), filt_y_step_(0),
+    input_vectorization_(kZyx) { }
+
+ConvolutionComponent::ConvolutionComponent(
+    const ConvolutionComponent &component):
+    UpdatableComponent(component),
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    filt_x_dim_(component.filt_x_dim_),
+    filt_y_dim_(component.filt_y_dim_),
+    filt_x_step_(component.filt_x_step_),
+    filt_y_step_(component.filt_y_step_),
+    input_vectorization_(component.input_vectorization_),
+    filter_params_(component.filter_params_),
+    bias_params_(component.bias_params_) { }
+
+ConvolutionComponent::ConvolutionComponent(
+    const CuMatrixBase<BaseFloat> &filter_params,
+    const CuVectorBase<BaseFloat> &bias_params,
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    BaseFloat learning_rate):
+    input_x_dim_(input_x_dim),
+    input_y_dim_(input_y_dim),
+    input_z_dim_(input_z_dim),
+    filt_x_dim_(filt_x_dim),
+    filt_y_dim_(filt_y_dim),
+    filt_x_step_(filt_x_step),
+    filt_y_step_(filt_y_step),
+    input_vectorization_(input_vectorization),
+    filter_params_(filter_params),
+    bias_params_(bias_params){
+  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
+               bias_params.Dim() != 0);
+  KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
+  SetUnderlyingLearningRate(learning_rate);
+  is_gradient_ = false;
+}
+
+// aquire input dim
+int32 ConvolutionComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
+}
+
+// aquire output dim
+int32 ConvolutionComponent::OutputDim() const {
+  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
+  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
+  int32 num_filters = filter_params_.NumRows();
+  return num_x_steps * num_y_steps * num_filters;
+}
+
+// initialize the component using hyperparameters
+void ConvolutionComponent::Init(
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+    TensorVectorizationType input_vectorization,
+    BaseFloat param_stddev, BaseFloat bias_stddev) {
+  input_x_dim_ = input_x_dim;
+  input_y_dim_ = input_y_dim;
+  input_z_dim_ = input_z_dim;
+  filt_x_dim_ = filt_x_dim;
+  filt_y_dim_ = filt_y_dim;
+  filt_x_step_ = filt_x_step;
+  filt_y_step_ = filt_y_step;
+  input_vectorization_ = input_vectorization;
+  KALDI_ASSERT((input_x_dim_ - filt_x_dim_) % filt_x_step_ == 0);
+  KALDI_ASSERT((input_y_dim_ - filt_y_dim_) % filt_y_step_ == 0);
+  int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_;
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
+  filter_params_.SetRandn();
+  filter_params_.Scale(param_stddev);
+  bias_params_.SetRandn();
+  bias_params_.Scale(bias_stddev);
+}
+
+// initialize the component using predefined matrix file
+void ConvolutionComponent::Init(
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    std::string matrix_filename) {
+  input_x_dim_ = input_x_dim;
+  input_y_dim_ = input_y_dim;
+  input_z_dim_ = input_z_dim;
+  filt_x_dim_ = filt_x_dim;
+  filt_y_dim_ = filt_y_dim;
+  filt_x_step_ = filt_x_step;
+  filt_y_step_ = filt_y_step;
+  input_vectorization_ = input_vectorization;
+  CuMatrix<BaseFloat> mat;
+  ReadKaldiObject(matrix_filename, &mat);
+  int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_);
+  int32 num_filters = mat.NumRows();
+  KALDI_ASSERT(mat.NumCols() == (filter_dim + 1));
+  filter_params_.Resize(num_filters, filter_dim);
+  bias_params_.Resize(num_filters);
+  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
+  bias_params_.CopyColFromMat(mat, filter_dim);
+}
+
+// display information about component
+std::string ConvolutionComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", input-x-dim=" << input_x_dim_
+         << ", input-y-dim=" << input_y_dim_
+         << ", input-z-dim=" << input_z_dim_
+         << ", filt-x-dim=" << filt_x_dim_
+         << ", filt-y-dim=" << filt_y_dim_
+         << ", filt-x-step=" << filt_x_step_
+         << ", filt-y-step=" << filt_y_step_
+         << ", input-vectorization=" << input_vectorization_
+         << ", num-filters=" << filter_params_.NumRows();
+  PrintParameterStats(stream, "filter-params", filter_params_);
+  PrintParameterStats(stream, "bias-params", bias_params_, true);
+  return stream.str();
+}
+
+// initialize the component using configuration file
+void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+  std::string matrix_filename;
+  int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1,
+        filt_x_dim = -1, filt_y_dim = -1,
+        filt_x_step = -1, filt_y_step = -1,
+        num_filters = -1;
+  std::string input_vectorization_order = "zyx";
+  InitLearningRatesFromConfig(cfl);
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim);
+  ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim);
+  ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim);
+  ok = ok && cfl->GetValue("filt-x-step", &filt_x_step);
+  ok = ok && cfl->GetValue("filt-y-step", &filt_y_step);
+
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  // optional argument
+  TensorVectorizationType input_vectorization;
+  cfl->GetValue("input-vectorization-order", &input_vectorization_order);
+  if (input_vectorization_order.compare("zyx") == 0) {
+    input_vectorization = kZyx;
+  } else if (input_vectorization_order.compare("yzx") == 0) {
+    input_vectorization = kYzx;
+  } else {
+    KALDI_ERR << "Unknown or unsupported input vectorization order "
+              << input_vectorization_order
+              << " accepted candidates are 'yzx' and 'zyx'";
+  }
+
+  if (cfl->GetValue("matrix", &matrix_filename)) {
+    // initialize from prefined parameter matrix
+    Init(input_x_dim, input_y_dim, input_z_dim,
+         filt_x_dim, filt_y_dim,
+         filt_x_step, filt_y_step,
+         input_vectorization,
+         matrix_filename);
+  } else {
+    ok = ok && cfl->GetValue("num-filters", &num_filters);
+    if (!ok)
+      KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+    // initialize from configuration
+    int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim;
+    BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0;
+    cfl->GetValue("param-stddev", &param_stddev);
+    cfl->GetValue("bias-stddev", &bias_stddev);
+    Init(input_x_dim, input_y_dim, input_z_dim,
+         filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters,
+         input_vectorization, param_stddev, bias_stddev);
+  }
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+}
+
+// Inline methods to convert from tensor index i.e., (x,y,z) index
+// to index in yzx or zyx vectorized tensors
+inline int32 YzxVectorIndex(int32 x, int32 y, int32 z,
+                            int32 input_x_dim,
+                            int32 input_y_dim,
+                            int32 input_z_dim) {
+  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
+  return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y;
+}
+
+inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z,
+                            int32 input_x_dim,
+                            int32 input_y_dim,
+                            int32 input_z_dim) {
+  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
+  return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z;
+}
+
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for convolution, each patch corresponds to
+// one dot product in the convolution
+void ConvolutionComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
+  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
+  const int32 filt_x_step = filt_x_step_,
+              filt_y_step = filt_y_step_,
+              filt_x_dim = filt_x_dim_,
+              filt_y_dim = filt_y_dim_,
+              input_x_dim = input_x_dim_,
+              input_y_dim = input_y_dim_,
+              input_z_dim = input_z_dim_,
+              filter_dim = filter_params_.NumCols();
+
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      int32 patch_start_index = patch_number * filter_dim;
+      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
+        for (int32 y = 0; y < filt_y_dim; y++)  {
+          for (int32 z = 0; z < input_z_dim; z++, index++)  {
+            KALDI_ASSERT(index < column_map_size);
+            if (input_vectorization_ == kZyx)  {
+              column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x,
+                                                 y_step * filt_y_step + y, z,
+                                                 input_x_dim, input_y_dim,
+                                                 input_z_dim);
+            } else if (input_vectorization_ == kYzx)  {
+              column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
+                                                  y_step * filt_y_step + y, z,
+                                                  input_x_dim, input_y_dim,
+                                                  input_z_dim);
+            }
+          }
+        }
+      }
+    }
+  }
+  CuArray<int32> cu_cols(column_map);
+  patches->CopyCols(in, cu_cols);
+}
+
+
+// propagation function
+// see function declaration in nnet-simple-component.h for details
+void* ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                         const CuMatrixBase<BaseFloat> &in,
+                                         CuMatrixBase<BaseFloat> *out) const {
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = in.NumRows(),
+              filter_dim = filter_params_.NumCols();
+  KALDI_ASSERT((*out).NumRows() == num_frames &&
+               (*out).NumCols() == (num_filters * num_x_steps * num_y_steps));
+
+  CuMatrix<BaseFloat> patches(num_frames,
+                              num_x_steps * num_y_steps * filter_dim,
+                              kUndefined);
+  InputToInputPatches(in, &patches);
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
+      filter_params_batch;
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
+              out->ColRange(patch_number * num_filters, num_filters)));
+      patch_batch.push_back(new CuSubMatrix<BaseFloat>(
+              patches.ColRange(patch_number * filter_dim, filter_dim)));
+      filter_params_batch.push_back(filter_params_elem);
+      tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
+    }
+  }
+  // apply all filters
+  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch,
+                              kNoTrans, filter_params_batch,
+                              kTrans, 1.0);
+  // release memory
+  delete filter_params_elem;
+  for (int32 p = 0; p < tgt_batch.size(); p++) {
+    delete tgt_batch[p];
+    delete patch_batch[p];
+  }
+  return NULL;
+}
+
+// scale the parameters
+void ConvolutionComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    filter_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    filter_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
+}
+
+// add another convolution component
+void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const ConvolutionComponent *other =
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  filter_params_.AddMat(alpha, other->filter_params_);
+  bias_params_.AddVec(alpha, other->bias_params_);
+}
+
+/*
+ This function transforms a vector of lists into a list of vectors,
+ padded with -1.
+ @param[in] The input vector of lists. Let in.size() be D, and let
+            the longest list length (i.e. the max of in[i].size()) be L.
+ @param[out] The output list of vectors. The length of the list will
+            be L, each vector-dimension will be D (i.e. out[i].size() == D),
+            and if in[i] == j, then for some k we will have that
+            out[k][j] = i. The output vectors are padded with -1
+            where necessary if not all the input lists have the same side.
+*/
+void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
+                                                std::vector<std::vector<int32> > *out) {
+  int32 D = in.size();
+  int32 L = 0;
+  for (int32 i = 0; i < D; i++)
+    if (in[i].size() > L)
+      L = in[i].size();
+  out->resize(L);
+  for (int32 i = 0; i < L; i++)
+    (*out)[i].resize(D, -1);
+  for (int32 i = 0; i < D; i++) {
+    for (int32 j = 0; j < in[i].size(); j++) {
+      (*out)[j][i] = in[i][j];
+    }
+  }
+}
+
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to one dot product
+// in the convolution
+void ConvolutionComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              filt_x_step = filt_x_step_,
+              filt_y_step = filt_y_step_,
+              filt_x_dim = filt_x_dim_,
+              filt_y_dim = filt_y_dim_,
+              input_x_dim = input_x_dim_,
+              input_y_dim = input_y_dim_,
+              input_z_dim = input_z_dim_,
+              filter_dim = filter_params_.NumCols();
+
+  // Compute the reverse column_map from the matrix with input
+  // derivative patches to input derivative matrix
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      int32 patch_start_index = patch_number * filter_dim;
+      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
+        for (int32 y = 0; y < filt_y_dim; y++)  {
+          for (int32 z = 0; z < input_z_dim; z++, index++)  {
+            int32 vector_index;
+            if (input_vectorization_ == kZyx)  {
+              vector_index = ZyxVectorIndex(x_step * filt_x_step + x,
+                                            y_step * filt_y_step + y, z,
+                                            input_x_dim, input_y_dim,
+                                            input_z_dim);
+            } else {
+              KALDI_ASSERT(input_vectorization_ == kYzx);
+              vector_index = YzxVectorIndex(x_step * filt_x_step + x,
+                                            y_step * filt_y_step + y, z,
+                                            input_x_dim, input_y_dim,
+                                            input_z_dim);
+            }
+            KALDI_ASSERT(vector_index < rev_col_map_size);
+            reverse_column_map[vector_index].push_back(index);
+          }
+        }
+      }
+    }
+  }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
+  }
+}
+
+// back propagation function
+// see function declaration in nnet-simple-component.h for details
+void ConvolutionComponent::Backprop(const std::string &debug_info,
+                                    const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in_value,
+                                    const CuMatrixBase<BaseFloat> &, // out_value,
+                                    const CuMatrixBase<BaseFloat> &out_deriv,
+                                    void *memo,
+                                    Component *to_update_in,
+                                    CuMatrixBase<BaseFloat> *in_deriv) const {
+  ConvolutionComponent *to_update =
+      dynamic_cast<ConvolutionComponent*>(to_update_in);
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = out_deriv.NumRows(),
+              filter_dim = filter_params_.NumCols();
+
+  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
+               out_deriv.NumCols() ==
+               (num_filters * num_x_steps * num_y_steps));
+
+  // Compute inderiv patches
+  CuMatrix<BaseFloat> in_deriv_patches(num_frames,
+                                       num_x_steps * num_y_steps * filter_dim,
+                                       kSetZero);
+
+  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
+      filter_params_batch;
+  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
+      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+
+      patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
+              in_deriv_patches.ColRange(
+              patch_number * filter_dim, filter_dim)));
+      out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
+              patch_number * num_filters, num_filters)));
+      filter_params_batch.push_back(filter_params_elem);
+    }
+  }
+  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch,
+                              out_deriv_batch, kNoTrans,
+                              filter_params_batch, kNoTrans, 0.0);
+
+  if (in_deriv) {
+    // combine the derivatives from the individual input deriv patches
+    // to compute input deriv matrix
+    InderivPatchesToInderiv(in_deriv_patches, in_deriv);
+  }
+
+  if (to_update != NULL)  {
+    to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch);
+  }
+
+  // release memory
+  delete filter_params_elem;
+  for (int32 p = 0; p < patch_deriv_batch.size(); p++) {
+    delete patch_deriv_batch[p];
+    delete out_deriv_batch[p];
+  }
+}
+
+
+// update parameters
+// see function declaration in nnet-simple-component.h for details
+void ConvolutionComponent::Update(const std::string &debug_info,
+                                  const CuMatrixBase<BaseFloat> &in_value,
+                                  const CuMatrixBase<BaseFloat> &out_deriv,
+                                  const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch) {
+  // useful dims
+  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
+              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
+              num_filters = filter_params_.NumRows(),
+              num_frames = out_deriv.NumRows(),
+              filter_dim = filter_params_.NumCols();
+  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
+               out_deriv.NumCols() ==
+               (num_filters * num_x_steps * num_y_steps));
+
+
+  CuMatrix<BaseFloat> filters_grad;
+  CuVector<BaseFloat> bias_grad;
+
+  CuMatrix<BaseFloat> input_patches(num_frames,
+                                    filter_dim * num_x_steps * num_y_steps,
+                                    kUndefined);
+  InputToInputPatches(in_value, &input_patches);
+
+  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
+  bias_grad.Resize(num_filters, kSetZero); // reset
+
+  // create a single large matrix holding the smaller matrices
+  // from the vector container filters_grad_batch along the rows
+  CuMatrix<BaseFloat> filters_grad_blocks_batch(
+      num_x_steps * num_y_steps * filters_grad.NumRows(),
+      filters_grad.NumCols());
+
+  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
+
+  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
+    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
+      int32 patch_number = x_step * num_y_steps + y_step;
+      filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
+          filters_grad_blocks_batch.RowRange(
+              patch_number * filters_grad.NumRows(), filters_grad.NumRows())));
+
+      input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
+              input_patches.ColRange(patch_number * filter_dim, filter_dim)));
+    }
+  }
+
+  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, out_deriv_batch, kTrans,
+                              input_patch_batch, kNoTrans, 1.0);
+
+  // add the row blocks together to filters_grad
+  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
+
+  // create a matrix holding the col blocks sum of out_deriv
+  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
+                                               num_filters);
+
+  // add the col blocks together to out_deriv_col_blocks_sum
+  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
+
+  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
+
+  // release memory
+  for (int32 p = 0; p < input_patch_batch.size(); p++) {
+    delete filters_grad_batch[p];
+    delete input_patch_batch[p];
+  }
+
+  //
+  // update
+  //
+  filter_params_.AddMat(learning_rate_, filters_grad);
+  bias_params_.AddVec(learning_rate_, bias_grad);
+}
+
+void ConvolutionComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
+  ExpectToken(is, binary, "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<FiltXDim>");
+  ReadBasicType(is, binary, &filt_x_dim_);
+  ExpectToken(is, binary, "<FiltYDim>");
+  ReadBasicType(is, binary, &filt_y_dim_);
+  ExpectToken(is, binary, "<FiltXStep>");
+  ReadBasicType(is, binary, &filt_x_step_);
+  ExpectToken(is, binary, "<FiltYStep>");
+  ReadBasicType(is, binary, &filt_y_step_);
+  ExpectToken(is, binary, "<InputVectorization>");
+  int32 input_vectorization;
+  ReadBasicType(is, binary, &input_vectorization);
+  input_vectorization_ = static_cast<TensorVectorizationType>(input_vectorization);
+  ExpectToken(is, binary, "<FilterParams>");
+  filter_params_.Read(is, binary);
+  ExpectToken(is, binary, "<BiasParams>");
+  bias_params_.Read(is, binary);
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ExpectToken(is, binary, "</ConvolutionComponent>");
+  } else {
+    is_gradient_ = false;
+    KALDI_ASSERT(tok == "</ConvolutionComponent>");
+  }
+}
+
+void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // write opening tag and learning rate.
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<FiltXDim>");
+  WriteBasicType(os, binary, filt_x_dim_);
+  WriteToken(os, binary, "<FiltYDim>");
+  WriteBasicType(os, binary, filt_y_dim_);
+  WriteToken(os, binary, "<FiltXStep>");
+  WriteBasicType(os, binary, filt_x_step_);
+  WriteToken(os, binary, "<FiltYStep>");
+  WriteBasicType(os, binary, filt_y_step_);
+  WriteToken(os, binary, "<InputVectorization>");
+  WriteBasicType(os, binary, static_cast<int32>(input_vectorization_));
+  WriteToken(os, binary, "<FilterParams>");
+  filter_params_.Write(os, binary);
+  WriteToken(os, binary, "<BiasParams>");
+  bias_params_.Write(os, binary);
+  WriteToken(os, binary, "<IsGradient>");
+  WriteBasicType(os, binary, is_gradient_);
+  WriteToken(os, binary, "</ConvolutionComponent>");
+}
+
+BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const {
+  const ConvolutionComponent *other =
+      dynamic_cast<const ConvolutionComponent*>(&other_in);
+  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
+         + VecVec(bias_params_, other->bias_params_);
+}
+
+Component* ConvolutionComponent::Copy() const {
+  ConvolutionComponent *ans = new ConvolutionComponent(*this);
+  return ans;
+}
+
+void ConvolutionComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
+  temp_filter_params.SetRandn();
+  filter_params_.AddMat(stddev, temp_filter_params);
+
+  CuVector<BaseFloat> temp_bias_params(bias_params_);
+  temp_bias_params.SetRandn();
+  bias_params_.AddVec(stddev, temp_bias_params);
+}
+
+void ConvolutionComponent::SetParams(const VectorBase<BaseFloat> &bias,
+                                     const MatrixBase<BaseFloat> &filter) {
+  bias_params_ = bias;
+  filter_params_ = filter;
+  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
+}
+
+int32 ConvolutionComponent::NumParameters() const {
+  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
+}
+
+void ConvolutionComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == this->NumParameters());
+  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
+  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
+  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
+}
+void ConvolutionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  KALDI_ASSERT(params.Dim() == this->NumParameters());
+  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
+  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
+  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
+}
+
+// aquire input dim
+int32 MaxpoolingComponent::InputDim() const {
+  return input_x_dim_ * input_y_dim_ * input_z_dim_;
+}
+
+MaxpoolingComponent::MaxpoolingComponent(
+    const MaxpoolingComponent &component):
+    input_x_dim_(component.input_x_dim_),
+    input_y_dim_(component.input_y_dim_),
+    input_z_dim_(component.input_z_dim_),
+    pool_x_size_(component.pool_x_size_),
+    pool_y_size_(component.pool_y_size_),
+    pool_z_size_(component.pool_z_size_),
+    pool_x_step_(component.pool_x_step_),
+    pool_y_step_(component.pool_y_step_),
+    pool_z_step_(component.pool_z_step_) { }
+
+// aquire output dim
+int32 MaxpoolingComponent::OutputDim() const {
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+  return num_pools_x * num_pools_y * num_pools_z;
+}
+
+// check the component parameters
+void MaxpoolingComponent::Check() const {
+  // sanity check of the max pooling parameters
+  KALDI_ASSERT(input_x_dim_ > 0);
+  KALDI_ASSERT(input_y_dim_ > 0);
+  KALDI_ASSERT(input_z_dim_ > 0);
+  KALDI_ASSERT(pool_x_size_ > 0);
+  KALDI_ASSERT(pool_y_size_ > 0);
+  KALDI_ASSERT(pool_z_size_ > 0);
+  KALDI_ASSERT(pool_x_step_ > 0);
+  KALDI_ASSERT(pool_y_step_ > 0);
+  KALDI_ASSERT(pool_z_step_ > 0);
+  KALDI_ASSERT(input_x_dim_ >= pool_x_size_);
+  KALDI_ASSERT(input_y_dim_ >= pool_y_size_);
+  KALDI_ASSERT(input_z_dim_ >= pool_z_size_);
+  KALDI_ASSERT(pool_x_size_ >= pool_x_step_);
+  KALDI_ASSERT(pool_y_size_ >= pool_y_step_);
+  KALDI_ASSERT(pool_z_size_ >= pool_z_step_);
+  KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_  == 0);
+  KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_  == 0);
+  KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_  == 0);
+}
+
+// initialize the component using configuration file
+void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
+  bool ok = true;
+
+  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
+  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
+  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
+  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
+  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
+  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
+  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
+  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
+  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (!ok)
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+
+  Check();
+}
+
+// Method to convert from a matrix representing a minibatch of vectorized
+// 3D tensors to patches for 3d max pooling, each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InputToInputPatches(
+    const CuMatrixBase<BaseFloat>& in,
+    CuMatrix<BaseFloat> *patches) const{
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+
+  std::vector<int32> column_map(patches->NumCols());
+  int32 column_map_size = column_map.size();
+  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+        // given the local node coordinate, group them from each pool
+        // to form a patch
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              KALDI_ASSERT(index < column_map_size);
+              column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+            }
+          }
+        }
+      }
+    }
+  }
+  CuArray<int32> cu_cols(column_map);
+  patches->CopyCols(in, cu_cols);
+}
+
+/*
+  This is the 3d max pooling propagate function.
+  It is assumed that each row of the input matrix
+  is a vectorized 3D-tensor of type zxy.
+  Similar to the propagate function of ConvolutionComponent,
+  the input matrix is first arranged into patches so that
+  pools (with / without overlapping) could be
+  processed in a parallelizable manner.
+  The output matrix is also a vectorized 3D-tensor of type zxy.
+*/
+
+void* MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  int32 num_frames = in.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in, &patches);
+
+  out->Set(-1e20); // reset a large negative value
+  for (int32 q = 0; q < pool_size; q++)
+    out->Max(patches.ColRange(q * num_pools, num_pools));
+  return NULL;
+}
+
+// Method to compute the input derivative matrix from the input derivatives
+// for patches, where each patch corresponds to
+// the nodes having the same local coordinatenodes from each pool
+void MaxpoolingComponent::InderivPatchesToInderiv(
+    const CuMatrix<BaseFloat>& in_deriv_patches,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
+  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
+  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
+
+  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
+  int32 rev_col_map_size = reverse_column_map.size();
+  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
+    for (int32 y = 0; y < pool_y_size_; y++) {
+      for (int32 z = 0; z < pool_z_size_; z++) {
+
+        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
+          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
+            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
+              int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
+                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
+                                  (z_pool * pool_z_step_ + z);
+
+              KALDI_ASSERT(vector_index < rev_col_map_size);
+              reverse_column_map[vector_index].push_back(index);
+            }
+          }
+        }
+      }
+    }
+  }
+  std::vector<std::vector<int32> > rearranged_column_map;
+  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
+  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
+    CuArray<int32> cu_cols(rearranged_column_map[p]);
+    in_deriv->AddCols(in_deriv_patches, cu_cols);
+  }
+}
+
+/*
+  3d max pooling backpropagate function
+  This function backpropagate the error from
+  out_deriv to in_deriv.
+  In order to select the node in each pool to
+  backpropagate the error, it has to compare
+  the output pool value stored in the out_value
+  matrix with each of its input pool member node
+  stroed in the in_value matrix.
+*/
+void MaxpoolingComponent::Backprop(const std::string &debug_info,
+                                   const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in_value,
+                                   const CuMatrixBase<BaseFloat> &out_value,
+                                   const CuMatrixBase<BaseFloat> &out_deriv,
+                                   void *memo,
+                                   Component *, // to_update,
+                                   CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (!in_deriv)
+    return;
+
+  int32 num_frames = in_value.NumRows();
+  int32 num_pools = OutputDim();
+  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
+  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
+  InputToInputPatches(in_value, &patches);
+
+  for (int32 q = 0; q < pool_size; q++) {
+    // zero-out mask
+    CuMatrix<BaseFloat> mask;
+    out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
+    mask.MulElements(out_deriv);
+    patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
+  }
+
+  // combine the derivatives from the individual input deriv patches
+  // to compute input deriv matrix
+  InderivPatchesToInderiv(patches, in_deriv);
+}
+
+void MaxpoolingComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
+  ReadBasicType(is, binary, &input_x_dim_);
+  ExpectToken(is, binary, "<InputYDim>");
+  ReadBasicType(is, binary, &input_y_dim_);
+  ExpectToken(is, binary, "<InputZDim>");
+  ReadBasicType(is, binary, &input_z_dim_);
+  ExpectToken(is, binary, "<PoolXSize>");
+  ReadBasicType(is, binary, &pool_x_size_);
+  ExpectToken(is, binary, "<PoolYSize>");
+  ReadBasicType(is, binary, &pool_y_size_);
+  ExpectToken(is, binary, "<PoolZSize>");
+  ReadBasicType(is, binary, &pool_z_size_);
+  ExpectToken(is, binary, "<PoolXStep>");
+  ReadBasicType(is, binary, &pool_x_step_);
+  ExpectToken(is, binary, "<PoolYStep>");
+  ReadBasicType(is, binary, &pool_y_step_);
+  ExpectToken(is, binary, "<PoolZStep>");
+  ReadBasicType(is, binary, &pool_z_step_);
+  ExpectToken(is, binary, "</MaxpoolingComponent>");
+  Check();
+}
+
+void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MaxpoolingComponent>");
+  WriteToken(os, binary, "<InputXDim>");
+  WriteBasicType(os, binary, input_x_dim_);
+  WriteToken(os, binary, "<InputYDim>");
+  WriteBasicType(os, binary, input_y_dim_);
+  WriteToken(os, binary, "<InputZDim>");
+  WriteBasicType(os, binary, input_z_dim_);
+  WriteToken(os, binary, "<PoolXSize>");
+  WriteBasicType(os, binary, pool_x_size_);
+  WriteToken(os, binary, "<PoolYSize>");
+  WriteBasicType(os, binary, pool_y_size_);
+  WriteToken(os, binary, "<PoolZSize>");
+  WriteBasicType(os, binary, pool_z_size_);
+  WriteToken(os, binary, "<PoolXStep>");
+  WriteBasicType(os, binary, pool_x_step_);
+  WriteToken(os, binary, "<PoolYStep>");
+  WriteBasicType(os, binary, pool_y_step_);
+  WriteToken(os, binary, "<PoolZStep>");
+  WriteBasicType(os, binary, pool_z_step_);
+  WriteToken(os, binary, "</MaxpoolingComponent>");
+}
+
+// display information about component
+std::string MaxpoolingComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", input-x-dim=" << input_x_dim_
+         << ", input-y-dim=" << input_y_dim_
+         << ", input-z-dim=" << input_z_dim_
+         << ", pool-x-size=" << pool_x_size_
+         << ", pool-y-size=" << pool_y_size_
+         << ", pool-z-size=" << pool_z_size_
+         << ", pool-x-step=" << pool_x_step_
+         << ", pool-y-step=" << pool_y_step_
+         << ", pool-z-step=" << pool_z_step_;
+  return stream.str();
+}
+
+
+int32 LstmNonlinearityComponent::InputDim() const {
+  int32 cell_dim = value_sum_.NumCols();
+  return cell_dim * 5 + (use_dropout_ ? 3 : 0);
+}
+
+int32 LstmNonlinearityComponent::OutputDim() const {
+  int32 cell_dim = value_sum_.NumCols();
+  return cell_dim * 2;
+}
+
+
+void LstmNonlinearityComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
+  ExpectToken(is, binary, "<Params>");
+  params_.Read(is, binary);
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairConfig>");
+  self_repair_config_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairProb>");
+  self_repair_total_.Read(is, binary);
+
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<UseDropout>") {
+    ReadBasicType(is, binary, &use_dropout_);
+    ReadToken(is, binary, &tok);
+  } else {
+    use_dropout_ = false;
+  }
+  KALDI_ASSERT(tok == "<Count>");
+  ReadBasicType(is, binary, &count_);
+
+  // For the on-disk format, we normalze value_sum_, deriv_sum_ and
+  // self_repair_total_ by dividing by the count, but in memory they are scaled
+  // by the count.  [for self_repair_total_, the scaling factor is count_ *
+  // cell_dim].
+  value_sum_.Scale(count_);
+  deriv_sum_.Scale(count_);
+  int32 cell_dim = params_.NumCols();
+  self_repair_total_.Scale(count_ * cell_dim);
+
+  InitNaturalGradient();
+
+  ExpectToken(is, binary, "</LstmNonlinearityComponent>");
+
+}
+
+void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Read opening tag and learning rate.
+
+  WriteToken(os, binary, "<Params>");
+  params_.Write(os, binary);
+  WriteToken(os, binary, "<ValueAvg>");
+  {
+    Matrix<BaseFloat> value_avg(value_sum_);
+    if (count_ != 0.0)
+      value_avg.Scale(1.0 / count_);
+    value_avg.Write(os, binary);
+  }
+  WriteToken(os, binary, "<DerivAvg>");
+  {
+    Matrix<BaseFloat> deriv_avg(deriv_sum_);
+    if (count_ != 0.0)
+      deriv_avg.Scale(1.0 / count_);
+    deriv_avg.Write(os, binary);
+  }
+  WriteToken(os, binary, "<SelfRepairConfig>");
+  self_repair_config_.Write(os, binary);
+  WriteToken(os, binary, "<SelfRepairProb>");
+  {
+    int32 cell_dim = params_.NumCols();
+    Vector<BaseFloat> self_repair_prob(self_repair_total_);
+    if (count_ != 0.0)
+      self_repair_prob.Scale(1.0 / (count_ * cell_dim));
+    self_repair_prob.Write(os, binary);
+  }
+  if (use_dropout_) {
+    // only write this if true; we have back-compat code in reading anyway.
+    // this makes the models without dropout easier to read with older code.
+    WriteToken(os, binary, "<UseDropout>");
+    WriteBasicType(os, binary, use_dropout_);
+  }
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "</LstmNonlinearityComponent>");
+}
+
+
+
+std::string LstmNonlinearityComponent::Info() const {
+  std::ostringstream stream;
+  int32 cell_dim = params_.NumCols();
+  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim
+         << ", use-dropout=" << (use_dropout_ ? "true" : "false");
+  PrintParameterStats(stream, "w_ic", params_.Row(0));
+  PrintParameterStats(stream, "w_fc", params_.Row(1));
+  PrintParameterStats(stream, "w_oc", params_.Row(2));
+
+  // Note: some of the following code mirrors the code in
+  // UpdatableComponent::Info(), in nnet-component-itf.cc.
+  if (count_ > 0) {
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+  }
+  static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
+                                        "o_t_sigmoid", "m_t_tanh" };
+  for (int32 i = 0; i < 5; i++) {
+    stream << ", " << nonlin_names[i] << "={";
+    stream << " self-repair-lower-threshold=" << self_repair_config_(i)
+           << ", self-repair-scale=" << self_repair_config_(i + 5);
+
+    if (count_ != 0) {
+      BaseFloat self_repaired_proportion =
+          self_repair_total_(i) / (count_ * cell_dim);
+      stream << ", self-repaired-proportion=" << self_repaired_proportion;
+      Vector<double> value_sum(value_sum_.Row(i)),
+          deriv_sum(deriv_sum_.Row(i));
+      Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
+      value_avg.Scale(1.0 / count_);
+      deriv_avg.Scale(1.0 / count_);
+      stream << ", value-avg=" << SummarizeVector(value_avg)
+             << ", deriv-avg=" << SummarizeVector(deriv_avg);
+    }
+    stream << " }";
+  }
+  return stream.str();
+}
+
+
+Component* LstmNonlinearityComponent::Copy() const {
+  return new LstmNonlinearityComponent(*this);
+}
+
+void LstmNonlinearityComponent::ZeroStats() {
+  value_sum_.SetZero();
+  deriv_sum_.SetZero();
+  self_repair_total_.SetZero();
+  count_ = 0.0;
+}
+
+void LstmNonlinearityComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    params_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_.SetZero();
+    count_ = 0.0;
+  } else {
+    params_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_.Scale(scale);
+    count_ *= scale;
+  }
+}
+
+void LstmNonlinearityComponent::Add(BaseFloat alpha,
+                                    const Component &other_in) {
+  const LstmNonlinearityComponent *other =
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  params_.AddMat(alpha, other->params_);
+  value_sum_.AddMat(alpha, other->value_sum_);
+  deriv_sum_.AddMat(alpha, other->deriv_sum_);
+  self_repair_total_.AddVec(alpha, other->self_repair_total_);
+  count_ += alpha * other->count_;
+}
+
+void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
+  temp_params.SetRandn();
+  params_.AddMat(stddev, temp_params);
+}
+
+BaseFloat LstmNonlinearityComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const LstmNonlinearityComponent *other =
+      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return TraceMatMat(params_, other->params_, kTrans);
+}
+
+int32 LstmNonlinearityComponent::NumParameters() const {
+  return params_.NumRows() * params_.NumCols();
+}
+
+void LstmNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == NumParameters());
+  params->CopyRowsFromMat(params_);
+}
+
+
+void LstmNonlinearityComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params)  {
+  KALDI_ASSERT(params.Dim() == NumParameters());
+  params_.CopyRowsFromVec(params);
+}
+
+
+void* LstmNonlinearityComponent::Propagate(
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  cu::ComputeLstmNonlinearity(in, params_, out);
+  return NULL;
+}
+
+
+void LstmNonlinearityComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &, // out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  if (to_update_in == NULL) {
+    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
+                                 deriv_sum_, self_repair_config_,
+                                 count_, in_deriv,
+                                 (CuMatrixBase<BaseFloat>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<double>*) NULL,
+                                 (CuMatrixBase<BaseFloat>*) NULL);
+  } else {
+    LstmNonlinearityComponent *to_update =
+        dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
+    KALDI_ASSERT(to_update != NULL);
+
+    int32 cell_dim = params_.NumCols();
+    CuMatrix<BaseFloat> params_deriv(3, cell_dim, kUndefined);
+    CuMatrix<BaseFloat> self_repair_total(5, cell_dim, kUndefined);
+
+    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
+                                 deriv_sum_, self_repair_config_,
+                                 count_, in_deriv, &params_deriv,
+                                 &(to_update->value_sum_),
+                                 &(to_update->deriv_sum_),
+                                 &self_repair_total);
+
+    CuVector<BaseFloat> self_repair_total_sum(5);
+    self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0);
+    to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum);
+    to_update->count_ += static_cast<double>(in_value.NumRows());
+
+    BaseFloat scale = 1.0;
+    if (!to_update->is_gradient_) {
+      to_update->preconditioner_.PreconditionDirections(
+          &params_deriv, &scale);
+    }
+    to_update->params_.AddMat(to_update->learning_rate_ * scale,
+                              params_deriv);
+  }
+}
+
+LstmNonlinearityComponent::LstmNonlinearityComponent(
+    const LstmNonlinearityComponent &other):
+    UpdatableComponent(other),
+    params_(other.params_),
+    use_dropout_(other.use_dropout_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_config_(other.self_repair_config_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    preconditioner_(other.preconditioner_) { }
+
+void LstmNonlinearityComponent::Init(
+    int32 cell_dim, bool use_dropout,
+    BaseFloat param_stddev,
+    BaseFloat tanh_self_repair_threshold,
+    BaseFloat sigmoid_self_repair_threshold,
+    BaseFloat self_repair_scale) {
+  KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
+               tanh_self_repair_threshold >= 0.0 &&
+               tanh_self_repair_threshold <= 1.0 &&
+               sigmoid_self_repair_threshold >= 0.0 &&
+               sigmoid_self_repair_threshold <= 0.25 &&
+               self_repair_scale >= 0.0 && self_repair_scale <= 0.1);
+  use_dropout_ = use_dropout;
+  params_.Resize(3, cell_dim);
+  params_.SetRandn();
+  params_.Scale(param_stddev);
+  value_sum_.Resize(5, cell_dim);
+  deriv_sum_.Resize(5, cell_dim);
+  self_repair_config_.Resize(10);
+  self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold);
+  self_repair_config_(2) = tanh_self_repair_threshold;
+  self_repair_config_(4) = tanh_self_repair_threshold;
+  self_repair_config_.Range(5, 5).Set(self_repair_scale);
+  self_repair_total_.Resize(5);
+  count_ = 0.0;
+  InitNaturalGradient();
+
+}
+
+void LstmNonlinearityComponent::InitNaturalGradient() {
+  // As regards the configuration for the natural-gradient preconditioner, we
+  // don't make it configurable from the command line-- it's unlikely that any
+  // differences from changing this would be substantial enough to effectively
+  // tune the configuration.  Because the preconditioning code doesn't 'see' the
+  // derivatives from individual frames, but only averages over the minibatch,
+  // there is a fairly small amount of data available to estimate the Fisher
+  // information matrix, so we set the rank, update period and
+  // num-samples-history to smaller values than normal.
+  preconditioner_.SetRank(20);
+  preconditioner_.SetUpdatePeriod(2);
+  preconditioner_.SetNumSamplesHistory(1000.0);
+}
+
+/// virtual
+void LstmNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
+  preconditioner_.Freeze(freeze);
+}
+
+void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
+  InitLearningRatesFromConfig(cfl);
+  bool ok = true;
+  bool use_dropout = false;
+  int32 cell_dim;
+  // these self-repair thresholds are the normal defaults for tanh and sigmoid
+  // respectively.  If, later on, we decide that we want to support different
+  // self-repair config values for the individual sigmoid and tanh
+  // nonlinearities, we can modify this code then.
+  BaseFloat tanh_self_repair_threshold = 0.2,
+      sigmoid_self_repair_threshold = 0.05,
+      self_repair_scale = 1.0e-05;
+  // param_stddev is the stddev of the parameters.  it may be better to
+  // use a smaller value but this was the default in the python scripts
+  // for a while.
+  BaseFloat param_stddev = 1.0;
+  ok = ok && cfl->GetValue("cell-dim", &cell_dim);
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("tanh-self-repair-threshold",
+                &tanh_self_repair_threshold);
+  cfl->GetValue("sigmoid-self-repair-threshold",
+                &sigmoid_self_repair_threshold);
+  cfl->GetValue("self-repair-scale", &self_repair_scale);
+  cfl->GetValue("use-dropout", &use_dropout);
+
+  // We may later on want to make it possible to initialize the different
+  // parameters w_ic, w_fc and w_oc with different biases.  We'll implement
+  // that when and if it's needed.
+
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  if (ok) {
+    Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold,
+         sigmoid_self_repair_threshold, self_repair_scale);
+  } else {
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  }
+}
+
+void LstmNonlinearityComponent::ConsolidateMemory() {
+  OnlineNaturalGradient preconditioner_temp(preconditioner_);
+  preconditioner_.Swap(&preconditioner_);
+}
+
+
+int32 GruNonlinearityComponent::InputDim() const {
+  if (recurrent_dim_ == cell_dim_) {
+    // non-projected GRU.
+    return 4 * cell_dim_;
+  } else {
+    return 3 * cell_dim_ + 2 * recurrent_dim_;
+  }
+}
+
+int32 GruNonlinearityComponent::OutputDim() const {
+  return 2 * cell_dim_;
+}
+
+
+std::string GruNonlinearityComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", cell-dim=" << cell_dim_
+         << ", recurrent-dim=" << recurrent_dim_;
+  PrintParameterStats(stream, "w_h", w_h_);
+  stream << ", self-repair-threshold=" << self_repair_threshold_
+         << ", self-repair-scale=" << self_repair_scale_;
+  if (count_ > 0) {  // c.f. NonlinearComponent::Info().
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+    stream << ", self-repaired-proportion="
+           << (self_repair_total_ / (count_ * cell_dim_));
+    Vector<double> value_avg_dbl(value_sum_);
+    Vector<BaseFloat> value_avg(value_avg_dbl);
+    value_avg.Scale(1.0 / count_);
+    stream << ", value-avg=" << SummarizeVector(value_avg);
+    Vector<double> deriv_avg_dbl(deriv_sum_);
+    Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
+    deriv_avg.Scale(1.0 / count_);
+    stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
+  }
+  // natural-gradient parameters.
+  stream << ", alpha=" << preconditioner_in_.GetAlpha()
+         << ", rank-in=" << preconditioner_in_.GetRank()
+         << ", rank-out=" << preconditioner_out_.GetRank()
+         << ", update-period="
+         << preconditioner_in_.GetUpdatePeriod();
+  return stream.str();
+}
+
+void GruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
+  cell_dim_ = -1;
+  recurrent_dim_ = -1;
+  self_repair_threshold_ = 0.2;
+  self_repair_scale_ = 1.0e-05;
+
+  InitLearningRatesFromConfig(cfl);
+  if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0)
+    KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent.";
+
+  BaseFloat param_stddev = 1.0 / std::sqrt(cell_dim_),
+      alpha = 4.0;
+  int32 rank_in = 20, rank_out = 80,
+      update_period = 4;
+
+  cfl->GetValue("recurrent-dim", &recurrent_dim_);
+  cfl->GetValue("self-repair-threshold", &self_repair_threshold_);
+  cfl->GetValue("self-repair-scale", &self_repair_scale_);
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("alpha", &alpha);
+  cfl->GetValue("rank-in", &rank_in);
+  cfl->GetValue("rank-out", &rank_out);
+  cfl->GetValue("update-period", &update_period);
+
+  if (recurrent_dim_ < 0)
+    recurrent_dim_ = cell_dim_;
+  if (recurrent_dim_ == 0 || recurrent_dim_ > cell_dim_)
+    KALDI_ERR << "Invalid values for cell-dim and recurrent-dim";
+
+  w_h_.Resize(cell_dim_, recurrent_dim_);
+  w_h_.SetRandn();
+  w_h_.Scale(param_stddev);
+
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_in_.SetUpdatePeriod(update_period);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+
+  count_ = 0.0;
+  self_repair_total_ = 0.0;
+  value_sum_.Resize(cell_dim_);
+  deriv_sum_.Resize(cell_dim_);
+
+  Check();
+}
+
+void* GruNonlinearityComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() == out->NumRows() &&
+               in.NumCols() == InputDim() &&
+               out->NumCols() == OutputDim());
+  // If recurrent_dim_ != cell_dim_, this is projected GRU and we
+  // are computing:
+  //  (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t).
+  // Otherwise (no projection), it's
+  //  (z_t, r_t, hpart_t, y_{t-1},) -> (h_t, y_t).
+  // but to understand this code, it's better to rename y to c:
+  //  (z_t, r_t, hpart_t, c_{t-1}) -> (h_t, c_t).
+  int32 num_rows = in.NumRows(),
+      c = cell_dim_,
+      r =  recurrent_dim_;
+  CuSubMatrix<BaseFloat> z_t(in, 0, num_rows, 0, c),
+      r_t(in, 0, num_rows, c, r),
+      hpart_t(in, 0, num_rows, c + r, c),
+      c_t1(in, 0, num_rows, c + r + c, c);
+  // note: the variable named 'c_t1' actually represents
+  // y_{t-1} for non-projected GRUs.
+
+  // By setting s_t1 to the last recurrent_dim_ rows of 'in', we get something
+  // that represents s_{t-1} for recurrent setups and y_{t-1} (which we're
+  // renaming to c_{t-1}) for non-projected GRUs.  The key thing is that
+  // in the non-projected case, the variables c_t1 and s_t1 point to the
+  // same memory.
+  CuSubMatrix<BaseFloat> s_t1(in, 0, num_rows, in.NumCols() - r, r);
+
+  // note: for non-projected GRUs, c_t below is actually y_t.
+  CuSubMatrix<BaseFloat> h_t(*out, 0, num_rows, 0, c),
+      c_t(*out, 0, num_rows, c, c);
+
+  // sdotr is the only temporary storage we need in the forward pass.
+  CuMatrix<BaseFloat> sdotr(num_rows, r);
+  sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0);
+  // now sdotr = r_t \dot s_{t-1}.
+  h_t.CopyFromMat(hpart_t);
+  // now h_t = hpart_t (note: hpart_t actually means U^h x_t).
+  h_t.AddMatMat(1.0, sdotr, kNoTrans, w_h_, kTrans, 1.0);
+  // now h_t = hpart_t + W^h (s_{t-1} \dot r_t).
+  h_t.Tanh(h_t);
+  // now, h_t = tanh(hpart_t + W^h (s_{t-1} \dot r_t)).
+
+  c_t.CopyFromMat(h_t);
+  // now c_t = h_t
+  c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0);
+  // now c_t = (1 - z_t) \dot h_t.
+  c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0);
+  // now c_t = (1 - z_t) \dot h_t  +  z_t \dot c_{t-1}.
+  return NULL;
+}
+
+void GruNonlinearityComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
+               in_value.NumRows() == out_value.NumRows() &&
+               in_value.NumCols() == InputDim() &&
+               out_value.NumCols() == OutputDim() &&
+               (in_deriv == NULL || SameDim(in_value, *in_deriv)) &&
+               memo == NULL);
+  GruNonlinearityComponent *to_update =
+      dynamic_cast<GruNonlinearityComponent*>(to_update_in);
+  KALDI_ASSERT(in_deriv != NULL || to_update != NULL);
+  int32 num_rows = in_value.NumRows(),
+      c = cell_dim_,
+      r = recurrent_dim_;
+
+  // To understand what's going on here, compare this code with the
+  // corresponding 'forward' code in Propagate().
+
+
+  CuSubMatrix<BaseFloat> z_t(in_value, 0, num_rows, 0, c),
+      r_t(in_value, 0, num_rows, c, r),
+      hpart_t(in_value, 0, num_rows, c + r, c),
+      c_t1(in_value, 0, num_rows, c + r + c, c),
+      s_t1(in_value, 0, num_rows, in_value.NumCols() - r, r);
+
+
+  // The purpose of this 'in_deriv_ptr' is so that we can create submatrices
+  // like z_t_deriv without the code crashing.  If in_deriv is NULL these point
+  // to 'in_value', and we'll be careful never to actually write to these
+  // sub-matrices, which aside from being conceptually wrong would violate the
+  // const semantics of this function.
+  const CuMatrixBase<BaseFloat> *in_deriv_ptr =
+      (in_deriv == NULL ? &in_value : in_deriv);
+  CuSubMatrix<BaseFloat> z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c),
+      r_t_deriv(*in_deriv_ptr, 0, num_rows, c, r),
+      hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c + r, c),
+      c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + r + c, c),
+      s_t1_deriv(*in_deriv_ptr, 0, num_rows, in_value.NumCols() - r, r);
+
+  // Note: the output h_t is never actually used in the GRU computation (we only
+  // output it because we want the value to be cached to save computation in the
+  // backprop), so we expect that the 'h_t_deriv', if we extracted it in the
+  // obvious way, would be all zeros.
+  // We create a different, local h_t_deriv
+  // variable that backpropagates the derivative from c_t_deriv.
+  CuSubMatrix<BaseFloat> h_t(out_value, 0, num_rows, 0, c),
+      c_t(out_value, 0, num_rows, c, c),
+      c_t_deriv(out_deriv, 0, num_rows, c, c);
+  CuMatrix<BaseFloat> h_t_deriv(num_rows, c, kUndefined);
+
+  {  // we initialize h_t_deriv with the derivative from 'out_deriv'.
+    // In real life in a GRU, this would always be zero; but in testing
+    // code it may be nonzero and we include this term so that
+    // the tests don't fail.  Note: if you were to remove these
+    // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below
+    // to a CopyFromMat() call.
+    CuSubMatrix<BaseFloat> h_t_deriv_in(out_deriv, 0, num_rows, 0, c);
+    h_t_deriv.CopyFromMat(h_t_deriv_in);
+  }
+
+
+  // sdotr is the same variable as used in the forward pass, it will contain
+  // r_t \dot s_{t-1}.
+  CuMatrix<BaseFloat> sdotr(num_rows, r);
+  sdotr.AddMatMatElements(1.0, r_t, s_t1, 0.0);
+
+
+  { // This block does the
+    // backprop corresponding to the
+    // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+
+    // First do: h_t_deriv = c_t_deriv \dot (1 - z_t).
+    h_t_deriv.AddMat(1.0, c_t_deriv);
+    h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0);
+
+    if (in_deriv) {
+      // these should be self-explanatory if you study
+      // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}".
+      z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0);
+      z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0);
+      c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0);
+    }
+  }
+
+  h_t_deriv.DiffTanh(h_t, h_t_deriv);
+  if (to_update)
+    to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv);
+
+
+  if (to_update)
+    to_update->UpdateParameters(sdotr, h_t_deriv);
+
+  // At this point, 'h_t_deriv' contains the derivative w.r.t.
+  // the argument of the tanh function, i.e. w.r.t. the expression:
+  //    hpart_t + W^h (s_{t-1} \dot r_t).
+  // The next block propagates this to the derivatives for
+  // hpart_t, s_{t-1} and r_t.
+  if (in_deriv) {
+    hpart_t_deriv.AddMat(1.0, h_t_deriv);
+
+    // We re-use the memory that we used for s_{t-1} \dot r_t,
+    // for its derivative.
+    CuMatrix<BaseFloat> &sdotr_deriv(sdotr);
+    sdotr_deriv.AddMatMat(1.0, h_t_deriv, kNoTrans, w_h_, kNoTrans, 0.0);
+
+    // we add to all the input-derivatives instead of setting them,
+    // because we chose to export the flag kBackpropAdds.
+    r_t_deriv.AddMatMatElements(1.0, sdotr_deriv, s_t1, 1.0);
+    s_t1_deriv.AddMatMatElements(1.0, sdotr_deriv, r_t, 1.0);
+  }
+}
+
+
+void GruNonlinearityComponent::TanhStatsAndSelfRepair(
+    const CuMatrixBase<BaseFloat> &h_t,
+    CuMatrixBase<BaseFloat> *h_t_deriv) {
+  KALDI_ASSERT(SameDim(h_t, *h_t_deriv));
+
+  // we use this probability (hardcoded for now) to limit the stats accumulation
+  // and self-repair code to running on about half of the minibatches.
+  BaseFloat repair_and_stats_probability = 0.5;
+  if (RandUniform() > repair_and_stats_probability)
+    return;
+
+  // OK, accumulate stats.
+  // For the next few lines, compare with TanhComponent::StoreStats(), which is where
+  // we got this code.
+  // tanh_deriv is the function derivative of the tanh function,
+  // tanh'(x) = tanh(x) * (1.0 - tanh(x)).  h_t corresponds to tanh(x).
+  CuMatrix<BaseFloat> tanh_deriv(h_t);
+  tanh_deriv.ApplyPow(2.0);
+  tanh_deriv.Scale(-1.0);
+  tanh_deriv.Add(1.0);
+
+  count_ += h_t.NumRows();
+  CuVector<BaseFloat> temp(cell_dim_);
+  temp.AddRowSumMat(1.0, h_t, 0.0);
+  value_sum_.AddVec(1.0, temp);
+  temp.AddRowSumMat(1.0, tanh_deriv, 0.0);
+  deriv_sum_.AddVec(1.0, temp);
+
+  if (count_ <= 0.0) {
+    // this would be rather pathological if it happened.
+    return;
+  }
+
+  // The rest of this function contains code modified from
+  // TanhComponent::RepairGradients().
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, cell_dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(self_repair_threshold_ * count_);
+  thresholds.ApplyHeaviside();
+  self_repair_total_ += thresholds_vec.Sum();
+
+  // there is a comment explaining what we are doing with
+  // 'thresholds_vec', at this point in TanhComponent::RepairGradients().
+  // We won't repeat it here.
+
+  h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability,
+                           h_t, kNoTrans, thresholds_vec);
+}
+
+void GruNonlinearityComponent::UpdateParameters(
+    const CuMatrixBase<BaseFloat> &sdotr,
+    const CuMatrixBase<BaseFloat> &h_t_deriv) {
+  if (is_gradient_) {
+    // 'simple' update, no natural gradient.  Compare
+    // with AffineComponent::UpdateSimple().
+    w_h_.AddMatMat(learning_rate_, h_t_deriv, kTrans,
+                   sdotr, kNoTrans, 1.0);
+  } else {
+    // the natural-gradient update.
+    CuMatrix<BaseFloat> in_value_temp(sdotr),
+        out_deriv_temp(h_t_deriv);
+
+    // These "scale" values get will get multiplied into the learning rate.
+    BaseFloat in_scale, out_scale;
+
+    preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale);
+    preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale);
+
+    BaseFloat local_lrate = learning_rate_ * in_scale * out_scale;
+    w_h_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
+                   in_value_temp, kNoTrans, 1.0);
+  }
+}
+
+
+
+void GruNonlinearityComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);
+  ExpectToken(is, binary, "<CellDim>");
+  ReadBasicType(is, binary, &cell_dim_);
+  ExpectToken(is, binary, "<RecurrentDim>");
+  ReadBasicType(is, binary, &recurrent_dim_);
+  ExpectToken(is, binary, "<w_h>");
+  w_h_.Read(is, binary);
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairTotal>");
+  ReadBasicType(is, binary, &self_repair_total_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  value_sum_.Scale(count_);  // we read in the averages, not the sums.
+  deriv_sum_.Scale(count_);
+  ExpectToken(is, binary, "<SelfRepairThreshold>");
+  ReadBasicType(is, binary, &self_repair_threshold_);
+  ExpectToken(is, binary, "<SelfRepairScale>");
+  ReadBasicType(is, binary, &self_repair_scale_);
+  BaseFloat alpha;
+  int32 rank_in, rank_out, update_period;
+  ExpectToken(is, binary, "<Alpha>");
+  ReadBasicType(is, binary, &alpha);
+  ExpectToken(is, binary, "<RankInOut>");
+  ReadBasicType(is, binary, &rank_in);
+  ReadBasicType(is, binary, &rank_out);
+  ExpectToken(is, binary, "<UpdatePeriod>");
+  ReadBasicType(is, binary, &update_period);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_in_.SetUpdatePeriod(update_period);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+  ExpectToken(is, binary, "</GruNonlinearityComponent>");
+}
+
+void GruNonlinearityComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);
+  WriteToken(os, binary, "<CellDim>");
+  WriteBasicType(os, binary, cell_dim_);
+  WriteToken(os, binary, "<RecurrentDim>");
+  WriteBasicType(os, binary, recurrent_dim_);
+  WriteToken(os, binary, "<w_h>");
+  w_h_.Write(os, binary);
+  {
+    // Write the value and derivative stats in a count-normalized way, for
+    // greater readability in text form.
+    WriteToken(os, binary, "<ValueAvg>");
+    Vector<BaseFloat> temp(value_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+    WriteToken(os, binary, "<DerivAvg>");
+    temp.CopyFromVec(deriv_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+  }
+  WriteToken(os, binary, "<SelfRepairTotal>");
+  WriteBasicType(os, binary, self_repair_total_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "<SelfRepairThreshold>");
+  WriteBasicType(os, binary, self_repair_threshold_);
+  WriteToken(os, binary, "<SelfRepairScale>");
+  WriteBasicType(os, binary, self_repair_scale_);
+
+  BaseFloat alpha = preconditioner_in_.GetAlpha();
+  int32 rank_in = preconditioner_in_.GetRank(),
+      rank_out = preconditioner_out_.GetRank(),
+      update_period = preconditioner_in_.GetUpdatePeriod();
+  WriteToken(os, binary, "<Alpha>");
+  WriteBasicType(os, binary, alpha);
+  WriteToken(os, binary, "<RankInOut>");
+  WriteBasicType(os, binary, rank_in);
+  WriteBasicType(os, binary, rank_out);
+  WriteToken(os, binary, "<UpdatePeriod>");
+  WriteBasicType(os, binary, update_period);
+  WriteToken(os, binary, "</GruNonlinearityComponent>");
+}
+
+void GruNonlinearityComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    w_h_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_ = 0.0;
+    count_ = 0.0;
+  } else {
+    w_h_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_ *= scale;
+    count_ *= scale;
+  }
+}
+
+void GruNonlinearityComponent::Add(BaseFloat alpha,
+                                   const Component &other_in) {
+  const GruNonlinearityComponent *other =
+      dynamic_cast<const GruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  w_h_.AddMat(alpha, other->w_h_);
+  value_sum_.AddVec(alpha, other->value_sum_);
+  deriv_sum_.AddVec(alpha, other->deriv_sum_);
+  self_repair_total_ += alpha * other->self_repair_total_;
+  count_ += alpha * other->count_;
+}
+
+void GruNonlinearityComponent::ZeroStats() {
+  value_sum_.SetZero();
+  deriv_sum_.SetZero();
+  self_repair_total_ = 0.0;
+  count_ = 0.0;
+}
+
+void GruNonlinearityComponent::Check() const {
+  KALDI_ASSERT(cell_dim_ > 0 && recurrent_dim_ > 0 &&
+               recurrent_dim_ <= cell_dim_ &&
+               self_repair_threshold_ >= 0.0 &&
+               self_repair_scale_ >= 0.0 );
+  KALDI_ASSERT(w_h_.NumRows() == cell_dim_ &&
+               w_h_.NumCols() == recurrent_dim_);
+  KALDI_ASSERT(value_sum_.Dim() == cell_dim_ &&
+               deriv_sum_.Dim() == cell_dim_);
+}
+
+void GruNonlinearityComponent::PerturbParams(BaseFloat stddev) {
+  CuMatrix<BaseFloat> temp_params(w_h_.NumRows(), w_h_.NumCols());
+  temp_params.SetRandn();
+  w_h_.AddMat(stddev, temp_params);
+}
+
+BaseFloat GruNonlinearityComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const GruNonlinearityComponent *other =
+      dynamic_cast<const GruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return TraceMatMat(w_h_, other->w_h_, kTrans);
+}
+
+int32 GruNonlinearityComponent::NumParameters() const {
+  return w_h_.NumRows() * w_h_.NumCols();
+}
+
+void GruNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == NumParameters());
+  params->CopyRowsFromMat(w_h_);
+}
+
+
+void GruNonlinearityComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params)  {
+  KALDI_ASSERT(params.Dim() == NumParameters());
+  w_h_.CopyRowsFromVec(params);
+}
+
+void GruNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
+  preconditioner_in_.Freeze(freeze);
+  preconditioner_out_.Freeze(freeze);
+}
+
+GruNonlinearityComponent::GruNonlinearityComponent(
+    const GruNonlinearityComponent &other):
+    UpdatableComponent(other),
+    cell_dim_(other.cell_dim_),
+    recurrent_dim_(other.recurrent_dim_),
+    w_h_(other.w_h_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    self_repair_threshold_(other.self_repair_threshold_),
+    self_repair_scale_(other.self_repair_scale_),
+    preconditioner_in_(other.preconditioner_in_),
+    preconditioner_out_(other.preconditioner_out_) {
+  Check();
+}
+
+
+int32 OutputGruNonlinearityComponent::InputDim() const {
+  return 3 * cell_dim_;
+}
+
+int32 OutputGruNonlinearityComponent::OutputDim() const {
+  return 2 * cell_dim_;
+}
+
+
+std::string OutputGruNonlinearityComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", cell-dim=" << cell_dim_;
+  PrintParameterStats(stream, "w_h", w_h_);
+  stream << ", self-repair-threshold=" << self_repair_threshold_
+         << ", self-repair-scale=" << self_repair_scale_;
+  if (count_ > 0) {  // c.f. NonlinearComponent::Info().
+    stream << ", count=" << std::setprecision(3) << count_
+           << std::setprecision(6);
+    stream << ", self-repaired-proportion="
+           << (self_repair_total_ / (count_ * cell_dim_));
+    Vector<double> value_avg_dbl(value_sum_);
+    Vector<BaseFloat> value_avg(value_avg_dbl);
+    value_avg.Scale(1.0 / count_);
+    stream << ", value-avg=" << SummarizeVector(value_avg);
+    Vector<double> deriv_avg_dbl(deriv_sum_);
+    Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
+    deriv_avg.Scale(1.0 / count_);
+    stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
+  }
+  // natural-gradient parameters.
+  stream << ", alpha=" << preconditioner_.GetAlpha()
+         << ", rank=" << preconditioner_.GetRank()
+         << ", update-period="
+         << preconditioner_.GetUpdatePeriod();
+  return stream.str();
+}
+
+void OutputGruNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
+  cell_dim_ = -1;
+  self_repair_threshold_ = 0.2;
+  self_repair_scale_ = 1.0e-05;
+
+  InitLearningRatesFromConfig(cfl);
+  if (!cfl->GetValue("cell-dim", &cell_dim_) || cell_dim_ <= 0)
+    KALDI_ERR << "cell-dim > 0 is required for GruNonlinearityComponent.";
+
+  BaseFloat param_mean = 0.0, param_stddev = 1.0, 
+      alpha = 4.0;
+  int32 rank=8,
+      update_period = 10;
+
+  cfl->GetValue("self-repair-threshold", &self_repair_threshold_);
+  cfl->GetValue("self-repair-scale", &self_repair_scale_);
+  cfl->GetValue("param-mean", &param_mean);
+  cfl->GetValue("param-stddev", &param_stddev);
+  cfl->GetValue("alpha", &alpha);
+  cfl->GetValue("rank", &rank);
+  cfl->GetValue("update-period", &update_period);
+
+
+  w_h_.Resize(cell_dim_);
+  w_h_.SetRandn();
+  w_h_.Scale(param_stddev);
+  w_h_.Add(param_mean);
+
+  preconditioner_.SetAlpha(alpha);
+  preconditioner_.SetRank(rank);
+  preconditioner_.SetUpdatePeriod(update_period);
+
+  count_ = 0.0;
+  self_repair_total_ = 0.0;
+  value_sum_.Resize(cell_dim_);
+  deriv_sum_.Resize(cell_dim_);
+
+  Check();
+}
+
+void* OutputGruNonlinearityComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumRows() == out->NumRows() &&
+               in.NumCols() == InputDim() &&
+               out->NumCols() == OutputDim());
+  // This component implements the function
+  // (z_t, hpart_t, c_{t-1}) -> (h_t, c_t)
+  // of dimensions
+  // (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim),
+  // where:
+  // h_t = \tanh( hpart_t + W^h \dot c_{t-1} )
+  // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+  int32 num_rows = in.NumRows(),
+      c = cell_dim_;
+  CuSubMatrix<BaseFloat> z_t(in, 0, num_rows, 0, c),
+      hpart_t(in, 0, num_rows, c, c),
+      c_t1(in, 0, num_rows, c + c, c);
+
+  CuSubMatrix<BaseFloat> h_t(*out, 0, num_rows, 0, c),
+      c_t(*out, 0, num_rows, c, c);
+
+  h_t.CopyFromMat(c_t1);
+  // now h_t = c_{t-1}
+  h_t.MulColsVec(w_h_);
+  // now h_t = W^h \dot c_{t-1}
+  h_t.AddMat(1.0, hpart_t, kNoTrans);
+  // now h_t = hpart_t + W^h \dot c_{t-1}.(note: hpart_t actually means U^h x_t).
+  h_t.Tanh(h_t);
+  // now, h_t = tanh(hpart_t + W^h \dot c_{t-1}).
+
+  c_t.CopyFromMat(h_t);
+  // now c_t = h_t
+  c_t.AddMatMatElements(-1.0, z_t, h_t, 1.0);
+  // now c_t = (1 - z_t) \dot h_t.
+  c_t.AddMatMatElements(1.0, z_t, c_t1, 1.0);
+  // now c_t = (1 - z_t) \dot h_t  +  z_t \dot c_{t-1}.
+  return NULL;
+}
+
+void OutputGruNonlinearityComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *, // indexes
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
+               in_value.NumRows() == out_value.NumRows() &&
+               in_value.NumCols() == InputDim() &&
+               out_value.NumCols() == OutputDim() &&
+               (in_deriv == NULL || SameDim(in_value, *in_deriv)) &&
+               memo == NULL);
+  OutputGruNonlinearityComponent *to_update =
+      dynamic_cast<OutputGruNonlinearityComponent*>(to_update_in);
+  KALDI_ASSERT(in_deriv != NULL || to_update != NULL);
+  int32 num_rows = in_value.NumRows(),
+      c = cell_dim_;
+
+  // To understand what's going on here, compare this code with the
+  // corresponding 'forward' code in Propagate().
+
+
+  CuSubMatrix<BaseFloat> z_t(in_value, 0, num_rows, 0, c),
+      hpart_t(in_value, 0, num_rows, c, c),
+      c_t1(in_value, 0, num_rows, c + c, c);
+
+  // The purpose of this 'in_deriv_ptr' is so that we can create submatrices
+  // like z_t_deriv without the code crashing.  If in_deriv is NULL these point
+  // to 'in_value', and we'll be careful never to actually write to these
+  // sub-matrices, which aside from being conceptually wrong would violate the
+  // const semantics of this function.
+  const CuMatrixBase<BaseFloat> *in_deriv_ptr =
+      (in_deriv == NULL ? &in_value : in_deriv);
+  CuSubMatrix<BaseFloat> z_t_deriv(*in_deriv_ptr, 0, num_rows, 0, c),
+      hpart_t_deriv(*in_deriv_ptr, 0, num_rows, c, c),
+      c_t1_deriv(*in_deriv_ptr, 0, num_rows, c + c, c);
+
+  // Note: the output h_t is never actually used in the GRU computation (we only
+  // output it because we want the value to be cached to save computation in the
+  // backprop), so we expect that the 'h_t_deriv', if we extracted it in the
+  // obvious way, would be all zeros.
+  // We create a different, local h_t_deriv
+  // variable that backpropagates the derivative from c_t_deriv.
+  CuSubMatrix<BaseFloat> h_t(out_value, 0, num_rows, 0, c),
+      c_t(out_value, 0, num_rows, c, c),
+      c_t_deriv(out_deriv, 0, num_rows, c, c);
+  CuMatrix<BaseFloat> h_t_deriv(num_rows, c, kUndefined);
+
+  {  // we initialize h_t_deriv with the derivative from 'out_deriv'.
+    // In real life in a GRU, this would always be zero; but in testing
+    // code it may be nonzero and we include this term so that
+    // the tests don't fail.  Note: if you were to remove these
+    // lines, you'd have to change 'h_t_deriv.AddMat(1.0, c_t_deriv);' below
+    // to a CopyFromMat() call.
+    CuSubMatrix<BaseFloat> h_t_deriv_in(out_deriv, 0, num_rows, 0, c);
+    h_t_deriv.CopyFromMat(h_t_deriv_in);
+  }
+
+
+  { // This block does the
+    // backprop corresponding to the
+    // forward-pass expression: c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+
+    // First do: h_t_deriv = c_t_deriv \dot (1 - z_t).
+    h_t_deriv.AddMat(1.0, c_t_deriv);
+    h_t_deriv.AddMatMatElements(-1.0, c_t_deriv, z_t, 1.0);
+
+    if (in_deriv) {
+      // these should be self-explanatory if you study
+      // the expression "c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}".
+      z_t_deriv.AddMatMatElements(-1.0, c_t_deriv, h_t, 1.0);
+      z_t_deriv.AddMatMatElements(1.0, c_t_deriv, c_t1, 1.0);
+      c_t1_deriv.AddMatMatElements(1.0, c_t_deriv, z_t, 1.0);
+    }
+  }
+
+  h_t_deriv.DiffTanh(h_t, h_t_deriv);
+  if (to_update)
+    to_update->TanhStatsAndSelfRepair(h_t, &h_t_deriv);
+  
+  if (to_update)
+    to_update->UpdateParameters(c_t1, h_t_deriv);
+  // At this point, 'h_t_deriv' contains the derivative w.r.t.
+  // the argument of the tanh function, i.e. w.r.t. the expression:
+  //    hpart_t + W^h \dot c_{t-1}.
+  // The next block propagates this to the derivative for h_part_t and c_t1
+  // The derivative of z_t has already been finished.
+  if (in_deriv) {
+    hpart_t_deriv.AddMat(1.0, h_t_deriv);
+
+    // Currently, c_t1_deriv contains the derivative from
+    // c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}
+    // Now compute the h_t = \tanh(hpart_t + W^h \dot c_{t-1}) part
+    h_t_deriv.MulColsVec(w_h_);
+    // Combine the two parts
+    c_t1_deriv.AddMat(1.0, h_t_deriv);
+  }
+}
+
+
+void OutputGruNonlinearityComponent::TanhStatsAndSelfRepair(
+    const CuMatrixBase<BaseFloat> &h_t,
+    CuMatrixBase<BaseFloat> *h_t_deriv) {
+  KALDI_ASSERT(SameDim(h_t, *h_t_deriv));
+
+  // we use this probability (hardcoded for now) to limit the stats accumulation
+  // and self-repair code to running on about half of the minibatches.
+  BaseFloat repair_and_stats_probability = 0.5;
+  if (RandUniform() > repair_and_stats_probability)
+    return;
+
+  // OK, accumulate stats.
+  // For the next few lines, compare with TanhComponent::StoreStats(), which is where
+  // we got this code.
+  // tanh_deriv is the function derivative of the tanh function,
+  // tanh'(x) = tanh(x) * (1.0 - tanh(x)).  h_t corresponds to tanh(x).
+  CuMatrix<BaseFloat> tanh_deriv(h_t);
+  tanh_deriv.ApplyPow(2.0);
+  tanh_deriv.Scale(-1.0);
+  tanh_deriv.Add(1.0);
+
+  count_ += h_t.NumRows();
+  CuVector<BaseFloat> temp(cell_dim_);
+  temp.AddRowSumMat(1.0, h_t, 0.0);
+  value_sum_.AddVec(1.0, temp);
+  temp.AddRowSumMat(1.0, tanh_deriv, 0.0);
+  deriv_sum_.AddVec(1.0, temp);
+
+  if (count_ <= 0.0) {
+    // this would be rather pathological if it happened.
+    return;
+  }
+
+  // The rest of this function contains code modified from
+  // TanhComponent::RepairGradients().
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, cell_dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(self_repair_threshold_ * count_);
+  thresholds.ApplyHeaviside();
+  self_repair_total_ += thresholds_vec.Sum();
+
+  // there is a comment explaining what we are doing with
+  // 'thresholds_vec', at this point in TanhComponent::RepairGradients().
+  // We won't repeat it here.
+
+  h_t_deriv->AddMatDiagVec(-self_repair_scale_ / repair_and_stats_probability,
+                           h_t, kNoTrans, thresholds_vec);
+}
+
+void OutputGruNonlinearityComponent::UpdateParameters(
+    const CuMatrixBase<BaseFloat> &c_t1_value,
+    const CuMatrixBase<BaseFloat> &h_t_deriv) {
+  if (is_gradient_) {
+    // 'simple' update, no natural gradient.  Compare
+    // with PerElementScaleComponent::UpdateSimple().
+    w_h_.AddDiagMatMat(learning_rate_, h_t_deriv, kTrans,
+                       c_t1_value, kNoTrans, 1.0);
+  } else {
+    // the natural-gradient update.
+    CuMatrix<BaseFloat> derivs_per_frame(c_t1_value);
+    derivs_per_frame.MulElements(h_t_deriv);
+
+    // This "scale" value gets will get multiplied into the learning rate.
+    BaseFloat scale;
+
+    preconditioner_.PreconditionDirections(&derivs_per_frame, &scale);
+
+    CuVector<BaseFloat> delta_w_h(w_h_.Dim());
+    delta_w_h.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
+    w_h_.AddVec(1.0, delta_w_h);
+  }
+}
+
+
+
+void OutputGruNonlinearityComponent::Read(std::istream &is, bool binary) {
+  ReadUpdatableCommon(is, binary);
+  ExpectToken(is, binary, "<CellDim>");
+  ReadBasicType(is, binary, &cell_dim_);
+  ExpectToken(is, binary, "<w_h>");
+  w_h_.Read(is, binary);
+  ExpectToken(is, binary, "<ValueAvg>");
+  value_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<DerivAvg>");
+  deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<SelfRepairTotal>");
+  ReadBasicType(is, binary, &self_repair_total_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  value_sum_.Scale(count_);  // we read in the averages, not the sums.
+  deriv_sum_.Scale(count_);
+  ExpectToken(is, binary, "<SelfRepairThreshold>");
+  ReadBasicType(is, binary, &self_repair_threshold_);
+  ExpectToken(is, binary, "<SelfRepairScale>");
+  ReadBasicType(is, binary, &self_repair_scale_);
+  BaseFloat alpha;
+  int32 rank, update_period;
+  ExpectToken(is, binary, "<Alpha>");
+  ReadBasicType(is, binary, &alpha);
+  ExpectToken(is, binary, "<Rank>");
+  ReadBasicType(is, binary, &rank);
+  ExpectToken(is, binary, "<UpdatePeriod>");
+  ReadBasicType(is, binary, &update_period);
+  preconditioner_.SetRank(rank);
+  preconditioner_.SetAlpha(alpha);
+  preconditioner_.SetUpdatePeriod(update_period);
+  ExpectToken(is, binary, "</OutputGruNonlinearityComponent>");
+}
+
+void OutputGruNonlinearityComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);
+  WriteToken(os, binary, "<CellDim>");
+  WriteBasicType(os, binary, cell_dim_);
+  WriteToken(os, binary, "<w_h>");
+  w_h_.Write(os, binary);
+  {
+    // Write the value and derivative stats in a count-normalized way, for
+    // greater readability in text form.
+    WriteToken(os, binary, "<ValueAvg>");
+    Vector<BaseFloat> temp(value_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+    WriteToken(os, binary, "<DerivAvg>");
+    temp.CopyFromVec(deriv_sum_);
+    if (count_ != 0.0) temp.Scale(1.0 / count_);
+    temp.Write(os, binary);
+  }
+  WriteToken(os, binary, "<SelfRepairTotal>");
+  WriteBasicType(os, binary, self_repair_total_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "<SelfRepairThreshold>");
+  WriteBasicType(os, binary, self_repair_threshold_);
+  WriteToken(os, binary, "<SelfRepairScale>");
+  WriteBasicType(os, binary, self_repair_scale_);
+
+  BaseFloat alpha = preconditioner_.GetAlpha();
+  int32 rank = preconditioner_.GetRank(),
+      update_period = preconditioner_.GetUpdatePeriod();
+  WriteToken(os, binary, "<Alpha>");
+  WriteBasicType(os, binary, alpha);
+  WriteToken(os, binary, "<Rank>");
+  WriteBasicType(os, binary, rank);
+  WriteToken(os, binary, "<UpdatePeriod>");
+  WriteBasicType(os, binary, update_period);
+  WriteToken(os, binary, "</OutputGruNonlinearityComponent>");
+}
+
+void OutputGruNonlinearityComponent::Scale(BaseFloat scale) {
+  if (scale == 0.0) {
+    w_h_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_ = 0.0;
+    count_ = 0.0;
+  } else {
+    w_h_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_ *= scale;
+    count_ *= scale;
+  }
+}
+
+void OutputGruNonlinearityComponent::Add(BaseFloat alpha,
+                                   const Component &other_in) {
+  const OutputGruNonlinearityComponent *other =
+      dynamic_cast<const OutputGruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  w_h_.AddVec(alpha, other->w_h_);
+  value_sum_.AddVec(alpha, other->value_sum_);
+  deriv_sum_.AddVec(alpha, other->deriv_sum_);
+  self_repair_total_ += alpha * other->self_repair_total_;
+  count_ += alpha * other->count_;
+}
+
+void OutputGruNonlinearityComponent::ZeroStats() {
+  value_sum_.SetZero();
+  deriv_sum_.SetZero();
+  self_repair_total_ = 0.0;
+  count_ = 0.0;
+}
+
+void OutputGruNonlinearityComponent::Check() const {
+  KALDI_ASSERT(cell_dim_ > 0 &&
+               self_repair_threshold_ >= 0.0 &&
+               self_repair_scale_ >= 0.0 );
+  KALDI_ASSERT(w_h_.Dim() == cell_dim_);
+  KALDI_ASSERT(value_sum_.Dim() == cell_dim_ &&
+               deriv_sum_.Dim() == cell_dim_);
+}
+
+void OutputGruNonlinearityComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_params(w_h_.Dim());
+  temp_params.SetRandn();
+  w_h_.AddVec(stddev, temp_params);
+}
+
+BaseFloat OutputGruNonlinearityComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  const OutputGruNonlinearityComponent *other =
+      dynamic_cast<const OutputGruNonlinearityComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return VecVec(w_h_, other->w_h_);
+}
+
+int32 OutputGruNonlinearityComponent::NumParameters() const {
+  return w_h_.Dim();
+}
+
+void OutputGruNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  KALDI_ASSERT(params->Dim() == NumParameters());
+  params->CopyFromVec(w_h_);
+}
+
+
+void OutputGruNonlinearityComponent::UnVectorize(
+    const VectorBase<BaseFloat> &params)  {
+  KALDI_ASSERT(params.Dim() == NumParameters());
+  w_h_.CopyFromVec(params);
+}
+
+void OutputGruNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
+  preconditioner_.Freeze(freeze);
+}
+
+OutputGruNonlinearityComponent::OutputGruNonlinearityComponent(
+    const OutputGruNonlinearityComponent &other):
+    UpdatableComponent(other),
+    cell_dim_(other.cell_dim_),
+    w_h_(other.w_h_),
+    value_sum_(other.value_sum_),
+    deriv_sum_(other.deriv_sum_),
+    self_repair_total_(other.self_repair_total_),
+    count_(other.count_),
+    self_repair_threshold_(other.self_repair_threshold_),
+    self_repair_scale_(other.self_repair_scale_),
+    preconditioner_(other.preconditioner_) {
+  Check();
+}
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-combined-component.h b/src/nnet3/nnet-combined-component.h
new file mode 100644
index 00000000000..85011bd826d
--- /dev/null
+++ b/src/nnet3/nnet-combined-component.h
@@ -0,0 +1,1109 @@
+// nnet3/nnet-combined-component.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+//                2018  Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_SPECIAL_COMPONENT_H_
+#define KALDI_NNET3_NNET_SPECIAL_COMPONENT_H_
+
+#include "nnet3/nnet-common.h"
+#include "nnet3/nnet-component-itf.h"
+#include "nnet3/natural-gradient-online.h"
+#include <iostream>
+
+namespace kaldi {
+namespace nnet3 {
+
+/// @file  nnet-combined-component.h
+///   You can view this as an overflow from nnet-simple-component.h.
+///   It contains components which meet the definition of "simple"
+///   components, i.e. they set the kSimpleComponent flag, but
+///   which are more special-purpose, i.e. they are specific to
+///   special layer types such as LSTMs, CNNs and GRUs.
+
+
+
+/**
+ * WARNING, this component is deprecated in favor of
+ *  TimeHeightConvolutionComponent, and will be deleted.
+ * ConvolutionalComponent implements 2d-convolution.
+ * It uses 3D filters on 3D inputs, but the 3D filters hop only over
+ * 2 dimensions as it has same size as the input along the 3rd dimension.
+ * Input : A matrix where each row is a  vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like features/delta/delta-delta)
+ *
+ *        The component supports input vectorizations of type zyx and yzx.
+ *        The default vectorization type is zyx.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *          The channel axis (z) in the output corresponds to the output of
+ *          different filters. The first channel corresponds to the first filter
+ *          i.e., first row of the filter_params_ matrix.
+ *
+ * Note: The component has to support yzx input vectorization as the binaries
+ * like add-deltas generate yz vectorized output. These input vectors are
+ * concatenated using the Append descriptor across time steps to form a yzx
+ * vectorized 3D tensor input.
+ * e.g. Append(Offset(input, -1), input, Offset(input, 1))
+ *
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
+ *
+ * Propagation:
+ * ------------
+ * Convolution operation consists of a dot-products between the filter tensor
+ * and input tensor patch, for various shifts of filter tensor along the x and y
+ * axes input tensor. (Note: there is no shift along z-axis as the filter and
+ * input tensor have same size along this axis).
+ *
+ * For a particular shift (i,j) of the filter tensor
+ * along input tensor dimensions x and y, the elements of the input tensor which
+ * overlap with the filter form the input tensor patch. This patch is vectorized
+ * in zyx format. All the patches corresponding to various samples in the
+ * mini-batch are stacked into a matrix, where each row corresponds to one
+ * patch. Let this matrix be represented by X_{i,j}. The dot products with
+ * various filters are computed simultaneously by computing the matrix product
+ * with the filter_params_ matrix (W)
+ * Y_{i,j} = X_{i,j}*W^T.
+ * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
+ *
+ * All the matrix products corresponding to various shifts (i,j) of the
+ * filter tensor are computed simultaneously using the AddMatMatBatched
+ * call of CuMatrixBase class.
+ *
+ * BackPropagation:
+ * ----------------
+ *  Backpropagation to compute the input derivative (\nabla X_{i,j})
+ *  consists of the a series of matrix products.
+ *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
+ *   output derivative for a particular shift of the filter.
+ *
+ *   Once again these matrix products are computed simultaneously.
+ *
+ * Update:
+ * -------
+ *  The weight gradient is computed as
+ *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
+ *
+ */
+class ConvolutionComponent: public UpdatableComponent {
+ public:
+  enum TensorVectorizationType  {
+    kYzx = 0,
+    kZyx = 1
+  };
+
+  ConvolutionComponent();
+  // constructor using another component
+  ConvolutionComponent(const ConvolutionComponent &component);
+  // constructor using parameters
+  ConvolutionComponent(
+    const CuMatrixBase<BaseFloat> &filter_params,
+    const CuVectorBase<BaseFloat> &bias_params,
+    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+    int32 filt_x_dim, int32 filt_y_dim,
+    int32 filt_x_step, int32 filt_y_step,
+    TensorVectorizationType input_vectorization,
+    BaseFloat learning_rate);
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "ConvolutionComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
+           kBackpropAdds|kPropagateAdds;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+  void Update(const std::string &debug_info,
+              const CuMatrixBase<BaseFloat> &in_value,
+              const CuMatrixBase<BaseFloat> &out_deriv,
+              const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
+
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  // Some functions that are specific to this class.
+  void SetParams(const VectorBase<BaseFloat> &bias,
+                 const MatrixBase<BaseFloat> &filter);
+  const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
+  const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
+  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+            int32 filt_x_dim, int32 filt_y_dim,
+            int32 filt_x_step, int32 filt_y_step, int32 num_filters,
+            TensorVectorizationType input_vectorization,
+            BaseFloat param_stddev, BaseFloat bias_stddev);
+  // there is no filt_z_dim parameter as the length of the filter along
+  // z-dimension is same as the input
+  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
+            int32 filt_x_dim, int32 filt_y_dim,
+            int32 filt_x_step, int32 filt_y_step,
+            TensorVectorizationType input_vectorization,
+            std::string matrix_filename);
+
+  // resize the component, setting the parameters to zero, while
+  // leaving any other configuration values the same
+  void Resize(int32 input_dim, int32 output_dim);
+
+  void Update(const std::string &debug_info,
+              const CuMatrixBase<BaseFloat> &in_value,
+              const CuMatrixBase<BaseFloat> &out_deriv);
+
+
+ private:
+  int32 input_x_dim_;   // size of the input along x-axis
+                        // (e.g. number of time steps)
+
+  int32 input_y_dim_;   // size of input along y-axis
+                        // (e.g. number of mel-frequency bins)
+
+  int32 input_z_dim_;   // size of input along z-axis
+                        // (e.g. number of channels is 3 if the input has
+                        // features + delta + delta-delta features
+
+  int32 filt_x_dim_;    // size of the filter along x-axis
+
+  int32 filt_y_dim_;    // size of the filter along y-axis
+
+  // there is no filt_z_dim_ as it is always assumed to be
+  // the same as input_z_dim_
+
+  int32 filt_x_step_;   // the number of steps taken along x-axis of input
+                        //  before computing the next dot-product
+                        //  of filter and input
+
+  int32 filt_y_step_;   // the number of steps taken along y-axis of input
+                        // before computing the next dot-product of the filter
+                        // and input
+
+  // there is no filt_z_step_ as only dot product is possible along this axis
+
+  TensorVectorizationType input_vectorization_; // type of vectorization of the
+  // input 3D tensor. Accepts zyx and yzx formats
+
+  CuMatrix<BaseFloat> filter_params_;
+  // the filter (or kernel) matrix is a matrix of vectorized 3D filters
+  // where each row in the matrix corresponds to one filter.
+  // The 3D filter tensor is vectorizedin zyx format.
+  // The first row of the matrix corresponds to the first filter and so on.
+  // Keep in mind the vectorization type and order of filters when using file
+  // based initialization.
+
+  CuVector<BaseFloat> bias_params_;
+  // the filter-specific bias vector (i.e., there is a seperate bias added
+  // to the output of each filter).
+
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
+  const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
+};
+
+
+/*
+  LstmNonlinearityComponent is a component that implements part of an LSTM, by
+  combining together the sigmoids and tanh's, plus some diagonal terms, into
+  a single block.
+  We will refer to the LSTM formulation used in
+
+  Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
+  by H. Sak et al,
+  http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
+
+  Suppose the cell dimension is C.  Then outside this component, we compute
+  the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
+  matrix multiplication:
+
+  i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
+  f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
+  c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
+  o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
+
+  The part of the computation that takes place in this component is as follows.
+  Its input is of dimension 5C [however, search for 'dropout' below],
+  consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}).  Its
+  output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
+
+  To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
+
+  This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
+  and w_o.
+
+
+  In the forward pass (Propagate), this component computes the following:
+
+     i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
+     f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
+     c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
+     o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
+     m_t = o_t * Tanh(c_t)                    (5)
+    # note: the outputs are just c_t and m_t.
+
+  [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead
+  of 5C in this case, the last three input dimensions will be interpreted as
+  per-frame dropout masks on i_t, f_t and o_t respectively, so that on the RHS of
+  (3), i_t is replaced by i_t * i_t_scale, and likewise for f_t and o_t.]
+
+  The backprop is as you would think, but for the "self-repair" we need to pass
+  in additional vectors (of the same dim as the parameters of the layer) that
+  dictate whether or not we add an additional term to the backpropagated
+  derivatives.  (This term helps force the input to the nonlinearities into the
+  range where the derivatives are not too small).
+
+  This component stores stats of the same form as are normally stored by the
+  StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
+  activations and derivatives, but this is done inside the Backprop() functions.
+  [the StoreStats() functions don't take the input data as an argument, so
+  storing this data that way is impossible, and anyway it's more efficient to
+  do it as part of backprop.]
+
+  Configuration values accepted:
+         cell-dim          e.g. cell-dim=1024  Cell dimension.  The input
+                          dimension of this component is cell-dim * 5, and the
+                          output dimension is cell-dim * 2.  Note: this
+                          component implements only part of the LSTM layer,
+                          see comments above.
+         param-stddev     Standard deviation for random initialization of
+                          the diagonal matrices (AKA peephole connections).
+                          default=1.0, which is probably too high but
+                          we couldn't see any reliable gain from decreasing it.
+         tanh-self-repair-threshold   Equivalent to the self-repair-lower-threshold
+                          in a TanhComponent; applies to both the tanh nonlinearities.
+                          default=0.2, you probably won't want to changethis.
+         sigmoid-self-repair-threshold   Equivalent to self-repair-lower-threshold
+                          in a SigmoidComponent; applies to all three of the sigmoid
+                          nonlinearities.  default=0.05, you probably won't want to
+                          change this.
+         self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent
+                          or TanhComponent; applies to both the sigmoid and tanh
+                          nonlinearities.  default=1.0e-05, which you probably won't
+                          want to change unless dealing with an objective function
+                          that has smaller or larger dynamic range than normal, in
+                          which case you might want to make it smaller or larger.
+*/
+class LstmNonlinearityComponent: public UpdatableComponent {
+ public:
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  LstmNonlinearityComponent(): use_dropout_(false) { }
+  virtual std::string Type() const { return "LstmNonlinearityComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ZeroStats();
+  virtual void FreezeNaturalGradient(bool freeze);
+
+  // Some functions that are specific to this class:
+  explicit LstmNonlinearityComponent(
+      const LstmNonlinearityComponent &other);
+
+  void Init(int32 cell_dim, bool use_dropout,
+            BaseFloat param_stddev,
+            BaseFloat tanh_self_repair_threshold,
+            BaseFloat sigmoid_self_repair_threshold,
+            BaseFloat self_repair_scale);
+
+  virtual void ConsolidateMemory();
+
+ private:
+
+  // Initializes the natural-gradient object with the configuration we
+  // use for this object, which for now is hardcoded at the C++ level.
+  void InitNaturalGradient();
+
+  // Notation: C is the cell dimension; it equals params_.NumCols().
+
+  // The dimension of the parameter matrix is (3 x C);
+  // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
+  CuMatrix<BaseFloat> params_;
+
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
+  // for i_t and f_t.
+  bool use_dropout_;
+
+  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
+  // equations (1) through (5), this is the sum of the values of the nonliearities
+  // (used for diagnostics only).  It is comparable to value_sum_ vector
+  // in base-class NonlinearComponent.
+  CuMatrix<double> value_sum_;
+
+  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
+  // equations (1) through (5), this is the sum of the derivatives of the
+  // nonliearities (used for diagnostics and to control self-repair).  It is
+  // comparable to the deriv_sum_ vector in base-class
+  // NonlinearComponent.
+  CuMatrix<double> deriv_sum_;
+
+  // This matrix has dimension 10.  The contents are a block of 5 self-repair
+  // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
+  // self-repair scales (typically all 0.00001).  These are for each of the 5
+  // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
+  // more info).
+  CuVector<BaseFloat> self_repair_config_;
+
+  // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
+  // component (see comments in cu-math.h for more info), it contains the total,
+  // over all frames represented in count_, of the number of dimensions that
+  // were subject to self_repair.  To get the self-repair proportion you should
+  // divide by (count_ times cell_dim_).
+  CuVector<double> self_repair_total_;
+
+  // The total count (number of frames) corresponding to the stats in value_sum_
+  // and deriv_sum_.
+  double count_;
+
+  // Preconditioner for the parameters of this component [operates in the space
+  // of dimension C].
+  // The preconditioner stores its own configuration values; we write and read
+  // these, but not the preconditioner object itself.
+  OnlineNaturalGradient preconditioner_;
+
+  const LstmNonlinearityComponent &operator
+      = (const LstmNonlinearityComponent &other); // Disallow.
+};
+
+
+
+
+/*
+ * WARNING, this component is deprecated as it's not compatible with
+ *   TimeHeightConvolutionComponent, and it will eventually be deleted.
+ * MaxPoolingComponent :
+ * Maxpooling component was firstly used in ConvNet for selecting an
+ * representative activation in an area. It inspired Maxout nonlinearity.
+ * Each output element of this component is the maximum of a block of
+ * input elements where the block has a 3D dimension (pool_x_size_,
+ * pool_y_size_, pool_z_size_).
+ * Blocks could overlap if the shift value on any axis is smaller
+ * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
+ * If the shift values are euqal to their pool size, there is no
+ * overlap; while if they all equal 1, the blocks overlap to
+ * the greatest possible extent.
+ *
+ * This component is designed to be used after a ConvolutionComponent
+ * so that the input matrix is propagated from a 2d-convolutional layer.
+ * This component implements 3d-maxpooling which performs
+ * max pooling along the three axes.
+ * Input : A matrix where each row is a vectorized 3D-tensor.
+ *        The 3D tensor has dimensions
+ *        x: (e.g. time)
+ *        y: (e.g. frequency)
+ *        z: (e.g. channels like number of filters in the ConvolutionComponent)
+ *
+ *        The component assumes input vectorizations of type zyx
+ *        which is the default output vectorization type of a ConvolutionComponent.
+ *        e.g. for input vectorization of type zyx the input is vectorized by
+ *        spanning axes z, y and x of the tensor in that order.
+ *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
+ *        the zyx vectorized input looks like
+ *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
+ *
+ * Output : The output is also a 3D tensor vectorized in the zyx format.
+ *
+ * For information on the hyperparameters and parameters of this component see
+ * the variable declarations.
+ *
+ *
+ */
+class MaxpoolingComponent: public Component {
+ public:
+
+  MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
+                           pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
+                           pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
+  // constructor using another component
+  MaxpoolingComponent(const MaxpoolingComponent &component);
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "MaxpoolingComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
+           kBackpropAdds;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
+
+
+ protected:
+  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
+                           CuMatrix<BaseFloat> *patches) const;
+  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
+                               CuMatrixBase<BaseFloat> *in_deriv) const;
+  virtual void Check() const;
+
+
+  int32 input_x_dim_;   // size of the input along x-axis
+  // (e.g. number of time steps)
+  int32 input_y_dim_;   // size of input along y-axis
+  // (e.g. number of mel-frequency bins)
+  int32 input_z_dim_;   // size of input along z-axis
+  // (e.g. number of filters in the ConvolutionComponent)
+
+  int32 pool_x_size_;    // size of the pooling window along x-axis
+  int32 pool_y_size_;    // size of the pooling window along y-axis
+  int32 pool_z_size_;    // size of the pooling window along z-axis
+
+  int32 pool_x_step_;   // the number of steps taken along x-axis of input
+  //  before computing the next pool
+  int32 pool_y_step_;   // the number of steps taken along y-axis of input
+  // before computing the next pool
+  int32 pool_z_step_;   // the number of steps taken along z-axis of input
+  // before computing the next pool
+
+};
+
+
+/**
+  GruNonlinearityComponent is a component that implements part of a
+  Gated Recurrent Unit (GRU).  This is more efficient in time and
+  memory than stitching it together using more basic components.
+  For a brief summary of what this actually computes, search
+  for 'recap' below; the first part of this comment establishes
+  the context.
+
+  This component supports two cases: the regular GRU
+ (as described in "Empirical Evaluation of
+ Gated Recurrent Neural Networks on Sequence Modeling",
+ https://arxiv.org/pdf/1412.3555.pdf),
+  and our "projected GRU" which takes ideas from the
+ paper we'll abbreviate as "LSTM based RNN architectures for LVCSR",
+ https://arxiv.org/pdf/1402.1128.pdf.
+
+ Before describing what this component does, we'll establish
+ some notation for the GRU.
+
+ First, the regular (non-projected) GRU.  In order to unify the notation with
+ our "projected GRU", we'll use slightly different variable names.  We'll also
+ ignore the bias terms for purposes of this exposition (let them be implicit).
+
+
+  Regular GRU:
+
+   z_t = \sigmoid ( U^z x_t + W^z y_{t-1} )   # update gate, dim == cell_dim
+   r_t = \sigmoid ( U^r x_t + W^r y_{t-1} )   # reset gate, dim == cell_dim
+   h_t = \tanh ( U^h x_t + W^h ( y_{t-1} \dot r_t ) )   # dim == cell_dim
+   y_t = ( 1 - z_t ) \dot h_t  +  z_t \dot y_{t-1}  # dim == cell_dim
+
+ For the "projected GRU", the 'cell_dim x cell_dim' full-matrix expressions W^z
+ W^r and W^h that participate in the expressions for z_t, r_t and h_t are
+ replaced with skinny matrices of dimension 'cell_dim x recurrent_dim'
+ (where recurrent_dim < cell_dim) and the output is replaced by
+ a lower-dimension projection of the hidden state, of dimension
+ 'recurrent_dim + non_recurrent_dim < cell_dim', instead of the
+ full 'cell_dim'.  We rename y_t to c_t (this name is inspired by LSTMs), and
+ we now let the output (still called y_t) be a projection of c_t.
+ s_t is a dimension range of the output y_t.    Parameters of the
+ projected GRU:
+           cell_dim > 0
+           recurrent_dim > 0
+           non_recurrent_dim > 0  (where non_recurrent_dim + recurrent_dim < cell_dim).
+
+
+  Equations:
+
+   z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate, dim(z_t) == cell_dim
+   r_t = \sigmoid ( U^r x_t + W^r s_{t-1} )   # reset gate, dim(r_t) == recurrent_dim
+   h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) )   # dim(h_t) == cell_dim
+   c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}  # dim(c_t) == cell_dim
+   y_t = W^y c_t      # dim(y_t) = recurrent_dim + non_recurrent_dim.  This is
+                      # the output of the GRU.
+   s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t, dim(s_t) = recurrent_dim.
+
+
+   Because we'll need it below, we define
+    hpart_t = U^h x_t
+   which is a subexpression of h_t.
+
+   Our choice to make a "special" component for the projected GRU is to have
+   it be a function from
+     (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t)
+   That is, the input to the component is all those things on the LHS
+   appended together, and the output is the two things on the
+   RHS appended together.  The dimensions are:
+    (cell_dim, recurrent_dim, cell_dim, cell_dim, recurrent_dim) -> (cell_dim, cell_dim).
+   The component computes the functions:
+     h_t = \tanh( hpart_t + W^h (s_{t-1} \dot r_t))
+     c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+
+   Notice that 'W^h' is the only parameter that lives inside the component.
+
+   You might also notice that the output 'h_t' is never actually used
+   in any other part of the GRU, so the question arises: why is it
+   necessary to have it be an output of the component?  This has to do with
+   saving computation: because h_t is an output, and we'll be defining
+   the kBackpropNeedsOutput flag, it is available in the backprop phase
+   and this helps us avoid some computation (otherwise we'd have to do
+   a redundant multiplication by W^h in the backprop phase that we already
+   did in the forward phase).  We could have used the 'memo' mechanism to
+   do this, but this is undesirable because the use of a memo disables
+   'update consolidation' in the backprop so we'd lose a little
+   speed there.
+
+   In the case where it's a regular, not projected GRU, this component
+   is a function from
+      (z_t, r_t, hpart_t, y_{t-1}) -> (h_t, y_t)
+   We can actually do this with the same code as the projected-GRU code,
+   we just make sure that recurrent_dim == cell_dim, and the only structural
+   difference is that c_{t-1} and s_{t-1} become the same variable (y_{t-1}),
+   and we rename c_t to y_t.
+
+   This component stores stats of the same form as are normally stored by the
+   StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
+   activations and derivatives, but this is done inside the Backprop() functions.
+
+
+  The main configuration values that are accepted:
+         cell-dim         e.g. cell-dim=1024  Cell dimension.
+         recurrent-dim    e.g. recurrent-dim=256.  If not specified, we assume
+                          this is a non-projected GRU.
+         param-stddev     Standard deviation for random initialization of
+                          the matrix W^h.  Defaults to 1.0 / sqrt(d) where
+                          d is recurrent-dim if specified, else cell-dim.
+         self-repair-threshold   Equivalent to the self-repair-lower-threshold
+                          in a TanhComponent; applies to the tanh nonlinearity.
+                          default=0.2, you probably won't want to change this.
+         self-repair-scale Equivalent to the self-repair-scale in a
+                          TanhComponent; applies to the tanh nonlinearity.
+                          default=1.0e-05, which you probably won't want to
+                          change unless dealing with an objective function that
+                          has smaller or larger dynamic range than normal, in
+                          which case you might want to make it smaller or
+                          larger.
+
+  Values inherited from UpdatableComponent (see its declaration in
+  nnet-component-itf.h for details):
+      learning-rate
+      learning-rate-factor
+      max-change
+
+   Natural-gradient related options are below; you won't normally have to
+   set these.
+      alpha                 Constant that determines how much we smooth the
+                            Fisher-matrix estimates with the unit matrix.
+                            Larger means more smoothing. default=4.0
+      rank-in               Rank used in low-rank-plus-unit estimate of Fisher
+                            matrix in the input space.  default=20.
+      rank-out              Rank used in low-rank-plus-unit estimate of Fisher
+                            matrix in the output-derivative space.  default=80.
+      update-period         Determines the period (in minibatches) with which
+                            we update the Fisher-matrix estimates;
+                            making this > 1 saves a little time in training.
+                            default=4.
+
+
+   Recap of what this computes:
+      If recurrent-dim is specified, this component implements
+      the function
+           (z_t, r_t, hpart_t, c_{t-1}, s_{t-1}) -> (h_t, c_t)
+     of dims:
+   (cell_dim, recurrent_dim, cell_dim, cell_dim, recurrent_dim) -> (cell_dim, cell_dim).
+    where:
+         h_t = \tanh( hpart_t + W^h (s_{t-1} \dot r_t))
+         c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+     If recurrent-dim is not specified, this component implements
+     the function
+        (z_t, r_t, hpart_t, y_{t-1}) -> (h_t, y_t)
+   of dimensions
+       (cell_dim, cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim),
+    where:
+         h_t = \tanh( hpart_t + W^h (y_{t-1} \dot r_t))
+         y_t = (1 - z_t) \dot h_t + z_t \dot y_{t-1}.
+*/
+class GruNonlinearityComponent: public UpdatableComponent {
+ public:
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  GruNonlinearityComponent() { }
+  virtual std::string Type() const { return "GruNonlinearityComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|\
+        kBackpropNeedsOutput|kBackpropAdds;
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const { return new GruNonlinearityComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ZeroStats();
+  virtual void FreezeNaturalGradient(bool freeze);
+
+  // Some functions that are specific to this class:
+  explicit GruNonlinearityComponent(
+      const GruNonlinearityComponent &other);
+
+ private:
+
+  void Check() const;  // checks dimensions, etc.
+
+  /**
+     This function stores value and derivative stats for the tanh
+     nonlinearity that is a part of this component, and if needed
+     adds the small 'self-repair' term to 'h_t_deriv'.
+      @param [in] h_t The output of the tanh expression from the
+                      forward pass.
+      @param [in,out] h_t_deriv  To here will be added the small
+                      self-repair term (this is a small value
+                      that we use to push oversaturated neurons
+                      back to the center).
+     This function has side effects on the class instance, specifically the
+     members value_sum_, deriv_sum, self_repair_total_, and count_.
+   */
+  void TanhStatsAndSelfRepair(const CuMatrixBase<BaseFloat> &h_t,
+                              CuMatrixBase<BaseFloat> *h_t_deriv);
+
+  /*  This function is responsible for updating the w_h_ matrix
+      (taking into account the learning rate).
+        @param [in] sdotr  The value of the expression (s_{t-1} \dot r_t).
+        @param [in] h_t_deriv  The derivative of the objective
+                        function w.r.t. the argument of the tanh
+                        function, i.e. w.r.t. the expression
+                        "hpart_t + W^h (s_{t-1} \dot r_t)".
+                        This function is concerned with the second
+                        term as it affects the derivative w.r.t. W^h.
+   */
+  void UpdateParameters(const CuMatrixBase<BaseFloat> &sdotr,
+                        const CuMatrixBase<BaseFloat> &h_t_deriv);
+
+
+  int32 cell_dim_;  // cell dimension, e.g. 1024.
+  int32 recurrent_dim_;  // recurrent dimension, e.g. 256 for projected GRU;
+                         // if it's the same as cell_dim it means we are
+                         // implementing regular (non-projected) GRU
+
+
+  // The matrix W^h, of dimension cell_dim_ by recurrent_dim_.
+  // There is no bias term needed here because hpart_t comes from
+  // an affine component that has a bias.
+  CuMatrix<BaseFloat> w_h_;
+
+  // Of dimension cell_dim_, this is comparable to the value_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the tanh nonlinearity.
+  // Normalize by dividing by count_.
+  CuVector<double> value_sum_;
+
+  // Of dimension cell_dim_, this is comparable to the deriv_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the function-derivative of
+  // the tanh nonlinearity.  Normalize by dividing by count_.
+  CuVector<double> deriv_sum_;
+
+  // This is part of the stats (along with value_sum_, deriv_sum_, and count_);
+  // if you divide it by count_ it gives you the proportion of the time that an
+  // average dimension was subject to self-repair.
+  double self_repair_total_;
+
+  // The total count (number of frames) corresponding to the stats in value_sum_,
+  // deriv_sum_, and self_repair_total_.
+  double count_;
+
+  // A configuration parameter, this determines how saturated the derivative
+  // has to be for a particular dimension, before we activate self-repair.
+  // Default value is 0.2, the same as for TanhComponent.
+  BaseFloat self_repair_threshold_;
+
+  // A configuration parameter, this determines the maximum absolute value of
+  // the extra term that we add to the input derivative of the tanh when doing
+  // self repair.  The default value is 1.0e-05.
+  BaseFloat self_repair_scale_;
+
+  // Preconditioner for the input space when updating w_h_ (has dimension
+  // recurrent_dim_ if use-natural-gradient was true, else not set up).
+  // The preconditioner stores its own configuration values; we write and read
+  // these, but not the preconditioner object itself.
+  OnlineNaturalGradient preconditioner_in_;
+  // Preconditioner for the output space when updating w_h_ (has dimension
+  // recurrent_dim_ if use-natural-gradient was true, else not set up).
+
+  OnlineNaturalGradient preconditioner_out_;
+
+  const GruNonlinearityComponent &operator
+      = (const GruNonlinearityComponent &other); // Disallow.
+};
+
+
+/**
+  OutputGruNonlinearityComponent is a component that implements part of a
+  Output Gated Recurrent Unit (OGRU).  Compare with the traditional GRU, it uses
+  output gate instead reset gate, and the formula of h_t will be different. 
+  You can regard it as a variant of GRU.
+  This code is more efficient in time and memory than stitching it together
+  using more basic components.
+  For a brief summary of what this actually computes, search for 'recap' below;
+  the first part of this comment establishes the context. For more information
+  about GRU, please check the summary of GruNonlinearityComponent.
+
+ Before describing what this component does, we'll establish
+ some notation for the OGRU.
+
+ We use the same notation with previous GRU. We'll also
+ ignore the bias terms for purposes of this exposition (let them be implicit).
+
+
+  Regular OGRU:
+
+   z_t = \sigmoid ( U^z x_t + W^z y_{t-1} )   # update gate, dim == cell_dim
+   o_t = \sigmoid ( U^o x_t + W^o y_{t-1} )   # output gate, dim == cell_dim
+   h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )   # dim == cell_dim
+   c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}  # dim == cell_dim
+   y_t = ( c_t \dot o_t )
+
+ For the "projected OGRU", the 'cell_dim x cell_dim' full-matrix expressions W^z
+ W^o that participate in the expressions for z_t, o_t are
+ replaced with skinny matrices of dimension 'cell_dim x recurrent_dim'
+ (where recurrent_dim < cell_dim) and the output is replaced by
+ a lower-dimension projection of the hidden state, of dimension
+ 'recurrent_dim + non_recurrent_dim < cell_dim', instead of the
+ full 'cell_dim'.
+ s_t is a dimension range of the output y_t.    Parameters of the
+ projected OGRU:
+           cell_dim > 0
+           recurrent_dim > 0
+           non_recurrent_dim > 0  (where non_recurrent_dim + recurrent_dim <= cell_dim).
+
+
+  Equations:
+
+   z_t = \sigmoid ( U^z x_t + W^z s_{t-1} )   # update gate, dim(z_t) == cell_dim
+   o_t = \sigmoid ( U^o x_t + W^o s_{t-1} )   # output gate, dim(o_t) == cell_dim
+   h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )   # dim(h_t) == cell_dim
+   c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}  # dim(c_t) == cell_dim
+   y_t = ( c_t \dot o_t) W^y  # dim(y_t) = recurrent_dim + non_recurrent_dim.
+                              # This is the output of the OGRU.
+   s_t = y_t[0:recurrent_dim-1]  # dimension range of y_t, dim(s_t) = recurrent_dim.
+
+
+   Because we'll need it below, we define
+    hpart_t = U^h x_t
+   which is a subexpression of h_t.
+
+   Our choice to make a "special" component for the projected OGRU is to have
+   it be a function from
+     (z_t, hpart_t, c_{t-1}) -> (h_t, c_t)
+   That is, the input to the component is all those things on the LHS
+   appended together, and the output is the two things on the
+   RHS appended together.  The dimensions are:
+    (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim).
+   The component computes the functions:
+     h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} )
+     c_t = ( 1 - z_t ) \dot h_t  +  z_t \dot c_{t-1}
+
+   Notice that 'W^h' is the only parameter that lives inside the component.
+
+   You might also notice that the output 'h_t' is never actually used
+   in any other part of the GRU, so the question arises: why is it
+   necessary to have it be an output of the component?  This has to do with
+   saving computation: because h_t is an output, and we'll be defining
+   the kBackpropNeedsOutput flag, it is available in the backprop phase
+   and this helps us avoid some computation (otherwise we'd have to do
+   a redundant multiplication by W^h in the backprop phase that we already
+   did in the forward phase).  We could have used the 'memo' mechanism to
+   do this, but this is undesirable because the use of a memo disables
+   'update consolidation' in the backprop so we'd lose a little
+   speed there.
+
+   This component stores stats of the same form as are normally stored by the
+   StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
+   activations and derivatives, but this is done inside the Backprop() functions.
+
+
+  The main configuration values that are accepted:
+         cell-dim         e.g. cell-dim=1024  Cell dimension.
+         recurrent-dim    e.g. recurrent-dim=256.  If not specified, we assume
+                          this is a non-projected GRU.
+         param-stddev     Standard deviation for random initialization of
+                          the matrix W^h.  Defaults to 1.0 / sqrt(d) where
+                          d is recurrent-dim if specified, else cell-dim.
+         self-repair-threshold   Equivalent to the self-repair-lower-threshold
+                          in a TanhComponent; applies to the tanh nonlinearity.
+                          default=0.2, you probably won't want to change this.
+         self-repair-scale Equivalent to the self-repair-scale in a
+                          TanhComponent; applies to the tanh nonlinearity.
+                          default=1.0e-05, which you probably won't want to
+                          change unless dealing with an objective function that
+                          has smaller or larger dynamic range than normal, in
+                          which case you might want to make it smaller or
+                          larger.
+
+  Values inherited from UpdatableComponent (see its declaration in
+  nnet-component-itf.h for details):
+      learning-rate
+      learning-rate-factor
+      max-change
+
+   Natural-gradient related options are below; you won't normally have to
+   set these.
+      alpha                 Constant that determines how much we smooth the
+                            Fisher-matrix estimates with the unit matrix.
+                            Larger means more smoothing. default=4.0
+      rank                  The rank of the correction to the unit matrix.
+                            default=8.
+      update-period         Determines the period (in minibatches) with which
+                            we update the Fisher-matrix estimates;
+                            making this > 1 saves a little time in training.
+                            default=10.
+
+
+   Recap of what this computes:
+     This component implements the function
+        (z_t, hpart_t, c_{t-1}) -> (h_t, c_t)
+     of dimensions
+        (cell_dim, cell_dim, cell_dim) -> (cell_dim, cell_dim),
+    where:
+         h_t = \tanh( hpart_t + W^h \dot c_{t-1} )
+         c_t = (1 - z_t) \dot h_t + z_t \dot c_{t-1}.
+*/
+class OutputGruNonlinearityComponent: public UpdatableComponent {
+ public:
+
+  virtual int32 InputDim() const;
+  virtual int32 OutputDim() const;
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  OutputGruNonlinearityComponent() { }
+  virtual std::string Type() const { return "OutputGruNonlinearityComponent"; }
+  virtual int32 Properties() const {
+    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|\
+        kBackpropNeedsOutput|kBackpropAdds;
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update_in,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const { return new OutputGruNonlinearityComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ZeroStats();
+  virtual void FreezeNaturalGradient(bool freeze);
+
+  // Some functions that are specific to this class:
+  explicit OutputGruNonlinearityComponent(
+      const OutputGruNonlinearityComponent &other);
+
+ private:
+
+  void Check() const;  // checks dimensions, etc.
+
+  /**
+     This function stores value and derivative stats for the tanh
+     nonlinearity that is a part of this component, and if needed
+     adds the small 'self-repair' term to 'h_t_deriv'.
+      @param [in] h_t The output of the tanh expression from the
+                      forward pass.
+      @param [in,out] h_t_deriv  To here will be added the small
+                      self-repair term (this is a small value
+                      that we use to push oversaturated neurons
+                      back to the center).
+     This function has side effects on the class instance, specifically the
+     members value_sum_, deriv_sum, self_repair_total_, and count_.
+   */
+  void TanhStatsAndSelfRepair(const CuMatrixBase<BaseFloat> &h_t,
+                              CuMatrixBase<BaseFloat> *h_t_deriv);
+
+  /*  This function is responsible for updating the w_h_ matrix
+      (taking into account the learning rate).
+        @param [in] c_t1_value  The value of c_{t-1}.
+        @param [in] h_t_deriv  The derivative of the objective
+                        function w.r.t. the argument of the tanh
+                        function, i.e. w.r.t. the expression
+                        "hpart_t + W^h \dot c_t1".
+                        This function is concerned with the second
+                        term as it affects the derivative w.r.t. W^h.
+   */
+  void UpdateParameters(const CuMatrixBase<BaseFloat> &c_t1_value,
+                        const CuMatrixBase<BaseFloat> &h_t_deriv);
+
+
+  int32 cell_dim_;  // cell dimension, e.g. 1024.
+
+  // The matrix W^h, of dimension cell_dim_ by recurrent_dim_.
+  // There is no bias term needed here because hpart_t comes from
+  // an affine component that has a bias.
+  CuVector<BaseFloat> w_h_;
+
+  // Of dimension cell_dim_, this is comparable to the value_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the tanh nonlinearity.
+  // Normalize by dividing by count_.
+  CuVector<double> value_sum_;
+
+  // Of dimension cell_dim_, this is comparable to the deriv_sum_ vector in
+  // class NonlinearComponent.  It stores the sum of the function-derivative of
+  // the tanh nonlinearity.  Normalize by dividing by count_.
+  CuVector<double> deriv_sum_;
+
+  // This is part of the stats (along with value_sum_, deriv_sum_, and count_);
+  // if you divide it by count_ it gives you the proportion of the time that an
+  // average dimension was subject to self-repair.
+  double self_repair_total_;
+
+  // The total count (number of frames) corresponding to the stats in value_sum_,
+  // deriv_sum_, and self_repair_total_.
+  double count_;
+
+  // A configuration parameter, this determines how saturated the derivative
+  // has to be for a particular dimension, before we activate self-repair.
+  // Default value is 0.2, the same as for TanhComponent.
+  BaseFloat self_repair_threshold_;
+
+  // A configuration parameter, this determines the maximum absolute value of
+  // the extra term that we add to the input derivative of the tanh when doing
+  // self repair.  The default value is 1.0e-05.
+  BaseFloat self_repair_scale_;
+
+  // Unlike the GruNonlinearityComponent, there is only one dimension to
+  // consider as the parameters are a vector not a matrix, so we only need one
+  // preconditioner.
+  OnlineNaturalGradient preconditioner_;
+
+  const OutputGruNonlinearityComponent &operator
+      = (const OutputGruNonlinearityComponent &other); // Disallow.
+};
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+
+#endif
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 31ff9819dfa..e4bd1a402eb 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -392,11 +392,12 @@ size_t IndexVectorHasher::operator () (
                             // skipping over more elements.  Setting n1 large or
                             // n2 to 1 would make the hasher consider all
                             // elements.
+  size_t len = index_vector.size();
   // all long-ish numbers appearing below are randomly chosen primes.
-  size_t ans = 1433 + 34949  * index_vector.size();
+  size_t ans = 1433 + 34949 * len;
   std::vector<Index>::const_iterator iter = index_vector.begin(),
       end = index_vector.end(), med = end;
-  if (med > iter + n1)
+  if (n1 < len)
     med = iter + n1;
 
   for (; iter != med; ++iter) {
@@ -412,6 +413,10 @@ size_t IndexVectorHasher::operator () (
     ans += iter->n * 1619;
     ans += iter->t * 15649;
     ans += iter->x * 89809;
+    // The following if-statement was introduced in order to fix an
+    // out-of-range iterator problem on Windows.
+    if (n2 > len || iter >= end - n2) 
+        break;
   }
   return ans;
 }
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index 1a5ceabab0e..5206caac9e2 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -37,25 +37,36 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
     bool b = config_line.ParseLine(config_lines[i]);
     KALDI_ASSERT(b && "Could not parse config line.");
     if (config_line.FirstToken() == "component-node") {
+      // What we're trying to do here is: find a line like:
+      //  component-node name=foo component=foo input=Append(bar, ReplaceIndex(ivector, t, 0))
+      // we want to replace it with something like:
+      // component-node name=foo component=foo input=Append(bar, ReplaceIndex(ivector, t, 0))
+      // .. and we want this to also work if instead of 'ivector' it has something like
+      // Scale(0.5, ivector).  We assume that ReplaceIndex() expressions only occur in this
+      // type of context.
       std::string whole_line = config_lines[i];
       std::string to_search_for = "ReplaceIndex(";
       std::string::size_type to_search_for_size = to_search_for.size();
       std::string::size_type pos = whole_line.find(to_search_for);
       if (pos != std::string::npos) {
-        std::string::size_type comma_pos = whole_line.find(',', pos);
+        std::string::size_type comma_pos = whole_line.find(", t, 0)", pos);
         if (comma_pos != std::string::npos) {
           // if the line contained ReplaceIndex(ivector, t, 0),
           // descriptor_name would now be 'ivector'.
           std::string descriptor_name =
               whole_line.substr(pos + to_search_for_size,
                                 comma_pos - (pos + to_search_for_size));
-          std::string::size_type end_pos = whole_line.find(')', pos);
-          std::string::size_type expr_size = end_pos + 1 - pos;
+          // Note: 7, below, is the size of: ", t, 0)".
+          std::string::size_type end_pos = comma_pos + 7;
+          std::string::size_type expr_size = end_pos - pos;
           // e.g. expr_size would be strlen("ReplaceIndex(ivector, t, 0)").
           std::ostringstream to_replace_with;
           to_replace_with << "Round(" << descriptor_name << ", " << ivector_period << ")";
           whole_line.replace(pos, expr_size, to_replace_with.str());
           config_to_read << whole_line << "\n";
+        } else {
+          KALDI_ERR << "Could not process the ReplaceIndex expression in: "
+                    << whole_line;
         }
       }
     }
diff --git a/src/nnet3/nnet-compile-utils-test.cc b/src/nnet3/nnet-compile-utils-test.cc
index 53820abf32a..894d0a3577b 100644
--- a/src/nnet3/nnet-compile-utils-test.cc
+++ b/src/nnet3/nnet-compile-utils-test.cc
@@ -95,7 +95,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
         num_locations : max_generated_submat_list_size;
     submat_lists[i].reserve(num_locations);
     for (int32 j = 0; j < num_locations; j++) {
-      if (j <= min_num_kaddrows)
+      if (j <= min_num_kaddrows && j < num_submat_indexes)
         // since we need min_num_kaddrows in the split_lists we ensure that
         // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
diff --git a/src/nnet3/nnet-compile-utils.cc b/src/nnet3/nnet-compile-utils.cc
index 49012e08884..b1f9d0b0e2b 100644
--- a/src/nnet3/nnet-compile-utils.cc
+++ b/src/nnet3/nnet-compile-utils.cc
@@ -25,351 +25,164 @@
 
 namespace kaldi {
 namespace nnet3 {
-// this comparator will be used to sort pairs using first_element
-// we declare it as a struct as it will also be used by std::lower_bound
-// method which will supply elements of different types to the function
-struct FirstElementComparator {
-  int first_element(int32 t) const {
-    return t;
-  }
-
-  int first_element(std::pair<int32, int32> t) const  {
-    return t.first;
-  }
 
-  template< typename T1, typename T2>
-  bool operator()( T1 const & t1, T2 const & t2) const  {
-    return first_element(t1) < first_element(t2);
-  }
-};
-
-// This comparator is used with std::find_if function to search for pairs
-// whose first element is equal to the given pair
-struct FirstElementIsEqualComparator :
-      public std::unary_function<std::pair<int32, int32>, bool>
-{
-  explicit FirstElementIsEqualComparator(const int32 element):
-      element_(element) {}
-  bool operator() (std::pair<int32, int32> const &arg)
-  { return (arg.first == element_); }
-  int32 element_;
-};
-
-// This comparator is used with std::find_if function to search for pairs
-// whose .first and .second elements are equal to the given pair
-struct PairIsEqualComparator  :
-      public std::unary_function<std::pair<int32, int32>, bool>
-{
-  explicit PairIsEqualComparator(const std::pair<int32, int32> pair):
-      pair_(pair) {}
-  bool operator() (std::pair<int32, int32> const &arg)
-  {
-    if (pair_.first == arg.first)
-      return pair_.second == arg.second;
-    return false;
+/**
+   Gets counts of submatrices (the 1st members of pairs) in submat_lists.
+   Also outputs, to 'submats_with_large_counts', a list of submatrix indexes
+   that have counts over half of submat_lists.size().  (These will be separated
+   out into their own AddRows() commands).
+ */
+void GetSubmatCounts(
+    const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
+    std::unordered_map<int32,int32> *submat_counts,
+    std::vector<int32> *submats_with_large_counts) {
+  auto iter = submat_lists.begin(), end = submat_lists.end();
+  for (; iter != end; ++iter) {
+    std::vector<std::pair<int32, int32> >::const_iterator
+        iter2 = iter->begin(), end2 = iter->end();
+    for (; iter2 != end2; ++iter2) {
+      int32 submat_index = iter2->first;
+      KALDI_ASSERT(submat_index >= 0);  // We don't expect -1's in submat_lists.
+      std::unordered_map<int32,int32>::iterator
+          iter = submat_counts->find(submat_index);
+      if (iter == submat_counts->end())
+        (*submat_counts)[submat_index] = 1;
+      else
+        iter->second++;
+    }
   }
-  std::pair<int32, int32> pair_;
-};
-
-// this comparator will be used to sort pairs initially by second element in
-// descending order and then by first element in descending order.
-// note, std::sort accepts an actual function as an alternative to a
-// function object.
-bool  SecondElementComparator(const std::pair<int32, int32>& first_pair,
-                              const std::pair<int32, int32>& second_pair) {
-  if (first_pair.second == second_pair.second)
-    return first_pair.first > second_pair.first;
-  return first_pair.second > second_pair.second;
+  auto counts_iter = submat_counts->begin(),
+      counts_end = submat_counts->end();
+  size_t cutoff = submat_lists.size() / 2;
+  for (; counts_iter != counts_end; ++counts_iter)
+    if (counts_iter->second > cutoff)
+      submats_with_large_counts->push_back(counts_iter->first);
 }
 
-// Function to sort the lists in a vector of lists of pairs, by the first
-// element of the pair
-void SortSubmatLists(
-    // vector of list of location pairs
-    const std::vector<std::vector<std::pair<int32, int32> > > submat_lists,
-    // a copy of the input submat_lists where the lists are sorted
-    // (this will be used in the caller function for sort and find functions)
-    std::vector<std::vector<std::pair<int32, int32> > > * sorted_submat_lists,
-    // maximum size of the submat_lists
-    int32* max_submat_list_size
-    )
-{
-  *max_submat_list_size = 0;
-  sorted_submat_lists->reserve(submat_lists.size());
-  KALDI_ASSERT(submat_lists.size() > 0);
-  for (int32 i = 0; i < submat_lists.size(); i++) {
-    if (submat_lists[i].size() > *max_submat_list_size)
-      *max_submat_list_size = submat_lists[i].size();
-    sorted_submat_lists->push_back(submat_lists[i]);
-    std::sort((*sorted_submat_lists)[i].begin(),
-              (*sorted_submat_lists)[i].end(),
-              FirstElementComparator());
+/**
+   This function, used in SplitLocations(), is used to make separate
+   'split lists' for certain high-count submatrix indexes, specified by
+   the user in 'submats_to_separate'.  These split
+   lists will be lists of pairs that are all either (-1, 1) or (submatrix_index, x)
+   for a particular submatrix index (constant within the split list).
+   These high-count lists will be written to 'split_lists'; they
+   will eventually compile to AddRows() commands.  We write the remaining
+   members of the lists in 'submat_lists' (the ones that did not make it
+   into 'split_lists') to 'reduced_submat_lists'.
+ */
+void SeparateSubmatsWithLargeCounts(
+    const std::vector<int32> &submats_to_separate,
+    const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
+    std::vector<std::vector<std::pair<int32, int32> > > *reduced_submat_lists,
+    std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
+  KALDI_ASSERT(split_lists->empty() && !submats_to_separate.empty());
+  size_t num_to_separate = submats_to_separate.size(),
+      num_rows = submat_lists.size();
+  std::unordered_map<int32, size_t> submat_to_index;
+  reduced_submat_lists->clear();
+  reduced_submat_lists->resize(num_rows);
+  split_lists->resize(num_to_separate);
+  for (size_t i = 0; i < num_to_separate; i++) {
+    (*split_lists)[i].resize(num_rows, std::pair<int32, int32>(-1, -1));
+    int32 submat = submats_to_separate[i];
+    submat_to_index[submat] = i;
   }
-}
-
-// Function to compute a histogram of the submat_index,
-// which is the first_element in the location pair, given vector of list of
-// location pairs
-void ComputeSubmatIndexHistogram(
-    // vector of list of pairs of location pairs where the lists are sorted
-    // by submat_indexes (.first element)
-    const std::vector<std::vector<std::pair<int32, int32> > >
-    sorted_submat_lists,
-    // a histogram of submat_indexes where
-    // the keys are submat_indexes and values are a vector of frequencies
-    // of first occurrence, second occurrence, etc. of a submat_index
-    // in a submat_list
-    unordered_map<int32, std::vector<int32> >* submat_histogram
-    ) {
-  KALDI_ASSERT(sorted_submat_lists.size() > 0);
-  // computing the submat_histogram
-  // counting the occurrences of each element in the current submat_list;
-  // each new occurrence of the same element, in this list, is counted
-  // as a seperate symbol for frequency counts
-  for (int32 i = 0; i < sorted_submat_lists.size(); i++) {
-    int j = 0;
-    unordered_map<int32, std::vector<int32> >::iterator histogram_iterator
-        = submat_histogram->end();
-    int32 repetition_count = 0;
-    while (j < sorted_submat_lists[i].size()) {
-      if ((histogram_iterator == submat_histogram->end()) ||
-          (histogram_iterator->first != sorted_submat_lists[i][j].first)) {
-        histogram_iterator =
-            submat_histogram->find(sorted_submat_lists[i][j].first);
-        repetition_count = 0;
-        // if a histogram entry was not found for this submat_index, add one
-        if (histogram_iterator == submat_histogram->end()) {
-          (*submat_histogram)[sorted_submat_lists[i][j].first];
-          histogram_iterator = submat_histogram->find(
-              sorted_submat_lists[i][j].first);
-        }
+  for (size_t row = 0; row < submat_lists.size(); row++) {
+    std::vector<std::pair<int32, int32> >::const_iterator
+        iter = submat_lists[row].begin(), end = submat_lists[row].end();
+    std::vector<std::pair<int32, int32> >
+        &reduced_list = (*reduced_submat_lists)[row];
+    // 'reduced_lists' will contain the pairs that don't make it into
+    // 'split_lists'.
+    for (; iter != end; ++iter) {
+      int32 submat_index = iter->first;
+      std::unordered_map<int32, size_t>::const_iterator map_iter =
+          submat_to_index.find(submat_index);
+      if (map_iter == submat_to_index.end()) { // not a large-count submatrix.
+        reduced_list.push_back(*iter);
+        continue;
       }
-
-      if (repetition_count >= (histogram_iterator->second).size()) {
-        // this is the first time the submat_index repeated this many times
-        // so add an entry for this in the count vector
-        (histogram_iterator->second).push_back(1);
-      } else {
-        (histogram_iterator->second)[repetition_count]++;
+      size_t index = map_iter->second;
+      std::pair<int32,int32> &p = (*split_lists)[index][row];
+      if (p.first >= 0) {
+        // we'd only reach here if the same submat index repeated in the same
+        // row, which is possible but rare.
+        reduced_list.push_back(*iter);
+        continue;
       }
-      repetition_count++;
-      j++;
-    }
-  }
-}
-
-
-// Function to find the first occurrence of a submat_index in list of location
-// pairs from a vector of list of locations pairs.
-// The occurrences are returned as a list of vector iterators,
-// pointing to the position of the pair in the list or to the
-// end of the list (when the pair is not present)
-void FindSubmatIndexInSubmatLists(
-    // submat_index to search in the submat_lists
-    int32 submat_index,
-    // sorted_submat_lists is a pointer as we want non-const iterators in the
-    // output
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    // a vector of iterators to store the location of the pairs
-    std::vector<std::vector<std::pair<int32, int32> >::iterator>
-    *output_iterator_list,
-    // the max size of the submat_lists if the found pairs have been removed
-    int32 *max_remaining_submat_list_size) {
-
-  output_iterator_list->reserve(sorted_submat_lists->size());
-  *max_remaining_submat_list_size = 0;
-  for (int32 i = 0; i < sorted_submat_lists->size(); i++)  {
-    std::vector< std::pair<int32, int32> > & submat_list =
-        (*sorted_submat_lists)[i];
-    output_iterator_list->push_back(
-        std::find_if(submat_list.begin(), submat_list.end(),
-                     FirstElementIsEqualComparator(submat_index)));
-    int32 remaining_submat_list_size = submat_list.size();
-    if  (output_iterator_list->back() != submat_list.end())  {
-      // since the submat_index is present in this submat_list
-      // if submat_index was deleted from the list
-      // the remaining submat_list's size is reduced by 1
-      remaining_submat_list_size--;
-    }
-    *max_remaining_submat_list_size = remaining_submat_list_size
-                                      > *max_remaining_submat_list_size ? remaining_submat_list_size :
-                                      *max_remaining_submat_list_size;
-  }
-}
-
-// Function to extract the identified pairs (identified with an iterator)
-// from a vector of list of pairs, "to extract" means to copy into
-// a list and erase the original pair from the submat_lists
-void ExtractGivenPairsFromSubmatLists(
-    std::vector<std::vector<std::pair<int32, int32> >::iterator>
-    &input_iterator_list,
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    std::vector<std::pair<int32, int32> > *list_of_pairs) {
-  list_of_pairs->reserve(sorted_submat_lists->size());
-  for (int32 i = 0; i < input_iterator_list.size(); i++) {
-    if (input_iterator_list[i] != (*sorted_submat_lists)[i].end()) {
-      // there was an element with the submat_index in the current list
-      list_of_pairs->push_back(*input_iterator_list[i]);
-      (*sorted_submat_lists)[i].erase(input_iterator_list[i]);
-    } else  {
-      // insert a dummy element. Callers of this function expect the dummy
-      // element to be (-1, -1)
-      list_of_pairs->push_back(std::make_pair(-1, -1));
-    }
-  }
-}
-
-// Function to extract the last pairs from a vector of list of pairs
-// a dummy is added when the list is empty
-static void ExtractLastPairFromSubmatLists(
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    std::vector<std::pair<int32, int32> > *list_of_pairs) {
-  list_of_pairs->reserve(sorted_submat_lists->size());
-  for (int32 i = 0; i < sorted_submat_lists->size(); i++) {
-    if ((*sorted_submat_lists)[i].size() == 0)  {
-      // the value of the dummy has to be (-1, -1) as down stream code has
-      // expects -1 values for dummies
-      list_of_pairs->push_back(std::make_pair(-1, -1));
-      continue;
+      p.first = submat_index;
+      int32 src_row_index = iter->second;
+      p.second = src_row_index;
     }
-    list_of_pairs->push_back((*sorted_submat_lists)[i].back());
-    (*sorted_submat_lists)[i].pop_back();
   }
 }
 
-// Function which does the actual splitting of submat_lists. But it operates on
-// sorted submat_lists and uses submat_histogram_vector.
-// See SplitLocations, below for the algorithm
-static void SplitLocationsUsingSubmatHistogram(
-    // maximum size of the lists in the sorted_submat_lists
-    int32 max_submat_list_size,
-    // a vector of list of pairs where each list is expected to be sorted
-    // this is a pointer as the lists will be modified
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    // a vector of pairs to represent a histogram
-    // this is a pointer as the vector will be sorted
-    std::vector<std::pair<int32, int32> > *submat_histogram_vector,
-    // a vector of lists of pairs with rearranged pairs
-    std::vector<std::vector<std::pair<int32, int32> > > *split_lists)  {
-
-  // sort the submat_histogram_vector based on second element of pair
-  // in descending order then first element of pair in descending order
-  std::sort(submat_histogram_vector->begin(),
-            submat_histogram_vector->end(), SecondElementComparator);
-
-  int32 prev_max_remaining_submat_list_size = max_submat_list_size;
-  std::vector<std::pair<int32, int32> >::iterator iter;
-  for (iter = submat_histogram_vector->begin();
-       iter != submat_histogram_vector->end();
-       ++iter)  {
-    std::pair<int32, int32> submat_index_and_count = *iter;
-    std::vector<std::vector<std::pair<int32, int32> >::iterator>
-        output_iterator_list;
-    int32 max_remaining_submat_list_size = 0;
-    FindSubmatIndexInSubmatLists(submat_index_and_count.first,
-                                 sorted_submat_lists,
-                                 &output_iterator_list,
-                                 &max_remaining_submat_list_size);
-    if (max_remaining_submat_list_size
-        < prev_max_remaining_submat_list_size)  {
-      // since we will have a smaller max_remaining_submat_list_size by
-      // splitting this submat_index into a seperate list,
-      // we will split it;
-      std::vector<std::pair<int32, int32> > list_of_pairs;
-      ExtractGivenPairsFromSubmatLists(output_iterator_list,
-                                       sorted_submat_lists,
-                                       &list_of_pairs);
-      split_lists->push_back(list_of_pairs);
-      prev_max_remaining_submat_list_size = max_remaining_submat_list_size;
-    }
-  }
-
-  // rearrange the remaining pairs into lists where
-  // pairs with multiple first elements are allowed
-  // Note : we don't yet know if there is any advantage of having multiple
-  // calls to the same submat in kAddRowsMulti. If this is actually helpful
-  // then use the sorted_histogram_vector to first copy submat_indexes which
-  // did not make it to kAddRows calls
-  for (int32 i = 0; i < prev_max_remaining_submat_list_size; i++) {
-    std::vector<std::pair<int32, int32> > list_of_pairs;
-    ExtractLastPairFromSubmatLists(sorted_submat_lists,
-                                   &list_of_pairs);
-    split_lists->push_back(list_of_pairs);
-  }
-}
-
-// Function rearranges the submat_lists (see nnet-compute-utils.h for
-// description of submat_lists), into lists that can be used as inputs
-// for kAddRows and kAddRowsMulti calls.
-// kAddRows requires a list of pairs where all the first elements correspond to
-// the same submat_index.
-// kAddRowsMulti uses a list of pairs where the first elements can correspond to
-// multiple submat_index locations.
-// ------------------------
-// The maximum size of a list in submat_lists is the minimum number of
-// kAddRowsMulti calls necessary.
-// In the current implementation we replace kAddRowsMulti calls with
-// kAddRows calls wherever possible, while not increasing the number of calls.
-//
-// Algorithm :
-// The function computes a histogram of submat_indexes and spans through the
-// submat_indexes in descending order of frequency. For each submat_index a
-// decision is made to copy it using a kAddRows call or not.
-// A kAddRow call is made for a submat_index if splitting it into a seperate
-// list reduces the max_submat_list_size by one, i.e., reduces the number of
-// remaining kAddRowsMulti calls.
-// submat_indexes which cannot be assigned to kAddRow calls are rearranged into
-// lists for kAddRowsMulti calls.
-//
-// Note : To decide splits we could have solved a combinatorial
-// optimization problem where we find the best set of
-// kAddRows + kAddRowsMulti calls;
-// but given that both these calls have similar costs,
-// and that the average number of elements in a submat_list is around 4,
-// it does not make sense to
-// choose a kAddRows call unless it is able to immediately reduce a
-// kAddRowsMulti call. So we simplify the process and stay away from any
-// complex search algorithms. We might implement a solution where a more
-// elaborate search is done,if the length of the lists increases
-// for newer NN architectures, as even minor savings in speed due to increased
-// number of kAddRows calls can accumulate compensating for the additional calls
-
 void SplitLocations(
     const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
     std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
+  size_t num_rows = submat_lists.size(),
+      num_output_lists = 0;
+  auto iter = submat_lists.begin(), end = submat_lists.end();
+  for (; iter != end; ++iter)
+    if (iter->size() > num_output_lists)
+      num_output_lists = iter->size();
+  split_lists->clear();
+  if (num_output_lists == 0)  // Odd, but could happen, maybe
+    return;
+  else if (num_output_lists == 1) {
+    split_lists->resize(1);
+    std::vector<std::pair<int32, int32> > &list = (*split_lists)[0];
+    list.resize(num_rows, std::pair<int32, int32>(-1, -1));
+    for (size_t i = 0; i < num_rows; i++) {
+      if (!submat_lists[i].empty())
+        list[i] = submat_lists[i][0];
+    }
+    return;
+  }
 
-  // a histogram of the submat_indexes in the submat_lists
-  // each occurence in a given submat_list is considered unique so we maintain
-  // a vector to count each occurrence separately.
-  // The i'th element in the vector corresponds to the count of
-  // the (i+1)'th occurrence of a submat_index in a submat_list
-  unordered_map<int32, std::vector<int32> > submat_histogram;
-
-  int32 max_submat_list_size = 0;
-
-  // initializing a vector of list of pairs to store the sorted submat_lists
-  std::vector<std::vector< std::pair<int32, int32> > >
-      sorted_submat_lists;
-  SortSubmatLists(submat_lists, &sorted_submat_lists, &max_submat_list_size);
-  ComputeSubmatIndexHistogram(sorted_submat_lists, &submat_histogram);
-  // the vector has same information as the submat_histogram, but it is
-  // suitable for sorting according to frequency. The first elements of pairs
-  // can be repeated, these correspond to different occurrences in the same list
-  std::vector<std::pair<int32, int32> > submat_histogram_vector;
-  // copy the key, occurence_counts from submat_histogram to a vector
-  unordered_map<int32, std::vector<int32> >::iterator hist_iter;
-  for (hist_iter = submat_histogram.begin();
-       hist_iter != submat_histogram.end();
-       ++hist_iter) {
-    for (int32 i = 0; i < (hist_iter->second).size(); i++)  {
-      submat_histogram_vector.push_back(
-          std::make_pair(hist_iter->first, (hist_iter->second)[i]));
+  // counts for each submatrix index, of how many times it occurs.
+  std::unordered_map<int32,int32> submat_counts;
+  std::vector<int32> submats_with_large_counts;
+  GetSubmatCounts(submat_lists, &submat_counts, &submats_with_large_counts);
+  if (!submats_with_large_counts.empty()) {
+    // There are submatrices with counts over half the num-rows.  We assign these
+    // their own output lists.
+
+    std::vector<std::vector<std::pair<int32, int32> > > reduced_submat_lists;
+    SeparateSubmatsWithLargeCounts(submats_with_large_counts,
+                                   submat_lists,
+                                   &reduced_submat_lists,
+                                   split_lists);
+    // 'reduced_split_lists' is the result of recursing with input 'reduced_submat_lists';
+    // we'll append its result to 'split_lists'.
+    std::vector<std::vector<std::pair<int32, int32> > > reduced_split_lists;
+    SplitLocations(reduced_submat_lists, &reduced_split_lists);
+    size_t cur_num_lists = split_lists->size(),
+        num_extra_lists = reduced_split_lists.size(),
+        new_num_lists = cur_num_lists + num_extra_lists;
+    split_lists->resize(new_num_lists);
+    for (size_t i = 0; i < num_extra_lists; i++)
+      (*split_lists)[cur_num_lists + i].swap(reduced_split_lists[i]);
+    return;
+    // and we're done.
+  } else {
+    // All the counts of submatrix indexes seem to be small so we are resigned to
+    // only using AddRowsMulti commands.
+    split_lists->resize(num_output_lists);
+    for (size_t i = 0; i < num_output_lists; i++)
+      (*split_lists)[i].resize(num_rows, std::pair<int32, int32>(-1, -1));
+    for (size_t row = 0; row < num_rows; row++) {
+      const std::vector<std::pair<int32, int32> > &this_list =
+          submat_lists[row];
+      size_t this_list_size = submat_lists[row].size();
+      for (size_t i = 0; i < this_list_size; i++) {
+        (*split_lists)[i][row] = this_list[i];
+      }
     }
   }
-  SplitLocationsUsingSubmatHistogram(max_submat_list_size, &sorted_submat_lists,
-                                     &submat_histogram_vector, split_lists);
 }
 
+
 /* If it is the case for some i >= 0 that all the .first elements of
    "location_vector" are either i or -1, then output i to first_value and the
    .second elements into "second_values", and return true.  Otherwise return
diff --git a/src/nnet3/nnet-compile-utils.h b/src/nnet3/nnet-compile-utils.h
index 124f40f3421..e21f81aecdd 100644
--- a/src/nnet3/nnet-compile-utils.h
+++ b/src/nnet3/nnet-compile-utils.h
@@ -32,11 +32,15 @@ namespace nnet3 {
 
 
 /**
-   The input to this function is a vector of lists of pairs, and this function
-   splits it up into a list of vectors of pairs.  In order to make the lists all
-   the same length it may have to insert "dummy" pairs with value (-1, -1).
-   In addition, this function implement certain heuristics to break up the
-   list into pairs in a particular desirable way, which we will describe below.
+   The input to this function is a vector (indexed by matrix-row-index) of lists
+   of pairs (submat_index, row_index), and this function splits it up into a
+   list of vectors of pairs, where those vectors are indexed by
+   matrix-row-index.
+
+   In order to make the lists all the same length it may have to insert "dummy"
+   pairs with value (-1, -1).  In addition, this function implement certain
+   heuristics to break up the list into pairs in a particular desirable way,
+   which we will describe below.
 
    Let the input be `submat_lists`, and let `num_rows = submat_lists.size()`.
    The value -1 is not expected to appear as either the .first or .second
@@ -74,7 +78,6 @@ namespace nnet3 {
 
    See documentation here: \ref dnn3_compile_compiler_split_locations
  */
-
 void SplitLocations(
     const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
     std::vector<std::vector<std::pair<int32, int32> > > *split_lists);
@@ -179,4 +182,3 @@ void GetNxList(const std::vector<Index> &indexes,
 
 
 #endif
-
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 77d78113bbb..8421f9f29f4 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -1361,6 +1361,15 @@ void Compiler::OutputDebugInfo(NnetComputation *computation) const {
     NnetComputation::MatrixDebugInfo &debug_info =
         computation->matrix_debug_info[value_matrix];
     debug_info.is_deriv = false;
+    if (!debug_info.cindexes.empty()) {
+      // This can happen if we created an alias for a node using a
+      // dim-range-node that covers all the dimensions (would satisfy
+      // IsWholeMatrix() above while not being a unique matrix).  We sometimes
+      // do that to work around compiler constraints when creating expressions
+      // that have the same quantity with more than one scaling value within the
+      // same expression (like for computing deltas).
+      continue;
+    }
     AppendCindexes(step_info.node_index, step_info.output_indexes,
                    &debug_info.cindexes);
     if (deriv_matrix != 0) {
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 2c76805f5cc..75522a5ac09 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -23,6 +23,7 @@
 #include <iomanip>
 #include "nnet3/nnet-component-itf.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/nnet-combined-component.h"
 #include "nnet3/nnet-normalize-component.h"
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-convolutional-component.h"
@@ -68,6 +69,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
     ans = new RestrictedAttentionComponent::PrecomputedIndexes();
   } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") {
     ans = new GeneralDropoutComponentPrecomputedIndexes();
+  } else if (cpi_type == "SpecAugmentTimeMaskComponentPrecomputedIndexes") {
+    ans = new SpecAugmentTimeMaskComponentPrecomputedIndexes();
   } else if (cpi_type == "TdnnComponentPrecomputedIndexes") {
     ans = new TdnnComponent::PrecomputedIndexes();
   }
@@ -166,6 +169,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new DropoutMaskComponent();
   } else if (component_type == "GeneralDropoutComponent") {
     ans = new GeneralDropoutComponent();
+  } else if (component_type == "SpecAugmentTimeMaskComponent") {
+    ans = new SpecAugmentTimeMaskComponent();
   } else if (component_type == "BackpropTruncationComponent") {
     ans = new BackpropTruncationComponent();
   } else if (component_type == "LstmNonlinearityComponent") {
@@ -178,6 +183,10 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new RestrictedAttentionComponent();
   } else if (component_type == "SumBlockComponent") {
     ans = new SumBlockComponent();
+  } else if (component_type == "GruNonlinearityComponent") {
+    ans = new GruNonlinearityComponent();
+  } else if (component_type == "OutputGruNonlinearityComponent") {
+    ans = new OutputGruNonlinearityComponent();
   } else if (component_type == "ScaleAndOffsetComponent") {
     ans = new ScaleAndOffsetComponent();
   }
@@ -367,8 +376,10 @@ void NonlinearComponent::StoreStatsInternal(
 
 void NonlinearComponent::StoreBackpropStats(
     const CuMatrixBase<BaseFloat> &out_deriv) {
-  // only store these stats about every 4 minibatches.
-  if (RandInt(0, 3) == 0)
+  // Only store these stats about every 4 minibatches.  Make sure to always
+  // store the stats on the very first minibatch, or it would interact badly
+  // with the ConsolidateMemory() code.
+  if (RandInt(0, 3) == 0 && oderiv_count_ != 0)
     return;
 
   KALDI_ASSERT(out_deriv.NumCols() == dim_);
@@ -622,7 +633,11 @@ void NonlinearComponent::InitFromConfig(ConfigLine *cfl) {
               << Type() << ": \"" << cfl->WholeLine() << "\"";
 }
 
-
+void NonlinearComponent::ConsolidateMemory() {
+  { CuVector<double> temp(value_sum_); value_sum_.Swap(&temp); }
+  { CuVector<double> temp(deriv_sum_); deriv_sum_.Swap(&temp); }
+  { CuVector<double> temp(oderiv_sumsq_); oderiv_sumsq_.Swap(&temp); }
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 01697353308..32d6b3d305d 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -375,6 +375,23 @@ class Component {
   /// backprop to consume it.
   virtual void DeleteMemo(void *memo) const { KALDI_ASSERT(memo == NULL); }
 
+  /// This virtual function relates to memory management, and avoiding
+  /// fragmentation.  It is called only once per model, after we do the first
+  /// minibatch of training.  The default implementation does nothing, but it
+  /// can be overridden by child classes, where it may re-initialize certain
+  /// quantities that may possibly have been allocated during the forward pass
+  /// (e.g. certain statistics; OnlineNaturalGradient objects).  We use our own
+  /// CPU-based allocator (see cu-allocator.h) and since it can't do paging
+  /// since we're not in control of the GPU page table, fragmentation can be a
+  /// problem.  The allocator always tries to put things in 'low-address memory'
+  /// (i.e. at smaller memory addresses) near the beginning of the block it
+  /// allocated, to avoid fragmentation; but if permanent things (belonging to
+  /// the model) are allocated in the forward pass, they can permanently stay in
+  /// high memory.  This function helps to prevent that, by re-allocating those
+  /// things into low-address memory (It's important that it's called after all the
+  /// temporary buffers for the forward-backward have been freed, so that there
+  /// is low-address memory available)).
+  virtual void ConsolidateMemory() { }
 
   Component() { }
 
@@ -620,6 +637,8 @@ class NonlinearComponent: public Component {
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
 
+  virtual void ConsolidateMemory();
+
   // The following functions are unique to NonlinearComponent.
   // They mostly relate to diagnostics.
   const CuVector<double> &ValueSum() const { return value_sum_; }
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index 9c84115d406..cf99bbcd82a 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -135,7 +135,9 @@ void ComputationGraphBuilder::ExplainWhyNotComputable(
     int32 first_cindex_id) const {
   int32 max_lines_print = 100;
   std::deque<int32> cindexes_to_explain;
+  std::vector<bool> added_to_queue(graph_->cindexes.size(), false);
   cindexes_to_explain.push_back(first_cindex_id);
+  added_to_queue[first_cindex_id] = true;
   KALDI_ASSERT(graph_->cindexes.size() == graph_->dependencies.size());
   std::ostringstream os;
   os << "*** cindex ";
@@ -148,18 +150,17 @@ void ComputationGraphBuilder::ExplainWhyNotComputable(
     cindexes_to_explain.pop_front();
     KALDI_ASSERT(static_cast<size_t>(cindex_id) < graph_->cindexes.size());
     PrintCindexId(os, cindex_id);
-    os << " is " << static_cast<ComputableInfo>(
-        computable_info_[cindex_id]) << ", dependencies: ";
+    os << " is " << cindex_info_[cindex_id].computable << ", dependencies: ";
     const std::vector<int32> dependencies = graph_->dependencies[cindex_id];
     std::vector<int32>::const_iterator iter = dependencies.begin(),
         end = dependencies.end();
     for (; iter != end; iter++) {
       int32 dep_cindex_id = *iter;
       PrintCindexId(os, dep_cindex_id);
-      ComputableInfo status = static_cast<ComputableInfo>(
-          computable_info_[cindex_id]);
-      if (status != kComputable) {
-        os << '[' << status << ']';
+      const CindexInfo &dep_info = cindex_info_[dep_cindex_id];
+      os << '[' << dep_info.computable << ']';
+      if (dep_info.computable != kComputable && !added_to_queue[dep_cindex_id]) {
+        added_to_queue[dep_cindex_id] = true;
         cindexes_to_explain.push_back(dep_cindex_id);
       }
       if (iter+2 != end)
@@ -223,16 +224,19 @@ void ComputationGraph::Print(std::ostream &os,
 
 
 // inline
-void ComputationGraphBuilder::AddCindexId(int32 cindex_id,
-                                          bool is_input,
-                                          bool is_output) {
+void ComputationGraphBuilder::AddCindexId(int32 cindex_id) {
   // If this cindex_id has just now been added to the graph, the following
   // assert should succeed.
-  KALDI_PARANOID_ASSERT(cindex_id == computable_queued_.size() &&
-                        cindex_id == computable_info_.size() &&
-                        cindex_id == depend_on_this_.size() &&
-                        cindex_id == usable_count_.size());
+  KALDI_PARANOID_ASSERT(cindex_id == depend_on_this_.size() &&
+                        cindex_id == cindex_info_.size());
+  depend_on_this_.push_back(std::vector<int32>());
+  cindex_info_.push_back(CindexInfo());
+}
+/*
+  CindexInfo &info = cindex_info_.back();
   if (is_input) {
+    info.computable = k
+
     computable_info_.push_back(kComputable);
     computable_queued_.push_back(false);
   } else {
@@ -242,9 +246,9 @@ void ComputationGraphBuilder::AddCindexId(int32 cindex_id,
     computable_queued_.push_back(false);
     next_queue_.push_back(cindex_id);
   }
-  depend_on_this_.push_back(std::vector<int32>());
+  depend_on_this_.
   usable_count_.push_back(is_output ? 1 : 0);
-}
+  }*/
 
 
 void ComputationGraphBuilder::AddInputs() {
@@ -263,7 +267,8 @@ void ComputationGraphBuilder::AddInputs() {
       bool is_input = true, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Input index seems to be listed more than once");
-      AddCindexId(cindex_id, true, false);
+      AddCindexId(cindex_id);
+      cindex_info_.back().computable = kComputable;
       num_added++;
     }
   }
@@ -282,7 +287,10 @@ void ComputationGraphBuilder::AddOutputs() {
       bool is_input = false, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Output index seems to be listed more than once");
-      AddCindexId(cindex_id, false, true);
+      AddCindexId(cindex_id);
+      cindex_info_.back().usable_count = 1;
+      cindex_info_.back().queued = true;
+      next_queue_.push_back(cindex_id);
       num_added++;
     }
   }
@@ -290,17 +298,15 @@ void ComputationGraphBuilder::AddOutputs() {
     KALDI_ERR << "Cannot process computation request with no outputs";
   }
   current_distance_ = 0;
-  // the calls to AddCindexId in this function will have added to next_queue_.
   KALDI_ASSERT(current_queue_.empty());
   current_queue_.swap(next_queue_);
 }
 
 bool ComputationGraphBuilder::AllOutputsAreComputable() const {
-  char is_computable_char = static_cast<char>(kComputable);
-  std::vector<char>::const_iterator iter = computable_info_.begin(),
-      end = computable_info_.end();
+  auto iter = cindex_info_.begin(),
+      end = cindex_info_.end();
   for (int32 cindex_id = 0; iter != end; ++iter, ++cindex_id) {
-    if (*iter != is_computable_char) {  // is not computable.
+    if (iter->computable != kComputable) {
       int32 network_node = graph_->cindexes[cindex_id].first;
       if (nnet_.IsOutputNode(network_node))
         return false;
@@ -318,8 +324,6 @@ std::ostream& operator << (std::ostream &os,
       break;
     case ComputationGraphBuilder::kNotComputable: os << "kNotComputable";
       break;
-    case ComputationGraphBuilder::kWillNotCompute: os << "kWillNotCompute";
-      break;
     default: os << "[invalid enum value]"; break;
   }
   return os;
@@ -335,10 +339,9 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
       end = graph_->cindexes.end();
   for (int32 cindex_id = 0; iter != end; ++iter,++cindex_id) {
     int32 network_node = iter->first;
-    ComputableInfo c = static_cast<ComputableInfo>(computable_info_[cindex_id]);
     if (nnet_.IsOutputNode(network_node)) {
       num_outputs_total++;
-      if (c != kComputable)
+      if (cindex_info_[cindex_id].computable != kComputable)
         outputs_not_computable.push_back(cindex_id);
     }
   }
@@ -362,16 +365,17 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
 // which are actually used in computing it.  It also clears the dependencies
 // of those cindexes that are not computable.
 void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
-  ComputableInfo c = static_cast<ComputableInfo>(computable_info_[cindex_id]);
-  // by the time this is called, there should be no cindexes with unknown state.
-  KALDI_ASSERT(c != kUnknown);
-  if (c == kNotComputable || c == kWillNotCompute) {
-    // if something is not computable, there is no point
+  const CindexInfo &info = cindex_info_[cindex_id];
+  // by the time this is called, there should be no cindexes with unknown state
+  // that are usable.
+  KALDI_ASSERT(!(info.computable == kUnknown && info.usable_count != 0));
+  if (info.computable == kNotComputable || info.usable_count == 0) {
+    // if something is not computable or is not usable, there is no point
     // keeping around its dependencies.
     graph_->dependencies[cindex_id].clear();
     return;
   }
-  KALDI_ASSERT(c == kComputable);
+  KALDI_ASSERT(info.computable == kComputable);
   const Cindex &cindex = graph_->cindexes[cindex_id];
   int32 node_id = cindex.first;
   const Index &index = cindex.second;
@@ -385,7 +389,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
     case kDescriptor: {
       const Descriptor &desc = node.descriptor;
       bool dont_care = false;  // there should be no kUnknown, and we check this
-      CindexSet cindex_set(*graph_, computable_info_, dont_care);
+      CindexSet cindex_set(*graph_, cindex_info_, dont_care);
       std::vector<Cindex> used_cindexes;
       bool ans = desc.IsComputable(index, cindex_set, &used_cindexes);
       // If the next assert fails it could be a failure in the assumption that
@@ -410,7 +414,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       // In the line below, node_id - 1 is the index of the component-input
       // node-- the descriptor at the input to the component.  We are interested
       // in the set of inputs to the component that are computable.
-      IndexSet index_set(*graph_, computable_info_, node_id - 1, dont_care);
+      IndexSet index_set(*graph_, cindex_info_, node_id - 1, dont_care);
       std::vector<Index> used_indexes;
       bool ans = c->IsComputable(request_->misc_info, index, index_set,
                                  &used_indexes);
@@ -476,12 +480,10 @@ void ComputationGraphBuilder::Compute(const ComputationRequest &request) {
     // only check rarely if we're running at low verbose level.
     if (GetVerboseLevel() >= 3 || RandInt(1,  (current_distance_ + 1)) == 1)
       Check(cur_segment_start);
-    // TODO: come up with a scheme to delay when we call
-    // UpdateAllComputableInfo().
-    UpdateAllComputableInfo();
     if (current_queue_.empty()) // we're done.
       break;
   }
+  KALDI_VLOG(6) << "current_distance = " << current_distance_; // TEMP
   if (current_distance_ == max_distance)
     KALDI_ERR << "Loop detected while building computation graph (bad "
               << "network topology?)";
@@ -507,7 +509,8 @@ void ComputationGraphBuilder::Check(int32 start_cindex_id) const {
         KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
       }
     }
-    { // check dependencies.
+    if (cindex_info_[cindex_id].dependencies_computed) {
+      // check dependencies.
       std::vector<int32> dependencies = graph_->dependencies[cindex_id];
       int32 size = dependencies.size();
       std::sort(dependencies.begin(), dependencies.end());
@@ -525,49 +528,40 @@ void ComputationGraphBuilder::Check(int32 start_cindex_id) const {
     {
       // check usable_count_
       int32 node_index = graph_->cindexes[cindex_id].first;
-      int32 usable_count = usable_count_[cindex_id],
+      int32 usable_count = cindex_info_[cindex_id].usable_count,
           usable_count_recomputed = nnet_.IsOutputNode(node_index) ? 1 : 0;
       std::vector<int32> depend_on_this = depend_on_this_[cindex_id];
       int32 size = depend_on_this.size();
       for (size_t j = 0; j < size; j++) {
         int32 other_cindex_id = depend_on_this[j];
-        if (usable_count_[other_cindex_id] != 0 &&
-            computable_info_[other_cindex_id] != kNotComputable)
+        if (cindex_info_[other_cindex_id].usable_count != 0 &&
+            cindex_info_[other_cindex_id].computable != kNotComputable)
           usable_count_recomputed++;
       }
       KALDI_ASSERT(usable_count == usable_count_recomputed);
     }
-    // check computable_info_.  note: this will not be accurate
-    // if the cindex_id is still queued to have dependencies added
-    // (in cur_queue_ or next_queue_).
-    if (computable_queue_.empty()) {
+    // check `computable`.
+    if (cindex_info_[cindex_id].dependencies_computed) {
       ComputationGraphBuilder::ComputableInfo c =
           ComputeComputableInfo(cindex_id);
-      // the status doesn't have to be correct if it's kWillNotCompute,
-      // because these are cindex-ids that we chose not to compute
-      // because we determined they would not be useful, and
-      // ComputeComputableInfo() will never return this value.
-      if (c != computable_info_[cindex_id] &&
-          computable_info_[cindex_id] != kWillNotCompute) {
-        int32 count_cur = std::count(current_queue_.begin(),
-                                     current_queue_.end(), cindex_id),
-            count_next = std::count(next_queue_.begin(),
-                                    next_queue_.end(), cindex_id);
-        // if it wasn't queued, then something is wrong.
-        if (count_cur + count_next == 0)
-          KALDI_ERR << "Mismatch in computable status";
+      // the status doesn't have to match if the stored info is kUnknown.
+      if (c != cindex_info_[cindex_id].computable &&
+          cindex_info_[cindex_id].computable != kUnknown) {
+        KALDI_ERR << "Mismatch in computable status";
       }
     }
-    // check computable_queued_.
+    // check `queued`.
     // note, the following checks might be a bit slow.
-    if (computable_queued_[cindex_id]) {
-      KALDI_ASSERT(std::count(computable_queue_.begin(),
-                              computable_queue_.end(),
-                              cindex_id) == 1);
-    } else {
-      KALDI_ASSERT(std::count(computable_queue_.begin(),
-                              computable_queue_.end(),
-                              cindex_id) == 0);
+    if (RandInt(0, cindex_id) == 0) {
+      if (cindex_info_[cindex_id].queued) {
+        KALDI_ASSERT(std::count(current_queue_.begin(),
+                                current_queue_.end(),
+                                cindex_id) == 1);
+      } else {
+        KALDI_ASSERT(std::count(current_queue_.begin(),
+                                current_queue_.end(),
+                                cindex_id) == 0);
+      }
     }
   }
 }
@@ -595,13 +589,13 @@ void ComputationGraphBuilder::Prune() {
   std::vector<bool> keep(num_cindex_ids - start_cindex_id, false);
   for (int32 c = start_cindex_id; c < num_cindex_ids; c++) {
     if (required[c - start_cindex_id] || graph_->is_input[c]) {
-      KALDI_ASSERT(computable_info_[c] == kComputable &&
+      KALDI_ASSERT(cindex_info_[c].computable == kComputable &&
                    "You are calling Prune when not everything is computable.");
       keep[c - start_cindex_id] = true;
     }
   }
   graph_->Renumber(start_cindex_id, keep);
-  // We also need to renumber computable_info_ and usable_count_, which
+  // We also need to renumber cindex_info_ b,
   // graph_->Renumber doesn't do for us, but we can make some shortcuts.  We set
   // all computable_info_ to kComputable because actually it all was kComputable
   // (we checked when deciding what to keep); and we set the usable_count_ to 1
@@ -613,21 +607,18 @@ void ComputationGraphBuilder::Prune() {
   // segments" is not critical.  [this information only gets used if we process
   // additional segments as part of the compilation of an online computation.]
   int32 new_num_cindex_ids = graph_->cindexes.size();
-  computable_info_.resize(start_cindex_id);
-  computable_info_.resize(new_num_cindex_ids, (char)kComputable);
-  usable_count_.resize(start_cindex_id);
-  usable_count_.resize(new_num_cindex_ids, 1);
+  cindex_info_.resize(start_cindex_id);
+  cindex_info_.resize(new_num_cindex_ids);
+  for (int32 i = start_cindex_id; i < new_num_cindex_ids; i++) {
+    cindex_info_[i].computable = kComputable;
+    cindex_info_[i].usable_count = 1;
+  }
   // depend_on_this_ is a vector of vectors-- keeping track of the reverse of
   // the dependencies-- and I believe we won't be needing this information any
   // more past this point.
   depend_on_this_.resize(start_cindex_id);
   depend_on_this_.resize(new_num_cindex_ids);
-  // computable_queued_ also shouldn't be queried past this point, but
-  // I believe they should all be false at this point anyway (note that
-  // we assert below that computable_queue_ is empty).
-  computable_queued_.resize(new_num_cindex_ids);
 
-  KALDI_ASSERT(computable_queue_.empty());
   graph_->segment_ends.push_back(new_num_cindex_ids);
 }
 
@@ -636,7 +627,6 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
   if (static_cast<int32>(graph_->dependencies.size()) <= cindex_id) {
     graph_->dependencies.resize(2 * cindex_id + 1);
   }
-
   Cindex cindex = graph_->cindexes[cindex_id];
 
   // find the dependencies of this cindex.
@@ -696,8 +686,11 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
     int32 dep_cindex_id = graph_->GetCindexId(input_cindexes[i],
                                               is_input, &is_new);
     this_dep[i] = dep_cindex_id;
-    if (is_new)
-      AddCindexId(dep_cindex_id, false, false);
+    if (is_new) {
+      AddCindexId(dep_cindex_id);
+      cindex_info_.back().queued = true;
+      next_queue_.push_back(dep_cindex_id);
+    }
     // we will keep dependent's usable_count_ up to date below
   }
   // remove duplicates of dependencies.
@@ -720,15 +713,6 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
     depend_on_this_[dep_cindex_id].push_back(cindex_id);
     IncrementUsableCount(dep_cindex_id);
   }
-
-  // Now that we've added the dependencies, we can put this into
-  // the computable_queue_ to assess whether it's computable
-  KALDI_ASSERT(computable_info_[cindex_id] == kUnknown &&
-               !computable_queued_[cindex_id]);
-  // we think it'll be faster in the next line to do push_front instead of
-  // push_back; either one would be correct.
-  computable_queue_.push_front(cindex_id);
-  computable_queued_[cindex_id] = true;
 }
 
 
@@ -743,18 +727,19 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
     case kDescriptor: {
       const Descriptor &desc = node.descriptor;
       {
-        CindexSet cindex_set(*graph_, computable_info_, false);
+        CindexSet cindex_set(*graph_, cindex_info_, false);
         if (desc.IsComputable(index, cindex_set, NULL)) {
-          // it's computable even without counting kUnknown inputs as computable
-          // [treat_unknown_as_computable = false] -> definitely computable.
+          // it's computable even without counting kUnknown and kWillNotCompute
+          // inputs as computable [treat_unknown_as_computable = false] ->
+          // definitely computable.
           return kComputable;
         }
       }
-      CindexSet cindex_set2(*graph_, computable_info_, true);
+      CindexSet cindex_set2(*graph_, cindex_info_, true);
       if (!desc.IsComputable(index, cindex_set2, NULL)) {
-        // it's not computable even when counting kUnknown inputs as
-        // computable [treat_unknown_as_computable = true] -> definitely not
-        // computable.
+        // it's not computable even when counting kUnknown
+        // inputs as computable [treat_unknown_as_computable = true] ->
+        // definitely not computable.
         return kNotComputable;
       }
       return kUnknown;
@@ -763,14 +748,15 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
       const Component *c = nnet_.GetComponent(node.u.component_index);
       const int32 input_node_id = node_id - 1;
       {
-        IndexSet index_set(*graph_, computable_info_, input_node_id, false);
+        IndexSet index_set(*graph_, cindex_info_, input_node_id, false);
         if (c->IsComputable(request_->misc_info, index, index_set, NULL)) {
-          // it's computable even without counting kUnknown inputs as computable
-          // [treat_unknown_as_computable = false] -> definitely computable.
+          // it's computable even without counting kUnknown
+          // inputs as computable [treat_unknown_as_computable = false] ->
+          // definitely computable.
           return kComputable;
         }
       }
-      IndexSet index_set2(*graph_, computable_info_, input_node_id, true);
+      IndexSet index_set2(*graph_, cindex_info_, input_node_id, true);
       if (!c->IsComputable(request_->misc_info, index, index_set2, NULL)) {
         // it's not computable even when counting kUnknown inputs as computable
         // [treat_unknown_as_computable = true] -> definitely not computable.
@@ -782,7 +768,7 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
       Cindex input_cindex(node.u.node_index, index);
       int32 input_cindex_id = graph_->GetCindexId(input_cindex);
       if (input_cindex_id != -1)
-        return ComputableInfo(computable_info_[input_cindex_id]);
+        return cindex_info_[input_cindex_id].computable;
       else
         return kUnknown;
     }
@@ -802,7 +788,7 @@ void ComputationGraphBuilder::GetComputableInfo(
     std::vector<std::vector<bool> > *computable) const {
   KALDI_ASSERT(!graph_->cindexes.empty() &&
                "You need to call this after Compute()!");
-  KALDI_ASSERT(!computable_info_.empty() &&
+  KALDI_ASSERT(!cindex_info_.empty() &&
                "You need to call this before Prune()!");
   computable->clear();
   computable->resize(request_->outputs.size());
@@ -817,7 +803,7 @@ void ComputationGraphBuilder::GetComputableInfo(
       Cindex cindex(n, output.indexes[j]);
       int32 cindex_id = graph_->GetCindexId(cindex);
       KALDI_ASSERT(cindex_id != -1);
-      this_vec[j] = (computable_info_[cindex_id] == kComputable);
+      this_vec[j] = (cindex_info_[cindex_id].computable == kComputable);
     }
   }
 }
@@ -826,11 +812,18 @@ void ComputationGraphBuilder::GetComputableInfo(
 void ComputationGraphBuilder::UpdateComputableInfo(int32 cindex_id) {
   // if the current computable_info_ for cindex_id value is not kUnknown, this
   // cindex_id should not have been in the queue.
-  KALDI_ASSERT(static_cast<size_t>(cindex_id) < computable_info_.size());
-  char &output = computable_info_[cindex_id];
+  KALDI_ASSERT(static_cast<size_t>(cindex_id) < cindex_info_.size());
+
+  // We don't need to update the computable info of this node since it's
+  // not usable (i.e. not currently reachable from any node that is not
+  // kNotComputable).
+  if (cindex_info_[cindex_id].usable_count == 0)
+    return;
+
+  ComputableInfo &output = cindex_info_[cindex_id].computable;
   KALDI_ASSERT(output == kUnknown);
 
-  output = static_cast<char>(ComputeComputableInfo(cindex_id));
+  output = ComputeComputableInfo(cindex_id);
 
   if (output != kUnknown) {
     // The computable status of cindexes that depend on this cindex and whose
@@ -840,15 +833,15 @@ void ComputationGraphBuilder::UpdateComputableInfo(int32 cindex_id) {
         end = depend_on_this_[cindex_id].end();
     for (; iter != end; ++iter) {
       int32 other_cindex_id = *iter;
-      if (computable_info_[other_cindex_id] == kUnknown &&
-          !computable_queued_[other_cindex_id]) {
-        computable_queue_.push_back(other_cindex_id);
-        computable_queued_[other_cindex_id] = true;
+      if (cindex_info_[other_cindex_id].computable == kUnknown &&
+          !cindex_info_[other_cindex_id].queued) {
+        cindex_info_[other_cindex_id].queued = true;
+        next_queue_.push_back(other_cindex_id);
       }
     }
-    if (output == kNotComputable && usable_count_[cindex_id] != 0) {
+    if (output == kNotComputable && cindex_info_[cindex_id].usable_count != 0) {
       // If we have just changed the computable state from kUnknown to
-      // kNotComputable, then given the way the usable_count_ is defined (see
+      // kNotComputable, then given the way the usable_count is defined (see
       // the declaration), this means that we must decrement the
       // usable_count_ of all cindex_ids that we depend on.
       std::vector<int32>::const_iterator
@@ -862,37 +855,11 @@ void ComputationGraphBuilder::UpdateComputableInfo(int32 cindex_id) {
   }
 }
 
-void ComputationGraphBuilder::SetAsWillNotCompute(int32 cindex_id) {
-  KALDI_ASSERT(usable_count_[cindex_id] == 0);
-  computable_info_[cindex_id] = kWillNotCompute;
-  std::vector<int32>::const_iterator iter = depend_on_this_[cindex_id].begin(),
-      end = depend_on_this_[cindex_id].end();
-  for (; iter != end; ++iter) {
-    int32 other_cindex_id = *iter;
-    if (computable_info_[other_cindex_id] == kUnknown &&
-        !computable_queued_[other_cindex_id]) {
-      computable_queue_.push_back(other_cindex_id);
-      computable_queued_[other_cindex_id] = true;
-    }
-  }
-}
-
-
-void ComputationGraphBuilder::UpdateAllComputableInfo() {
-  while (!computable_queue_.empty()) {
-    int32 cindex_id = computable_queue_.front();
-    computable_queue_.pop_front();
-    computable_queued_[cindex_id] = false;
-    UpdateComputableInfo(cindex_id);
-  }
-}
-
 
 void ComputationGraphBuilder::IncrementUsableCount(int32 cindex_id) {
-  KALDI_PARANOID_ASSERT(static_cast<size_t>(cindex_id)<usable_count_.size());
-  // the next line post-increments the reachable count.
-  if (usable_count_[cindex_id]++ == 0 &&
-      computable_info_[cindex_id] != kNotComputable) {
+  CindexInfo &info = cindex_info_[cindex_id];
+  if (info.usable_count++ == 0 &&
+      info.computable != kNotComputable) {
     std::vector<int32>::const_iterator
         iter = graph_->dependencies[cindex_id].begin(),
         end = graph_->dependencies[cindex_id].end();
@@ -900,15 +867,21 @@ void ComputationGraphBuilder::IncrementUsableCount(int32 cindex_id) {
       int32 dep_cindex_id = *iter;
       IncrementUsableCount(dep_cindex_id);
     }
+    if (info.computable == kUnknown &&
+        !info.queued) {
+      // It's become usable, so make sure it's queued to process whether it is
+      // computable or not.
+      info.queued = true;
+      next_queue_.push_back(cindex_id);
+    }
   }
 }
 
 
 void ComputationGraphBuilder::DecrementUsableCount(int32 cindex_id) {
-  KALDI_PARANOID_ASSERT(static_cast<size_t>(cindex_id)<usable_count_.size());
-  KALDI_PARANOID_ASSERT(usable_count_[cindex_id] > 0);
-  if (--usable_count_[cindex_id] == 0 &&
-      computable_info_[cindex_id] != kNotComputable) {
+  KALDI_PARANOID_ASSERT(cindex_info_[cindex_id].usable_count > 0);
+  if (--cindex_info_[cindex_id].usable_count == 0 &&
+      cindex_info_[cindex_id].computable != kNotComputable) {
     std::vector<int32>::const_iterator
         iter = graph_->dependencies[cindex_id].begin(),
         end = graph_->dependencies[cindex_id].end();
@@ -924,11 +897,19 @@ void ComputationGraphBuilder::BuildGraphOneIter() {
   while (!current_queue_.empty()) {
     int32 cindex_id = current_queue_.back();
     current_queue_.pop_back();
-    KALDI_ASSERT(computable_info_[cindex_id] == kUnknown);
-    if (usable_count_[cindex_id] == 0)
-      SetAsWillNotCompute(cindex_id);
-    else
+    cindex_info_[cindex_id].queued = false;
+    if (!cindex_info_[cindex_id].dependencies_computed &&
+        cindex_info_[cindex_id].usable_count != 0) {
+      cindex_info_[cindex_id].dependencies_computed = true;
       AddDependencies(cindex_id);
+      // Add to the queue so we can check whether it's computable.
+      if (!cindex_info_[cindex_id].queued) {
+        cindex_info_[cindex_id].queued = true;
+        next_queue_.push_back(cindex_id);
+      }
+    } else if (cindex_info_[cindex_id].computable == kUnknown) {
+      UpdateComputableInfo(cindex_id);
+    }
   }
   current_queue_.swap(next_queue_);  // now next_queue_ will be empty.
   current_distance_++;
@@ -940,7 +921,7 @@ void ComputationGraphBuilder::ComputeRequiredArray(
 
   int32 num_cindex_ids = graph_->cindexes.size();
   KALDI_ASSERT(num_cindex_ids >= start_cindex_id);
-  KALDI_ASSERT(computable_info_.size() == num_cindex_ids);
+  KALDI_ASSERT(cindex_info_.size() == num_cindex_ids);
   required->clear();
   required->resize(num_cindex_ids - start_cindex_id, false);
 
@@ -976,7 +957,7 @@ void ComputationGraphBuilder::ComputeRequiredArray(
   // usable_count_ == 0; this would indicate a bug somewhere.
   for (int32 c = start_cindex_id; c < num_cindex_ids; c++)
     KALDI_ASSERT(!((*required)[c - start_cindex_id] &&
-                   (usable_count_[c] == 0)));
+                   (cindex_info_[c].usable_count == 0)));
 
 }
 
@@ -1483,12 +1464,12 @@ void ComputeComputationPhases(
 }
 
 CindexSet::CindexSet(const ComputationGraph &graph):
-    graph_(graph), is_computable_(NULL) { }
+    graph_(graph), info_(NULL) { }
 
 CindexSet::CindexSet(const ComputationGraph &graph,
-                     const std::vector<char> &is_computable,
+                     const std::vector<ComputationGraphBuilder::CindexInfo> &info,
                      bool treat_unknown_as_computable):
-    graph_(graph), is_computable_(&is_computable),
+    graph_(graph), info_(&info),
     treat_unknown_as_computable_(treat_unknown_as_computable) { }
 
 
@@ -1497,26 +1478,23 @@ bool CindexSet::operator () (const Cindex &cindex) const {
   if (cindex_id == -1) {
     return false;
   } else {
-    if (is_computable_ == NULL) {
+    if (info_ == NULL) {
       return true;
     } else {
       ComputationGraphBuilder::ComputableInfo
-          c = static_cast<ComputationGraphBuilder::ComputableInfo>(
-              ((*is_computable_)[cindex_id]));
-      if (treat_unknown_as_computable_)
-        return (c == ComputationGraphBuilder::kComputable ||
-                c == ComputationGraphBuilder::kUnknown);
-      else
-        return (c == ComputationGraphBuilder::kComputable);
+          c = (*info_)[cindex_id].computable;
+      return (c == ComputationGraphBuilder::kComputable ||
+              (c == ComputationGraphBuilder::kUnknown &&
+               treat_unknown_as_computable_));
     }
   }
 }
 
 IndexSet::IndexSet(const ComputationGraph &graph,
-                   const std::vector<char> &is_computable,
+                   const std::vector<ComputationGraphBuilder::CindexInfo> &info,
                    int32 node_id,
                    bool treat_unknown_as_computable):
-    graph_(graph), is_computable_(is_computable), node_id_(node_id),
+    graph_(graph), info_(info), node_id_(node_id),
     treat_unknown_as_computable_(treat_unknown_as_computable) { }
 
 bool IndexSet::operator () (const Index &index) const {
@@ -1524,9 +1502,7 @@ bool IndexSet::operator () (const Index &index) const {
   if (cindex_id == -1) {
     return false;
   } else {
-    ComputationGraphBuilder::ComputableInfo
-        c = static_cast<ComputationGraphBuilder::ComputableInfo>(
-            is_computable_[cindex_id]);
+    ComputationGraphBuilder::ComputableInfo c = info_[cindex_id].computable;
     if (treat_unknown_as_computable_)
       return (c == ComputationGraphBuilder::kComputable ||
               c == ComputationGraphBuilder::kUnknown);
diff --git a/src/nnet3/nnet-computation-graph.h b/src/nnet3/nnet-computation-graph.h
index c0662756502..c310444545a 100644
--- a/src/nnet3/nnet-computation-graph.h
+++ b/src/nnet3/nnet-computation-graph.h
@@ -58,7 +58,7 @@ struct ComputationGraph {
   /// particular cindex_id directly depends on to compute it.  No repeats will
   /// be present.  Note, some of these dependencies may be optional
   /// dependencies; in early stages of compilation this will contain all
-  /// "desired" inputs and later we will prune the dependencies contain just
+  /// "desired" inputs and later we will prune the dependencies to contain just
   /// those that are used (which will vary depending on availability).
   std::vector<std::vector<int32> > dependencies;
 
@@ -157,7 +157,7 @@ class ComputationGraphBuilder {
   // dependencies of a particular cindex_id we realize that we won't be able to
   // use this cindex_id (i.e. it may be computable but it's not used) because
   // its usable_count is zero, and in those cases we change the status to
-  // kWillNotCompute even though the cindex-id may be computable- for most
+  // kWillNotCompute even though the cindex-id may be computable.  For most
   // purposes this status is treated the same as kNotComputable.
   enum ComputableInfo {
     kUnknown = 0,
@@ -165,6 +165,30 @@ class ComputationGraphBuilder {
     kNotComputable = 2,
     kWillNotCompute = 3
   };
+
+  struct CindexInfo {
+    ComputableInfo computable;  // kUnknown, kComputable, kNotComputable
+    int32 usable_count;   // usable_count_[i] for a cindex_id i is defined as 1 if i is a requested
+    // output, and otherwise as the number of other cindex_ids j such that
+    // computable_info_[j] is not kNotComputable AND usable_count_[j] > 0 AND i is
+    // a member of graph->dependencies[j].  A cindex_id is termed "usable"
+    // (meaning it could potentially participate in the computation of the output)
+    // if its usable_count_ is > 0.  This quantity is designed to be easy to keep
+    // updated as we add cindex_ids.
+
+    // True if in current_queue_ or next_queue_.
+    bool queued;
+
+    // True if we have created the cindexes that this cindex depends on.
+    bool dependencies_computed;
+
+    CindexInfo(const CindexInfo &other) = default;
+    CindexInfo(): computable(kUnknown),
+                  usable_count(0),
+                  queued(false),
+                  dependencies_computed(false) { }
+  };
+
  private:
   // This function, called from ExplainWhyNotComputable(), prints to "os"
   // a human-readable form of a given cindex_id, that looks like
@@ -189,12 +213,9 @@ class ComputationGraphBuilder {
   // the output.
   void BuildGraphOneIter();
 
-  // make sure the "computable_info_" array is up to date.
-  void UpdateAllComputableInfo();
-
-  // (called from UpdateAllComputableInfo); make sure the computable_info for
-  // cindex_id is up to date.  As a side effect this may also update the
-  // usable_count_ array.
+  // (called from BuildGraphOneIter()); make sure the computable_info for
+  // cindex_id is up to date.  Has side effects: may update usable_count
+  // values and add things to next_queue_.
   void UpdateComputableInfo(int32 cindex_id);
 
   // (called from BuildGraphOneIter()), this function sets the cindex_id to
@@ -207,11 +228,8 @@ class ComputationGraphBuilder {
   ComputableInfo ComputeComputableInfo(int32 cindex_id) const;
 
   // To be called when this cindex_id has just been newly added to graph_, this
-  // function adds various initial variables associated with it, to *this.
-  // is_input should be set to true if this cindex-id is being added as an input
-  // (from request_.inputs), and is_output should be set to true if this
-  // cindex-id is being added as an output (from request_.outputs).
-  inline void AddCindexId(int32 cindex_id, bool is_input, bool is_output);
+  // function adds a couple default variables associated with it, to *this.
+  inline void AddCindexId(int32 cindex_id);
 
   // Add cindex_ids that this cindex_id depends on.
   void AddDependencies(int32 cindex_id);
@@ -256,36 +274,18 @@ class ComputationGraphBuilder {
   // for each cindex_id, which other cindex_ids depend on it.
   std::vector<std::vector<int32> > depend_on_this_;
 
-  // this vector, indexed by cindex_id, contains our information about whether
-  // each cindex_id is computable; it's ComputableInfo, cast to char.
-  std::vector<char> computable_info_;
-
-  // this is a queue of cindex_ids that we need to re-compute whether they are
-  // computable or not (because either they are new and haven't had dependencies
-  // added, or their dependencies' computable status has changed since we last
-  // computed their computable_ value).
-  std::deque<int32> computable_queue_;
-  // this vector tells us whether a cindex_id is in computable_queued_; it
-  // stops us from adding things twice.
-  std::vector<bool> computable_queued_;
-
-  // usable_count_[i] for a cindex_id i is defined as 1 if i is a requested
-  // output, and otherwise as the number of other cindex_ids j such that
-  // computable_info_[j] is not kNotComputable AND usable_count_[j] > 0 AND i is
-  // a member of graph->dependencies[j].  A cindex_id is termed "usable"
-  // (meaning it could potentially participate in the computation of the output)
-  // if its usable_count_ is > 0.  This quantity is designed to be easy to keep
-  // updated as we add cindex_ids.
-  std::vector<int32> usable_count_;
+
+  // this vector is  indexed by cindex_id
+  std::vector<CindexInfo> cindex_info_;
 
   // current_distance_ >= 0 is the distance to the output, of the cindex_ids in
   // current_queue_.
   int32 current_distance_;
-  // the cindex_ids in current_queue_ are at distance "current_distance" to the
-  // output and have not yet had their dependencies processed.
+  // the cindex_ids in current_queue_ are at no more than distance
+  // "current_distance" to the output
   std::vector<int32> current_queue_;
-  // the cindex_ids in next_queue_ are at distance current_distance + 1 to the
-  // output and have not yet had their dependencies processed.
+  // the cindex_ids in next_queue_ are at no more than distance current_distance
+  // + 1 to the output
   std::vector<int32> next_queue_;
 };
 
@@ -305,14 +305,15 @@ class CindexSet {
 
   /// with this constructor, represents the set of all Cindexes that exist in
   /// the graph and which are computable.  If treat_unknown_as_computable is
-  /// true then we consider kComputable and kUnknown to be computable, else we
-  /// consider just nodes that are kComputable to be computable.
+  /// true then we consider kComputable, kUnknown and kWillNotCompute to be
+  /// computable; else we consider just nodes that are kComputable to be
+  /// computable.
   CindexSet(const ComputationGraph &graph,
-            const std::vector<char> &is_computable,
+            const std::vector<ComputationGraphBuilder::CindexInfo> &info,
             bool treat_unknown_as_computable);
  private:
   const ComputationGraph &graph_;
-  const std::vector<char> *is_computable_;
+  const std::vector<ComputationGraphBuilder::CindexInfo> *info_;
   bool treat_unknown_as_computable_;
 };
 
@@ -327,14 +328,14 @@ class IndexSet {
   /// (node_id, x) which is computable exists in this graph.  If
   /// treat_unknown_as_computable is true then we consider kComputable and kUnknown
   /// to be computable, else we consider just nodes that are kComputable to be
-  /// computable.
+  /// computable.  The `info` input is only needed for its `computable` member.
   IndexSet(const ComputationGraph &graph,
-           const std::vector<char> &computable_info,
+           const std::vector<ComputationGraphBuilder::CindexInfo> &info,
            int32 node_id,
            bool treat_unknown_as_computable);
  private:
   const ComputationGraph &graph_;
-  const std::vector<char> &is_computable_;
+  const std::vector<ComputationGraphBuilder::CindexInfo> &info_;
   int32 node_id_;
   bool treat_unknown_as_computable_;
 };
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 97d8b9045ea..a3571eeb532 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -514,17 +514,22 @@ struct NnetComputation {
   NnetComputation(): need_model_derivative(false) { }
 };
 
-
-
-
-// This operator is to print out the NnetComputation in a human-readable way, for
-// debugging purposes.
-// We don't give Read and Write functions to struct NnetComputation, because we
-// don't anticipate needing to write it to disk.
-std::ostream &operator << (std::ostream &os,
-                           NnetComputation &computation);
-
-
+// A helper class equipped with the stream insertion operator<< to print out
+// the NnetComputation in a human-readable way, with NnetComputation::Print(),
+// for debugging purposes, e.g.:
+//    KALDI_VLOG(3) << NnetComputationPrintInserter{mycomputation, mynet};
+struct NnetComputationPrintInserter {
+  const NnetComputation& computation;
+  const Nnet& nnet;
+  void Print(std::ostream& os) const {
+    computation.Print(os, nnet);
+  }
+  friend inline std::ostream &operator <<(std::ostream &os,
+                                          NnetComputationPrintInserter xhis) {
+    xhis.Print(os);
+    return os;
+  }
+};
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 7ee7d7df717..b5052c71759 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -491,8 +491,10 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
   for (int32 i = 0; i < size; i += 30 + RandInt(0, 9)) {
     // Do a pseudo-random spot check that the row-indexes are not out of range.
     int32 submatrix_index = pairs[i].first, row = pairs[i].second;
-    CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
-    KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    if (submatrix_index != -1) {
+      CuSubMatrix<BaseFloat> m = GetSubMatrix(submatrix_index);
+      KALDI_ASSERT(row >= 0 && row < m.NumRows() && num_cols == m.NumCols());
+    }
   }
 #endif
   pointers->CopyFromVec(vec);
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index 333ed3168b9..f96195ff146 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -119,7 +119,7 @@ class NnetComputer {
 
   // Version of GetOutput that calls Swap(), destroying the output stored inside
   // this object.  You should probably not use this if you plan to call
-  // Backward() on the same NnetComputer object, or it's a recurret
+  // Backward() on the same NnetComputer object, or it's a recurrent
   // computation-- it may lead to a crash.
   void GetOutputDestructive(const std::string &output_name,
                             CuMatrix<BaseFloat> *output);
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index f48a3968c88..7a1617f261a 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -665,6 +665,12 @@ void TimeHeightConvolutionComponent::PrecomputedIndexes::Read(
   ExpectToken(is, binary, "</TimeHeightConvolutionComponentPrecomputedIndexes>");
 }
 
+void TimeHeightConvolutionComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h
index e107962abc2..279cec321dd 100644
--- a/src/nnet3/nnet-convolutional-component.h
+++ b/src/nnet3/nnet-convolutional-component.h
@@ -300,6 +300,8 @@ class TimeHeightConvolutionComponent: public UpdatableComponent {
   };
 
   void ScaleLinearParams(BaseFloat alpha) { linear_params_.Scale(alpha); }
+
+  void ConsolidateMemory();
  private:
 
   void Check() const;
@@ -556,6 +558,8 @@ class TdnnComponent: public UpdatableComponent {
   CuVector<BaseFloat> &BiasParams() { return bias_params_; }
 
   BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; }
+
+  void ConsolidateMemory();
  private:
 
   // This static function is a utility function that extracts a CuSubMatrix
diff --git a/src/nnet3/nnet-descriptor-test.cc b/src/nnet3/nnet-descriptor-test.cc
index de6fe5247bd..94f9bc99f12 100644
--- a/src/nnet3/nnet-descriptor-test.cc
+++ b/src/nnet3/nnet-descriptor-test.cc
@@ -205,6 +205,9 @@ void UnitTestGeneralDescriptorSpecial() {
   names.push_back("d");
   KALDI_ASSERT(NormalizeTextDescriptor(names, "a") == "a");
   KALDI_ASSERT(NormalizeTextDescriptor(names, "Scale(-1.0, a)") == "Scale(-1, a)");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Scale(-1.0, Scale(-2.0, a))") == "Scale(2, a)");
+  KALDI_ASSERT(NormalizeTextDescriptor(names, "Scale(2.0, Sum(Scale(2.0, a), b, c))") ==
+               "Sum(Scale(4, a), Sum(Scale(2, b), Scale(2, c)))");
   KALDI_ASSERT(NormalizeTextDescriptor(names, "Const(1.0, 512)") == "Const(1, 512)");
   KALDI_ASSERT(NormalizeTextDescriptor(names, "Sum(Const(1.0, 512), Scale(-1.0, a))") ==
                "Sum(Const(1, 512), Scale(-1, a))");
diff --git a/src/nnet3/nnet-descriptor.cc b/src/nnet3/nnet-descriptor.cc
index 78fedc3b00c..a218d945d65 100644
--- a/src/nnet3/nnet-descriptor.cc
+++ b/src/nnet3/nnet-descriptor.cc
@@ -928,6 +928,29 @@ bool GeneralDescriptor::Normalize(GeneralDescriptor *desc) {
         std::swap(desc->value1_, child->value1_);
         std::swap(desc->value2_, child->value2_);
         changed = true;
+      } else if (child->descriptor_type_ == kSum) {
+        // Push the Scale() inside the sum expression.
+        desc->descriptors_.clear();
+        for (size_t i = 0; i < child->descriptors_.size(); i++) {
+          GeneralDescriptor *new_child =
+              new GeneralDescriptor(kScale, -1, -1, desc->alpha_);
+          new_child->descriptors_.push_back(child->descriptors_[i]);
+          desc->descriptors_.push_back(new_child);
+        }
+        desc->descriptor_type_ = kSum;
+        desc->alpha_ = 0.0;
+        child->descriptors_.clear();  // prevent them being freed.
+        delete child;
+        changed = true;
+      } else if (child->descriptor_type_ == kScale) {
+        // Combine the 'scale' expressions.
+        KALDI_ASSERT(child->descriptors_.size() == 1);
+        GeneralDescriptor *grandchild = child->descriptors_[0];
+        desc->alpha_ *= child->alpha_;
+        desc->descriptors_[0] = grandchild;
+        child->descriptors_.clear();  // prevent them being freed.
+        delete child;
+        changed = true;
       } else if (child->descriptor_type_ != kNodeName) {
         KALDI_ERR << "Unhandled case encountered when normalizing Descriptor; "
             "you can work around this by pushing Scale() inside "
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index b0ba56d1e35..2ddcc0480d8 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -23,7 +23,7 @@
 
 namespace kaldi {
 namespace nnet3 {
-
+using std::string;
 
 void NnetDiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
   CheckDim();
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index cc5fe3cc050..15004092eaa 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -214,8 +214,8 @@ void GetComputationRequest(const Nnet &nnet,
     const NnetIo &io = eg.io[i];
     const std::string &name = io.name;
     int32 node_index = nnet.GetNodeIndex(name);
-    if (node_index == -1 &&
-        !nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index))
+    if (node_index == -1 ||
+        (!nnet.IsInputNode(node_index) && !nnet.IsOutputNode(node_index)))
       KALDI_ERR << "Nnet example has input or output named '" << name
                 << "', but no such input or output node is in the network.";
 
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 00a31fa897c..0b190f3af33 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -573,7 +573,7 @@ void StatisticsPoolingComponent::InitFromConfig(ConfigLine *cfl) {
 StatisticsPoolingComponent::StatisticsPoolingComponent():
     input_dim_(-1), input_period_(1), left_context_(-1), right_context_(-1),
     num_log_count_features_(0), output_stddevs_(false),
-    variance_floor_(1.0e-10) { }
+    variance_floor_(1.0e-10), require_direct_input_(false) { }
 
 
 StatisticsPoolingComponent::StatisticsPoolingComponent(
@@ -582,7 +582,8 @@ StatisticsPoolingComponent::StatisticsPoolingComponent(
     left_context_(other.left_context_), right_context_(other.right_context_),
     num_log_count_features_(other.num_log_count_features_),
     output_stddevs_(other.output_stddevs_),
-    variance_floor_(1.0e-10) {
+    variance_floor_(other.variance_floor_),
+    require_direct_input_(other.require_direct_input_) {
   Check();
 }
 
@@ -614,6 +615,9 @@ void StatisticsPoolingComponent::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<VarianceFloor>");
   ReadBasicType(is, binary, &variance_floor_);
   ExpectToken(is, binary, "</StatisticsPoolingComponent>");
+  require_direct_input_ = false;  // This is not written to disk, it's only used
+                                  // temporarily, in memory (see
+                                  // nnet3-xvector-compute-batched.cc).
   Check();
 }
 
@@ -1385,7 +1389,10 @@ void ConstantComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
   output_.CopyFromVec(params);
 }
 
-
+void ConstantComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_);
+  preconditioner_.Swap(&temp);
+}
 
 std::string DropoutMaskComponent::Info() const {
   std::ostringstream stream;
@@ -1522,6 +1529,9 @@ std::string GeneralDropoutComponent::Info() const {
          << ", dropout-proportion=" << dropout_proportion_;
   if (continuous_)
     stream << ", continuous=true";
+  if (specaugment_max_proportion_ != 0)
+    stream << ", specaugment-max-proportion=" << specaugment_max_proportion_
+           << ", specaugment-max-regions=" << specaugment_max_regions_;
   if (time_period_ > 0)
     stream << ", time-period=" << time_period_;
   return stream.str();
@@ -1529,7 +1539,10 @@ std::string GeneralDropoutComponent::Info() const {
 
 GeneralDropoutComponent::GeneralDropoutComponent():
     dim_(-1), block_dim_(-1), time_period_(0),
-    dropout_proportion_(0.5), continuous_(false) { }
+    dropout_proportion_(0.5),
+    specaugment_max_proportion_(0.0),
+    specaugment_max_regions_(1),
+    continuous_(false) { }
 
 GeneralDropoutComponent::GeneralDropoutComponent(
     const GeneralDropoutComponent &other):
@@ -1537,6 +1550,8 @@ GeneralDropoutComponent::GeneralDropoutComponent(
     block_dim_(other.block_dim_),
     time_period_(other.time_period_),
     dropout_proportion_(other.dropout_proportion_),
+    specaugment_max_proportion_(other.specaugment_max_proportion_),
+    specaugment_max_regions_(other.specaugment_max_regions_),
     continuous_(other.continuous_) { }
 
 void* GeneralDropoutComponent::Propagate(
@@ -1549,7 +1564,8 @@ void* GeneralDropoutComponent::Propagate(
   // The following will do nothing if 'out' and 'in' refer to the same data.
   out->CopyFromMat(in);
 
-  if (test_mode_ || dropout_proportion_ == 0.0)
+  if (test_mode_ ||
+      (dropout_proportion_ == 0.0 && specaugment_max_proportion_ == 0.0))
     return NULL;
 
   const GeneralDropoutComponentPrecomputedIndexes *indexes =
@@ -1586,7 +1602,8 @@ void GeneralDropoutComponent::Backprop(
   // The following will do no work if in_deriv->Data() == out_deriv.Data().
   in_deriv->CopyFromMat(out_deriv);
 
-  if (test_mode_ || dropout_proportion_ == 0.0) {
+  if (test_mode_ ||
+      (dropout_proportion_ == 0.0 && specaugment_max_proportion_ == 0.0)) {
     KALDI_ASSERT(memo == NULL);
     return;
   }
@@ -1619,6 +1636,19 @@ void GeneralDropoutComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &time_period_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
+  if (PeekToken(is, binary) == 'S') {
+    ExpectToken(is, binary, "<SpecAugmentMaxProportion>");
+    ReadBasicType(is, binary, &specaugment_max_proportion_);
+    if (PeekToken(is, binary) == 'S') {
+      ExpectToken(is, binary, "<SpecAugmentMaxRegions>");
+      ReadBasicType(is, binary, &specaugment_max_regions_);
+    } else {
+      specaugment_max_regions_ = 1;
+    }
+  } else {
+    specaugment_max_proportion_ = 0.0;
+    specaugment_max_regions_ = 1;
+  }
   if (PeekToken(is, binary) == 'T') {
     ExpectToken(is, binary, "<TestMode>");
     test_mode_ = true;
@@ -1645,6 +1675,14 @@ void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, time_period_);
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
+  if (specaugment_max_proportion_) {
+    WriteToken(os, binary, "<SpecAugmentMaxProportion>");
+    WriteBasicType(os, binary, specaugment_max_proportion_);
+    if (specaugment_max_regions_ != 1) {
+      WriteToken(os, binary, "<SpecAugmentMaxRegions>");
+      WriteBasicType(os, binary, specaugment_max_regions_);
+    }
+  }
   if (test_mode_)
     WriteToken(os, binary, "<TestMode>");
   if (continuous_)
@@ -1669,18 +1707,79 @@ void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("time-period", &time_period_);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+
+  specaugment_max_proportion_ = 0.0;
+  cfl->GetValue("specaugment-max-proportion", &specaugment_max_proportion_);
+  specaugment_max_regions_ = 1;
+  cfl->GetValue("specaugment-max-regions", &specaugment_max_regions_);
   continuous_ = false;
   cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
+
+  if (specaugment_max_proportion_ != 0.0) {
+    if (specaugment_max_proportion_ < 0.0 ||
+        specaugment_max_proportion_ > 1.0 || continuous_ ||
+        specaugment_max_regions_ < 1) {
+      KALDI_ERR << "Invalid config values: specaugment-max-proportion = "
+                << specaugment_max_proportion_ << ", continuous = "
+                << std::boolalpha << continuous_
+                << ", specaugment-max-regions = " << specaugment_max_regions_;
+    }
+  }
 }
 
 
 CuMatrix<BaseFloat>* GeneralDropoutComponent::GetMemo(
     int32 num_mask_rows) const {
   KALDI_ASSERT(num_mask_rows > 0 && !test_mode_ &&
-               dropout_proportion_ > 0.0);
-  CuMatrix<BaseFloat> *ans = new CuMatrix<BaseFloat>(num_mask_rows, block_dim_);
+               (dropout_proportion_ > 0.0 ||
+                specaugment_max_proportion_ != 0.0));
+  CuMatrix<BaseFloat> *ans = new CuMatrix<BaseFloat>(num_mask_rows, block_dim_,
+                                                     kUndefined);
+
+  if (specaugment_max_proportion_ != 0.0) {
+    // This block takes care of the case where we are doing SpecAugment.
+    int32 num_freq_bins = block_dim_;
+    Matrix<BaseFloat> mask(num_mask_rows, block_dim_);
+    mask.Set(1.0);
+    int32 specaugment_max_zeroed = static_cast<int32>(
+        num_freq_bins * specaugment_max_proportion_  +  0.5);
+    for (int32 seq = 0; seq < num_mask_rows; seq++) {
+      // actually seq is more like a sub-part of a sequence, in the case where
+      // time_period_ is not zero.
+      SubVector<BaseFloat> this_mask(mask, seq);  // will be all ones, right now.
+      int32 num_bins_zeroed = RandInt(0, specaugment_max_zeroed);
+      if (num_bins_zeroed != 0) {
+        // This is not quite the same as the paper, it is allowed to "wrap around"
+        // from the top to the bottom of the frequency spectrum.
+        int32 start_bin = RandInt(0, num_freq_bins - 1);
+        for (int32 i = start_bin; i < start_bin + num_bins_zeroed; i++)
+          this_mask(i % num_freq_bins) = 0.0;
+
+        // if specaugment_max_regions_ is not 1 (e.g. if it's 2 or 3), we want
+        // to (possibly) split up the zeroed region into more segments.
+        // The way we do this is a bit odd, but it was hard to think of
+        // an elegant way to do it.  We just choose a random half of the spectrum
+        // (viewing it as a circle, so choosing a random half of the circle)
+        // and swap around that half, i.e. flip it on its head.
+        for (int32 n = 1; n < specaugment_max_regions_; n++) {
+          int32 half_bin_size = num_freq_bins / 2,
+              quarter_bin_size = half_bin_size / 2,
+              start_bin = RandInt(0, num_freq_bins - 1),
+              end_bin = start_bin + half_bin_size;
+          for (int32 i = 0; i < quarter_bin_size; i++) {
+            BaseFloat &a = this_mask((start_bin + i) % num_freq_bins),
+                &b = this_mask((end_bin - i) % num_freq_bins);
+            std::swap(a, b);
+          }
+        }
+      }
+    }
+    ans->CopyFromMat(mask);
+    return ans;
+  }
+
   BaseFloat dropout_proportion = dropout_proportion_;
 
   // This const_cast is only safe assuming you don't attempt
@@ -1693,7 +1792,7 @@ CuMatrix<BaseFloat>* GeneralDropoutComponent::GetMemo(
     // function (x>0?1:0), a proportion "dropout_proportion" will be zero and (1 -
     // dropout_proportion) will be 1.0.
     ans->ApplyHeaviside();
-    ans->Scale(1.0 / dropout_proportion);
+    ans->Scale(1.0 / (1.0 - dropout_proportion));
   } else {
     ans->Scale(dropout_proportion * 4.0);
     // make the expected value 1.0.
@@ -1781,6 +1880,243 @@ void GeneralDropoutComponentPrecomputedIndexes::Read(std::istream &is,
               "</GeneralDropoutComponentPrecomputedIndexes>");
 }
 
+std::string SpecAugmentTimeMaskComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", dim=" << dim_
+         << ", zeroed-proportion=" << zeroed_proportion_
+         << ", time-mask-max-frames=" << time_mask_max_frames_;
+  return stream.str();
+}
+
+SpecAugmentTimeMaskComponent::SpecAugmentTimeMaskComponent():
+    dim_(-1), zeroed_proportion_(0.25),
+    time_mask_max_frames_(10) { }
+
+SpecAugmentTimeMaskComponent::SpecAugmentTimeMaskComponent(
+    const SpecAugmentTimeMaskComponent &other):
+    dim_(other.dim_),
+    zeroed_proportion_(other.zeroed_proportion_),
+    time_mask_max_frames_(other.time_mask_max_frames_) { }
+
+void* SpecAugmentTimeMaskComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+
+  KALDI_ASSERT(SameDim(in, *out));
+
+  // The following will do nothing if 'out' and 'in' refer to the same data.
+  out->CopyFromMat(in);
+
+  if (test_mode_ ||
+      zeroed_proportion_ == 0.0)
+    return NULL;
+
+  const SpecAugmentTimeMaskComponentPrecomputedIndexes *indexes =
+    dynamic_cast<const SpecAugmentTimeMaskComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL);
+
+  CuVector<BaseFloat> *mask = GetMemo(*indexes);
+  out->MulRowsVec(*mask);
+  return mask;
+}
+
+void SpecAugmentTimeMaskComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(in_deriv != NULL && SameDim(*in_deriv, out_deriv));
+
+  // The following will do no work if in_deriv->Data() == out_deriv.Data().
+  in_deriv->CopyFromMat(out_deriv);
+
+  if (test_mode_ || zeroed_proportion_ == 0.0) {
+    KALDI_ASSERT(memo == NULL);
+    return;
+  }
+
+  const SpecAugmentTimeMaskComponentPrecomputedIndexes *indexes =
+    dynamic_cast<const SpecAugmentTimeMaskComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL && memo != NULL);
+  CuVector<BaseFloat> *mask = reinterpret_cast<CuVector<BaseFloat>*>(memo);
+
+  in_deriv->MulRowsVec(*mask);
+}
+
+void SpecAugmentTimeMaskComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<SpecAugmentTimeMaskComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<ZeroedProportion>");
+  ReadBasicType(is, binary, &zeroed_proportion_);
+  ExpectToken(is, binary, "<TimeMaskMaxFrames>");
+  ReadBasicType(is, binary, &time_mask_max_frames_);
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
+    test_mode_ = true;
+  } else {
+    test_mode_ = false;
+  }
+  ExpectToken(is, binary, "</SpecAugmentTimeMaskComponent>");
+}
+
+
+void SpecAugmentTimeMaskComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<SpecAugmentTimeMaskComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<ZeroedProportion>");
+  WriteBasicType(os, binary, zeroed_proportion_);
+  WriteToken(os, binary, "<TimeMaskMaxFrames>");
+  WriteBasicType(os, binary, time_mask_max_frames_);
+  if (test_mode_)
+    WriteToken(os, binary, "<TestMode>");
+  WriteToken(os, binary, "</SpecAugmentTimeMaskComponent>");
+}
+
+Component* SpecAugmentTimeMaskComponent::Copy() const {
+  return new SpecAugmentTimeMaskComponent(*this);
+}
+
+void SpecAugmentTimeMaskComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = 0;
+  bool ok = cfl->GetValue("dim", &dim_);
+  KALDI_ASSERT(ok && dim_ > 0);
+  zeroed_proportion_ = 0.25;
+  cfl->GetValue("zeroed-proportion", &zeroed_proportion_);
+  time_mask_max_frames_ = 10;
+  cfl->GetValue("time-mask-max-frames", &time_mask_max_frames_);
+  KALDI_ASSERT(time_mask_max_frames_ > 1);
+}
+
+
+CuVector<BaseFloat>* SpecAugmentTimeMaskComponent::GetMemo(
+    const SpecAugmentTimeMaskComponentPrecomputedIndexes &indexes_in) const {
+
+  const std::vector<std::vector<int32> > &indexes = indexes_in.indexes;
+  int32 num_sequences = indexes.size();
+  BaseFloat z = zeroed_proportion_;
+  int32 time_mask_max_frames = time_mask_max_frames_,
+      non_time_mask_max_frames = time_mask_max_frames * (1-z) / z;
+  KALDI_ASSERT(time_mask_max_frames > 0 &&
+               non_time_mask_max_frames > 0);
+  Vector<BaseFloat> mask(indexes_in.tot_size, kUndefined);
+
+  for (int32 s = 0; s < num_sequences; s++) {
+    // this_row_indexes gives us, for a particular sequence, the ordered list of
+    // row-indexes where we can find the successive 't' values of this sequence.
+    const std::vector<int32> this_row_indexes = indexes[s];
+    int32 seq_length = this_row_indexes.size();
+    KALDI_ASSERT(seq_length > 0);
+
+    int32 t = 0;
+    while (t < seq_length) {
+      // add a non-zeroed, then a zeroed, segment, repeatedly until we have
+      // filled the sequence.  The first time we choose randomly whether to add
+      // a zeroed or a non-zeroed segment.
+      if (t > 0 || WithProb(z)) {
+        int32 nonzeroed_length = RandInt(1, non_time_mask_max_frames);
+        for (; t < seq_length && nonzeroed_length > 0; t++, nonzeroed_length--)
+          mask(this_row_indexes[t]) = 1.0;
+      }
+      int32 zeroed_length = RandInt(1, time_mask_max_frames);
+      for (; t < seq_length && zeroed_length > 0; t++, zeroed_length--)
+        mask(this_row_indexes[t]) = 0.0;
+    }
+  }
+  return new CuVector<BaseFloat>(mask);
+}
+
+ComponentPrecomputedIndexes* SpecAugmentTimeMaskComponent::PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const {
+  KALDI_ASSERT(input_indexes == output_indexes);
+
+  SpecAugmentTimeMaskComponentPrecomputedIndexes *ans = new
+      SpecAugmentTimeMaskComponentPrecomputedIndexes;
+  int32 size = input_indexes.size();
+  KALDI_ASSERT(size != 0);
+  // 'sort_indexes' will contain the n and t values and then
+  // the index into input_indexes.  When we sort these, it will
+  // sort first on the n value and then on the t, which will allow us
+  // to create ans->indexes.
+  std::vector<std::tuple<int32, int32, int32> > sort_indexes(size);
+
+  std::unordered_set<int32> all_n_values;  // just for determining how many
+                                           // there are.
+  for (int32 i = 0; i < size; i++) {
+    int32 n = input_indexes[i].n;
+    all_n_values.insert(n);
+    std::get<0>(sort_indexes[i]) = n;
+    std::get<1>(sort_indexes[i]) = input_indexes[i].t;
+    std::get<2>(sort_indexes[i]) = i;
+  }
+  std::sort(sort_indexes.begin(), sort_indexes.end());
+
+  // the stuff with n_idx is because we don't assume the
+  // n values start from zero and are consecutive.
+  int32 num_n_values = all_n_values.size(),
+      n_idx = 0,
+      cur_n_value = std::get<0>(sort_indexes[0]);
+  ans->indexes.resize(num_n_values);
+  for (int32 i = 0; i < size; i++) {
+    std::tuple<int32, int32, int32> &tp(sort_indexes[i]);
+    int32 n = std::get<0>(tp),
+        row_index = std::get<2>(tp);
+    KALDI_ASSERT(n >= cur_n_value);
+    if (n > cur_n_value) {
+      n_idx++;
+      KALDI_ASSERT(n_idx < num_n_values);
+      cur_n_value = n;
+    }
+    ans->indexes[n_idx].push_back(row_index);
+  }
+  n_idx++;
+  KALDI_ASSERT(n_idx == num_n_values);
+  ans->tot_size = size;
+  return ans;
+}
+
+void SpecAugmentTimeMaskComponentPrecomputedIndexes::Write(std::ostream &os,
+    bool binary) const {
+  WriteToken(os, binary,
+             "<SpecAugmentTimeMaskComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<Indexes>");
+  int32 size = indexes.size();
+  WriteBasicType(os, binary, size);
+  for (int32 i = 0; i < size; i++) {
+    WriteIntegerVector(os, binary, indexes[i]);
+  }
+  WriteToken(os, binary,
+             "</SpecAugmentTimeMaskComponentPrecomputedIndexes>");
+}
+
+void SpecAugmentTimeMaskComponentPrecomputedIndexes::Read(std::istream &is,
+    bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<SpecAugmentTimeMaskComponentPrecomputedIndexes>",
+                       "<Indexes>");
+  int32 size;
+  ReadBasicType(is, binary, &size);
+  indexes.clear();
+  indexes.resize(size);
+  for (int32 i = 0; i < size; i++)
+    ReadIntegerVector(is, binary, &(indexes[i]));
+  ExpectToken(is, binary,
+              "</SpecAugmentTimeMaskComponentPrecomputedIndexes>");
+  tot_size = 0;
+  for (auto v : indexes) tot_size += v.size();
+}
+
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index cff73a55b59..9af5c87c7f3 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -331,7 +331,8 @@ class StatisticsExtractionComponentPrecomputedIndexes:
  or whatever, instead of just component-name, because its output is only defined at multiples
  of its input-period.
 
- The output of StatisticsPoolingComponent will only be defined if at least one input was defined.
+ The output of StatisticsPoolingComponent will only be defined if at least one
+ input was defined.
  */
 class StatisticsPoolingComponent: public Component {
  public:
@@ -396,6 +397,11 @@ class StatisticsPoolingComponent: public Component {
       const std::vector<Index> &output_indexes,
       bool need_backprop) const;
 
+  // Used in computing the 'real' context of networks involving this component;
+  // with the default value of false, the left/right context will always appear
+  // to be 0.
+  void SetRequireDirectInput(bool b) { require_direct_input_ = b; }
+
  private:
   // Checks that the parameters are valid.
   void Check() const;
@@ -411,6 +417,13 @@ class StatisticsPoolingComponent: public Component {
   int32 num_log_count_features_;
   bool output_stddevs_;
   BaseFloat variance_floor_;
+  // If require_direct_input_ is set to true, in order for a particular 't'
+  // value to be available at the output of this component, it will require that
+  // 't' value to be computable at the input.  This is used in computing the
+  // "real" left/right context of the network, but this member isn't currently
+  // written to disk and will default to false when read.
+  bool require_direct_input_;
+
 };
 
 class StatisticsPoolingComponentPrecomputedIndexes:
@@ -679,6 +692,8 @@ class ConstantComponent: public UpdatableComponent {
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+
+  virtual void ConsolidateMemory();
  private:
 
   // the output value-- a vector.
@@ -792,6 +807,7 @@ class DropoutMaskComponent: public RandomComponent {
    of 't' values (e.g. the first block of 10 values gets one dropout
    mask, the second block of 10 gets another one, and so on).
 
+   It also has support for the frequency component of SpecAugment.
 
    Configuration values accepted on the command line, with defaults:
 
@@ -835,6 +851,26 @@ class DropoutMaskComponent: public RandomComponent {
                   dropout, and it would probably make more sense to just use the
                   normal DropoutComponent.
 
+       specaugment-max-proportion=0  If nonzero, causes this component to
+                 implement SpecAugment.  (Note: you probably would want this
+                 after a batch-norm component so the average at input is
+                 zero), and the input dim will be interpreted as some kind of
+                 frequency space, e.g. linear or mel.  specaugment-max-proportion
+                 will be the maximum proportion of the frequency
+                 space that this component might zero out (so multiply this by
+                 by input dim to get the maximum columns that might be zeroed out);
+                 the actual number of columns zeroed out for each sequence will
+                 be randomly chosen between zero and the maximum.  Note: the
+                 non-zeroed frequencies won't be multiplied by a constant more
+                 than one as we would in the normal dropout mode.
+
+       specaugment-max-regions=1  This can be set to a value greater than one
+                 (e.g., 2) to implement a variant of SpecAugment where instead
+                 of zeroing out a single region of the frequency spectrum
+                 we zero out a randomly chosen number of regions, from one to
+                 this number.  The maximum proportion of the frequency spectrum
+                 that we remove is unaffected.
+
  */
 class GeneralDropoutComponent: public RandomComponent {
  public:
@@ -887,7 +923,9 @@ class GeneralDropoutComponent: public RandomComponent {
 
  private:
 
-  // Returns a random matrix of dimension 'num_mask_rows' by 'block_dim_'.  This
+  // Returns a random matrix reflecting the masking we are applying.
+  // In the normal case where we are doing a
+  // of dimension 'num_mask_rows' by 'block_dim_'.  This
   // should not be called if test_mode_ is true or dropout_proportion_ is zero.
   CuMatrix<BaseFloat> *GetMemo(int32 num_mask_rows) const;
 
@@ -906,9 +944,11 @@ class GeneralDropoutComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
-  bool continuous_;
+  BaseFloat specaugment_max_proportion_;
+
+  int32 specaugment_max_regions_;
 
-  bool test_mode_;
+  bool continuous_;
 
   const GeneralDropoutComponent &operator
   = (const GeneralDropoutComponent &other); // Disallow.
@@ -922,8 +962,11 @@ class GeneralDropoutComponentPrecomputedIndexes:
  public:
 
 
-  // num_mask_rows is the number of rows in the dropout-mask matrix;
-  // it's num-cols is the block_dim_ of the component.
+  // num_mask_rows is the number of rows in the dropout-mask matrix, which will
+  // in the normal case equal the number of sequences we are processing.  Its
+  // num-cols is the block_dim_ of the component (e.g. might be the InputDim()
+  // (which is the same as OutputDim()), or maybe less if the block-dim option
+  // was specified.
   int32 num_mask_rows;
 
   // 'indexes' is of dimension (the number of rows in the matrix we're doing
@@ -950,6 +993,125 @@ class GeneralDropoutComponentPrecomputedIndexes:
 };
 
 
+class SpecAugmentTimeMaskComponentPrecomputedIndexes;
+
+/**
+   SpecAugmentTimeMaskComponent implements the time part of SpecAugment.
+   Instead of zeroing out a single time-region of the input, though,
+   it zeroes out multiple smaller time-regions.
+
+   Configuration values accepted on the command line, with defaults:
+
+       dim        Dimension of the input and output of this component,
+                  e.g. 512
+
+
+       zeroed-proportion=0.25  Proportion of the input that is to be zeroed;
+                  should be in the range (0, 1).
+
+       time-mask-max-frames=10   The maximum time duration of the *zeroed*
+                  regions.  The non-zeroed regions in between will have maximum
+                 duration equal to this times (1-z)/z, where z
+                 is zeroed-proportion.
+ */
+class SpecAugmentTimeMaskComponent: public RandomComponent {
+ public:
+  virtual int32 InputDim() const { return dim_; }
+
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  SpecAugmentTimeMaskComponent();
+
+  SpecAugmentTimeMaskComponent(const SpecAugmentTimeMaskComponent &other);
+
+  virtual std::string Type() const { return "SpecAugmentTimeMaskComponent"; }
+  virtual int32 Properties() const {
+    return kRandomComponent|kPropagateInPlace|kBackpropInPlace|kUsesMemo;
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void DeleteMemo(void *memo) const {
+    delete static_cast<CuVector<BaseFloat>*>(memo);
+  }
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+ private:
+
+  // Returns a random vector reflecting the masking we are applying.
+  CuVector<BaseFloat> *GetMemo(
+      const SpecAugmentTimeMaskComponentPrecomputedIndexes &indexes) const;
+
+
+  // The input and output dimension
+  int32 dim_;
+
+  BaseFloat zeroed_proportion_;
+
+  int32 time_mask_max_frames_;
+
+  const SpecAugmentTimeMaskComponent &operator
+  = (const SpecAugmentTimeMaskComponent &other); // Disallow.
+};
+
+// This stores some precomputed indexes for SpecAugmentTimeMaskComponent.
+// This object is created for every instance of the Propagate()
+// function in the compiled computation.
+class SpecAugmentTimeMaskComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+  // 'indexes' is indexed first by sequence and then by time within that
+  // sequence; each list indexes[s] is a consecutive list of the elements of
+  // that sequence (e.g. t=0, t=1, and so on).  The int32 values inside these
+  // lists are row-indexes into the matrix that is at the input and output of
+  // this component.
+  std::vector<std::vector<int32> > indexes;
+
+  // 'tot_size' is the total number of elements in 'indexes', equal to the
+  // num-rows of the matrix we're doing dropout on.
+  int32 tot_size;
+
+  virtual ~SpecAugmentTimeMaskComponentPrecomputedIndexes() { }
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new SpecAugmentTimeMaskComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const {
+    return "SpecAugmentTimeMaskComponentPrecomputedIndexes";
+  }
+};
+
+
 
 
 
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index d10c6fabd36..e400bdfa2db 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -214,8 +214,8 @@ void BatchNormComponent::ComputeDerived() {
 
   if (count_ == 0.0) {
     KALDI_WARN << "Test-mode is set but there is no data count.  "
-        "Creating random counts.  This only makes sense "
-        "in unit-tests (or compute_prob_*.0.log).  If you see this "
+        "Creating random counts.  This is NOT A PROBLEM if the message "
+        "appears in unit-tests or in compute_prob_*.0.log.  If you see this "
         "elsewhere, something is very wrong.";
     count_ = 1.0;
     stats_sum_.SetRandn();
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 2a6523ddf06..127441a240c 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -2945,7 +2945,7 @@ static int32 FindNStride(const std::vector<Index> &indexes,
   int32 size = indexes.size();
   KALDI_ASSERT(size > 0);
   int32 N = indexes[size-1].n + 1,
-      n_stride;
+        n_stride = -1;
   if (N <= 1) {
     // we wouldn't be able to determine the stride if N <= 1.
     return 0;
@@ -2974,7 +2974,7 @@ static int32 FindNStride(const std::vector<Index> &indexes,
         break;
       }
     }
-    if (stride == size / N) {
+    if (n_stride == -1) {
       // if we fell off the loop then we found no candidates, which is strange
       // and means we did not find the expected structure; we'll return 0 as we
       // failed.
@@ -3035,7 +3035,7 @@ static int32 FindNStride(const std::vector<Cindex> &cindexes,
   int32 size = cindexes.size();
   KALDI_ASSERT(size > 0);
   int32 N = cindexes[size-1].second.n + 1,
-      n_stride;
+      n_stride = 0;
   if (N <= 1)
     return 0;
   Cindex cindex(cindexes[0]);
@@ -4928,14 +4928,15 @@ void OptimizeMemoryCompression(const Nnet &nnet,
   }
   if (memory_compression_level >= 1) {
     int64 bytes_used_initial, bytes_used_final;
-    if (GetVerboseLevel() >= 2)
+    bool verbose_ge_2 = GetVerboseLevel() >= 2;
+    if (verbose_ge_2)
       bytes_used_initial = GetMaxMemoryUse(*computation);
 
     MemoryCompressionOptimizer opt(nnet, memory_compression_level,
                                    middle_command, computation);
     opt.Optimize();
 
-    if (GetVerboseLevel() >= 2) {
+    if (verbose_ge_2) {
       bytes_used_final = GetMaxMemoryUse(*computation);
       if (bytes_used_final != bytes_used_initial) {
         KALDI_VLOG(2) << "Memory compression reduced  memory use from "
diff --git a/src/nnet3/nnet-parse-test.cc b/src/nnet3/nnet-parse-test.cc
index babdbbdcb0e..5ae4917dba6 100644
--- a/src/nnet3/nnet-parse-test.cc
+++ b/src/nnet3/nnet-parse-test.cc
@@ -23,193 +23,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-void UnitTestConfigLineParse() {
-  std::string str;
-  {
-    ConfigLine cfl;
-    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
-    bool status = cfl.ParseLine(str);
-    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
-
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
-    KALDI_ASSERT(str_value == "yyy");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
-    KALDI_ASSERT(str_value == "bar");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "123");
-
-    std::vector<int32> int_values;
-    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
-    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
-    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
-    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar";
-    KALDI_ASSERT(cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar a=b c d f=g";
-    std::string value;
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
-                 cfl.GetValue("a", &value)  && value == "b c d" &&
-                 cfl.GetValue("f", &value) && value == "g" &&
-                 !cfl.HasUnusedValues());
-  }
-  {
-    ConfigLine cfl;
-    str = "zzz a=b baz";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
-                 cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b baz ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b =c";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "x y z");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
-    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
-    KALDI_ASSERT(str_value == "cd");
-    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
-    KALDI_ASSERT(str_value == "bd");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "x baz= pp = qq flag=t ";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " x baz= pp=qq flag=t  ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
-    KALDI_ASSERT(str_value == "t");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-
-    bool bool_value = false;
-    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
-    KALDI_ASSERT(bool_value);
-  }
-
-  {
-    ConfigLine cfl;
-    str = "xx _baz=a -pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx 0baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx -baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz'=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " baz=g";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
-    bool flag;
-    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz1=a pp=qq";
-    KALDI_ASSERT(cfl.ParseLine(str));
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
-  }
-}
-
-void UnitTestReadConfig() {
-  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
-      "a-b beta2='b c' beta3=bd # \n"
-      "a-b gamma=1:2:3:4  # Int Vector test\n"
-      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
-      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
-      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
-      "a-b quoted='a b c' # quoted string\n"
-      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
-
-  std::istringstream is(str);
-  std::vector<std::string> lines;
-  ReadConfigLines(is, &lines);
-  KALDI_ASSERT(lines.size() == 8);
-
-  ConfigLine cfl;
-  for (size_t i = 0; i < lines.size(); i++) {
-    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
-    if (i == 1) {
-        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
-    }
-    if (i == 4) {
-      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
-    }
-    if (i == 5) {
-      BaseFloat float_val = 0;
-      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
-    }
-    if (i == 6) {
-      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
-    }
-    if (i == 7) {
-      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
-    }
-  }
-}
 
 void UnitTestDescriptorTokenize() {
   std::vector<std::string> lines;
@@ -281,8 +94,6 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
 
-  UnitTestConfigLineParse();
-  UnitTestReadConfig();
   UnitTestDescriptorTokenize();
   UnitTestSummarizeVector();
   UnitTestNameMatchesPattern();
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index c920790fd6f..17dec23e7c1 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -27,353 +27,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-
-bool ConfigLine::ParseLine(const std::string &line) {
-  data_.clear();
-  whole_line_ = line;
-  if (line.size() == 0) return false;   // Empty line
-  size_t pos = 0, size = line.size();
-  while (isspace(line[pos]) && pos < size) pos++;
-  if (pos == size)
-    return false;  // whitespace-only line
-  size_t first_token_start_pos = pos;
-  // first get first_token_.
-  while (!isspace(line[pos]) && pos < size) {
-    if (line[pos] == '=') {
-      // If the first block of non-whitespace looks like "foo-bar=...",
-      // then we ignore it: there is no initial token, and FirstToken()
-      // is empty.
-      pos = first_token_start_pos;
-      break;
-    }
-    pos++;
-  }
-  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
-  // first_token_ is expected to be either empty or something like
-  // "component-node", which actually is a slightly more restrictive set of
-  // strings than IsValidName() checks for this is a convenient way to check it.
-  if (!first_token_.empty() && !IsValidName(first_token_))
-    return false;
-
-  while (pos < size) {
-    if (isspace(line[pos])) {
-      pos++;
-      continue;
-    }
-
-    // OK, at this point we know that we are pointing at nonspace.
-    size_t next_equals_sign = line.find_first_of("=", pos);
-    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
-      // we're looking for something like 'key=value'.  If there is no equals sign,
-      // or it's not preceded by something, it's a parsing failure.
-      return false;
-    }
-    std::string key(line, pos, next_equals_sign - pos);
-    if (!IsValidName(key)) return false;
-
-    // handle any quotes.  we support key='blah blah' or key="foo bar".
-    // no escaping is supported.
-    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
-      char my_quote = line[next_equals_sign+1];
-      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
-      if (next_quote == std::string::npos) {  // no matching quote was found.
-        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
-                   << line << "'";
-        return false;
-      } else {
-        std::string value(line, next_equals_sign + 2,
-                          next_quote - next_equals_sign - 2);
-        data_.insert(std::make_pair(key, std::make_pair(value, false)));
-        pos = next_quote + 1;
-        continue;
-      }
-    } else {
-      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
-      // in general, config values with spaces in them, even without quoting.
-
-      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
-          terminating_space = size;
-
-      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
-        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
-        if (preceding_space != std::string::npos &&
-            preceding_space > next_equals_sign)
-          terminating_space = preceding_space;
-      }
-      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
-        terminating_space--;
-
-      std::string value(line, next_equals_sign + 1,
-                        terminating_space - (next_equals_sign + 1));
-      data_.insert(std::make_pair(key, std::make_pair(value, false)));
-      pos = terminating_space;
-    }
-  }
-  return true;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::string *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      *value = (it->second).first;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToReal((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, int32 *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToInteger((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
-  KALDI_ASSERT(value != NULL);
-  value->clear();
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
-        // KALDI_WARN << "Bad option " << (it->second).first;
-        return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, bool *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if ((it->second).first.size() == 0) return false;
-      switch (((it->second).first)[0]) {
-        case 'F':
-        case 'f':
-          *value = false;
-          break;
-        case 'T':
-        case 't':
-          *value = true;
-          break;
-        default:
-          return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::HasUnusedValues() const {
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) return true;
-  }
-  return false;
-}
-
-std::string ConfigLine::UnusedValues() const {
-  std::string unused_str;
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) {
-      if (unused_str == "")
-        unused_str = it->first + "=" + (it->second).first;
-      else
-        unused_str += " " + it->first + "=" + (it->second).first;
-    }
-  }
-  return unused_str;
-}
-
-// This is like ExpectToken but for two tokens, and it
-// will either accept token1 and then token2, or just token2.
-// This is useful in Read functions where the first token
-// may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2) {
-  KALDI_ASSERT(token1 != token2);
-  std::string temp;
-  ReadToken(is, binary, &temp);
-  if (temp == token1) {
-    ExpectToken(is, binary, token2);
-  } else {
-    if (temp != token2) {
-      KALDI_ERR << "Expecting token " << token1 << " or " << token2
-                << " but got " << temp;
-    }
-  }
-}
-
-// static
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToInteger(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      std::string b = split_string[i].substr(len);
-      if (b.empty())
-        KALDI_ERR << "Bad option " << split_string[i];
-      if (b[0] == 'f' || b[0] == 'F') *param = false;
-      else if (b[0] == 't' || b[0] == 'T') *param = true;
-      else
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToReal(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      *param = split_string[i].substr(len);
-
-      // Set "string" to all the pieces but the one we used.
-      *string = "";
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!SplitStringToIntegers(split_string[i].substr(len), ":,",
-                                 false, param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens) {
   KALDI_ASSERT(tokens != NULL);
@@ -422,32 +75,6 @@ bool DescriptorTokenize(const std::string &input,
   return true;
 }
 
-bool IsValidName(const std::string &name) {
-  if (name.size() == 0) return false;
-  for (size_t i = 0; i < name.size(); i++) {
-    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
-      return false;
-    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
-      return false;
-  }
-  return true;
-}
-
-void ReadConfigLines(std::istream &is,
-                    std::vector<std::string> *lines) {
-  KALDI_ASSERT(lines != NULL);
-  std::string line;
-  while (std::getline(is, line)) {
-    if (line.size() == 0) continue;
-    size_t start = line.find_first_not_of(" \t");
-    size_t end = line.find_first_of('#');
-    if (start == std::string::npos || start == end) continue;
-    end = line.find_last_not_of(" \t", end - 1);
-    KALDI_ASSERT(end >= start);
-    lines->push_back(line.substr(start, end - start + 1));
-  }
-}
-
 std::string ErrorContext(std::istream &is) {
   if (!is.good()) return "end of line";
   char buf[21];
@@ -518,12 +145,12 @@ std::string SummarizeVector(const VectorBase<float> &vec) {
 }
 
 std::string SummarizeVector(const VectorBase<double> &vec) {
-  Vector<BaseFloat> vec_copy(vec);
+  Vector<float> vec_copy(vec);
   return SummarizeVector(vec_copy);
 }
 
 std::string SummarizeVector(const CuVectorBase<BaseFloat> &cu_vec) {
-  Vector<BaseFloat> vec(cu_vec);
+  Vector<float> vec(cu_vec);
   return SummarizeVector(vec);
 }
 
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index a073a54f7e0..0fc19d51f6c 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -26,103 +26,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-/**
-   This class is responsible for parsing input like
-    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
-   and giving you access to the fields, in this case
-
-   FirstToken() == "hi-there", and key->value pairs:
-
-   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
-   bing->"a b c", baz->"a b c d='a b' e"
-
-   The first token is optional, if the line started with a key-value pair then
-   FirstValue() will be empty.
-
-   Note: it can parse value fields with space inside them only if they are free of the '='
-   character.  If values are going to contain the '=' character, you need to quote them
-   with either single or double quotes.
-
-   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
- */
-class ConfigLine {
- public:
-  // Tries to parse the line as a config-file line.  Returns false
-  // if it could not for some reason, e.g. parsing failure.  In most cases
-  // prints no warnings; the user should do this.  Does not expect comments.
-  bool ParseLine(const std::string &line);
-
-  // the GetValue functions are overloaded for various types.  They return true
-  // if the key exists with value that can be converted to that type, and false
-  // otherwise.  They also mark the key-value pair as having been read.  It is
-  // not an error to read values twice.
-  bool GetValue(const std::string &key, std::string *value);
-  bool GetValue(const std::string &key, BaseFloat *value);
-  bool GetValue(const std::string &key, int32 *value);
-  // Values may be separated by ":" or by ",".
-  bool GetValue(const std::string &key, std::vector<int32> *value);
-  bool GetValue(const std::string &key, bool *value);
-
-  bool HasUnusedValues() const;
-  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
-  /// of the GetValue() functions.
-  std::string UnusedValues() const;
-
-  const std::string &FirstToken() const { return first_token_; }
-
-  const std::string WholeLine() { return whole_line_; }
-  // use default assignment operator and copy constructor.
- private:
-  std::string whole_line_;
-  // the first token of the line, e.g. if line is
-  // foo-bar baz=bing
-  // then first_token_ would be "foo-bar".
-  std::string first_token_;
-
-  // data_ maps from key to (value, is-this-value-consumed?).
-  std::map<std::string, std::pair<std::string, bool> > data_;
-
-};
-
-// Note: the ParseFromString functions are to be removed after we switch over to
-// using the ConfigLine mechanism.
-
-
-/// \file nnet-parse.h
-///   This header contains a few parsing-related functions that are used
-///    while reading parsing neural network files and config files.
-
-/// Function used in Init routines.  Suppose name=="foo", if "string" has a
-/// field like foo=12, this function will set "param" to 12 and remove that
-/// element from "string".  It returns true if the parameter was read.
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param);
-
-/// This version of ParseFromString is for parameters of type BaseFloat.
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param);
-
-/// This version of ParseFromString is for parameters of type bool, which can
-/// appear as any string beginning with f, F, t or T.
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param);
-
-/// This version of ParseFromString is for parsing strings.  (these
-/// should not contain space).
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param);
-
-/// This version of ParseFromString handles colon-separated or comma-separated
-/// lists of integers.
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param);
-
-/// This function is like ExpectToken but for two tokens, and it will either
-/// accept token1 and then token2, or just token2.  This is useful in Read
-/// functions where the first token may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2);
 
 /**
    This function tokenizes input when parsing Descriptor configuration
@@ -142,32 +45,6 @@ void ExpectOneOrTwoTokens(std::istream &is, bool binary,
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens);
 
-/// Returns true if 'name' would be a valid name for a component or node in a
-/// Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
-/// '-', '_', '.', A-Z, a-z, or 0-9.
-bool IsValidName(const std::string &name);
-
-
-/**
-   This function reads in a config file and *appends* its contents to a vector of
-   lines; it is responsible for removing comments (anything after '#') and
-   stripping out any lines that contain only whitespace after comment removal.
- */
-void ReadConfigLines(std::istream &is,
-                     std::vector<std::string> *lines);
-
-
-/**
-   This function converts config-lines from a simple sequence of strings
-   as output by ReadConfigLines(), into a sequence of first-tokens and
-   name-value pairs.  The general format is:
-      "command-type bar=baz xx=yyy"
-   etc., although there are subtleties as to what exactly is allowed, see
-   documentation for class ConfigLine for details.
-   This function will die if there was a parsing failure.
- */
-void ParseConfigLines(const std::vector<std::string> &lines,
-                      std::vector<ConfigLine> *config_lines);
 
 /*
   Returns true if name 'name' matches pattern 'pattern'.  The pattern
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 69f8442a08a..53c8d46578b 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1,6 +1,6 @@
 // nnet3/nnet-simple-component.cc
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015-2017  Johns Hopkins University (author: Daniel Povey)
 //                2015  Xiaohui Zhang
 //                2015  Guoguo Chen
 //                2015  Daniel Galvez
@@ -417,8 +417,10 @@ void SigmoidComponent::RepairGradients(
 void SigmoidComponent::StoreStats(const CuMatrixBase<BaseFloat> &in_value,
                                   const CuMatrixBase<BaseFloat> &out_value,
                                   void *memo) {
-  // only store stats about every other minibatch.
-  if (RandInt(0, 1) == 0)
+  // Only store stats about every other minibatch (but on the first minibatch,
+  // always store it, which is necessary for the ConsolidateMemory() operation
+  // to work correctly.
+  if (RandInt(0, 1) == 0 && count_ != 0)
     return;
   // derivative of the nonlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(), out_value.NumCols(),
@@ -939,8 +941,10 @@ void TanhComponent::Backprop(const std::string &debug_info,
 void TanhComponent::StoreStats(const CuMatrixBase<BaseFloat> &in_value,
                                const CuMatrixBase<BaseFloat> &out_value,
                                void *memo) {
-  // only store stats about every other minibatch.
-  if (RandInt(0, 1) == 0)
+  // Only store stats about every other minibatch (but on the first minibatch,
+  // always store it, which is necessary for the ConsolidateMemory() operation
+  // to work correctly.
+  if (RandInt(0, 1) == 0 && count_ != 0)
     return;
   // derivative of the onlinearity is out_value * (1.0 - out_value);
   CuMatrix<BaseFloat> temp_deriv(out_value);
@@ -1073,8 +1077,10 @@ void RectifiedLinearComponent::StoreStats(
     const CuMatrixBase<BaseFloat> &in_value,
     const CuMatrixBase<BaseFloat> &out_value,
     void *memo) {
-  // only store stats about every other minibatch.
-  if (RandInt(0, 1) == 0)
+  // Only store stats about every other minibatch (but on the first minibatch,
+  // always store it, which is necessary for the ConsolidateMemory() operation
+  // to work correctly.
+  if (RandInt(0, 1) == 0 && count_ != 0)
     return;
   CuMatrix<BaseFloat> temp_deriv(out_value.NumRows(),
                                  out_value.NumCols(),
@@ -1637,6 +1643,12 @@ void NaturalGradientRepeatedAffineComponent::Update(
   bias_params_.AddVec(learning_rate_ * scale, bias_deriv);
 }
 
+void NaturalGradientRepeatedAffineComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_in_);
+  preconditioner_in_.Swap(&temp);
+}
+
+
 BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) :
   UpdatableComponent(other),
   linear_params_(other.linear_params_),
@@ -2555,6 +2567,13 @@ void ScaleAndOffsetComponent::BackpropInternal(
   }
 }
 
+void ScaleAndOffsetComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_scale(scale_preconditioner_);
+  scale_preconditioner_.Swap(&temp_scale);
+  OnlineNaturalGradient temp_offset(offset_preconditioner_);
+  offset_preconditioner_.Swap(&temp_offset);
+}
+
 
 std::string ConstantFunctionComponent::Info() const {
   std::ostringstream stream;
@@ -2744,7 +2763,10 @@ void ConstantFunctionComponent::UnVectorize(const VectorBase<BaseFloat> &params)
   output_.CopyFromVec(params);
 }
 
-
+void ConstantFunctionComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_);
+  preconditioner_.Swap(&temp);
+}
 
 void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   ReadUpdatableCommon(is, binary);  // Read the opening tag and learning rate
@@ -3017,12 +3039,17 @@ void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
-/// virtual
 void NaturalGradientAffineComponent::FreezeNaturalGradient(bool freeze) {
   preconditioner_in_.Freeze(freeze);
   preconditioner_out_.Freeze(freeze);
 }
 
+void NaturalGradientAffineComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 void LinearComponent::Read(std::istream &is, bool binary) {
   std::string token = ReadUpdatableCommon(is, binary);
@@ -3291,6 +3318,12 @@ void LinearComponent::FreezeNaturalGradient(bool freeze) {
   preconditioner_out_.Freeze(freeze);
 }
 
+void LinearComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 std::string FixedAffineComponent::Info() const {
   std::ostringstream stream;
@@ -3515,7 +3548,7 @@ void* SoftmaxComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   // Apply softmax function to each row of the output...
   // for that row, we do
   // x_i = exp(x_i) / sum_j exp(x_j).
-  out->ApplySoftMaxPerRow(in);
+  out->SoftMaxPerRow(in);
 
   // This floor on the output helps us deal with
   // almost-zeros in a way that doesn't lead to overflow.
@@ -3568,7 +3601,7 @@ void* LogSoftmaxComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                     CuMatrixBase<BaseFloat> *out) const {
   // Applies log softmax function to each row of the output. For each row, we do
   // x_i = x_i - log(sum_j exp(x_j))
-  out->ApplyLogSoftMaxPerRow(in);
+  out->LogSoftMaxPerRow(in);
   return NULL;
 }
 
@@ -3900,942 +3933,13 @@ void NaturalGradientPerElementScaleComponent::Update(
   scales_.AddVec(1.0, delta_scales);
 }
 
-/// virtual
 void NaturalGradientPerElementScaleComponent::FreezeNaturalGradient(bool freeze) {
   preconditioner_.Freeze(freeze);
 }
 
-// Constructors for the convolution component
-ConvolutionComponent::ConvolutionComponent():
-    UpdatableComponent(),
-    input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
-    filt_x_dim_(0), filt_y_dim_(0),
-    filt_x_step_(0), filt_y_step_(0),
-    input_vectorization_(kZyx) { }
-
-ConvolutionComponent::ConvolutionComponent(
-    const ConvolutionComponent &component):
-    UpdatableComponent(component),
-    input_x_dim_(component.input_x_dim_),
-    input_y_dim_(component.input_y_dim_),
-    input_z_dim_(component.input_z_dim_),
-    filt_x_dim_(component.filt_x_dim_),
-    filt_y_dim_(component.filt_y_dim_),
-    filt_x_step_(component.filt_x_step_),
-    filt_y_step_(component.filt_y_step_),
-    input_vectorization_(component.input_vectorization_),
-    filter_params_(component.filter_params_),
-    bias_params_(component.bias_params_) { }
-
-ConvolutionComponent::ConvolutionComponent(
-    const CuMatrixBase<BaseFloat> &filter_params,
-    const CuVectorBase<BaseFloat> &bias_params,
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    BaseFloat learning_rate):
-    input_x_dim_(input_x_dim),
-    input_y_dim_(input_y_dim),
-    input_z_dim_(input_z_dim),
-    filt_x_dim_(filt_x_dim),
-    filt_y_dim_(filt_y_dim),
-    filt_x_step_(filt_x_step),
-    filt_y_step_(filt_y_step),
-    input_vectorization_(input_vectorization),
-    filter_params_(filter_params),
-    bias_params_(bias_params){
-  KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() &&
-               bias_params.Dim() != 0);
-  KALDI_ASSERT(filter_params.NumCols() == filt_x_dim * filt_y_dim * input_z_dim);
-  SetUnderlyingLearningRate(learning_rate);
-  is_gradient_ = false;
-}
-
-// aquire input dim
-int32 ConvolutionComponent::InputDim() const {
-  return input_x_dim_ * input_y_dim_ * input_z_dim_;
-}
-
-// aquire output dim
-int32 ConvolutionComponent::OutputDim() const {
-  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
-  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
-  int32 num_filters = filter_params_.NumRows();
-  return num_x_steps * num_y_steps * num_filters;
-}
-
-// initialize the component using hyperparameters
-void ConvolutionComponent::Init(
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step, int32 num_filters,
-    TensorVectorizationType input_vectorization,
-    BaseFloat param_stddev, BaseFloat bias_stddev) {
-  input_x_dim_ = input_x_dim;
-  input_y_dim_ = input_y_dim;
-  input_z_dim_ = input_z_dim;
-  filt_x_dim_ = filt_x_dim;
-  filt_y_dim_ = filt_y_dim;
-  filt_x_step_ = filt_x_step;
-  filt_y_step_ = filt_y_step;
-  input_vectorization_ = input_vectorization;
-  KALDI_ASSERT((input_x_dim_ - filt_x_dim_) % filt_x_step_ == 0);
-  KALDI_ASSERT((input_y_dim_ - filt_y_dim_) % filt_y_step_ == 0);
-  int32 filter_dim = filt_x_dim_ * filt_y_dim_ * input_z_dim_;
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0);
-  filter_params_.SetRandn();
-  filter_params_.Scale(param_stddev);
-  bias_params_.SetRandn();
-  bias_params_.Scale(bias_stddev);
-}
-
-// initialize the component using predefined matrix file
-void ConvolutionComponent::Init(
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    std::string matrix_filename) {
-  input_x_dim_ = input_x_dim;
-  input_y_dim_ = input_y_dim;
-  input_z_dim_ = input_z_dim;
-  filt_x_dim_ = filt_x_dim;
-  filt_y_dim_ = filt_y_dim;
-  filt_x_step_ = filt_x_step;
-  filt_y_step_ = filt_y_step;
-  input_vectorization_ = input_vectorization;
-  CuMatrix<BaseFloat> mat;
-  ReadKaldiObject(matrix_filename, &mat);
-  int32 filter_dim = (filt_x_dim_ * filt_y_dim_ * input_z_dim_);
-  int32 num_filters = mat.NumRows();
-  KALDI_ASSERT(mat.NumCols() == (filter_dim + 1));
-  filter_params_.Resize(num_filters, filter_dim);
-  bias_params_.Resize(num_filters);
-  filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim));
-  bias_params_.CopyColFromMat(mat, filter_dim);
-}
-
-// display information about component
-std::string ConvolutionComponent::Info() const {
-  std::ostringstream stream;
-  stream << UpdatableComponent::Info()
-         << ", input-x-dim=" << input_x_dim_
-         << ", input-y-dim=" << input_y_dim_
-         << ", input-z-dim=" << input_z_dim_
-         << ", filt-x-dim=" << filt_x_dim_
-         << ", filt-y-dim=" << filt_y_dim_
-         << ", filt-x-step=" << filt_x_step_
-         << ", filt-y-step=" << filt_y_step_
-         << ", input-vectorization=" << input_vectorization_
-         << ", num-filters=" << filter_params_.NumRows();
-  PrintParameterStats(stream, "filter-params", filter_params_);
-  PrintParameterStats(stream, "bias-params", bias_params_, true);
-  return stream.str();
-}
-
-// initialize the component using configuration file
-void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
-  bool ok = true;
-  std::string matrix_filename;
-  int32 input_x_dim = -1, input_y_dim = -1, input_z_dim = -1,
-        filt_x_dim = -1, filt_y_dim = -1,
-        filt_x_step = -1, filt_y_step = -1,
-        num_filters = -1;
-  std::string input_vectorization_order = "zyx";
-  InitLearningRatesFromConfig(cfl);
-  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim);
-  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim);
-  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim);
-  ok = ok && cfl->GetValue("filt-x-dim", &filt_x_dim);
-  ok = ok && cfl->GetValue("filt-y-dim", &filt_y_dim);
-  ok = ok && cfl->GetValue("filt-x-step", &filt_x_step);
-  ok = ok && cfl->GetValue("filt-y-step", &filt_y_step);
-
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-  // optional argument
-  TensorVectorizationType input_vectorization;
-  cfl->GetValue("input-vectorization-order", &input_vectorization_order);
-  if (input_vectorization_order.compare("zyx") == 0) {
-    input_vectorization = kZyx;
-  } else if (input_vectorization_order.compare("yzx") == 0) {
-    input_vectorization = kYzx;
-  } else {
-    KALDI_ERR << "Unknown or unsupported input vectorization order "
-              << input_vectorization_order
-              << " accepted candidates are 'yzx' and 'zyx'";
-  }
-
-  if (cfl->GetValue("matrix", &matrix_filename)) {
-    // initialize from prefined parameter matrix
-    Init(input_x_dim, input_y_dim, input_z_dim,
-         filt_x_dim, filt_y_dim,
-         filt_x_step, filt_y_step,
-         input_vectorization,
-         matrix_filename);
-  } else {
-    ok = ok && cfl->GetValue("num-filters", &num_filters);
-    if (!ok)
-      KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-    // initialize from configuration
-    int32 filter_input_dim = filt_x_dim * filt_y_dim * input_z_dim;
-    BaseFloat param_stddev = 1.0 / std::sqrt(filter_input_dim), bias_stddev = 1.0;
-    cfl->GetValue("param-stddev", &param_stddev);
-    cfl->GetValue("bias-stddev", &bias_stddev);
-    Init(input_x_dim, input_y_dim, input_z_dim,
-         filt_x_dim, filt_y_dim, filt_x_step, filt_y_step, num_filters,
-         input_vectorization, param_stddev, bias_stddev);
-  }
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-}
-
-// Inline methods to convert from tensor index i.e., (x,y,z) index
-// to index in yzx or zyx vectorized tensors
-inline int32 YzxVectorIndex(int32 x, int32 y, int32 z,
-                            int32 input_x_dim,
-                            int32 input_y_dim,
-                            int32 input_z_dim) {
-  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
-  return (input_y_dim * input_z_dim) * x + (input_y_dim) * z + y;
-}
-
-inline int32 ZyxVectorIndex(int32 x, int32 y, int32 z,
-                            int32 input_x_dim,
-                            int32 input_y_dim,
-                            int32 input_z_dim) {
-  KALDI_PARANOID_ASSERT(x < input_x_dim && y < input_y_dim && z < input_z_dim);
-  return (input_y_dim * input_z_dim) * x + (input_z_dim) * y + z;
-}
-
-// Method to convert from a matrix representing a minibatch of vectorized
-// 3D tensors to patches for convolution, each patch corresponds to
-// one dot product in the convolution
-void ConvolutionComponent::InputToInputPatches(
-    const CuMatrixBase<BaseFloat>& in,
-    CuMatrix<BaseFloat> *patches) const{
-  int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_);
-  int32 num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_);
-  const int32 filt_x_step = filt_x_step_,
-              filt_y_step = filt_y_step_,
-              filt_x_dim = filt_x_dim_,
-              filt_y_dim = filt_y_dim_,
-              input_x_dim = input_x_dim_,
-              input_y_dim = input_y_dim_,
-              input_z_dim = input_z_dim_,
-              filter_dim = filter_params_.NumCols();
-
-  std::vector<int32> column_map(patches->NumCols());
-  int32 column_map_size = column_map.size();
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      int32 patch_start_index = patch_number * filter_dim;
-      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
-        for (int32 y = 0; y < filt_y_dim; y++)  {
-          for (int32 z = 0; z < input_z_dim; z++, index++)  {
-            KALDI_ASSERT(index < column_map_size);
-            if (input_vectorization_ == kZyx)  {
-              column_map[index] = ZyxVectorIndex(x_step * filt_x_step + x,
-                                                 y_step * filt_y_step + y, z,
-                                                 input_x_dim, input_y_dim,
-                                                 input_z_dim);
-            } else if (input_vectorization_ == kYzx)  {
-              column_map[index] = YzxVectorIndex(x_step * filt_x_step + x,
-                                                  y_step * filt_y_step + y, z,
-                                                  input_x_dim, input_y_dim,
-                                                  input_z_dim);
-            }
-          }
-        }
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches->CopyCols(in, cu_cols);
-}
-
-
-// propagation function
-// see function declaration in nnet-simple-component.h for details
-void* ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                         const CuMatrixBase<BaseFloat> &in,
-                                         CuMatrixBase<BaseFloat> *out) const {
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              num_filters = filter_params_.NumRows(),
-              num_frames = in.NumRows(),
-              filter_dim = filter_params_.NumCols();
-  KALDI_ASSERT((*out).NumRows() == num_frames &&
-               (*out).NumCols() == (num_filters * num_x_steps * num_y_steps));
-
-  CuMatrix<BaseFloat> patches(num_frames,
-                              num_x_steps * num_y_steps * filter_dim,
-                              kUndefined);
-  InputToInputPatches(in, &patches);
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-  std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
-      filter_params_batch;
-
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      tgt_batch.push_back(new CuSubMatrix<BaseFloat>(
-              out->ColRange(patch_number * num_filters, num_filters)));
-      patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-              patches.ColRange(patch_number * filter_dim, filter_dim)));
-      filter_params_batch.push_back(filter_params_elem);
-      tgt_batch[patch_number]->AddVecToRows(1.0, bias_params_, 1.0); // add bias
-    }
-  }
-  // apply all filters
-  AddMatMatBatched<BaseFloat>(1.0, tgt_batch, patch_batch,
-                              kNoTrans, filter_params_batch,
-                              kTrans, 1.0);
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < tgt_batch.size(); p++) {
-    delete tgt_batch[p];
-    delete patch_batch[p];
-  }
-  return NULL;
-}
-
-// scale the parameters
-void ConvolutionComponent::Scale(BaseFloat scale) {
-  if (scale == 0.0) {
-    filter_params_.SetZero();
-    bias_params_.SetZero();
-  } else {
-    filter_params_.Scale(scale);
-    bias_params_.Scale(scale);
-  }
-}
-
-// add another convolution component
-void ConvolutionComponent::Add(BaseFloat alpha, const Component &other_in) {
-  const ConvolutionComponent *other =
-      dynamic_cast<const ConvolutionComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  filter_params_.AddMat(alpha, other->filter_params_);
-  bias_params_.AddVec(alpha, other->bias_params_);
-}
-
-/*
- This function transforms a vector of lists into a list of vectors,
- padded with -1.
- @param[in] The input vector of lists. Let in.size() be D, and let
-            the longest list length (i.e. the max of in[i].size()) be L.
- @param[out] The output list of vectors. The length of the list will
-            be L, each vector-dimension will be D (i.e. out[i].size() == D),
-            and if in[i] == j, then for some k we will have that
-            out[k][j] = i. The output vectors are padded with -1
-            where necessary if not all the input lists have the same side.
-*/
-void RearrangeIndexes(const std::vector<std::vector<int32> > &in,
-                                                std::vector<std::vector<int32> > *out) {
-  int32 D = in.size();
-  int32 L = 0;
-  for (int32 i = 0; i < D; i++)
-    if (in[i].size() > L)
-      L = in[i].size();
-  out->resize(L);
-  for (int32 i = 0; i < L; i++)
-    (*out)[i].resize(D, -1);
-  for (int32 i = 0; i < D; i++) {
-    for (int32 j = 0; j < in[i].size(); j++) {
-      (*out)[j][i] = in[i][j];
-    }
-  }
-}
-
-// Method to compute the input derivative matrix from the input derivatives
-// for patches, where each patch corresponds to one dot product
-// in the convolution
-void ConvolutionComponent::InderivPatchesToInderiv(
-    const CuMatrix<BaseFloat>& in_deriv_patches,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              filt_x_step = filt_x_step_,
-              filt_y_step = filt_y_step_,
-              filt_x_dim = filt_x_dim_,
-              filt_y_dim = filt_y_dim_,
-              input_x_dim = input_x_dim_,
-              input_y_dim = input_y_dim_,
-              input_z_dim = input_z_dim_,
-              filter_dim = filter_params_.NumCols();
-
-  // Compute the reverse column_map from the matrix with input
-  // derivative patches to input derivative matrix
-  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
-  int32 rev_col_map_size = reverse_column_map.size();
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++) {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      int32 patch_start_index = patch_number * filter_dim;
-      for (int32 x = 0, index = patch_start_index; x < filt_x_dim; x++)  {
-        for (int32 y = 0; y < filt_y_dim; y++)  {
-          for (int32 z = 0; z < input_z_dim; z++, index++)  {
-            int32 vector_index;
-            if (input_vectorization_ == kZyx)  {
-              vector_index = ZyxVectorIndex(x_step * filt_x_step + x,
-                                            y_step * filt_y_step + y, z,
-                                            input_x_dim, input_y_dim,
-                                            input_z_dim);
-            } else {
-              KALDI_ASSERT(input_vectorization_ == kYzx);
-              vector_index = YzxVectorIndex(x_step * filt_x_step + x,
-                                            y_step * filt_y_step + y, z,
-                                            input_x_dim, input_y_dim,
-                                            input_z_dim);
-            }
-            KALDI_ASSERT(vector_index < rev_col_map_size);
-            reverse_column_map[vector_index].push_back(index);
-          }
-        }
-      }
-    }
-  }
-  std::vector<std::vector<int32> > rearranged_column_map;
-  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
-  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-    CuArray<int32> cu_cols(rearranged_column_map[p]);
-    in_deriv->AddCols(in_deriv_patches, cu_cols);
-  }
-}
-
-// back propagation function
-// see function declaration in nnet-simple-component.h for details
-void ConvolutionComponent::Backprop(const std::string &debug_info,
-                                    const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in_value,
-                                    const CuMatrixBase<BaseFloat> &, // out_value,
-                                    const CuMatrixBase<BaseFloat> &out_deriv,
-                                    void *memo,
-                                    Component *to_update_in,
-                                    CuMatrixBase<BaseFloat> *in_deriv) const {
-  ConvolutionComponent *to_update =
-      dynamic_cast<ConvolutionComponent*>(to_update_in);
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              num_filters = filter_params_.NumRows(),
-              num_frames = out_deriv.NumRows(),
-              filter_dim = filter_params_.NumCols();
-
-  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
-               out_deriv.NumCols() ==
-               (num_filters * num_x_steps * num_y_steps));
-
-  // Compute inderiv patches
-  CuMatrix<BaseFloat> in_deriv_patches(num_frames,
-                                       num_x_steps * num_y_steps * filter_dim,
-                                       kSetZero);
-
-  std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-      filter_params_batch;
-  CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
-
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-
-      patch_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(
-              in_deriv_patches.ColRange(
-              patch_number * filter_dim, filter_dim)));
-      out_deriv_batch.push_back(new CuSubMatrix<BaseFloat>(out_deriv.ColRange(
-              patch_number * num_filters, num_filters)));
-      filter_params_batch.push_back(filter_params_elem);
-    }
-  }
-  AddMatMatBatched<BaseFloat>(1.0, patch_deriv_batch,
-                              out_deriv_batch, kNoTrans,
-                              filter_params_batch, kNoTrans, 0.0);
-
-  if (in_deriv) {
-    // combine the derivatives from the individual input deriv patches
-    // to compute input deriv matrix
-    InderivPatchesToInderiv(in_deriv_patches, in_deriv);
-  }
-
-  if (to_update != NULL)  {
-    to_update->Update(debug_info, in_value, out_deriv, out_deriv_batch);
-  }
-
-  // release memory
-  delete filter_params_elem;
-  for (int32 p = 0; p < patch_deriv_batch.size(); p++) {
-    delete patch_deriv_batch[p];
-    delete out_deriv_batch[p];
-  }
-}
-
-
-// update parameters
-// see function declaration in nnet-simple-component.h for details
-void ConvolutionComponent::Update(const std::string &debug_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch) {
-  // useful dims
-  const int32 num_x_steps = (1 + (input_x_dim_ - filt_x_dim_) / filt_x_step_),
-              num_y_steps = (1 + (input_y_dim_ - filt_y_dim_) / filt_y_step_),
-              num_filters = filter_params_.NumRows(),
-              num_frames = out_deriv.NumRows(),
-              filter_dim = filter_params_.NumCols();
-  KALDI_ASSERT(out_deriv.NumRows() == num_frames &&
-               out_deriv.NumCols() ==
-               (num_filters * num_x_steps * num_y_steps));
-
-
-  CuMatrix<BaseFloat> filters_grad;
-  CuVector<BaseFloat> bias_grad;
-
-  CuMatrix<BaseFloat> input_patches(num_frames,
-                                    filter_dim * num_x_steps * num_y_steps,
-                                    kUndefined);
-  InputToInputPatches(in_value, &input_patches);
-
-  filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset
-  bias_grad.Resize(num_filters, kSetZero); // reset
-
-  // create a single large matrix holding the smaller matrices
-  // from the vector container filters_grad_batch along the rows
-  CuMatrix<BaseFloat> filters_grad_blocks_batch(
-      num_x_steps * num_y_steps * filters_grad.NumRows(),
-      filters_grad.NumCols());
-
-  std::vector<CuSubMatrix<BaseFloat>* > filters_grad_batch, input_patch_batch;
-
-  for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
-    for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
-      int32 patch_number = x_step * num_y_steps + y_step;
-      filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-          filters_grad_blocks_batch.RowRange(
-              patch_number * filters_grad.NumRows(), filters_grad.NumRows())));
-
-      input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
-              input_patches.ColRange(patch_number * filter_dim, filter_dim)));
-    }
-  }
-
-  AddMatMatBatched<BaseFloat>(1.0, filters_grad_batch, out_deriv_batch, kTrans,
-                              input_patch_batch, kNoTrans, 1.0);
-
-  // add the row blocks together to filters_grad
-  filters_grad.AddMatBlocks(1.0, filters_grad_blocks_batch);
-
-  // create a matrix holding the col blocks sum of out_deriv
-  CuMatrix<BaseFloat> out_deriv_col_blocks_sum(out_deriv.NumRows(),
-                                               num_filters);
-
-  // add the col blocks together to out_deriv_col_blocks_sum
-  out_deriv_col_blocks_sum.AddMatBlocks(1.0, out_deriv);
-
-  bias_grad.AddRowSumMat(1.0, out_deriv_col_blocks_sum, 1.0);
-
-  // release memory
-  for (int32 p = 0; p < input_patch_batch.size(); p++) {
-    delete filters_grad_batch[p];
-    delete input_patch_batch[p];
-  }
-
-  //
-  // update
-  //
-  filter_params_.AddMat(learning_rate_, filters_grad);
-  bias_params_.AddVec(learning_rate_, bias_grad);
-}
-
-void ConvolutionComponent::Read(std::istream &is, bool binary) {
-  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
-  ExpectToken(is, binary, "<InputXDim>");
-  ReadBasicType(is, binary, &input_x_dim_);
-  ExpectToken(is, binary, "<InputYDim>");
-  ReadBasicType(is, binary, &input_y_dim_);
-  ExpectToken(is, binary, "<InputZDim>");
-  ReadBasicType(is, binary, &input_z_dim_);
-  ExpectToken(is, binary, "<FiltXDim>");
-  ReadBasicType(is, binary, &filt_x_dim_);
-  ExpectToken(is, binary, "<FiltYDim>");
-  ReadBasicType(is, binary, &filt_y_dim_);
-  ExpectToken(is, binary, "<FiltXStep>");
-  ReadBasicType(is, binary, &filt_x_step_);
-  ExpectToken(is, binary, "<FiltYStep>");
-  ReadBasicType(is, binary, &filt_y_step_);
-  ExpectToken(is, binary, "<InputVectorization>");
-  int32 input_vectorization;
-  ReadBasicType(is, binary, &input_vectorization);
-  input_vectorization_ = static_cast<TensorVectorizationType>(input_vectorization);
-  ExpectToken(is, binary, "<FilterParams>");
-  filter_params_.Read(is, binary);
-  ExpectToken(is, binary, "<BiasParams>");
-  bias_params_.Read(is, binary);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<IsGradient>") {
-    ReadBasicType(is, binary, &is_gradient_);
-    ExpectToken(is, binary, "</ConvolutionComponent>");
-  } else {
-    is_gradient_ = false;
-    KALDI_ASSERT(tok == "</ConvolutionComponent>");
-  }
-}
-
-void ConvolutionComponent::Write(std::ostream &os, bool binary) const {
-  WriteUpdatableCommon(os, binary);  // write opening tag and learning rate.
-  WriteToken(os, binary, "<InputXDim>");
-  WriteBasicType(os, binary, input_x_dim_);
-  WriteToken(os, binary, "<InputYDim>");
-  WriteBasicType(os, binary, input_y_dim_);
-  WriteToken(os, binary, "<InputZDim>");
-  WriteBasicType(os, binary, input_z_dim_);
-  WriteToken(os, binary, "<FiltXDim>");
-  WriteBasicType(os, binary, filt_x_dim_);
-  WriteToken(os, binary, "<FiltYDim>");
-  WriteBasicType(os, binary, filt_y_dim_);
-  WriteToken(os, binary, "<FiltXStep>");
-  WriteBasicType(os, binary, filt_x_step_);
-  WriteToken(os, binary, "<FiltYStep>");
-  WriteBasicType(os, binary, filt_y_step_);
-  WriteToken(os, binary, "<InputVectorization>");
-  WriteBasicType(os, binary, static_cast<int32>(input_vectorization_));
-  WriteToken(os, binary, "<FilterParams>");
-  filter_params_.Write(os, binary);
-  WriteToken(os, binary, "<BiasParams>");
-  bias_params_.Write(os, binary);
-  WriteToken(os, binary, "<IsGradient>");
-  WriteBasicType(os, binary, is_gradient_);
-  WriteToken(os, binary, "</ConvolutionComponent>");
-}
-
-BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const {
-  const ConvolutionComponent *other =
-      dynamic_cast<const ConvolutionComponent*>(&other_in);
-  return TraceMatMat(filter_params_, other->filter_params_, kTrans)
-         + VecVec(bias_params_, other->bias_params_);
-}
-
-Component* ConvolutionComponent::Copy() const {
-  ConvolutionComponent *ans = new ConvolutionComponent(*this);
-  return ans;
-}
-
-void ConvolutionComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_filter_params(filter_params_);
-  temp_filter_params.SetRandn();
-  filter_params_.AddMat(stddev, temp_filter_params);
-
-  CuVector<BaseFloat> temp_bias_params(bias_params_);
-  temp_bias_params.SetRandn();
-  bias_params_.AddVec(stddev, temp_bias_params);
-}
-
-void ConvolutionComponent::SetParams(const VectorBase<BaseFloat> &bias,
-                                     const MatrixBase<BaseFloat> &filter) {
-  bias_params_ = bias;
-  filter_params_ = filter;
-  KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows());
-}
-
-int32 ConvolutionComponent::NumParameters() const {
-  return (filter_params_.NumCols() + 1) * filter_params_.NumRows();
-}
-
-void ConvolutionComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  KALDI_ASSERT(params->Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  params->Range(0, num_filter_params).CopyRowsFromMat(filter_params_);
-  params->Range(num_filter_params, bias_params_.Dim()).CopyFromVec(bias_params_);
-}
-void ConvolutionComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
-  KALDI_ASSERT(params.Dim() == this->NumParameters());
-  int32 num_filter_params = filter_params_.NumCols() * filter_params_.NumRows();
-  filter_params_.CopyRowsFromVec(params.Range(0, num_filter_params));
-  bias_params_.CopyFromVec(params.Range(num_filter_params, bias_params_.Dim()));
-}
-
-// aquire input dim
-int32 MaxpoolingComponent::InputDim() const {
-  return input_x_dim_ * input_y_dim_ * input_z_dim_;
-}
-
-MaxpoolingComponent::MaxpoolingComponent(
-    const MaxpoolingComponent &component):
-    input_x_dim_(component.input_x_dim_),
-    input_y_dim_(component.input_y_dim_),
-    input_z_dim_(component.input_z_dim_),
-    pool_x_size_(component.pool_x_size_),
-    pool_y_size_(component.pool_y_size_),
-    pool_z_size_(component.pool_z_size_),
-    pool_x_step_(component.pool_x_step_),
-    pool_y_step_(component.pool_y_step_),
-    pool_z_step_(component.pool_z_step_) { }
-
-// aquire output dim
-int32 MaxpoolingComponent::OutputDim() const {
-  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
-  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
-  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
-  return num_pools_x * num_pools_y * num_pools_z;
-}
-
-// check the component parameters
-void MaxpoolingComponent::Check() const {
-  // sanity check of the max pooling parameters
-  KALDI_ASSERT(input_x_dim_ > 0);
-  KALDI_ASSERT(input_y_dim_ > 0);
-  KALDI_ASSERT(input_z_dim_ > 0);
-  KALDI_ASSERT(pool_x_size_ > 0);
-  KALDI_ASSERT(pool_y_size_ > 0);
-  KALDI_ASSERT(pool_z_size_ > 0);
-  KALDI_ASSERT(pool_x_step_ > 0);
-  KALDI_ASSERT(pool_y_step_ > 0);
-  KALDI_ASSERT(pool_z_step_ > 0);
-  KALDI_ASSERT(input_x_dim_ >= pool_x_size_);
-  KALDI_ASSERT(input_y_dim_ >= pool_y_size_);
-  KALDI_ASSERT(input_z_dim_ >= pool_z_size_);
-  KALDI_ASSERT(pool_x_size_ >= pool_x_step_);
-  KALDI_ASSERT(pool_y_size_ >= pool_y_step_);
-  KALDI_ASSERT(pool_z_size_ >= pool_z_step_);
-  KALDI_ASSERT((input_x_dim_ - pool_x_size_) % pool_x_step_  == 0);
-  KALDI_ASSERT((input_y_dim_ - pool_y_size_) % pool_y_step_  == 0);
-  KALDI_ASSERT((input_z_dim_ - pool_z_size_) % pool_z_step_  == 0);
-}
-
-// initialize the component using configuration file
-void MaxpoolingComponent::InitFromConfig(ConfigLine *cfl) {
-  bool ok = true;
-
-  ok = ok && cfl->GetValue("input-x-dim", &input_x_dim_);
-  ok = ok && cfl->GetValue("input-y-dim", &input_y_dim_);
-  ok = ok && cfl->GetValue("input-z-dim", &input_z_dim_);
-  ok = ok && cfl->GetValue("pool-x-size", &pool_x_size_);
-  ok = ok && cfl->GetValue("pool-y-size", &pool_y_size_);
-  ok = ok && cfl->GetValue("pool-z-size", &pool_z_size_);
-  ok = ok && cfl->GetValue("pool-x-step", &pool_x_step_);
-  ok = ok && cfl->GetValue("pool-y-step", &pool_y_step_);
-  ok = ok && cfl->GetValue("pool-z-step", &pool_z_step_);
-
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (!ok)
-    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-
-  Check();
-}
-
-// Method to convert from a matrix representing a minibatch of vectorized
-// 3D tensors to patches for 3d max pooling, each patch corresponds to
-// the nodes having the same local coordinatenodes from each pool
-void MaxpoolingComponent::InputToInputPatches(
-    const CuMatrixBase<BaseFloat>& in,
-    CuMatrix<BaseFloat> *patches) const{
-  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
-  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
-  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
-
-  std::vector<int32> column_map(patches->NumCols());
-  int32 column_map_size = column_map.size();
-  for (int32 x = 0, index =0; x < pool_x_size_; x++) {
-    for (int32 y = 0; y < pool_y_size_; y++) {
-      for (int32 z = 0; z < pool_z_size_; z++) {
-        // given the local node coordinate, group them from each pool
-        // to form a patch
-        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
-          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
-            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
-              KALDI_ASSERT(index < column_map_size);
-              column_map[index] = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
-                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
-                                  (z_pool * pool_z_step_ + z);
-
-            }
-          }
-        }
-      }
-    }
-  }
-  CuArray<int32> cu_cols(column_map);
-  patches->CopyCols(in, cu_cols);
-}
-
-/*
-  This is the 3d max pooling propagate function.
-  It is assumed that each row of the input matrix
-  is a vectorized 3D-tensor of type zxy.
-  Similar to the propagate function of ConvolutionComponent,
-  the input matrix is first arranged into patches so that
-  pools (with / without overlapping) could be
-  processed in a parallelizable manner.
-  The output matrix is also a vectorized 3D-tensor of type zxy.
-*/
-
-void* MaxpoolingComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const {
-  int32 num_frames = in.NumRows();
-  int32 num_pools = OutputDim();
-  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
-  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
-  InputToInputPatches(in, &patches);
-
-  out->Set(-1e20); // reset a large negative value
-  for (int32 q = 0; q < pool_size; q++)
-    out->Max(patches.ColRange(q * num_pools, num_pools));
-  return NULL;
-}
-
-// Method to compute the input derivative matrix from the input derivatives
-// for patches, where each patch corresponds to
-// the nodes having the same local coordinatenodes from each pool
-void MaxpoolingComponent::InderivPatchesToInderiv(
-    const CuMatrix<BaseFloat>& in_deriv_patches,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  int32 num_pools_x = 1 + (input_x_dim_ - pool_x_size_) / pool_x_step_;
-  int32 num_pools_y = 1 + (input_y_dim_ - pool_y_size_) / pool_y_step_;
-  int32 num_pools_z = 1 + (input_z_dim_ - pool_z_size_) / pool_z_step_;
-
-  std::vector<std::vector<int32> > reverse_column_map(in_deriv->NumCols());
-  int32 rev_col_map_size = reverse_column_map.size();
-  for (int32 x = 0, index = 0; x < pool_x_size_; x++) {
-    for (int32 y = 0; y < pool_y_size_; y++) {
-      for (int32 z = 0; z < pool_z_size_; z++) {
-
-        for (int32 x_pool = 0; x_pool < num_pools_x; x_pool++) {
-          for (int32 y_pool = 0; y_pool < num_pools_y; y_pool++) {
-            for (int32 z_pool = 0; z_pool < num_pools_z; z_pool++, index++) {
-              int32 vector_index = (x_pool * pool_x_step_ + x) * input_y_dim_ * input_z_dim_ +
-                                  (y_pool * pool_y_step_ + y) * input_z_dim_ +
-                                  (z_pool * pool_z_step_ + z);
-
-              KALDI_ASSERT(vector_index < rev_col_map_size);
-              reverse_column_map[vector_index].push_back(index);
-            }
-          }
-        }
-      }
-    }
-  }
-  std::vector<std::vector<int32> > rearranged_column_map;
-  RearrangeIndexes(reverse_column_map, &rearranged_column_map);
-  for (int32 p = 0; p < rearranged_column_map.size(); p++) {
-    CuArray<int32> cu_cols(rearranged_column_map[p]);
-    in_deriv->AddCols(in_deriv_patches, cu_cols);
-  }
-}
-
-/*
-  3d max pooling backpropagate function
-  This function backpropagate the error from
-  out_deriv to in_deriv.
-  In order to select the node in each pool to
-  backpropagate the error, it has to compare
-  the output pool value stored in the out_value
-  matrix with each of its input pool member node
-  stroed in the in_value matrix.
-*/
-void MaxpoolingComponent::Backprop(const std::string &debug_info,
-                                   const ComponentPrecomputedIndexes *indexes,
-                                   const CuMatrixBase<BaseFloat> &in_value,
-                                   const CuMatrixBase<BaseFloat> &out_value,
-                                   const CuMatrixBase<BaseFloat> &out_deriv,
-                                   void *memo,
-                                   Component *, // to_update,
-                                   CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)
-    return;
-
-  int32 num_frames = in_value.NumRows();
-  int32 num_pools = OutputDim();
-  int32 pool_size = pool_x_size_ * pool_y_size_ * pool_z_size_;
-  CuMatrix<BaseFloat> patches(num_frames, num_pools * pool_size, kUndefined);
-  InputToInputPatches(in_value, &patches);
-
-  for (int32 q = 0; q < pool_size; q++) {
-    // zero-out mask
-    CuMatrix<BaseFloat> mask;
-    out_value.EqualElementMask(patches.ColRange(q * num_pools, num_pools), &mask);
-    mask.MulElements(out_deriv);
-    patches.ColRange(q * num_pools, num_pools).CopyFromMat(mask);
-  }
-
-  // combine the derivatives from the individual input deriv patches
-  // to compute input deriv matrix
-  InderivPatchesToInderiv(patches, in_deriv);
-}
-
-void MaxpoolingComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputXDim>");
-  ReadBasicType(is, binary, &input_x_dim_);
-  ExpectToken(is, binary, "<InputYDim>");
-  ReadBasicType(is, binary, &input_y_dim_);
-  ExpectToken(is, binary, "<InputZDim>");
-  ReadBasicType(is, binary, &input_z_dim_);
-  ExpectToken(is, binary, "<PoolXSize>");
-  ReadBasicType(is, binary, &pool_x_size_);
-  ExpectToken(is, binary, "<PoolYSize>");
-  ReadBasicType(is, binary, &pool_y_size_);
-  ExpectToken(is, binary, "<PoolZSize>");
-  ReadBasicType(is, binary, &pool_z_size_);
-  ExpectToken(is, binary, "<PoolXStep>");
-  ReadBasicType(is, binary, &pool_x_step_);
-  ExpectToken(is, binary, "<PoolYStep>");
-  ReadBasicType(is, binary, &pool_y_step_);
-  ExpectToken(is, binary, "<PoolZStep>");
-  ReadBasicType(is, binary, &pool_z_step_);
-  ExpectToken(is, binary, "</MaxpoolingComponent>");
-  Check();
-}
-
-void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<MaxpoolingComponent>");
-  WriteToken(os, binary, "<InputXDim>");
-  WriteBasicType(os, binary, input_x_dim_);
-  WriteToken(os, binary, "<InputYDim>");
-  WriteBasicType(os, binary, input_y_dim_);
-  WriteToken(os, binary, "<InputZDim>");
-  WriteBasicType(os, binary, input_z_dim_);
-  WriteToken(os, binary, "<PoolXSize>");
-  WriteBasicType(os, binary, pool_x_size_);
-  WriteToken(os, binary, "<PoolYSize>");
-  WriteBasicType(os, binary, pool_y_size_);
-  WriteToken(os, binary, "<PoolZSize>");
-  WriteBasicType(os, binary, pool_z_size_);
-  WriteToken(os, binary, "<PoolXStep>");
-  WriteBasicType(os, binary, pool_x_step_);
-  WriteToken(os, binary, "<PoolYStep>");
-  WriteBasicType(os, binary, pool_y_step_);
-  WriteToken(os, binary, "<PoolZStep>");
-  WriteBasicType(os, binary, pool_z_step_);
-  WriteToken(os, binary, "</MaxpoolingComponent>");
-}
-
-// display information about component
-std::string MaxpoolingComponent::Info() const {
-  std::ostringstream stream;
-  stream << Type()
-         << ", input-x-dim=" << input_x_dim_
-         << ", input-y-dim=" << input_y_dim_
-         << ", input-z-dim=" << input_z_dim_
-         << ", pool-x-size=" << pool_x_size_
-         << ", pool-y-size=" << pool_y_size_
-         << ", pool-z-size=" << pool_z_size_
-         << ", pool-x-step=" << pool_x_step_
-         << ", pool-y-step=" << pool_y_step_
-         << ", pool-z-step=" << pool_z_step_;
-  return stream.str();
+void NaturalGradientPerElementScaleComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp(preconditioner_);
+  preconditioner_.Swap(&temp);
 }
 
 void PermuteComponent::ComputeReverseColumnMap() {
@@ -4964,13 +4068,13 @@ bool CompositeComponent::IsUpdatable() const {
 int32 CompositeComponent::InputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.front()->InputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::OutputDim() const {
   KALDI_ASSERT(!components_.empty());
   return components_.back()->OutputDim();
-};
+}
 
 // virtual
 int32 CompositeComponent::Properties() const {
@@ -4992,7 +4096,7 @@ int32 CompositeComponent::Properties() const {
   if (last_component_properties & kStoresStats)
     ans |= kBackpropNeedsOutput;
   return ans;
-};
+}
 
 
 MatrixStrideType CompositeComponent::GetStrideType(int32 i) const {
@@ -5215,7 +4319,7 @@ void CompositeComponent::Backprop(const std::string &debug_info,
       // optimization; other propagates might also be skippable.
       int32 properties = components_[num_components - 2]->Properties(),
           next_properties = components_[num_components - 1]->Properties();
-      if (!(properties & (kBackpropNeedsOutput || kUsesMemo)) &&
+      if (!(properties & (kBackpropNeedsOutput | kUsesMemo)) &&
           !(next_properties & kBackpropNeedsInput)) {
         num_components_to_propagate--;
       }
@@ -5513,366 +4617,6 @@ void CompositeComponent::SetComponent(int32 i, Component *component) {
   components_[i] = component;
 }
 
-int32 LstmNonlinearityComponent::InputDim() const {
-  int32 cell_dim = value_sum_.NumCols();
-  return cell_dim * 5 + (use_dropout_ ? 3 : 0);
-}
-
-int32 LstmNonlinearityComponent::OutputDim() const {
-  int32 cell_dim = value_sum_.NumCols();
-  return cell_dim * 2;
-}
-
-
-void LstmNonlinearityComponent::Read(std::istream &is, bool binary) {
-  ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
-  ExpectToken(is, binary, "<Params>");
-  params_.Read(is, binary);
-  ExpectToken(is, binary, "<ValueAvg>");
-  value_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<DerivAvg>");
-  deriv_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<SelfRepairConfig>");
-  self_repair_config_.Read(is, binary);
-  ExpectToken(is, binary, "<SelfRepairProb>");
-  self_repair_total_.Read(is, binary);
-
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<UseDropout>") {
-    ReadBasicType(is, binary, &use_dropout_);
-    ReadToken(is, binary, &tok);
-  } else {
-    use_dropout_ = false;
-  }
-  KALDI_ASSERT(tok == "<Count>");
-  ReadBasicType(is, binary, &count_);
-
-  // For the on-disk format, we normalze value_sum_, deriv_sum_ and
-  // self_repair_total_ by dividing by the count, but in memory they are scaled
-  // by the count.  [for self_repair_total_, the scaling factor is count_ *
-  // cell_dim].
-  value_sum_.Scale(count_);
-  deriv_sum_.Scale(count_);
-  int32 cell_dim = params_.NumCols();
-  self_repair_total_.Scale(count_ * cell_dim);
-
-  InitNaturalGradient();
-
-  ExpectToken(is, binary, "</LstmNonlinearityComponent>");
-
-}
-
-void LstmNonlinearityComponent::Write(std::ostream &os, bool binary) const {
-  WriteUpdatableCommon(os, binary);  // Read opening tag and learning rate.
-
-  WriteToken(os, binary, "<Params>");
-  params_.Write(os, binary);
-  WriteToken(os, binary, "<ValueAvg>");
-  {
-    Matrix<BaseFloat> value_avg(value_sum_);
-    if (count_ != 0.0)
-      value_avg.Scale(1.0 / count_);
-    value_avg.Write(os, binary);
-  }
-  WriteToken(os, binary, "<DerivAvg>");
-  {
-    Matrix<BaseFloat> deriv_avg(deriv_sum_);
-    if (count_ != 0.0)
-      deriv_avg.Scale(1.0 / count_);
-    deriv_avg.Write(os, binary);
-  }
-  WriteToken(os, binary, "<SelfRepairConfig>");
-  self_repair_config_.Write(os, binary);
-  WriteToken(os, binary, "<SelfRepairProb>");
-  {
-    int32 cell_dim = params_.NumCols();
-    Vector<BaseFloat> self_repair_prob(self_repair_total_);
-    if (count_ != 0.0)
-      self_repair_prob.Scale(1.0 / (count_ * cell_dim));
-    self_repair_prob.Write(os, binary);
-  }
-  if (use_dropout_) {
-    // only write this if true; we have back-compat code in reading anyway.
-    // this makes the models without dropout easier to read with older code.
-    WriteToken(os, binary, "<UseDropout>");
-    WriteBasicType(os, binary, use_dropout_);
-  }
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary, count_);
-  WriteToken(os, binary, "</LstmNonlinearityComponent>");
-}
-
-
-
-std::string LstmNonlinearityComponent::Info() const {
-  std::ostringstream stream;
-  int32 cell_dim = params_.NumCols();
-  stream << UpdatableComponent::Info() << ", cell-dim=" << cell_dim
-         << ", use-dropout=" << (use_dropout_ ? "true" : "false");
-  PrintParameterStats(stream, "w_ic", params_.Row(0));
-  PrintParameterStats(stream, "w_fc", params_.Row(1));
-  PrintParameterStats(stream, "w_oc", params_.Row(2));
-
-  // Note: some of the following code mirrors the code in
-  // UpdatableComponent::Info(), in nnet-component-itf.cc.
-  if (count_ > 0) {
-    stream << ", count=" << std::setprecision(3) << count_
-           << std::setprecision(6);
-  }
-  static const char *nonlin_names[] = { "i_t_sigmoid", "f_t_sigmoid", "c_t_tanh",
-                                        "o_t_sigmoid", "m_t_tanh" };
-  for (int32 i = 0; i < 5; i++) {
-    stream << ", " << nonlin_names[i] << "={";
-    stream << " self-repair-lower-threshold=" << self_repair_config_(i)
-           << ", self-repair-scale=" << self_repair_config_(i + 5);
-
-    if (count_ != 0) {
-      BaseFloat self_repaired_proportion =
-          self_repair_total_(i) / (count_ * cell_dim);
-      stream << ", self-repaired-proportion=" << self_repaired_proportion;
-      Vector<double> value_sum(value_sum_.Row(i)),
-          deriv_sum(deriv_sum_.Row(i));
-      Vector<BaseFloat> value_avg(value_sum), deriv_avg(deriv_sum);
-      value_avg.Scale(1.0 / count_);
-      deriv_avg.Scale(1.0 / count_);
-      stream << ", value-avg=" << SummarizeVector(value_avg)
-             << ", deriv-avg=" << SummarizeVector(deriv_avg);
-    }
-    stream << " }";
-  }
-  return stream.str();
-}
-
-
-Component* LstmNonlinearityComponent::Copy() const {
-  return new LstmNonlinearityComponent(*this);
-}
-
-void LstmNonlinearityComponent::ZeroStats() {
-  value_sum_.SetZero();
-  deriv_sum_.SetZero();
-  self_repair_total_.SetZero();
-  count_ = 0.0;
-}
-
-void LstmNonlinearityComponent::Scale(BaseFloat scale) {
-  if (scale == 0.0) {
-    params_.SetZero();
-    value_sum_.SetZero();
-    deriv_sum_.SetZero();
-    self_repair_total_.SetZero();
-    count_ = 0.0;
-  } else {
-    params_.Scale(scale);
-    value_sum_.Scale(scale);
-    deriv_sum_.Scale(scale);
-    self_repair_total_.Scale(scale);
-    count_ *= scale;
-  }
-}
-
-void LstmNonlinearityComponent::Add(BaseFloat alpha,
-                                    const Component &other_in) {
-  const LstmNonlinearityComponent *other =
-      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  params_.AddMat(alpha, other->params_);
-  value_sum_.AddMat(alpha, other->value_sum_);
-  deriv_sum_.AddMat(alpha, other->deriv_sum_);
-  self_repair_total_.AddVec(alpha, other->self_repair_total_);
-  count_ += alpha * other->count_;
-}
-
-void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
-  CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
-  temp_params.SetRandn();
-  params_.AddMat(stddev, temp_params);
-}
-
-BaseFloat LstmNonlinearityComponent::DotProduct(
-    const UpdatableComponent &other_in) const {
-  const LstmNonlinearityComponent *other =
-      dynamic_cast<const LstmNonlinearityComponent*>(&other_in);
-  KALDI_ASSERT(other != NULL);
-  return TraceMatMat(params_, other->params_, kTrans);
-}
-
-int32 LstmNonlinearityComponent::NumParameters() const {
-  return params_.NumRows() * params_.NumCols();
-}
-
-void LstmNonlinearityComponent::Vectorize(VectorBase<BaseFloat> *params) const {
-  KALDI_ASSERT(params->Dim() == NumParameters());
-  params->CopyRowsFromMat(params_);
-}
-
-
-void LstmNonlinearityComponent::UnVectorize(
-    const VectorBase<BaseFloat> &params)  {
-  KALDI_ASSERT(params.Dim() == NumParameters());
-  params_.CopyRowsFromVec(params);
-}
-
-
-void* LstmNonlinearityComponent::Propagate(
-    const ComponentPrecomputedIndexes *, // indexes
-    const CuMatrixBase<BaseFloat> &in,
-    CuMatrixBase<BaseFloat> *out) const {
-  cu::ComputeLstmNonlinearity(in, params_, out);
-  return NULL;
-}
-
-
-void LstmNonlinearityComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &, // out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    void *memo,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  if (to_update_in == NULL) {
-    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
-                                 deriv_sum_, self_repair_config_,
-                                 count_, in_deriv,
-                                 (CuMatrixBase<BaseFloat>*) NULL,
-                                 (CuMatrixBase<double>*) NULL,
-                                 (CuMatrixBase<double>*) NULL,
-                                 (CuMatrixBase<BaseFloat>*) NULL);
-  } else {
-    LstmNonlinearityComponent *to_update =
-        dynamic_cast<LstmNonlinearityComponent*>(to_update_in);
-    KALDI_ASSERT(to_update != NULL);
-
-    int32 cell_dim = params_.NumCols();
-    CuMatrix<BaseFloat> params_deriv(3, cell_dim, kUndefined);
-    CuMatrix<BaseFloat> self_repair_total(5, cell_dim, kUndefined);
-
-    cu::BackpropLstmNonlinearity(in_value, params_, out_deriv,
-                                 deriv_sum_, self_repair_config_,
-                                 count_, in_deriv, &params_deriv,
-                                 &(to_update->value_sum_),
-                                 &(to_update->deriv_sum_),
-                                 &self_repair_total);
-
-    CuVector<BaseFloat> self_repair_total_sum(5);
-    self_repair_total_sum.AddColSumMat(1.0, self_repair_total, 0.0);
-    to_update->self_repair_total_.AddVec(1.0, self_repair_total_sum);
-    to_update->count_ += static_cast<double>(in_value.NumRows());
-
-    BaseFloat scale = 1.0;
-    if (!to_update->is_gradient_) {
-      to_update->preconditioner_.PreconditionDirections(
-          &params_deriv, &scale);
-    }
-    to_update->params_.AddMat(to_update->learning_rate_ * scale,
-                              params_deriv);
-  }
-}
-
-LstmNonlinearityComponent::LstmNonlinearityComponent(
-    const LstmNonlinearityComponent &other):
-    UpdatableComponent(other),
-    params_(other.params_),
-    use_dropout_(other.use_dropout_),
-    value_sum_(other.value_sum_),
-    deriv_sum_(other.deriv_sum_),
-    self_repair_config_(other.self_repair_config_),
-    self_repair_total_(other.self_repair_total_),
-    count_(other.count_),
-    preconditioner_(other.preconditioner_) { }
-
-void LstmNonlinearityComponent::Init(
-    int32 cell_dim, bool use_dropout,
-    BaseFloat param_stddev,
-    BaseFloat tanh_self_repair_threshold,
-    BaseFloat sigmoid_self_repair_threshold,
-    BaseFloat self_repair_scale) {
-  KALDI_ASSERT(cell_dim > 0 && param_stddev >= 0.0 &&
-               tanh_self_repair_threshold >= 0.0 &&
-               tanh_self_repair_threshold <= 1.0 &&
-               sigmoid_self_repair_threshold >= 0.0 &&
-               sigmoid_self_repair_threshold <= 0.25 &&
-               self_repair_scale >= 0.0 && self_repair_scale <= 0.1);
-  use_dropout_ = use_dropout;
-  params_.Resize(3, cell_dim);
-  params_.SetRandn();
-  params_.Scale(param_stddev);
-  value_sum_.Resize(5, cell_dim);
-  deriv_sum_.Resize(5, cell_dim);
-  self_repair_config_.Resize(10);
-  self_repair_config_.Range(0, 5).Set(sigmoid_self_repair_threshold);
-  self_repair_config_(2) = tanh_self_repair_threshold;
-  self_repair_config_(4) = tanh_self_repair_threshold;
-  self_repair_config_.Range(5, 5).Set(self_repair_scale);
-  self_repair_total_.Resize(5);
-  count_ = 0.0;
-  InitNaturalGradient();
-
-}
-
-void LstmNonlinearityComponent::InitNaturalGradient() {
-  // As regards the configuration for the natural-gradient preconditioner, we
-  // don't make it configurable from the command line-- it's unlikely that any
-  // differences from changing this would be substantial enough to effectively
-  // tune the configuration.  Because the preconditioning code doesn't 'see' the
-  // derivatives from individual frames, but only averages over the minibatch,
-  // there is a fairly small amount of data available to estimate the Fisher
-  // information matrix, so we set the rank, update period and
-  // num-samples-history to smaller values than normal.
-  preconditioner_.SetRank(20);
-  preconditioner_.SetUpdatePeriod(2);
-  preconditioner_.SetNumSamplesHistory(1000.0);
-}
-
-/// virtual
-void LstmNonlinearityComponent::FreezeNaturalGradient(bool freeze) {
-  preconditioner_.Freeze(freeze);
-}
-
-void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
-  InitLearningRatesFromConfig(cfl);
-  bool ok = true;
-  bool use_dropout = false;
-  int32 cell_dim;
-  // these self-repair thresholds are the normal defaults for tanh and sigmoid
-  // respectively.  If, later on, we decide that we want to support different
-  // self-repair config values for the individual sigmoid and tanh
-  // nonlinearities, we can modify this code then.
-  BaseFloat tanh_self_repair_threshold = 0.2,
-      sigmoid_self_repair_threshold = 0.05,
-      self_repair_scale = 1.0e-05;
-  // param_stddev is the stddev of the parameters.  it may be better to
-  // use a smaller value but this was the default in the python scripts
-  // for a while.
-  BaseFloat param_stddev = 1.0;
-  ok = ok && cfl->GetValue("cell-dim", &cell_dim);
-  cfl->GetValue("param-stddev", &param_stddev);
-  cfl->GetValue("tanh-self-repair-threshold",
-                &tanh_self_repair_threshold);
-  cfl->GetValue("sigmoid-self-repair-threshold",
-                &sigmoid_self_repair_threshold);
-  cfl->GetValue("self-repair-scale", &self_repair_scale);
-  cfl->GetValue("use-dropout", &use_dropout);
-
-  // We may later on want to make it possible to initialize the different
-  // parameters w_ic, w_fc and w_oc with different biases.  We'll implement
-  // that when and if it's needed.
-
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (ok) {
-    Init(cell_dim, use_dropout, param_stddev, tanh_self_repair_threshold,
-         sigmoid_self_repair_threshold, self_repair_scale);
-  } else {
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-  }
-}
 
 SumBlockComponent::SumBlockComponent(const SumBlockComponent &other):
     input_dim_(other.input_dim_), output_dim_(other.output_dim_),
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 12ae99d716b..546176f71ee 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1,9 +1,9 @@
 // nnet3/nnet-simple-component.h
 
 // Copyright 2011-2013  Karel Vesely
-//           2012-2015  Johns Hopkins University (author: Daniel Povey)
+//           2012-2017  Johns Hopkins University (author: Daniel Povey)
 //                2013  Xiaohui Zhang
-//           2014-2015  Vijayaditya Peddinti
+//           2014-2016  Vijayaditya Peddinti
 //           2014-2015  Guoguo Chen
 //                2015  Daniel Galvez
 //                2015  Tom Ko
@@ -42,7 +42,7 @@ namespace nnet3 {
 ///   nnet-general-component.h there are components that don't fit this pattern.
 ///
 ///   Some components that do provide the kSimpleComponent flag are not declared
-///   here: see also nnet-normalize-component.h.
+///   here: see also nnet-normalize-component.h and nnet-combined-component.h
 
 // This "nnet3" version of the p-norm component only supports the 2-norm.
 class PnormComponent: public Component {
@@ -650,6 +650,9 @@ class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent {
   // Copy constructor
   explicit NaturalGradientRepeatedAffineComponent(
       const NaturalGradientRepeatedAffineComponent &other);
+
+  virtual void ConsolidateMemory();
+
  private:
   virtual void Update(
       const CuMatrixBase<BaseFloat> &in_value,
@@ -753,7 +756,7 @@ class LogSoftmaxComponent: public NonlinearComponent {
   Configuration values accepted by this component:
 
   Values inherited from UpdatableComponent (see its declaration in
-  nnet-component-itf for details):
+  nnet-component-itf.h for details):
      learning-rate
      learning-rate-factor
      max-change
@@ -814,8 +817,8 @@ class LogSoftmaxComponent: public NonlinearComponent {
                             matrix in the input space.  default=20.
       rank-out              Rank used in low-rank-plus-unit estimate of Fisher
                             matrix in the output-derivative space.  default=80.
-      update-period         Determines after with what frequency (in
-                            minibatches) we update the Fisher-matrix estimates;
+      update-period         Determines the period (in minibatches) with which
+                            we update the Fisher-matrix estimates;
                             making this > 1 saves a little time in training.
                             default=4.
 */
@@ -832,6 +835,9 @@ class NaturalGradientAffineComponent: public AffineComponent {
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
   virtual void FreezeNaturalGradient(bool freeze);
+
+  virtual void ConsolidateMemory();
+
   // copy constructor
   explicit NaturalGradientAffineComponent(
       const NaturalGradientAffineComponent &other);
@@ -955,6 +961,8 @@ class LinearComponent: public UpdatableComponent {
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
   virtual void FreezeNaturalGradient(bool freeze);
+  virtual void ConsolidateMemory();
+
   // copy constructor
   explicit LinearComponent(const LinearComponent &other);
 
@@ -1715,6 +1723,7 @@ class ConstantFunctionComponent: public UpdatableComponent {
   virtual int32 NumParameters() const;
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ConsolidateMemory();
  private:
   int32 input_dim_;
   // the output value-- a vector.
@@ -1783,6 +1792,8 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
             int32 rank, int32 update_period, BaseFloat num_samples_history,
             BaseFloat alpha);
 
+  void ConsolidateMemory();
+
  private:
   // unlike the NaturalGradientAffineComponent, there is only one dimension to
   // consider as the parameters are a vector not a matrix, so we only need one
@@ -1804,7 +1815,6 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
 };
 
 
-
 /*
   ScaleAndOffsetComponent implements a per-element scale and offset.
   It may be useful just after BatchNormComponent, as the trainable offset
@@ -1888,6 +1898,7 @@ class ScaleAndOffsetComponent: public UpdatableComponent {
   virtual int32 NumParameters() const { return 2 * scales_.Dim(); }
   virtual void Vectorize(VectorBase<BaseFloat> *params) const;
   virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+  virtual void ConsolidateMemory();
 
 
   // copy constructor
@@ -1935,521 +1946,6 @@ class ScaleAndOffsetComponent: public UpdatableComponent {
 };
 
 
-
-/**
- * WARNING, this component is deprecated in favor of
- *  TimeHeightConvolutionComponent, and will be deleted.
- * ConvolutionalComponent implements 2d-convolution.
- * It uses 3D filters on 3D inputs, but the 3D filters hop only over
- * 2 dimensions as it has same size as the input along the 3rd dimension.
- * Input : A matrix where each row is a  vectorized 3D-tensor.
- *        The 3D tensor has dimensions
- *        x: (e.g. time)
- *        y: (e.g. frequency)
- *        z: (e.g. channels like features/delta/delta-delta)
- *
- *        The component supports input vectorizations of type zyx and yzx.
- *        The default vectorization type is zyx.
- *        e.g. for input vectorization of type zyx the input is vectorized by
- *        spanning axes z, y and x of the tensor in that order.
- *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
- *        the zyx vectorized input looks like
- *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
- *
- *
- * Output : The output is also a 3D tensor vectorized in the zyx format.
- *          The channel axis (z) in the output corresponds to the output of
- *          different filters. The first channel corresponds to the first filter
- *          i.e., first row of the filter_params_ matrix.
- *
- * Note: The component has to support yzx input vectorization as the binaries
- * like add-deltas generate yz vectorized output. These input vectors are
- * concatenated using the Append descriptor across time steps to form a yzx
- * vectorized 3D tensor input.
- * e.g. Append(Offset(input, -1), input, Offset(input, 1))
- *
- *
- * For information on the hyperparameters and parameters of this component see
- * the variable declarations.
- *
- * Propagation:
- * ------------
- * Convolution operation consists of a dot-products between the filter tensor
- * and input tensor patch, for various shifts of filter tensor along the x and y
- * axes input tensor. (Note: there is no shift along z-axis as the filter and
- * input tensor have same size along this axis).
- *
- * For a particular shift (i,j) of the filter tensor
- * along input tensor dimensions x and y, the elements of the input tensor which
- * overlap with the filter form the input tensor patch. This patch is vectorized
- * in zyx format. All the patches corresponding to various samples in the
- * mini-batch are stacked into a matrix, where each row corresponds to one
- * patch. Let this matrix be represented by X_{i,j}. The dot products with
- * various filters are computed simultaneously by computing the matrix product
- * with the filter_params_ matrix (W)
- * Y_{i,j} = X_{i,j}*W^T.
- * Each row of W corresponds to one filter 3D tensor vectorized in zyx format.
- *
- * All the matrix products corresponding to various shifts (i,j) of the
- * filter tensor are computed simultaneously using the AddMatMatBatched
- * call of CuMatrixBase class.
- *
- * BackPropagation:
- * ----------------
- *  Backpropagation to compute the input derivative (\nabla X_{i,j})
- *  consists of the a series of matrix products.
- *  \nablaX_{i,j} = \nablaY_{i,j}*W where \nablaY_{i,j} corresponds to the
- *   output derivative for a particular shift of the filter.
- *
- *   Once again these matrix products are computed simultaneously.
- *
- * Update:
- * -------
- *  The weight gradient is computed as
- *  \nablaW = \Sum_{i,j} (X_{i,j}^T *\nablaY_{i,j})
- *
- */
-class ConvolutionComponent: public UpdatableComponent {
- public:
-  enum TensorVectorizationType  {
-    kYzx = 0,
-    kZyx = 1
-  };
-
-  ConvolutionComponent();
-  // constructor using another component
-  ConvolutionComponent(const ConvolutionComponent &component);
-  // constructor using parameters
-  ConvolutionComponent(
-    const CuMatrixBase<BaseFloat> &filter_params,
-    const CuVectorBase<BaseFloat> &bias_params,
-    int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-    int32 filt_x_dim, int32 filt_y_dim,
-    int32 filt_x_step, int32 filt_y_step,
-    TensorVectorizationType input_vectorization,
-    BaseFloat learning_rate);
-
-  virtual int32 InputDim() const;
-  virtual int32 OutputDim() const;
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "ConvolutionComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
-           kBackpropAdds|kPropagateAdds;
-  }
-
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update_in,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-  void Update(const std::string &debug_info,
-              const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv,
-              const std::vector<CuSubMatrix<BaseFloat> *>& out_deriv_batch);
-
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Some functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-
-  // Some functions that are specific to this class.
-  void SetParams(const VectorBase<BaseFloat> &bias,
-                 const MatrixBase<BaseFloat> &filter);
-  const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
-  const CuMatrix<BaseFloat> &LinearParams() const { return filter_params_; }
-  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-            int32 filt_x_dim, int32 filt_y_dim,
-            int32 filt_x_step, int32 filt_y_step, int32 num_filters,
-            TensorVectorizationType input_vectorization,
-            BaseFloat param_stddev, BaseFloat bias_stddev);
-  // there is no filt_z_dim parameter as the length of the filter along
-  // z-dimension is same as the input
-  void Init(int32 input_x_dim, int32 input_y_dim, int32 input_z_dim,
-            int32 filt_x_dim, int32 filt_y_dim,
-            int32 filt_x_step, int32 filt_y_step,
-            TensorVectorizationType input_vectorization,
-            std::string matrix_filename);
-
-  // resize the component, setting the parameters to zero, while
-  // leaving any other configuration values the same
-  void Resize(int32 input_dim, int32 output_dim);
-
-  void Update(const std::string &debug_info,
-              const CuMatrixBase<BaseFloat> &in_value,
-              const CuMatrixBase<BaseFloat> &out_deriv);
-
-
- private:
-  int32 input_x_dim_;   // size of the input along x-axis
-                        // (e.g. number of time steps)
-
-  int32 input_y_dim_;   // size of input along y-axis
-                        // (e.g. number of mel-frequency bins)
-
-  int32 input_z_dim_;   // size of input along z-axis
-                        // (e.g. number of channels is 3 if the input has
-                        // features + delta + delta-delta features
-
-  int32 filt_x_dim_;    // size of the filter along x-axis
-
-  int32 filt_y_dim_;    // size of the filter along y-axis
-
-  // there is no filt_z_dim_ as it is always assumed to be
-  // the same as input_z_dim_
-
-  int32 filt_x_step_;   // the number of steps taken along x-axis of input
-                        //  before computing the next dot-product
-                        //  of filter and input
-
-  int32 filt_y_step_;   // the number of steps taken along y-axis of input
-                        // before computing the next dot-product of the filter
-                        // and input
-
-  // there is no filt_z_step_ as only dot product is possible along this axis
-
-  TensorVectorizationType input_vectorization_; // type of vectorization of the
-  // input 3D tensor. Accepts zyx and yzx formats
-
-  CuMatrix<BaseFloat> filter_params_;
-  // the filter (or kernel) matrix is a matrix of vectorized 3D filters
-  // where each row in the matrix corresponds to one filter.
-  // The 3D filter tensor is vectorizedin zyx format.
-  // The first row of the matrix corresponds to the first filter and so on.
-  // Keep in mind the vectorization type and order of filters when using file
-  // based initialization.
-
-  CuVector<BaseFloat> bias_params_;
-  // the filter-specific bias vector (i.e., there is a seperate bias added
-  // to the output of each filter).
-
-  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
-                           CuMatrix<BaseFloat> *patches) const;
-  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
-                               CuMatrixBase<BaseFloat> *in_deriv) const;
-  const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
-};
-
-
-/*
-  LstmNonlinearityComponent is a component that implements part of an LSTM, by
-  combining together the sigmoids and tanh's, plus some diagonal terms, into
-  a single block.
-  We will refer to the LSTM formulation used in
-
-  Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling"
-  by H. Sak et al,
-  http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43905.pdf.
-
-  Suppose the cell dimension is C.  Then outside this component, we compute
-  the 4 * C-dimensional quantity consisting of 4 blocks as follows, by a single
-  matrix multiplication:
-
-  i_part = W_{ix} x_t + W_{im} m_{t-1} + b_i
-  f_part = W_{fx} x_t + W_{fm} m_{t-1} + b_f
-  c_part = W_{cx} x_t + W_{cm} m_{t-1} + b_c
-  o_part = W_{cx} x_t + W_{om} m_{t-1} + b_o
-
-  The part of the computation that takes place in this component is as follows.
-  Its input is of dimension 5C [however, search for 'dropout' below],
-  consisting of 5 blocks: (i_part, f_part, c_part, o_part, and c_{t-1}).  Its
-  output is of dimension 2C, consisting of 2 blocks: c_t and m_t.
-
-  To recap: the input is (i_part, f_part, c_part, o_part, c_{t-1}); the output is (c_t, m_t).
-
-  This component has parameters, 3C of them in total: the diagonal matrices w_i, w_f
-  and w_o.
-
-
-  In the forward pass (Propagate), this component computes the following:
-
-     i_t = Sigmoid(i_part + w_{ic}*c_{t-1})   (1)
-     f_t = Sigmoid(f_part + w_{fc}*c_{t-1})   (2)
-     c_t = f_t*c_{t-1} + i_t * Tanh(c_part)   (3)
-     o_t = Sigmoid(o_part + w_{oc}*c_t)       (4)
-     m_t = o_t * Tanh(c_t)                    (5)
-    # note: the outputs are just c_t and m_t.
-
-  [Note regarding dropout: optionally the input-dimension may be 5C + 3 instead
-  of 5C in this case, the last three input dimensions will be interpreted as
-  per-frame dropout masks on i_t, f_t and o_t respectively, so that on the RHS of
-  (3), i_t is replaced by i_t * i_t_scale, and likewise for f_t and o_t.]
-
-  The backprop is as you would think, but for the "self-repair" we need to pass
-  in additional vectors (of the same dim as the parameters of the layer) that
-  dictate whether or not we add an additional term to the backpropagated
-  derivatives.  (This term helps force the input to the nonlinearities into the
-  range where the derivatives are not too small).
-
-  This component stores stats of the same form as are normally stored by the
-  StoreStats() functions for the sigmoid and tanh units, i.e. averages of the
-  activations and derivatives, but this is done inside the Backprop() functions.
-  [the StoreStats() functions don't take the input data as an argument, so
-  storing this data that way is impossible, and anyway it's more efficient to
-  do it as part of backprop.]
-
-  Configuration values accepted:
-         cell-dim          e.g. cell-dim=1024  Cell dimension.  The input
-                          dimension of this component is cell-dim * 5, and the
-                          output dimension is cell-dim * 2.  Note: this
-                          component implements only part of the LSTM layer,
-                          see comments above.
-         param-stddev     Standard deviation for random initialization of
-                          the diagonal matrices (AKA peephole connections).
-                          default=1.0, which is probably too high but
-                          we couldn't see any reliable gain from decreasing it.
-         tanh-self-repair-threshold   Equivalent to the self-repair-lower-threshold
-                          in a TanhComponent; applies to both the tanh nonlinearities.
-                          default=0.2, you probably won't want to changethis.
-         sigmoid-self-repair-threshold   Equivalent to self-repair-lower-threshold
-                          in a SigmoidComponent; applies to all three of the sigmoid
-                          nonlinearities.  default=0.05, you probably won't want to
-                          change this.
-         self-repair-scale Equivalent to the self-repair-scale in a SigmoidComponent
-                          or TanhComponent; applies to both the sigmoid and tanh
-                          nonlinearities.  default=1.0e-05, which you probably won't
-                          want to change unless dealing with an objective function
-                          that has smaller or larger dynamic range than normal, in
-                          which case you might want to make it smaller or larger.
-*/
-class LstmNonlinearityComponent: public UpdatableComponent {
- public:
-
-  virtual int32 InputDim() const;
-  virtual int32 OutputDim() const;
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  LstmNonlinearityComponent(): use_dropout_(false) { }
-  virtual std::string Type() const { return "LstmNonlinearityComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput;
-  }
-
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update_in,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-
-  virtual Component* Copy() const;
-
-  // Some functions from base-class UpdatableComponent.
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void PerturbParams(BaseFloat stddev);
-  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
-  virtual int32 NumParameters() const;
-  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
-  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
-  virtual void ZeroStats();
-  virtual void FreezeNaturalGradient(bool freeze);
-
-  // Some functions that are specific to this class:
-  explicit LstmNonlinearityComponent(
-      const LstmNonlinearityComponent &other);
-
-  void Init(int32 cell_dim, bool use_dropout,
-            BaseFloat param_stddev,
-            BaseFloat tanh_self_repair_threshold,
-            BaseFloat sigmoid_self_repair_threshold,
-            BaseFloat self_repair_scale);
-
- private:
-
-  // Initializes the natural-gradient object with the configuration we
-  // use for this object, which for now is hardcoded at the C++ level.
-  void InitNaturalGradient();
-
-  // Notation: C is the cell dimension; it equals params_.NumCols().
-
-  // The dimension of the parameter matrix is (3 x C);
-  // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
-  CuMatrix<BaseFloat> params_;
-
-  // If true, we expect an extra 3 dimensions on the input, for dropout masks
-  // for i_t and f_t.
-  bool use_dropout_;
-
-  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
-  // equations (1) through (5), this is the sum of the values of the nonliearities
-  // (used for diagnostics only).  It is comparable to value_sum_ vector
-  // in base-class NonlinearComponent.
-  CuMatrix<double> value_sum_;
-
-  // Of dimension 5 * C, with a row for each of the Sigmoid/Tanh functions in
-  // equations (1) through (5), this is the sum of the derivatives of the
-  // nonliearities (used for diagnostics and to control self-repair).  It is
-  // comparable to the deriv_sum_ vector in base-class
-  // NonlinearComponent.
-  CuMatrix<double> deriv_sum_;
-
-  // This matrix has dimension 10.  The contents are a block of 5 self-repair
-  // thresholds (typically "0.05 0.05 0.2 0.05 0.2"), then a block of 5
-  // self-repair scales (typically all 0.00001).  These are for each of the 5
-  // nonlinearities in the LSTM component in turn (see comments in cu-math.h for
-  // more info).
-  CuVector<BaseFloat> self_repair_config_;
-
-  // This matrix has dimension 5.  For each of the 5 nonlinearities in the LSTM
-  // component (see comments in cu-math.h for more info), it contains the total,
-  // over all frames represented in count_, of the number of dimensions that
-  // were subject to self_repair.  To get the self-repair proportion you should
-  // divide by (count_ times cell_dim_).
-  CuVector<double> self_repair_total_;
-
-  // The total count (number of frames) corresponding to the stats in value_sum_
-  // and deriv_sum_.
-  double count_;
-
-  // Preconditioner for the parameters of this component [operates in the space
-  // of dimension C].
-  // The preconditioner stores its own configuration values; we write and read
-  // these, but not the preconditioner object itself.
-  OnlineNaturalGradient preconditioner_;
-
-  const LstmNonlinearityComponent &operator
-      = (const LstmNonlinearityComponent &other); // Disallow.
-};
-
-
-
-
-/*
- * WARNING, this component is deprecated as it's not compatible with
- *   TimeHeightConvolutionComponent, and it will eventually be deleted.
- * MaxPoolingComponent :
- * Maxpooling component was firstly used in ConvNet for selecting an
- * representative activation in an area. It inspired Maxout nonlinearity.
- * Each output element of this component is the maximum of a block of
- * input elements where the block has a 3D dimension (pool_x_size_,
- * pool_y_size_, pool_z_size_).
- * Blocks could overlap if the shift value on any axis is smaller
- * than its corresponding pool size (e.g. pool_x_step_ < pool_x_size_).
- * If the shift values are euqal to their pool size, there is no
- * overlap; while if they all equal 1, the blocks overlap to
- * the greatest possible extent.
- *
- * This component is designed to be used after a ConvolutionComponent
- * so that the input matrix is propagated from a 2d-convolutional layer.
- * This component implements 3d-maxpooling which performs
- * max pooling along the three axes.
- * Input : A matrix where each row is a vectorized 3D-tensor.
- *        The 3D tensor has dimensions
- *        x: (e.g. time)
- *        y: (e.g. frequency)
- *        z: (e.g. channels like number of filters in the ConvolutionComponent)
- *
- *        The component assumes input vectorizations of type zyx
- *        which is the default output vectorization type of a ConvolutionComponent.
- *        e.g. for input vectorization of type zyx the input is vectorized by
- *        spanning axes z, y and x of the tensor in that order.
- *        Given 3d tensor A with sizes (2, 2, 2) along the three dimensions
- *        the zyx vectorized input looks like
- *  A(0,0,0) A(0,0,1) A(0,1,0) A(0,1,1) A(1,0,0) A(1,0,1) A(1,1,0) A(1,1,1)
- *
- * Output : The output is also a 3D tensor vectorized in the zyx format.
- *
- * For information on the hyperparameters and parameters of this component see
- * the variable declarations.
- *
- *
- */
-class MaxpoolingComponent: public Component {
- public:
-
-  MaxpoolingComponent(): input_x_dim_(0), input_y_dim_(0), input_z_dim_(0),
-                           pool_x_size_(0), pool_y_size_(0), pool_z_size_(0),
-                           pool_x_step_(0), pool_y_step_(0), pool_z_step_(0) { }
-  // constructor using another component
-  MaxpoolingComponent(const MaxpoolingComponent &component);
-
-  virtual int32 InputDim() const;
-  virtual int32 OutputDim() const;
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "MaxpoolingComponent"; }
-  virtual int32 Properties() const {
-    return kSimpleComponent|kBackpropNeedsInput|kBackpropNeedsOutput|
-           kBackpropAdds;
-  }
-
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *, // to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual Component* Copy() const { return new MaxpoolingComponent(*this); }
-
-
- protected:
-  void InputToInputPatches(const CuMatrixBase<BaseFloat>& in,
-                           CuMatrix<BaseFloat> *patches) const;
-  void InderivPatchesToInderiv(const CuMatrix<BaseFloat>& in_deriv_patches,
-                               CuMatrixBase<BaseFloat> *in_deriv) const;
-  virtual void Check() const;
-
-
-  int32 input_x_dim_;   // size of the input along x-axis
-  // (e.g. number of time steps)
-  int32 input_y_dim_;   // size of input along y-axis
-  // (e.g. number of mel-frequency bins)
-  int32 input_z_dim_;   // size of input along z-axis
-  // (e.g. number of filters in the ConvolutionComponent)
-
-  int32 pool_x_size_;    // size of the pooling window along x-axis
-  int32 pool_y_size_;    // size of the pooling window along y-axis
-  int32 pool_z_size_;    // size of the pooling window along z-axis
-
-  int32 pool_x_step_;   // the number of steps taken along x-axis of input
-  //  before computing the next pool
-  int32 pool_y_step_;   // the number of steps taken along y-axis of input
-  // before computing the next pool
-  int32 pool_z_step_;   // the number of steps taken along z-axis of input
-  // before computing the next pool
-
-};
-
-
 /**
    CompositeComponent is a component representing a sequence of
    [simple] components.  The config line would be something like the following
diff --git a/src/nnet3/nnet-tdnn-component.cc b/src/nnet3/nnet-tdnn-component.cc
index 52ad1031a4c..c287ce303a6 100644
--- a/src/nnet3/nnet-tdnn-component.cc
+++ b/src/nnet3/nnet-tdnn-component.cc
@@ -694,6 +694,12 @@ void TdnnComponent::PrecomputedIndexes::Read(
   ExpectToken(is, binary, "</TdnnComponentPrecomputedIndexes>");
 }
 
+void TdnnComponent::ConsolidateMemory() {
+  OnlineNaturalGradient temp_in(preconditioner_in_);
+  preconditioner_in_.Swap(&temp_in);
+  OnlineNaturalGradient temp_out(preconditioner_out_);
+  preconditioner_out_.Swap(&temp_out);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index bae332cd584..a8ef30bc314 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1400,7 +1400,7 @@ void ComputeExampleComputationRequestSimple(
 static void GenerateRandomComponentConfig(std::string *component_type,
                                           std::string *config) {
 
-  int32 n = RandInt(0, 35);
+  int32 n = RandInt(0, 37);
   BaseFloat learning_rate = 0.001 * RandInt(1, 100);
 
   std::ostringstream os;
@@ -1757,6 +1757,22 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " use-bias=" << (RandInt(0,1) == 0 ? "true":"false");
       break;
     }
+    case 36: {
+      *component_type = "GruNonlinearityComponent";
+      int32 cell_dim = RandInt(10, 20);
+      int32 recurrent_dim = (RandInt(0, 1) == 0 ?
+                             RandInt(5, cell_dim - 1) : cell_dim);
+      os << "cell-dim=" << cell_dim
+         << " recurrent-dim=" << recurrent_dim;
+      break;
+    }
+    case 37: {
+      *component_type = "OutputGruNonlinearityComponent";
+      os << "cell-dim=" << RandInt(10, 20)
+         << " learning-rate=" << learning_rate;
+
+      break;
+    }
     default:
       KALDI_ERR << "Error generating random component";
   }
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 8fda24cd22d..b4563c7a2c3 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -30,6 +30,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
     nnet_(nnet),
     compiler_(*nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -38,9 +39,6 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
                config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (config_.read_cache != "") {
     bool binary;
@@ -82,8 +80,12 @@ void NnetTrainer::Train(const NnetExample &eg) {
   } else { // conventional training
     TrainInternal(eg, *computation);
   }
-
+  if (num_minibatches_processed_ == 0) {
+    ConsolidateMemory(nnet_);
+    ConsolidateMemory(delta_nnet_);
+  }
   num_minibatches_processed_++;
+
 }
 
 void NnetTrainer::TrainInternal(const NnetExample &eg,
@@ -107,9 +109,9 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
                         delta_nnet_);
 
   // Update the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
-      1.0, 1.0 - config_.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
+      1.0, 1.0 - config_.momentum, nnet_, &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -163,9 +165,10 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
       max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -232,40 +235,10 @@ bool NnetTrainer::PrintTotalStats() const {
     bool ok = info.PrintTotalStats(name);
     ans = ans || ok;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / config_.backstitch_training_interval))
-                  << " \% of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / config_.backstitch_training_interval))
-              << " \% of the time.";
-}
-
 void ObjectiveFunctionInfo::UpdateStats(
     const std::string &output_name,
     int32 minibatches_per_phase,
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index fffc621930a..64ec7abc58e 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -26,6 +26,7 @@
 #include "nnet3/nnet-compute.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -187,10 +188,6 @@ class NnetTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -220,8 +217,7 @@ class NnetTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index d16a728e2ab..5f3ef78d06c 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -300,6 +300,15 @@ void SetNnetAsGradient(Nnet *nnet) {
   }
 }
 
+void SetRequireDirectInput(bool b, Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (dynamic_cast<StatisticsPoolingComponent*>(comp) != NULL)
+      dynamic_cast<StatisticsPoolingComponent*>(comp)->SetRequireDirectInput(b);
+  }
+}
+
+
 void ScaleNnet(BaseFloat scale, Nnet *nnet) {
   if (scale == 1.0) return;
   else {
@@ -630,13 +639,37 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes) {
 }
 
 
+// Parameters used in applying SVD:
+// 1. Energy threshold : For each Affine weights layer in the original baseline nnet3 model,
+//  we perform SVD based factoring of the weights matrix of the layer,
+//  into a singular values (left diagonal) matrix, and two Eigen matrices.
+//
+// SVD : Wx = UEV, U,V are Eigen matrices, and E is the singularity matrix)
+//
+// We take the center matrix E, and consider only the Singular values which contribute
+//  to (Energy-threshold) times the total Energy of Singularity parameters.
+//   These Singularity parameters are actually sorted in descending order and lower
+//    values are pruned out until the Total energy (Sum of squares) of the pruned set
+//     of parameters is just above (Energy-threshold * Total init energy). The values which
+//      are pruned away are replaced with 0 in the Singularity matrix
+//      and the Weights matrix after SVD is derived with shrinked dimensions.
+//
+// 2. Shrinkage-threshold : If the Shrinkage ratio of the SVD refactored Weights matrix
+//       is higher than Shrinkage-threshold for any of the Tdnn layers,
+//        the SVD process is aborted for that particular Affine weights layer.
+//
+
 // this class implements the internals of the edit directive 'apply-svd'.
 class SvdApplier {
  public:
   SvdApplier(const std::string component_name_pattern,
              int32 bottleneck_dim,
+             BaseFloat energy_threshold,
+             BaseFloat shrinkage_threshold,
              Nnet *nnet): nnet_(nnet),
                           bottleneck_dim_(bottleneck_dim),
+        		  energy_threshold_(energy_threshold),
+          		  shrinkage_threshold_(shrinkage_threshold),
                           component_name_pattern_(component_name_pattern) { }
   void ApplySvd() {
     DecomposeComponents();
@@ -673,43 +706,70 @@ class SvdApplier {
                      << " -> " << output_dim;
           continue;
         }
-        size_t n = modified_component_info_.size();
-        modification_index_[c] = n;
-        modified_component_info_.resize(n + 1);
-        ModifiedComponentInfo &info = modified_component_info_[n];
-        info.component_index = c;
-        info.component_name = component_name;
         Component *component_a = NULL, *component_b = NULL;
-        info.component_name_a = component_name + "_a";
-        info.component_name_b = component_name + "_b";
-        if (nnet_->GetComponentIndex(info.component_name_a) >= 0)
-          KALDI_ERR << "Neural network already has a component named "
-                    << info.component_name_a;
-        if (nnet_->GetComponentIndex(info.component_name_b) >= 0)
-          KALDI_ERR << "Neural network already has a component named "
-                    << info.component_name_b;
-        DecomposeComponent(component_name, *affine, &component_a, &component_b);
-        info.component_a_index = nnet_->AddComponent(info.component_name_a,
-                                                     component_a);
-        info.component_b_index = nnet_->AddComponent(info.component_name_b,
-                                                     component_b);
+	if (DecomposeComponent(component_name, *affine, &component_a, &component_b)) {
+	  size_t n = modified_component_info_.size();
+	  modification_index_[c] = n;
+	  modified_component_info_.resize(n + 1);
+	  ModifiedComponentInfo &info = modified_component_info_[n];
+	  info.component_index = c;
+	  info.component_name = component_name;
+	  info.component_name_a = component_name + "_a";
+	  info.component_name_b = component_name + "_b";
+	  if (nnet_->GetComponentIndex(info.component_name_a) >= 0)
+	    KALDI_ERR << "Neural network already has a component named "
+		      << info.component_name_a;
+	  if (nnet_->GetComponentIndex(info.component_name_b) >= 0)
+	    KALDI_ERR << "Neural network already has a component named "
+		      << info.component_name_b;
+	  info.component_a_index = nnet_->AddComponent(info.component_name_a,
+						       component_a);
+	  info.component_b_index = nnet_->AddComponent(info.component_name_b,
+						       component_b);
+	}
       }
     }
     KALDI_LOG << "Converted " << modified_component_info_.size()
               << " components to FixedAffineComponent.";
   }
 
-  void DecomposeComponent(const std::string &component_name,
+  // This function finds the minimum index of
+  // the Descending order sorted [input_vector],
+  // over a range of indices from [lower] to [upper] index,
+  // for which the sum of elements upto the found min. index is greater
+  // than [min_val].
+  // We add one to this index to return the reduced dimension value.
+
+  int32 GetReducedDimension(const Vector<BaseFloat> &input_vector,
+			     int32 lower,
+			     int32 upper,
+			     BaseFloat min_val) {
+    BaseFloat sum = 0;
+    int32 i = 0;
+    for (i = lower; i <= upper; i++) {
+	sum = sum + input_vector(i);
+	if (sum >= min_val) break;
+    }
+    return (i+1);
+  }
+
+// Here we perform SVD based refactorig of an input Affine component.
+// After applying SVD , we sort the Singularity values in descending order,
+// and take the subset of values which contribute to energy_threshold times
+// total original sum of squared singular values, and then refactor the Affine
+// component using only these selected singular values, thus making the bottleneck
+// dim of the refactored Affine layer equal to the no. of Singular values selected.
+// This function returs false if the shrinkage ratio of the total no. of parameters,
+// after the above SVD based refactoring, is greater than shrinkage threshold.
+//
+  bool DecomposeComponent(const std::string &component_name,
                           const AffineComponent &affine,
                           Component **component_a_out,
                           Component **component_b_out) {
     int32 input_dim = affine.InputDim(), output_dim = affine.OutputDim();
     Matrix<BaseFloat> linear_params(affine.LinearParams());
     Vector<BaseFloat> bias_params(affine.BiasParams());
-
-    int32 bottleneck_dim = bottleneck_dim_,
-        middle_dim = std::min<int32>(input_dim, output_dim);
-    KALDI_ASSERT(bottleneck_dim < middle_dim);
+    int32 middle_dim = std::min<int32>(input_dim, output_dim);
 
     // note: 'linear_params' is of dimension output_dim by input_dim.
     Vector<BaseFloat> s(middle_dim);
@@ -718,15 +778,40 @@ class SvdApplier {
     linear_params.Svd(&s, &B, &A);
     // make sure the singular values are sorted from greatest to least value.
     SortSvd(&s, &B, &A);
-    BaseFloat s_sum_orig = s.Sum();
-    s.Resize(bottleneck_dim, kCopyData);
-    A.Resize(bottleneck_dim, input_dim, kCopyData);
-    B.Resize(output_dim, bottleneck_dim, kCopyData);
-    BaseFloat s_sum_reduced = s.Sum();
+    Vector<BaseFloat> s2(s.Dim());
+    s2.AddVec2(1.0, s);
+    BaseFloat s2_sum_orig = s2.Sum();
+    KALDI_ASSERT(energy_threshold_ < 1);
+    KALDI_ASSERT(shrinkage_threshold_ < 1);
+    if (energy_threshold_ > 0) {
+      BaseFloat min_singular_sum = energy_threshold_ * s2_sum_orig;
+      bottleneck_dim_ = GetReducedDimension(s2, 0, s2.Dim()-1, min_singular_sum);
+    }
+    SubVector<BaseFloat> this_part(s2, 0, bottleneck_dim_);
+    BaseFloat s2_sum_reduced = this_part.Sum();
+    BaseFloat shrinkage_ratio =
+      static_cast<BaseFloat>(bottleneck_dim_ * (input_dim+output_dim))
+      / static_cast<BaseFloat>(input_dim * output_dim);
+    if (shrinkage_ratio > shrinkage_threshold_) {
+      KALDI_LOG << "Shrinkage ratio " << shrinkage_ratio
+		<< " greater than threshold : " << shrinkage_threshold_
+		<< " Skipping SVD for this layer.";
+      return false;
+    }
+
+    s.Resize(bottleneck_dim_, kCopyData);
+    A.Resize(bottleneck_dim_, input_dim, kCopyData);
+    B.Resize(output_dim, bottleneck_dim_, kCopyData);
+    KALDI_LOG << "For component " << component_name
+              << " singular value squared sum changed by "
+              << (s2_sum_orig - s2_sum_reduced)
+              << " (from " << s2_sum_orig << " to " << s2_sum_reduced << ")";
     KALDI_LOG << "For component " << component_name
-              << " singular value sum changed by "
-              << (s_sum_orig - s_sum_reduced)
-              << " (from " << s_sum_orig << " to " << s_sum_reduced << ")";
+	      << " dimension reduced from "
+              << " (" << input_dim << "," << output_dim << ")"
+	      << " to [(" << input_dim << "," << bottleneck_dim_
+	      << "), (" << bottleneck_dim_ << "," << output_dim <<")]";
+    KALDI_LOG << "shrinkage ratio : " << shrinkage_ratio;
 
     // we'll divide the singular values equally between the two
     // parameter matrices.
@@ -745,23 +830,22 @@ class SvdApplier {
     component_b->SetUpdatableConfigs(affine);
     *component_a_out = component_a;
     *component_b_out = component_b;
+    return true;
   }
 
   // This function modifies the topology of the neural network, splitting
   // up the components we're modifying into two parts.
   // Suppose we have something like:
   //  component-node name=some_node component=some_component input=
+  // nodes_to_modify will be a list of component-node indexes that we
+  // need to split into two.  These will be nodes like
+  // component-node name=component_node_name component=component_name input=xxx
+  // where 'component_name' is one of the components that we're splitting.
+  // node_names_modified is nnet_->node_names_ except with, for the nodes that
+  // we are splitting in two, "some_node_name" replaced with
+  // "some_node_name_b" (the second of the two split nodes).
   void ModifyTopology() {
-    // nodes_to_split will be a list of component-node indexes that we
-    // need to split into two.  These will be nodes like
-    // component-node name=component_node_name component=component_name input=xxx
-    // where 'component_name' is one of the components that we're splitting.
     std::set<int32> nodes_to_modify;
-
-
-    // node_names_modified is nnet_->node_names_ except with, for the nodes that
-    // we are splitting in two, "some_node_name" replaced with
-    // "some_node_name_b" (the second of the two split nodes).
     std::vector<std::string> node_names_orig = nnet_->GetNodeNames(),
         node_names_modified = node_names_orig;
 
@@ -881,6 +965,8 @@ class SvdApplier {
 
   Nnet *nnet_;
   int32 bottleneck_dim_;
+  BaseFloat energy_threshold_;
+  BaseFloat shrinkage_threshold_;
   std::string component_name_pattern_;
 };
 
@@ -959,7 +1045,7 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M) {
     // the learning rate slower to reduce the risk of divergence, since the
     // update may not be stable for starting points far from equilibrium.
     BaseFloat ratio = (trace_P_P * P.NumRows() / (trace_P * trace_P));
-    KALDI_ASSERT(ratio > 0.999);
+    KALDI_ASSERT(ratio > 0.99);
     if (ratio > 1.02) {
       update_speed *= 0.5;  // Slow down the update speed to reduce the risk of divergence.
       if (ratio > 1.1) update_speed *= 0.5;  // Slow it down even more.
@@ -1058,6 +1144,27 @@ void ConstrainOrthonormal(Nnet *nnet) {
   }
 }
 
+void ConsolidateMemory(Nnet *nnet) {
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    bool print_memory_info = (GetVerboseLevel() >= 1);
+    if (print_memory_info) {
+      KALDI_VLOG(1) << "Consolidating memory; will print memory usage before "
+          "and after consolidating:";
+      g_cuda_allocator.PrintMemoryUsage();
+    }
+    for (int32 c = 0; c < nnet->NumComponents(); c++) {
+      Component *comp = nnet->GetComponent(c);
+      comp->ConsolidateMemory();
+    }
+    if (print_memory_info) {
+      g_cuda_allocator.PrintMemoryUsage();
+    }
+  }
+#endif
+}
+
+
 
 // This code has been broken out of ReadEditConfig as it's quite long.
 // It implements the internals of the edit directive 'reduce-rank'.
@@ -1292,13 +1399,21 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
     } else if (directive == "apply-svd") {
       std::string name_pattern;
       int32 bottleneck_dim = -1;
-      if (!config_line.GetValue("name", &name_pattern) ||
-          !config_line.GetValue("bottleneck-dim", &bottleneck_dim))
-        KALDI_ERR << "Edit directive apply-svd requires 'name' and "
-            "'bottleneck-dim' to be specified.";
-      if (bottleneck_dim <= 0)
-        KALDI_ERR << "Bottleneck-dim must be positive in apply-svd command.";
-      SvdApplier applier(name_pattern, bottleneck_dim, nnet);
+      BaseFloat energy_threshold = -1;
+      BaseFloat shrinkage_threshold = 1.0;
+      config_line.GetValue("bottleneck-dim", &bottleneck_dim);
+      config_line.GetValue("energy-threshold", &energy_threshold);
+      config_line.GetValue("shrinkage-threshold", &shrinkage_threshold);
+      if (!config_line.GetValue("name", &name_pattern))
+        KALDI_ERR << "Edit directive apply-svd requires 'name' to be specified.";
+      if (bottleneck_dim <= 0 && energy_threshold <=0)
+        KALDI_ERR << "Either Bottleneck-dim or energy-threshold "
+	  "must be set in apply-svd command. "
+	  "Range of possible values is (0 1]";
+      SvdApplier applier(name_pattern, bottleneck_dim,
+			 energy_threshold,
+			 shrinkage_threshold,
+			 nnet);
       applier.ApplySvd();
     } else if (directive == "reduce-rank") {
       std::string name_pattern;
@@ -1634,7 +1749,6 @@ class ModelCollapser {
                                                   component_index2);
   }
 
-
   /**
      Tries to produce a component that's equivalent to running the component
      'component_index2' with input given by 'component_index1'.  This handles
@@ -2065,7 +2179,7 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
       ostr << "Per-component max-change active on "
            << num_max_change_per_component_applied_per_minibatch
            << " / " << num_updatable << " Updatable Components."
-           << "(smallest factor=" << min_scale << " on "
+           << " (Smallest factor=" << min_scale << " on "
            << component_name_with_min_scale
            << " with max-change=" << max_change_with_min_scale <<"). ";
     if (param_delta > max_param_change * max_change_scale)
@@ -2152,5 +2266,47 @@ void ApplyL2Regularization(const Nnet &nnet,
 }
 
 
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats) {
+  bool ans = UpdateNnetWithMaxChange(
+      delta_nnet, max_param_change, max_change_scale,
+      scale, nnet,
+      &(stats->num_max_change_per_component_applied),
+      &(stats->num_max_change_global_applied));
+  stats->num_minibatches_processed++;
+  return ans;
+}
+
+
+void MaxChangeStats::Print(const Nnet &nnet) const {
+  int32 i = 0;
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
+          comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+                  << "UpdatableComponent; change this code.";
+      if (num_max_change_per_component_applied[i] > 0)
+        KALDI_LOG << "For " << nnet.GetComponentName(c)
+                  << ", per-component max-change was enforced "
+                  << ((100.0 * num_max_change_per_component_applied[i]) /
+                      num_minibatches_processed)
+                  << " \% of the time.";
+      i++;
+    }
+  }
+  if (num_max_change_global_applied > 0)
+    KALDI_LOG << "The global max-change was enforced "
+              << ((100.0 * num_max_change_global_applied) /
+                  num_minibatches_processed)
+              << " \% of the time.";
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index c54fcf87e64..875663913da 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -118,6 +118,14 @@ void ScaleNnet(BaseFloat scale, Nnet *nnet);
 /// learning_rate_ to 1 for each UpdatableComponent in nnet
 void SetNnetAsGradient(Nnet *nnet);
 
+
+/// Calls the corresponding function in any component of type
+/// StatisticsPoolingComponent; used as a way to compute the 'real' left-right
+/// context of networks including SatisticsPoolingComponent, which will give you
+/// the minimum chunk size they can consume.
+void SetRequireDirectInput(bool b, Nnet *nnet);
+
+
 /// Does *dest += alpha * src (affects nnet parameters and
 /// stored stats).
 void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);
@@ -234,8 +242,8 @@ struct CollapseModelConfig {
   bool collapse_batchnorm;  // batchnorm then affine.
   bool collapse_affine;  // affine or fixed-affine then affine.
   bool collapse_scale;  // affine then fixed-scale.
-  CollapseModelConfig(): collapse_dropout(true),
-                         collapse_batchnorm(true),
+  CollapseModelConfig(): collapse_dropout(false),
+                         collapse_batchnorm(false),
                          collapse_affine(true),
                          collapse_scale(true) { }
 };
@@ -300,13 +308,18 @@ void CollapseModel(const CollapseModelConfig &config,
        DropoutMaskComponent or GeneralDropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
 
-    apply-svd name=<name-pattern> bottleneck-dim=<dim>
+    apply-svd name=<name-pattern> bottleneck-dim=<dim> energy-threshold=<threshold> shrinkage-threshold=<s>
        Locates all components with names matching <name-pattern>, which are
        type AffineComponent or child classes thereof.  If <dim> is
        less than the minimum of the (input or output) dimension of the component,
-       it does SVD on the components' parameters, retaining only the alrgest
+       it does SVD on the components' parameters, retaining only the largest
        <dim> singular values, replacing these components with sequences of two
        components, of types LinearComponent and NaturalGradientAffineComponent.
+       Instead we can set the filtering criterion for the Singular values as energy-threshold,
+       and retain those values which contribute to energy-threshold times the total energy of
+       the original singular values. A particular SVD factored component is left unshrinked,
+       if the shrinkage ratio of the total no. of its parameters,
+       after the SVD based refactoring, is greater than shrinkage threshold.
        See also 'reduce-rank'.
 
     reduce-rank name=<name-pattern> rank=<dim>
@@ -377,6 +390,17 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
                              num_max_change_per_component_applied,
                              int32 *num_max_change_global_applied);
 
+struct MaxChangeStats;
+
+// This overloaded version of UpdateNnetWithMaxChange() is a convenience
+// wrapper for when you have a MaxChangeStats object to keep track
+// of how many times the max-change was applied.  See documentation above.
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats);
+
 
 /**
    This function is used as part of the regular training workflow, prior to
@@ -476,6 +500,19 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
  */
 void ConstrainOrthonormal(Nnet *nnet);
 
+
+/**
+   This just calls ConsolidateMemory() on all the components of the nnet.  This
+   is called by the training code after processing the first minibatch.  On some
+   components this will do nothing; on some components it will reallocate
+   certain quantities that have been allocated during training (mostly the
+   contents of NaturalGradientOnline objects, and stats for NonlinearComponents)
+   so that they can be put into low memory.  This will tend to minimize
+   memory fragmentation.  Read comments in ../cudamatrix/cu-allocator.h for
+   more explanation.
+ */
+void ConsolidateMemory(Nnet *nnet);
+
 /** This utility function can be used to obtain the number of distinct 'n'
     values in a training example.  This is the number of examples
     (e.g. sequences) that have been combined into a single example.  (Actually
@@ -500,6 +537,24 @@ int32 GetNumNvalues(const std::vector<NnetIo> &io_vec,
                     bool exhaustive);
 
 
+struct MaxChangeStats {
+  int32 num_max_change_global_applied;
+  int32 num_minibatches_processed;
+  std::vector<int32> num_max_change_per_component_applied;
+
+  MaxChangeStats(const Nnet &nnet):
+      num_max_change_global_applied(0),
+      num_minibatches_processed(0),
+      num_max_change_per_component_applied(NumUpdatableComponents(nnet), 0) { }
+
+  // Prints the max-change stats.  Usually will be called at the end
+  // of the program.  The nnet is only needed for structural information,
+  // to work out the component names.
+  void Print(const Nnet &nnet) const;
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index 3f39c361c65..a5516149ee6 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -18,7 +18,10 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-compute-objf nnet3-discriminative-train \
    nnet3-discriminative-subset-egs nnet3-get-egs-simple \
    nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \
-   nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute
+   nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute \
+   nnet3-xvector-compute-batched \
+   nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch \
+   cuda-gpu-available cuda-compiled
 
 OBJFILES =
 
@@ -31,7 +34,8 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
+
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet2bin/cuda-compiled.cc b/src/nnet3bin/cuda-compiled.cc
similarity index 100%
rename from src/nnet2bin/cuda-compiled.cc
rename to src/nnet3bin/cuda-compiled.cc
diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnet3bin/cuda-gpu-available.cc
similarity index 83%
rename from src/nnetbin/cuda-gpu-available.cc
rename to src/nnet3bin/cuda-gpu-available.cc
index 390468d3046..2036ea82056 100644
--- a/src/nnetbin/cuda-gpu-available.cc
+++ b/src/nnet3bin/cuda-gpu-available.cc
@@ -35,19 +35,29 @@ using namespace kaldi;
 void TestGpuComputation() {
   CuMatrix<BaseFloat> m(100,100);
   m.SetRandn();
-  m.ApplySoftMaxPerRow(m);
+  m.SoftMaxPerRow(m);
 }
 #endif
 
 int main(int argc, char *argv[]) try {
+
+  /* only for Doxygen documentation, never shown in command line */
+  const char *usage =
+        "Test if there is a GPU available, and if the GPU setup is correct.\n"
+        "A GPU is acquired and a small computation is done\n"
+        "(generating a random matrix and computing softmax for its rows).\n"
+        "\n"
+        "exit-code: 0 = success, 1 = compiled without GPU support, -1 = error\n"
+        "\n"
+        "Usage:  cuda-gpu-available\n";
+
   char hostname[100] = "UNKNOWN-HOSTNAME";
 #if !defined(_MSC_VER) && !defined(__CYGWIN__)
   if (gethostname(hostname, 100)) {
     KALDI_WARN << "Cannot get hostname, " << strerror(errno);
   }
 #endif
-  KALDI_LOG << std::endl << std::endl
-    << "### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
+  KALDI_LOG << "\n\n### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().SelectGpuId("yes");
   fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
@@ -76,9 +86,9 @@ int main(int argc, char *argv[]) try {
   return 0;
 #else
   std::cerr
-    << "### CUDA WAS NOT COMPILED IN! ###" << std::endl
+    << "### CUDA WAS NOT COMPILED IN! ###\n"
     << "To support CUDA, you must run 'configure' on a machine "
-    << "that has the CUDA compiler 'nvcc' available.";
+    << "that has the CUDA compiler 'nvcc' available.\n";
   return 1;
 #endif
 } catch (const std::exception &e) {
@@ -95,4 +105,3 @@ int main(int argc, char *argv[]) try {
     << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
   return -1;
 }
-
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 2230ae77c00..c820814db24 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -116,7 +116,6 @@ int main(int argc, char *argv[]) {
     if (!nnet_config.empty()) {
       Input ki(nnet_config);
       am_nnet.GetNnet().ReadConfig(ki.Stream());
-      am_nnet.SetContext();
     }
 
     if(convert_repeated_to_block)
@@ -137,6 +136,9 @@ int main(int argc, char *argv[]) {
       ReadEditConfig(is, &(am_nnet.GetNnet()));
     }
 
+    am_nnet.SetContext();  // in case we used the config or edits-config or
+                           // edits options
+
     if (scale != 1.0)
       ScaleNnet(scale, &(am_nnet.GetNnet()));
 
diff --git a/src/nnet3bin/nnet3-average.cc b/src/nnet3bin/nnet3-average.cc
index 9d4513775d6..d794e37e50d 100644
--- a/src/nnet3bin/nnet3-average.cc
+++ b/src/nnet3bin/nnet3-average.cc
@@ -51,8 +51,30 @@ void GetWeights(const std::string &weights_str,
   }
 }
 
+// This job is run in a spawned thread; it reads a subset of models with
+// specified weights.  Sets *success to 1 for success and 0 for failure.  (We
+// don't use bool because of the weird implementation of std::vector<bool>).
+void ReadModels(std::vector<std::pair<std::string, BaseFloat> > models_and_weights,
+                nnet3::Nnet *output_nnet,
+                int32 *success) {
+  using namespace nnet3;
+  try {
+    int32 n = models_and_weights.size();
+    ReadKaldiObject(models_and_weights[0].first, output_nnet);
+    ScaleNnet(models_and_weights[0].second, output_nnet);
+    for (int32 i = 1; i < n; i++) {
+      Nnet nnet;
+      ReadKaldiObject(models_and_weights[i].first, &nnet);
+      AddNnet(nnet, models_and_weights[i].second, output_nnet);
+    }
+    *success = 1;
+  } catch (...) {
+    *success = 0;
+  }
 }
 
+}  // namespace kaldi
+
 
 int main(int argc, char *argv[]) {
   try {
@@ -70,12 +92,15 @@ int main(int argc, char *argv[]) {
         " nnet3-average 1.1.nnet 1.2.nnet 1.3.nnet 2.nnet\n";
 
     bool binary_write = true;
+    int32 num_threads = -1;
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     std::string weights_str;
     po.Register("weights", &weights_str, "Colon-separated list of weights, one "
                 "for each input model.  These will be normalized to sum to one.");
+    po.Register("num-threads", &num_threads, "Number of threads to read the "
+                "models (will be set automatically if not set.");
 
     po.Read(argc, argv);
 
@@ -88,24 +113,53 @@ int main(int argc, char *argv[]) {
         first_nnet_rxfilename = po.GetArg(1),
         nnet_wxfilename = po.GetArg(po.NumArgs());
 
-    Nnet nnet;
-    ReadKaldiObject(first_nnet_rxfilename, &nnet);
-
     int32 num_inputs = po.NumArgs() - 1;
 
+    if (num_threads <= 0) {
+      // Default logic for selecting the number of threads.
+      if (num_inputs > 10) num_threads = 3;
+      else if (num_inputs > 5) num_threads = 2;
+      else num_threads = 1;
+    }
+
+    if (num_threads > 1 && num_threads * 2 > num_inputs) {
+      num_threads = num_inputs / 2;
+    }
+
     std::vector<BaseFloat> model_weights;
     GetWeights(weights_str, num_inputs, &model_weights);
 
-    ScaleNnet(model_weights[0], &nnet);
+    std::vector<Nnet> nnets(num_threads);
+    std::vector<int32> return_statuses(num_threads);
+
+    std::vector<std::thread*> threads(num_threads);
 
-    for (int32 i = 2; i <= num_inputs; i++) {
-      Nnet src_nnet;
-      ReadKaldiObject(po.GetArg(i), &src_nnet);
-      AddNnet(src_nnet, model_weights[i - 1], &nnet);
+    for (int32 thread_id = 0; thread_id < num_threads; thread_id++) {
+      std::vector<std::pair<std::string, BaseFloat> > this_models_and_weights;
+      for (int32 j = 1 + thread_id; j < po.NumArgs(); j += num_threads) {
+        this_models_and_weights.push_back(std::pair<std::string, BaseFloat>(
+            po.GetArg(j), model_weights[j - 1]));
+      }
+      threads[thread_id] = new std::thread(ReadModels, this_models_and_weights,
+                                           &(nnets[thread_id]),
+                                           &(return_statuses[thread_id]));
     }
 
+    bool success = true;
+    for (int32 thread_id = 0; thread_id < num_threads; thread_id++) {
+      threads[thread_id]->join();
+      delete threads[thread_id];
+      if (!return_statuses[thread_id])
+        success = false;
+      if (success && thread_id > 0)
+        AddNnet(nnets[thread_id], 1.0, &(nnets[0]));
+    }
+
+    if (!success) {
+      KALDI_ERR << "Error detected in a model-reading thread.";
+    }
 
-    WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
+    WriteKaldiObject(nnets[0], nnet_wxfilename, binary_write);
 
     KALDI_LOG << "Averaged parameters of " << num_inputs
               << " neural nets, and wrote to " << nnet_wxfilename;
diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
new file mode 100644
index 00000000000..5d4b9b1db48
--- /dev/null
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -0,0 +1,208 @@
+// nnet3bin/nnet3-compute-batch.cc
+
+// Copyright 2012-2018   Johns Hopkins University (author: Daniel Povey)
+//           2018        Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-batch-compute.h"
+#include "base/timer.h"
+#include "nnet3/nnet-utils.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Propagate the features through raw neural network model "
+        "and write the output.  This version is optimized for GPU use. "
+        "If --apply-exp=true, apply the Exp() function to the output "
+        "before writing it out.\n"
+        "\n"
+        "Usage: nnet3-compute-batch [options] <nnet-in> <features-rspecifier> "
+        "<matrix-wspecifier>\n"
+        " e.g.: nnet3-compute-batch final.raw scp:feats.scp "
+        "ark:nnet_prediction.ark\n";
+
+    ParseOptions po(usage);
+    Timer timer;
+
+    NnetBatchComputerOptions opts;
+    opts.acoustic_scale = 1.0;  // by default do no scaling
+
+    bool apply_exp = false, use_priors = false;
+    std::string use_gpu = "yes";
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+                online_ivector_rspecifier,
+                utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    opts.Register(&po);
+
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per "
+                "utterance by default, or per speaker if you provide the "
+                "--utt2spk option.");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for "
+                "utt2spk option used to get ivectors per speaker");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
+    po.Register("apply-exp", &apply_exp, "If true, apply exp function to "
+                "output");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("use-priors", &use_priors, "If true, subtract the logs of the "
+                "priors stored with the model (in this case, "
+                "a .mdl file is expected as input).");
+
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().AllowMultithreading();
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet_rxfilename = po.GetArg(1),
+                feature_rspecifier = po.GetArg(2),
+                matrix_wspecifier = po.GetArg(3);
+
+    Nnet raw_nnet;
+    AmNnetSimple am_nnet;
+    if (use_priors) {
+      bool binary;
+      TransitionModel trans_model;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    } else {
+      ReadKaldiObject(nnet_rxfilename, &raw_nnet);
+    }
+    Nnet &nnet = (use_priors ? am_nnet.GetNnet() : raw_nnet);
+    SetBatchnormTestMode(true, &nnet);
+    SetDropoutTestMode(true, &nnet);
+    CollapseModel(CollapseModelConfig(), &nnet);
+
+    Vector<BaseFloat> priors;
+    if (use_priors)
+      priors = am_nnet.Priors();
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
+
+    int32 num_success = 0, num_fail = 0;
+    std::string output_uttid;
+    Matrix<BaseFloat> output_matrix;
+
+
+    NnetBatchInference inference(opts, nnet, priors);
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string utt = feature_reader.Key();
+      const Matrix<BaseFloat> &features = feature_reader.Value();
+      if (features.NumRows() == 0) {
+        KALDI_WARN << "Zero-length utterance: " << utt;
+        num_fail++;
+        continue;
+      }
+      const Matrix<BaseFloat> *online_ivectors = NULL;
+      const Vector<BaseFloat> *ivector = NULL;
+      if (!ivector_rspecifier.empty()) {
+        if (!ivector_reader.HasKey(utt)) {
+          KALDI_WARN << "No iVector available for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          ivector = new Vector<BaseFloat>(ivector_reader.Value(utt));
+        }
+      }
+      if (!online_ivector_rspecifier.empty()) {
+        if (!online_ivector_reader.HasKey(utt)) {
+          KALDI_WARN << "No online iVector available for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          online_ivectors = new Matrix<BaseFloat>(
+              online_ivector_reader.Value(utt));
+        }
+      }
+
+      inference.AcceptInput(utt, features, ivector, online_ivectors,
+                            online_ivector_period);
+
+      std::string output_key;
+      Matrix<BaseFloat> output;
+      while (inference.GetOutput(&output_key, &output)) {
+        if (apply_exp)
+          output.ApplyExp();
+        matrix_writer.Write(output_key, output);
+        num_success++;
+      }
+    }
+
+    inference.Finished();
+    std::string output_key;
+    Matrix<BaseFloat> output;
+    while (inference.GetOutput(&output_key, &output)) {
+      if (apply_exp)
+        output.ApplyExp();
+      matrix_writer.Write(output_key, output);
+      num_success++;
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed << "s";
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+
+    if (num_success != 0) {
+      return 0;
+    } else {
+      return 1;
+    }
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index f67167bc819..cf133025aae 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
     Timer timer;
 
     NnetSimpleComputationOptions opts;
-    opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
+    opts.acoustic_scale = 1.0; // by default do no scaling.
 
     bool apply_exp = false, use_priors = false;
     std::string use_gpu = "yes";
@@ -78,6 +78,10 @@ int main(int argc, char *argv[]) {
                 "priors stored with the model (in this case, "
                 "a .mdl file is expected as input).");
 
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
new file mode 100644
index 00000000000..9a3dc1a112f
--- /dev/null
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -0,0 +1,231 @@
+// nnet3bin/nnet3-latgen-faster-parallel.cc
+
+// Copyright 2012-2016   Johns Hopkins University (author: Daniel Povey)
+//                2014   Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/timer.h"
+#include "base/kaldi-common.h"
+#include "decoder/decoder-wrappers.h"
+#include "fstext/fstext-lib.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-batch-compute.h"
+#include "nnet3/nnet-utils.h"
+#include "util/kaldi-thread.h"
+#include "tree/context-dep.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+void HandleOutput(bool determinize,
+                  const fst::SymbolTable *word_syms,
+                  nnet3::NnetBatchDecoder *decoder,
+                  CompactLatticeWriter *clat_writer,
+                  LatticeWriter *lat_writer) {
+  // Write out any lattices that are ready.
+  std::string output_utterance_id, sentence;
+  if (determinize) {
+    CompactLattice clat;
+    while (decoder->GetOutput(&output_utterance_id, &clat, &sentence)) {
+      if (word_syms != NULL)
+        std::cerr << output_utterance_id << ' ' << sentence << '\n';
+      clat_writer->Write(output_utterance_id, clat);
+    }
+  } else {
+    Lattice lat;
+    while (decoder->GetOutput(&output_utterance_id, &lat, &sentence)) {
+      if (word_syms != NULL)
+        std::cerr << output_utterance_id << ' ' << sentence << '\n';
+      lat_writer->Write(output_utterance_id, lat);
+    }
+  }
+}
+
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  // note: making this program work with GPUs is as simple as initializing the
+  // device, but it probably won't make a huge difference in speed for typical
+  // setups.
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::Fst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices using nnet3 neural net model.  This version is optimized\n"
+        "for GPU-based inference.\n"
+        "Usage: nnet3-latgen-faster-batch [options] <nnet-in> <fst-in> <features-rspecifier>"
+        " <lattice-wspecifier>\n";
+    ParseOptions po(usage);
+
+    bool allow_partial = false;
+    LatticeFasterDecoderConfig decoder_opts;
+    NnetBatchComputerOptions compute_opts;
+    std::string use_gpu = "yes";
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0, num_threads = 1;
+    decoder_opts.Register(&po);
+    compute_opts.Register(&po);
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+    po.Register("num-threads", &num_threads, "Number of decoder (i.e. "
+                "graph-search) threads.  The number of model-evaluation threads "
+                "is always 1; this is optimized for use with the GPU.");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().AllowMultithreading();
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string model_in_rxfilename = po.GetArg(1),
+        fst_in_rxfilename = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        lattice_wspecifier = po.GetArg(4);
+
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(model_in_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      CollapseModel(CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    bool determinize = decoder_opts.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_rxfilename);
+
+    int32 num_success;
+    {
+      NnetBatchComputer computer(compute_opts, am_nnet.GetNnet(),
+                                 am_nnet.Priors());
+      NnetBatchDecoder decoder(*decode_fst, decoder_opts,
+                               trans_model, word_syms, allow_partial,
+                               num_threads, &computer);
+
+      for (; !feature_reader.Done(); feature_reader.Next()) {
+        std::string utt = feature_reader.Key();
+        const Matrix<BaseFloat> &features (feature_reader.Value());
+
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          decoder.UtteranceFailed();
+          continue;
+        }
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
+            decoder.UtteranceFailed();
+            continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
+          }
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
+            decoder.UtteranceFailed();
+            continue;
+          } else {
+            online_ivectors = &online_ivector_reader.Value(utt);
+          }
+        }
+
+        decoder.AcceptInput(utt, features, ivector, online_ivectors,
+                            online_ivector_period);
+
+        HandleOutput(decoder_opts.determinize_lattice, word_syms, &decoder,
+                     &compact_lattice_writer, &lattice_writer);
+      }
+      num_success = decoder.Finished();
+      HandleOutput(decoder_opts.determinize_lattice, word_syms, &decoder,
+                   &compact_lattice_writer, &lattice_writer);
+
+      // At this point the decoder and batch-computer objects will print
+      // diagnostics from their destructors (they are going out of scope).
+    }
+    delete decode_fst;
+    delete word_syms;
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-latgen-faster-parallel.cc b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
index 4858a9fcb14..e3d02410368 100644
--- a/src/nnet3bin/nnet3-latgen-faster-parallel.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
@@ -45,9 +45,11 @@ int main(int argc, char *argv[]) {
     using fst::StdArc;
 
     const char *usage =
-        "Generate lattices using nnet3 neural net model.\n"
+        "Generate lattices using nnet3 neural net model.  This version supports\n"
+        "multiple decoding threads (using a shared decoding graph.)\n"
         "Usage: nnet3-latgen-faster-parallel [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n"
+        "See also: nnet3-latgen-faster-batch (which supports GPUs)\n";
     ParseOptions po(usage);
 
     Timer timer;
diff --git a/src/nnet3bin/nnet3-latgen-faster.cc b/src/nnet3bin/nnet3-latgen-faster.cc
index cb26745d808..42cd843cf15 100644
--- a/src/nnet3bin/nnet3-latgen-faster.cc
+++ b/src/nnet3bin/nnet3-latgen-faster.cc
@@ -33,7 +33,7 @@
 int main(int argc, char *argv[]) {
   // note: making this program work with GPUs is as simple as initializing the
   // device, but it probably won't make a huge difference in speed for typical
-  // setups.
+  // setups.  You should use nnet3-latgen-faster-batch if you want to use a GPU.
   try {
     using namespace kaldi;
     using namespace kaldi::nnet3;
@@ -45,7 +45,8 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Generate lattices using nnet3 neural net model.\n"
         "Usage: nnet3-latgen-faster [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n"
+        "See also: nnet3-latgen-faster-parallel, nnet3-latgen-faster-batch\n";
     ParseOptions po(usage);
     Timer timer;
     bool allow_partial = false;
diff --git a/src/nnet3bin/nnet3-latgen-grammar.cc b/src/nnet3bin/nnet3-latgen-grammar.cc
new file mode 100644
index 00000000000..9d2304fb1d8
--- /dev/null
+++ b/src/nnet3bin/nnet3-latgen-grammar.cc
@@ -0,0 +1,214 @@
+// nnet3bin/nnet3-latgen-grammar.cc
+
+// Copyright      2018   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/nnet-utils.h"
+#include "decoder/grammar-fst.h"
+#include "base/timer.h"
+
+
+int main(int argc, char *argv[]) {
+  // note: making this program work with GPUs is as simple as initializing the
+  // device, but it probably won't make a huge difference in speed for typical
+  // setups.
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::Fst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices using nnet3 neural net model, and GrammarFst-based graph\n"
+        "see kaldi-asr.org/doc/grammar.html for more context.\n"
+        "\n"
+        "Usage: nnet3-latgen-grammar [options] <nnet-in> <grammar-fst-in> <features-rspecifier>"
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
+
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    LatticeFasterDecoderConfig config;
+    NnetSimpleComputationOptions decodable_opts;
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    config.Register(&po);
+    decodable_opts.Register(&po);
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for "
+                "utt2spk option used to get ivectors per speaker");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_rxfilename = po.GetArg(1),
+        grammar_fst_rxfilename = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        lattice_wspecifier = po.GetArg(4),
+        words_wspecifier = po.GetOptArg(5),
+        alignment_wspecifier = po.GetOptArg(6);
+
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(model_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      CollapseModel(CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    Int32VectorWriter words_writer(words_wspecifier);
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+    // this compiler object allows caching of computations across
+    // different utterances.
+    CachingOptimizingCompiler compiler(am_nnet.GetNnet(),
+                                       decodable_opts.optimize_config);
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    fst::GrammarFst fst;
+    ReadKaldiObject(grammar_fst_rxfilename, &fst);
+    timer.Reset();
+
+    {
+      LatticeFasterDecoderTpl<fst::GrammarFst> decoder(fst, config);
+
+      for (; !feature_reader.Done(); feature_reader.Next()) {
+        std::string utt = feature_reader.Key();
+        const Matrix<BaseFloat> &features (feature_reader.Value());
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_fail++;
+          continue;
+        }
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
+          }
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            online_ivectors = &online_ivector_reader.Value(utt);
+          }
+        }
+
+        DecodableAmNnetSimple nnet_decodable(
+            decodable_opts, trans_model, am_nnet,
+            features, ivector, online_ivectors,
+            online_ivector_period, &compiler);
+
+        double like;
+        if (DecodeUtteranceLatticeFaster(
+                decoder, nnet_decodable, trans_model, word_syms, utt,
+                decodable_opts.acoustic_scale, determinize, allow_partial,
+                &alignment_writer, &words_writer, &compact_lattice_writer,
+                &lattice_writer,
+                &like)) {
+          tot_like += like;
+          frame_count += nnet_decodable.NumFramesReady();
+          num_success++;
+        } else num_fail++;
+      }
+    }
+
+    kaldi::int64 input_frame_count =
+        frame_count * decodable_opts.frame_subsampling_factor;
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed * 100.0 / input_frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is "
+              << (tot_like / frame_count) << " over "
+              << frame_count << " frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-train.cc b/src/nnet3bin/nnet3-train.cc
index 271af5d06dc..d3fbaa587e1 100644
--- a/src/nnet3bin/nnet3-train.cc
+++ b/src/nnet3bin/nnet3-train.cc
@@ -20,7 +20,7 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "nnet3/nnet-training.h"
-
+#include "cudamatrix/cu-allocator.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -53,6 +53,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     train_config.Register(&po);
+    RegisterCuAllocatorOptions(&po);
 
     po.Read(argc, argv);
 
@@ -94,5 +95,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-xvector-compute-batched.cc b/src/nnet3bin/nnet3-xvector-compute-batched.cc
new file mode 100644
index 00000000000..01240815348
--- /dev/null
+++ b/src/nnet3bin/nnet3-xvector-compute-batched.cc
@@ -0,0 +1,663 @@
+// nnet3bin/nnet3-xvector-compute.cc
+
+// Copyright 2019   Daniel Povey
+//           2017   Johns Hopkins University (author: Daniel Povey)
+//           2017   Johns Hopkins University (author: Daniel Garcia-Romero)
+//           2017   David Snyder
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "base/timer.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+struct BatchedXvectorComputerOptions {
+  int32 chunk_size { 150 };
+  int32 batch_size { 32 };
+  bool pad_input { true };
+  NnetComputeOptions compute_config;
+  NnetOptimizeOptions optimize_config;
+  CachingOptimizingCompilerOptions compiler_config;
+
+
+  void Register(OptionsItf *po) {
+    po->Register("chunk-size", &chunk_size,
+                 "Size of chunk, in input frames.  Includes the nnet "
+                 "context, so the number of chunks will be more than "
+                 "total-input-frames / chunk-size.");
+    po->Register("batch-size", &batch_size,
+                 "Size of the batches of chunks that we compute at once. ");
+    po->Register("pad-input", &pad_input,
+                 "If true, for utterances shorter than `chunk-size` frames "
+                 "we will pad with repeats of the last frame.");
+    compute_config.Register(po);
+    optimize_config.Register(po);
+    compiler_config.Register(po);
+  }
+};
+
+
+/**
+   This function divides the number 'a' into 'b' pieces, such that
+   the sum of the pieces equals 'a' and no two pieces differ by more
+   than 1.
+     @param [in] a     A number, may be positive or negative
+     @param [in] b     The number of pieces, b >= 1.
+     @param [out] pieces   The pieces will be written to here.
+                       At exit, their sum will equal a, and none
+                       of them will differ from any other by more
+                       than 1.  Otherwise they are arbitrarily
+                       chosen.
+ */
+void DivideIntoPieces(int32 a, int32 b, std::vector<int32> *pieces) {
+  KALDI_ASSERT(b > 0);
+  pieces->clear();
+  pieces->reserve(b);
+  int32 a_sign = 1;
+  // Make sure a is positive before division, because the behavior of division
+  // with negative operands is not fully defined in C.
+  if (a < 0) {
+    a_sign = -1;
+    a *= -1;
+  }
+  int32 piece_size1 = a / b,
+      piece_size2 = piece_size1 + 1,
+      remainder = a % b;
+  int32 num_pieces_of_size1 = b - remainder,
+      num_pieces_of_size2 = remainder;
+  KALDI_ASSERT(a == num_pieces_of_size1 * piece_size1 +
+               num_pieces_of_size2 * piece_size2);
+
+  for (int32 i = 0; i < num_pieces_of_size1; i++)
+    pieces->push_back(piece_size1 * a_sign);
+  for (int32 i = 0; i < num_pieces_of_size2; i++)
+    pieces->push_back(piece_size2 * a_sign);
+}
+
+
+
+class BatchedXvectorComputer {
+ public:
+  /**
+       @param [in]  opts  Options class; warning, it keeps a reference to it.
+       @param [in]  nnet  The neural net we'll be computing with; assumed to have
+                          already been prepared for test.
+       @param [in] total_context   The sum of the left and right context of the
+                          network, computed after calling
+                          SetRequireDirectInput(true, &nnet); so the l/r context
+                          isn't zero.
+   */
+
+  BatchedXvectorComputer(const BatchedXvectorComputerOptions &opts,
+                         const Nnet &nnet,
+                         int32 total_context);
+
+  /**
+     Accepts an utterance to process into an xvector, and, if one or more
+     batches become full, processes the batch.
+   */
+  void AcceptUtterance(const std::string &utt,
+                      const Matrix<BaseFloat> &input);
+
+
+  /**  Returns true if at least one xvector is pending output (i.e. that
+       the user may call OutputXvector()).
+   */
+  bool XvectorReady() const;
+
+  /**
+     This function, which must only be called if XvectorReady() has
+     just returned true,  outputs an xvector for an utterance.
+       @param [out] utt  The utterance-id is written to here.
+                        Note: these will be output in the same order
+                        as the user called AcceptUtterance(), except
+                        that if opts_.pad_input is false and
+                        and utterance is shorter than the chunk
+                        size, some utterances may be skipped.
+       @param [out] xvector  The xvector will be written to here.
+   */
+  void OutputXvector(std::string *utt,
+                     Vector<BaseFloat> *xvector);
+
+
+  /**
+     Calling this will force any partial minibatch to be computed,
+     so that any utterances that have previously been passed to
+     AcceptUtterance() will, when this function returns, have
+     their xvectors ready to be retrieved by OutputXvector().
+   */
+  void Flush();
+
+
+ private:
+
+  struct XvectorTask {
+    std::string utt_id;
+    int32 num_chunks;
+    int32 num_chunks_finished;
+    Vector<BaseFloat> xvector;
+    XvectorTask *tail;
+  };
+
+
+  /**
+     This decides how to split the utterance into chunks.  It does so in a way
+     that minimizes the variance of the x-vector under some simplifying
+     assumptions.  It's about minimizing the variance of the x-vector.  We treat
+     the x-vector as computed as a sum over frames (although some frames may be
+     repeated or omitted due to gaps between chunks or overlaps between chunks);
+     and we try to minimize the variance of the x-vector estimate; this is minimized
+     when all the frames have the same weight, which is only possible if it can be
+     exactly divided into chunks; anyway, this function computes the best division
+     into chunks.
+
+     It's a question of whether to allow overlaps or gaps.
+     Suppose we are averaging independent quantities with variance 1.  The
+     variance of a simple sum of M of those quantities is 1/M.
+     Suppose we have M of those quantities, plus N which are repeated twice
+     in the sum.  The variance of the estimate formed that way is:
+
+      (M + 4N) / (M + 2N)^2
+
+     If we can't divide it exactly into chunks we'll compare the variances from
+     the cases where there is a gap vs. an overlap, and choose the one with
+     the smallest variance.  (Note: due to context effects we actually lose
+     total_context_ frames from the input signal, and the chunks would have
+     to overlap by total_context_ even if the part at the statistics-computation
+     layer were ideally cut up.
+
+        @param [in] num_frames  The number of frames in the utterance
+        @param [out] start_frames  This function will output to here a vector
+                    containing all the start-frames of chunks in this utterance.
+                    All chunks will have duration opts_.chunk_size; if a chunk
+                    goes past the end of the input we'll repeat the last frame.
+                    (This will only happen if opts_.pad_input is false and
+                    num_frames is less than opts_.chunk_length.)
+   */
+  void SplitUtteranceIntoChunks(int32 num_frames,
+                                std::vector<int32> *start_frames);
+
+  /** This adds a newly created XvectorTask at the tail of the singly linked
+      list whose (head,tail) are results_head_, results_tail_.
+   */
+  XvectorTask* CreateTask(const std::string &utt, int32 num_chunks);
+
+
+  /**
+     Does the nnet computation for one batch and distributes the
+     computed x-vectors (of chunks) appropriately to their XvectorTask
+     objects.
+   */
+  void ComputeOneBatch();
+
+  /**
+     Adds a new chunk to a batch we are preparing.  This will go
+     at position `position_in_batch_` which will be incremented.
+       @param [in] task  The task this is part of (records the
+                utterance); tasks_this_batch_[position_in_batch_] will
+                be set to this.
+       @param [in] input  The input matrix of features of
+                which this chunk is a part
+       @param [in] chunk_start  The frame at which this
+                chunk starts.  Must be >= 0; and if
+                opts_.pad_input is false, chunk_start + opts_.chunk_size
+                must be <= input.NumRows().
+   */
+  void AddChunkToBatch(XvectorTask *task,
+                       const Matrix<BaseFloat> &input,
+                       int32 chunk_start);
+
+  const BatchedXvectorComputerOptions &opts_;
+  int32 total_context_;
+  const Nnet &nnet_;
+
+  int32 feature_dim_;
+  int32 xvector_dim_;
+
+  /**
+     Staging area for the input features prior to copying them to GPU.
+     Dimension is opts_.chunk_size * opts_.batch_size by feature_dim_.  The
+     sequences are interleaved (will be faster since this corresponds to how
+     nnet3 keeps things in memory), i.e. row 0 of input_feats_ is time t=0
+     for chunk n=0; and row 1 of input_feats_ is time t=0 for chunk n=1.
+  */
+  Matrix<BaseFloat> input_feats_;
+
+
+  /** The compiled computation (will be the same for every batch).  */
+  std::shared_ptr<const NnetComputation> computation_;
+
+
+  /**  position_in_batch_ is the number of chunks that we have filled in in
+       the input_feats_ matrix and tasks_this_batch_.  When it reaches
+       opts_.batch_size we will do the actual computation.
+  */
+  int32 position_in_batch_;
+
+  /**
+     tasks_this_batch_ is of dimension opts_.batch_size.  It is a vector of pointers to
+     elements of the singly linked list whose head is at results_head_, or
+     NULL for elements with indexes >= position_in_batch_.
+   */
+  std::vector<XvectorTask*> tasks_this_batch_;
+
+  // results_head_ is the first element in the singly linked list of
+  // already-computed xvectors, or NULL if that list is empty.  Note:
+  // utterances that are ready will appear here first; new utterances
+  // get added to the tail.
+  XvectorTask *results_head_;
+  // results_tail_ is the last element in the singly linked list of
+  // already-computed xvectors, or NULL if the list is empty.
+  XvectorTask *results_tail_;
+};
+
+BatchedXvectorComputer::XvectorTask*
+BatchedXvectorComputer::CreateTask(
+    const std::string &utt, int32 num_chunks) {
+  XvectorTask *task = new XvectorTask;
+  task->utt_id = utt;
+  task->num_chunks = num_chunks;
+  task->num_chunks_finished = 0;
+  task->xvector.Resize(xvector_dim_);
+  task->tail = NULL;
+  if (results_tail_) {
+    results_tail_->tail = task;
+    results_tail_ = task;
+  } else {  // List was previously empty.
+    results_head_ = task;
+    results_tail_ = task;
+  }
+  return task;
+}
+
+BatchedXvectorComputer::BatchedXvectorComputer(
+    const BatchedXvectorComputerOptions &opts,
+    const Nnet &nnet,
+    int32 total_context):
+    opts_(opts),
+    total_context_(total_context),
+    nnet_(nnet),
+    position_in_batch_(0),
+    results_head_(NULL),
+    results_tail_(NULL) {
+
+  tasks_this_batch_.resize(opts_.batch_size);
+
+  feature_dim_ = nnet.InputDim("input");
+  xvector_dim_ = nnet.OutputDim("output");
+  // Zero input_feats_ in case there is only one batch, to avoid
+  // NaN's being generated due to undefined data.
+  input_feats_.Resize(opts_.chunk_size * opts_.batch_size,
+                      feature_dim_);
+
+  CachingOptimizingCompiler compiler(nnet, opts.optimize_config,
+                                     opts.compiler_config);
+
+  {  // This block creates computation_.
+    ComputationRequest request;
+    request.need_model_derivative = false;
+    request.store_component_stats = false;
+    request.inputs.resize(1);
+    IoSpecification &input(request.inputs[0]);
+    input.name = "input";
+    input.has_deriv = false;
+    input.indexes.resize(opts_.batch_size * opts_.chunk_size);
+    // Note: the sequences are interleaved in the input; this will save an extra
+    // copy since it corresponds to how nnet3 stores things by default.  (Makes
+    // TDNNs easier to implement.)
+    for (int32 n = 0; n < opts_.batch_size; n++) {
+      for (int32 t = 0; t < opts_.chunk_size; t++) {
+        Index index;
+        index.n = n;
+        index.t = t;
+        // index.x is 0 by default.
+        input.indexes[n + opts_.batch_size * t] = index;
+      }
+    }
+    IoSpecification output;
+    output.name = "output";
+    output.has_deriv = false;
+    output.indexes.resize(opts_.batch_size);
+    for (int32 n = 0; n < opts_.batch_size; n++){
+        Index index;
+        index.n = n;
+        index.t = 0;
+        output.indexes[n] = index;
+    }
+    request.outputs.push_back(output);
+    computation_ = compiler.Compile(request);
+  }
+}
+
+void BatchedXvectorComputer::AddChunkToBatch(
+    XvectorTask *task,
+    const Matrix<BaseFloat> &input,
+    int32 chunk_start) {
+  int32 n = position_in_batch_++;
+  KALDI_ASSERT(n >= 0 && n < opts_.batch_size);
+  tasks_this_batch_[n] = task;
+  int32 T = opts_.chunk_size,
+      num_input_frames = input.NumRows();
+  KALDI_ASSERT(input_feats_.NumRows() == T * opts_.batch_size);
+  if (input.NumCols() != feature_dim_) {
+    KALDI_ERR << "Feature dimension mismatch: neural net expected "
+              << feature_dim_ << ", got " << input.NumCols();
+  }
+  for (int32 t = 0; t < T; t++) {
+    SubVector<BaseFloat> dest(input_feats_, t * opts_.batch_size + n);
+    int32 src_t = t + chunk_start;
+    if (src_t >= num_input_frames) {
+      KALDI_ASSERT(opts_.pad_input);
+      src_t = num_input_frames - 1;  // Pad with repeats of the last frame.
+    }
+    SubVector<BaseFloat> src(input, src_t);
+    dest.CopyFromVec(src);
+  }
+}
+
+bool BatchedXvectorComputer::XvectorReady() const {
+  if (results_head_ == NULL)
+    return false;
+  KALDI_ASSERT(results_head_->num_chunks_finished <= results_head_->num_chunks);
+  return results_head_->num_chunks_finished == results_head_->num_chunks;
+}
+
+void BatchedXvectorComputer::OutputXvector(std::string *utt,
+                                           Vector<BaseFloat> *xvector) {
+  KALDI_ASSERT(XvectorReady());
+  *utt = results_head_->utt_id;
+  xvector->Swap(&(results_head_->xvector));
+  XvectorTask *new_tail = results_head_->tail;
+  delete results_head_;
+  results_head_ = new_tail;
+  if (new_tail == NULL)
+    results_tail_ = NULL;
+}
+
+void BatchedXvectorComputer::Flush() {
+  if (position_in_batch_ == 0)
+    return;
+  ComputeOneBatch();
+}
+
+
+void BatchedXvectorComputer::ComputeOneBatch() {
+
+  CuMatrix<BaseFloat> cu_input_feats(input_feats_);
+  Nnet *nnet_to_update = NULL;  // we're not doing any update.
+  NnetComputer computer(opts_.compute_config, *computation_,
+                        nnet_, nnet_to_update);
+  computer.AcceptInput("input", &cu_input_feats);
+  computer.Run();
+  CuMatrix<BaseFloat> cu_output;
+  computer.GetOutputDestructive("output", &cu_output);
+  KALDI_ASSERT(cu_output.NumRows() == opts_.batch_size);
+  Matrix<BaseFloat> output(cu_output);
+  for (int32 n = 0; n < opts_.batch_size; n++) {
+    XvectorTask *task = tasks_this_batch_[n];
+    if (task == NULL)
+      continue;  // Would only happen for the last batch.
+    task->num_chunks_finished++;
+    task->xvector.AddVec(1.0 / task->num_chunks, output.Row(n));
+  }
+  position_in_batch_ = 0;
+  std::fill(tasks_this_batch_.begin(), tasks_this_batch_.end(),
+            (XvectorTask*)NULL);
+}
+
+void BatchedXvectorComputer::AcceptUtterance(
+    const std::string &utt,
+    const Matrix<BaseFloat> &input) {
+  std::vector<int32> chunk_starts;
+  int32 num_frames = input.NumRows();
+  SplitUtteranceIntoChunks(num_frames, &chunk_starts);
+  int32 num_chunks = chunk_starts.size();
+  XvectorTask *task = CreateTask(utt, num_chunks);
+
+  for (int32 i = 0; i < num_chunks; i++) {
+    AddChunkToBatch(task, input, chunk_starts[i]);
+    if (position_in_batch_ == opts_.batch_size) {
+      ComputeOneBatch();
+    }
+  }
+}
+
+void BatchedXvectorComputer::SplitUtteranceIntoChunks(
+    int32 num_frames, std::vector<int32> *start_frames) {
+  start_frames->clear();
+  if (num_frames <= opts_.chunk_size) {
+    if (num_frames == opts_.chunk_size || opts_.pad_input)
+      start_frames->push_back(0);
+    // if we leave start_frames empty, then we just won't compute anything for
+    // this file.
+  } else {
+    // these modified quantities are to account for the context effects...  when
+    // the chunks overlap by exactly total_context_, the frames that get
+    // averaged by the respective chunks in their averaging layers would touch
+    // but not overlap.  So the optimal separation between chunks would equal
+    // opts_.chunk_size - total_context_.
+    int32 modified_num_frames = num_frames - total_context_,
+        modified_chunk_size = opts_.chunk_size - total_context_;
+    KALDI_ASSERT(modified_num_frames > modified_chunk_size);
+    int32 num_chunks1 = modified_num_frames / modified_chunk_size,
+        num_chunks2 = num_chunks1 + 1;
+    int32 num_frames1 = num_chunks1 * modified_chunk_size,
+        num_frames2 = num_chunks2 * modified_chunk_size;
+    KALDI_ASSERT(num_frames2 > modified_chunk_size);
+    // The M and N below correspond to the M and N in the comment:
+    // M is the number of frames repeated once in the averaging, N
+    // the number of frames repeated twice.  (Basically a solution
+    // of the equations: (M + 2N == num_frames2, M+N == modified_num_frames).
+    // Note: by a "frame" above, I mean a specific "t" value in
+    // the utterance.
+    int32 N = num_frames2 - modified_num_frames,
+        M = modified_num_frames - N;
+    KALDI_ASSERT(M + 2*N == num_frames2 && M + N == modified_num_frames);
+
+    // The variances below are proportional to the variance of our
+    // estimate of the xvector under certain simplifying assumptions..
+    // they help us choose whether to have gaps between the chunks
+    // or overlaps between them.
+    BaseFloat variance1 = 1.0 / num_frames1,  // the 1/M mentioned above.
+        variance2 = (M + 4.0*N) / ((M + 2.0*N)*(M + 2.0*N));
+    if (variance1 <= variance2) {
+      // We'll choose the smaller number of chunks.  There may be gaps.
+      // Counting the positions at the ends, there are num_chunks+1 positions
+      // where there might be gaps.
+      // Note: "total_gap" is >= 0, it's the positive of the sum of the
+      // sizes of those gaps.
+      int32 num_chunks = num_chunks1,
+          num_gaps = num_chunks + 1,
+          total_gap = modified_num_frames - num_chunks * modified_chunk_size;
+      KALDI_ASSERT(0 <= total_gap && total_gap < modified_chunk_size);
+      std::vector<int32> gap_sizes;  // elements will be >= 0.
+      DivideIntoPieces(total_gap, num_gaps, &gap_sizes);
+      int32 pos = gap_sizes[0];
+      for (int32 i = 0; i < num_chunks; i++) {
+        start_frames->push_back(pos);
+        pos += modified_chunk_size + gap_sizes[i + 1];
+      }
+      KALDI_ASSERT(pos == modified_num_frames);
+    } else {
+      int32 num_chunks = num_chunks2,
+          num_overlaps = num_chunks - 1,
+          total_overlap = modified_num_frames - num_chunks * modified_chunk_size;
+      KALDI_ASSERT( -modified_chunk_size < total_overlap && total_overlap <= 0 );
+      std::vector<int32> overlap_sizes;  // elements will be <= 0.
+      DivideIntoPieces(total_overlap, num_overlaps, &overlap_sizes);
+      int32 pos = 0;
+      for (int32 i = 0; i < num_chunks; i++) {
+        start_frames->push_back(pos);
+        pos += modified_chunk_size;
+        if (i < num_overlaps)
+          pos += overlap_sizes[i];
+      }
+      KALDI_ASSERT(pos == modified_num_frames);
+    }
+  }
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Propagate features through an xvector neural network model and write\n"
+        "the output vectors.  \"Xvector\" is our term for a vector or\n"
+        "embedding which is the output of a particular type of neural network\n"
+        "architecture found in speaker recognition.  This architecture\n"
+        "consists of several layers that operate on frames, a statistics\n"
+        "pooling layer that aggregates over the frame-level representations\n"
+        "and possibly additional layers that operate on segment-level\n"
+        "representations.  The xvectors are generally extracted from an\n"
+        "output layer after the statistics pooling layer.  By default, one\n"
+        "xvector is extracted directly from the set of features for each\n"
+        "utterance.  Optionally, xvectors are extracted from chunks of input\n"
+        "features and averaged, to produce a single vector.\n"
+        "\n"
+        "Usage: nnet3-xvector-compute [options] <raw-nnet-in> "
+        "<features-rspecifier> <vector-wspecifier>\n"
+        "e.g.: nnet3-xvector-compute final.raw scp:feats.scp "
+        "ark:nnet_prediction.ark\n"
+        "See also: nnet3-compute\n";
+
+    ParseOptions po(usage);
+    Timer timer;
+
+    BatchedXvectorComputerOptions opts;
+
+    std::string use_gpu = "no";
+
+    opts.Register(&po);
+
+    po.Register("use-gpu", &use_gpu,
+      "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet_rxfilename = po.GetArg(1),
+                feature_rspecifier = po.GetArg(2),
+                vector_wspecifier = po.GetArg(3);
+
+    Nnet nnet;
+    ReadKaldiObject(nnet_rxfilename, &nnet);
+    SetBatchnormTestMode(true, &nnet);
+    SetDropoutTestMode(true, &nnet);
+    CollapseModel(CollapseModelConfig(), &nnet);
+
+    int32 total_context;
+    {
+      int32 left_context, right_context;
+      // Compute left_context, right_context as the 'real' left/right context
+      // of the network; they'll tell us how many frames on the chunk boundaries
+      // won't really participate in the statistics averaging.
+      // SetRequireDirectInput()  modifies how the StatisticsPoolingComponent
+      // treats its dependences, so we'll get the 'real' left/right context.
+      SetRequireDirectInput(true, &nnet);
+      ComputeSimpleNnetContext(nnet, &left_context, &right_context);
+      KALDI_LOG << "Left/right context is " << left_context << ", "
+                << right_context;
+      SetRequireDirectInput(false, &nnet);
+      total_context = left_context + right_context;
+    }
+
+    BatchedXvectorComputer computer(opts, nnet, total_context);
+    BaseFloatVectorWriter vector_writer(vector_wspecifier);
+
+    int32 num_utts_read = 0, num_xvectors_written = 0;
+    int64 frame_count = 0;
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string utt = feature_reader.Key();
+      const Matrix<BaseFloat> &features (feature_reader.Value());
+      if (features.NumRows() == 0) {
+        KALDI_WARN << "Zero-length utterance: " << utt;
+        continue;
+      }
+
+      frame_count += features.NumRows();
+
+      computer.AcceptUtterance(utt, features);
+      num_utts_read++;
+
+      while (computer.XvectorReady()) {
+        std::string utt;
+        Vector<BaseFloat> xvector;
+        computer.OutputXvector(&utt, &xvector);
+        vector_writer.Write(utt, xvector);
+        num_xvectors_written++;
+      }
+    }
+
+    computer.Flush();
+    while (computer.XvectorReady()) {
+      std::string utt;
+      Vector<BaseFloat> xvector;
+      computer.OutputXvector(&utt, &xvector);
+      vector_writer.Write(utt, xvector);
+      num_xvectors_written++;
+    }
+
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Read " << num_utts_read << " utterances, wrote "
+              << num_xvectors_written << " xvectors.";
+
+    // Note: the following rule does something reasonable even if there are 0, 1
+    // or 2 utterances read.
+    if (num_xvectors_written > num_utts_read / 2)
+      return 0;
+    else
+      return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc
index a4bc89a7def..9014a0ee622 100644
--- a/src/nnet3bin/nnet3-xvector-compute.cc
+++ b/src/nnet3bin/nnet3-xvector-compute.cc
@@ -44,7 +44,7 @@ static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
   output_spec.indexes.resize(1);
   request.outputs.resize(1);
   request.outputs[0].Swap(&output_spec);
-  std::shared_ptr<const NnetComputation> computation(std::move(compiler->Compile(request)));
+  std::shared_ptr<const NnetComputation> computation(compiler->Compile(request));
   Nnet *nnet_to_update = NULL;  // we're not doing any update.
   NnetComputer computer(NnetComputeOptions(), *computation,
                   nnet, nnet_to_update);
@@ -96,6 +96,8 @@ int main(int argc, char *argv[]) {
     opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
 
     std::string use_gpu = "no";
+    std::string cached_compiler_in;
+    std::string cached_compiler_out;
     int32 chunk_size = -1,
       min_chunk_size = 100;
     bool pad_input = true;
@@ -112,6 +114,14 @@ int main(int argc, char *argv[]) {
       "Minimum chunk-size allowed when extracting xvectors.");
     po.Register("pad-input", &pad_input, "If true, duplicate the first and "
       "last frames of the input features as required to equal min-chunk-size.");
+    po.Register("cached-compiler-in", &cached_compiler_in,
+      "If set, read the cached compiler from the specified file path.");
+    po.Register("cached-compiler-out", &cached_compiler_out,
+      "If set, write the cached compiler to the specified file path.");
+
+#if HAVE_CUDA==1
+    CuDevice::RegisterDeviceOptions(&po);
+#endif
 
     po.Read(argc, argv);
 
@@ -135,6 +145,13 @@ int main(int argc, char *argv[]) {
     CollapseModel(CollapseModelConfig(), &nnet);
 
     CachingOptimizingCompiler compiler(nnet, opts.optimize_config, compiler_config);
+    
+    if (!cached_compiler_in.empty()) {
+        KALDI_LOG << "Reading cache from " << cached_compiler_in;
+        bool cache_binary_in;
+        Input ki(cached_compiler_in, &cache_binary_in);
+        compiler.ReadCache(ki.Stream(), cache_binary_in);
+    }
 
     BaseFloatVectorWriter vector_writer(vector_wspecifier);
 
@@ -223,6 +240,13 @@ int main(int argc, char *argv[]) {
               << (elapsed*100.0/frame_count);
     KALDI_LOG << "Done " << num_success << " utterances, failed for "
               << num_fail;
+    
+    if (!cached_compiler_out.empty()) {
+        KALDI_LOG << "Writing cache to " << cached_compiler_out;
+        bool binary_write = true;
+        Output ko(cached_compiler_out, &binary_write);
+        compiler.WriteCache(ko.Stream(), binary_write);
+    }
 
     if (num_success != 0) return 0;
     else return 1;
diff --git a/src/nnetbin/Makefile b/src/nnetbin/Makefile
index 49a174ec36e..5e934ff523a 100644
--- a/src/nnetbin/Makefile
+++ b/src/nnetbin/Makefile
@@ -1,5 +1,9 @@
 
+# cuda-gpu-available was moved to nnet3bin/; remove it in case an outdated
+# version persists.
 all:
+	-rm -f cuda-gpu-available
+
 EXTRA_CXXFLAGS = -Wno-sign-compare
 include ../kaldi.mk
 
@@ -15,7 +19,7 @@ BINFILES = nnet-train-frmshuff \
         nnet-forward nnet-copy nnet-info nnet-concat \
         transf-to-nnet cmvn-to-nnet nnet-initialize \
 	feat-to-post paste-post train-transitions \
-	cuda-gpu-available nnet-set-learnrate
+	nnet-set-learnrate
 
 OBJFILES =
 
@@ -25,7 +29,6 @@ TESTFILES =
 
 ADDLIBS = ../nnet/kaldi-nnet.a ../cudamatrix/kaldi-cudamatrix.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnetbin/nnet-train-multistream.cc b/src/nnetbin/nnet-train-multistream.cc
index 133c49e02a5..0667aa865bf 100644
--- a/src/nnetbin/nnet-train-multistream.cc
+++ b/src/nnetbin/nnet-train-multistream.cc
@@ -46,6 +46,7 @@ bool ReadData(SequentialBaseFloatMatrixReader& feature_reader,
   for ( ; !feature_reader.Done(); feature_reader.Next()) {
     // Do we have targets?
     const std::string& utt = feature_reader.Key();
+    KALDI_VLOG(3) << "Reading: " << utt;
     if (!target_reader.HasKey(utt)) {
       KALDI_WARN << utt << ", missing targets";
       (*num_no_tgt_mat)++;
@@ -216,6 +217,7 @@ int main(int argc, char *argv[]) {
     Mse mse(loss_opts);
 
     Timer time;
+    double time_gpu = 0;
     KALDI_LOG << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
               << " STARTED";
 
@@ -227,6 +229,7 @@ int main(int argc, char *argv[]) {
     std::vector<Matrix<BaseFloat> > feats_utt(num_streams);
     std::vector<Posterior> labels_utt(num_streams);
     std::vector<Vector<BaseFloat> > weights_utt(num_streams);
+    std::vector<int32> cursor_utt(num_streams); // 0 initialized,
     std::vector<int32> new_utt_flags(num_streams);
 
     CuMatrix<BaseFloat> feats_transf, nnet_out, obj_diff;
@@ -238,7 +241,7 @@ int main(int argc, char *argv[]) {
       new_utt_flags.assign(num_streams, 0);  // set new-utterance flags to zero,
       for (int s = 0; s < num_streams; s++) {
         // Need a new utterance for stream 's'?
-        if (feats_utt[s].NumRows() == 0) {
+        if (cursor_utt[s] >= feats_utt[s].NumRows()) {
           Matrix<BaseFloat> feats;
           Posterior targets;
           Vector<BaseFloat> weights;
@@ -249,7 +252,9 @@ int main(int argc, char *argv[]) {
                        &num_no_tgt_mat, &num_other_error)) {
 
             // input transform may contain splicing,
+            Timer t;
             nnet_transf.Feedforward(CuMatrix<BaseFloat>(feats), &feats_transf);
+            time_gpu += t.Elapsed();
 
             /* Here we could do the 'targets_delay', BUT...
              * It is better to do it by a <Splice> component!
@@ -262,17 +267,25 @@ int main(int argc, char *argv[]) {
             feats_utt[s] = Matrix<BaseFloat>(feats_transf);
             labels_utt[s] = targets;
             weights_utt[s] = weights;
+            cursor_utt[s] = 0;
             new_utt_flags[s] = 1;
           }
         }
       }
 
-      // end the training after processing all the frames,
-      size_t frames_to_go = 0;
+      // End the training when 1st stream is empty
+      // (this avoids over-adaptation to last utterances),
+      size_t inactive_streams = 0;
       for (int32 s = 0; s < num_streams; s++) {
-        frames_to_go += feats_utt[s].NumRows();
+        if (feats_utt[s].NumRows() - cursor_utt[s] <= 0) {
+          inactive_streams += 1;
+        }
+      }
+      if (inactive_streams >= 1) {
+        KALDI_LOG << "No more data to re-fill one of the streams, end of the training!";
+        KALDI_LOG << "(remaining stubs of data are discarded, don't overtrain on them)";
+        break;
       }
-      if (frames_to_go == 0) break;
 
       // number of frames we'll pack as the streams,
       std::vector<int32> frame_num_utt;
@@ -291,78 +304,74 @@ int main(int argc, char *argv[]) {
         weight_host.Resize(n_streams * batch_size, kSetZero);
         frame_num_utt.resize(n_streams, 0);
 
-        // we'll slice at most 'batch_size' frames,
+        // we slice at the 'cursor' at most 'batch_size' frames,
         for (int32 s = 0; s < n_streams; s++) {
-          int32 num_rows = feats_utt[s].NumRows();
+          int32 num_rows = std::max(0, feats_utt[s].NumRows() - cursor_utt[s]);
           frame_num_utt[s] = std::min(batch_size, num_rows);
         }
 
         // pack the data,
         {
           for (int32 s = 0; s < n_streams; s++) {
-            const Matrix<BaseFloat>& mat_tmp = feats_utt[s];
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
+            if (frame_num_utt[s] > 0) {
+              auto mat_tmp = feats_utt[s].RowRange(cursor_utt[s], frame_num_utt[s]);
+              for (int32 r = 0; r < frame_num_utt[s]; r++) {
+                feat_mat_host.Row(r*n_streams + s).CopyFromVec(mat_tmp.Row(r));
+              }
             }
           }
 
           for (int32 s = 0; s < n_streams; s++) {
-            const Posterior& target_tmp = labels_utt[s];
             for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              target_host[r*n_streams + s] = target_tmp[r];
+              target_host[r*n_streams + s] = labels_utt[s][cursor_utt[s] + r];
             }
           }
 
           // padded frames will keep initial zero-weight,
           for (int32 s = 0; s < n_streams; s++) {
-            const Vector<BaseFloat>& weight_tmp = weights_utt[s];
-            for (int32 r = 0; r < frame_num_utt[s]; r++) {
-              weight_host(r*n_streams + s) = weight_tmp(r);
+            if (frame_num_utt[s] > 0) {
+              auto weight_tmp = weights_utt[s].Range(cursor_utt[s], frame_num_utt[s]);
+              for (int32 r = 0; r < frame_num_utt[s]; r++) {
+                weight_host(r*n_streams + s) = weight_tmp(r);
+              }
             }
           }
         }
 
-        // remove the data we just packed,
-        {
-          for (int32 s = 0; s < n_streams; s++) {
-            // feats,
-            Matrix<BaseFloat>& m = feats_utt[s];
-            if (m.NumRows() == frame_num_utt[s]) {
-              feats_utt[s].Resize(0,0);  // we packed last chunk,
-            } else {
-              feats_utt[s] = Matrix<BaseFloat>(
-                m.RowRange(frame_num_utt[s], m.NumRows() - frame_num_utt[s])
-              );
-            }
-            // labels,
-            Posterior& post = labels_utt[s];
-            post.erase(post.begin(), post.begin() + frame_num_utt[s]);
-            // weights,
-            Vector<BaseFloat>& w = weights_utt[s];
-            if (w.Dim() == frame_num_utt[s]) {
-              weights_utt[s].Resize(0);  // we packed last chunk,
-            } else {
-              weights_utt[s] = Vector<BaseFloat>(
-                w.Range(frame_num_utt[s], w.Dim() - frame_num_utt[s])
-              );
-            }
-          }
+        // advance the cursors,
+        for (int32 s = 0; s < n_streams; s++) {
+          cursor_utt[s] += frame_num_utt[s];
         }
       }
 
       // pass the info about padding,
       nnet.SetSeqLengths(frame_num_utt);
-      // Show the 'utt' lengths in the VLOG[2],
-      if (GetVerboseLevel() >= 2) {
-        std::ostringstream os;
-        os << "[ ";
-        for (size_t i = 0; i < frame_num_utt.size(); i++) {
-          os << frame_num_utt[i] << " ";
+
+      // Show debug info,
+      if (GetVerboseLevel() >= 4) {
+        // cursors in the feature_matrices,
+        {
+          std::ostringstream os;
+          os << "[ ";
+          for (size_t i = 0; i < cursor_utt.size(); i++) {
+            os << cursor_utt[i] << " ";
+          }
+          os << "]";
+          KALDI_LOG << "cursor_utt[" << cursor_utt.size() << "]" << os.str();
+        }
+        // frames in the mini-batch,
+        {
+          std::ostringstream os;
+          os << "[ ";
+          for (size_t i = 0; i < frame_num_utt.size(); i++) {
+            os << frame_num_utt[i] << " ";
+          }
+          os << "]";
+          KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
         }
-        os << "]";
-        KALDI_LOG << "frame_num_utt[" << frame_num_utt.size() << "]" << os.str();
       }
 
+      Timer t;
       // with new utterance we reset the history,
       nnet.ResetStreams(new_utt_flags);
 
@@ -383,6 +392,7 @@ int main(int argc, char *argv[]) {
         // back-propagate, and do the update,
         nnet.Backpropagate(obj_diff, NULL);
       }
+      time_gpu += t.Elapsed();
 
       // 1st minibatch : show what happens in network,
       if (total_frames == 0) {
@@ -438,7 +448,8 @@ int main(int argc, char *argv[]) {
       << num_other_error << " with other errors. "
       << "[" << (crossvalidate ? "CROSS-VALIDATION" : "TRAINING")
       << ", " << time.Elapsed() / 60 << " min, processing "
-      << total_frames / time.Elapsed() << " frames per sec.]";
+      << total_frames / time.Elapsed() << " frames per sec, "
+      << "GPU_time " << 100.*time_gpu/time.Elapsed() << "% ]";
 
     if (objective_function == "xent") {
       KALDI_LOG << xent.Report();
diff --git a/src/online/Makefile b/src/online/Makefile
index 8f2fe238111..32c99500750 100644
--- a/src/online/Makefile
+++ b/src/online/Makefile
@@ -37,8 +37,7 @@ LIBNAME = kaldi-online
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/online/online-audio-source.cc b/src/online/online-audio-source.cc
index 7b3c31682aa..4f4c0ef5cf5 100644
--- a/src/online/online-audio-source.cc
+++ b/src/online/online-audio-source.cc
@@ -35,7 +35,7 @@ namespace kaldi {
 
 #ifndef KALDI_NO_PORTAUDIO
 
-// The actual PortAudio callback - delegates to OnlinePaSource->PaCallback()
+// The actual PortAudio callback - delegates to OnlinePaSource->Callback()
 int PaCallback(const void *input, void *output,
                long unsigned frame_count,
                const PaStreamCallbackTimeInfo *time_info,
@@ -58,10 +58,10 @@ OnlinePaSource::OnlinePaSource(const uint32 timeout,
 
   // Note this will work for 32bit integers but not for 64bit.
   // For 64bit integers even double wouldn't work
-  // You would ahve to use something like
+  // You would have to use something like
   // int64 rb_bits = 0; while (rb_size != 0) {++rb_bits; rb_size >>= 1;}
   // it would be much faster than two logs of FP numbers (even floats), too,
-  // but I dont have the time to test it.
+  // but I don't have the time to test it.
   float f = Log(static_cast<float>(rb_size)) / Log(static_cast<float>(2));
   int32 rb_bits = static_cast<int32>(ceil(f));
   if (rb_bits > 30)  // ok, this limit is somewhat arbitrary
@@ -72,18 +72,18 @@ OnlinePaSource::OnlinePaSource(const uint32 timeout,
                                &pa_ringbuf_, sizeof(SampleType),
                                rb_size_ / sizeof(SampleType), ring_buffer_);
   if (rbs != 0)
-    throw runtime_error("Unexpected PortAudio ring buffer init error");
+    KALDI_ERR << "PortAudio ring buffer init error";
 
   PaError paerr = Pa_Initialize();
   if (paerr != paNoError)
-    throw runtime_error("PortAudio initialization error");
+    KALDI_ERR << "PortAudio initialization error";
   // Monophone, 16-bit input hardcoded
   KALDI_ASSERT(sizeof(SampleType) == 2 &&
                "The current OnlinePaSource code assumes 16-bit input");
   paerr = Pa_OpenDefaultStream(&pa_stream_, 1, 0, paInt16, sample_rate_, 0,
                                PaCallback, this);
   if (paerr != paNoError)
-    throw runtime_error("PortAudio failed to open the default stream");
+    KALDI_ERR << "PortAudio failed to open the default stream";
 }
 
 
@@ -103,7 +103,7 @@ bool OnlinePaSource::Read(Vector<BaseFloat> *data) {
   if (!pa_started_) {  // start stream the first time Read() is called
     PaError paerr = Pa_StartStream(pa_stream_);
     if (paerr != paNoError)
-      throw std::runtime_error("Error while trying to open PortAudio stream");
+      KALDI_ERR << "Error while trying to open PortAudio stream";
     pa_started_ = true;
   }
   Timer timer;
diff --git a/src/online/online-audio-source.h b/src/online/online-audio-source.h
index d880660d24f..cf936d17d72 100644
--- a/src/online/online-audio-source.h
+++ b/src/online/online-audio-source.h
@@ -42,9 +42,9 @@ class OnlineAudioSourceItf {
   // The function returns true if there may be more data, and false if it
   // knows we are at the end of the stream.
   // In case an unexpected and unrecoverable error occurs the function throws
-  // an exception of type std::runtime_error (e.g. by using KALDI_ERR macro).
+  // an exception of type KaldiFatalError (by using KALDI_ERR macro).
   //
-  // NOTE: The older version of this interface had a second paramater - "timeout".
+  // NOTE: The older version of this interface had a second parameter - "timeout".
   //       We decided to remove it, because we don't envision usage scenarios,
   //       where "timeout" will need to be changed dynamically from call to call.
   //       If the particular audio source can experience timeouts for some reason
@@ -125,7 +125,7 @@ class OnlinePaSource : public OnlineAudioSourceItf {
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlinePaSource);
 };
 
-// The actual PortAudio callback - delegates to OnlinePaSource->PaCallback()
+// The actual PortAudio callback - delegates to OnlinePaSource->Callback()
 int PaCallback(const void *input, void *output,
                long unsigned frame_count,
                const PaStreamCallbackTimeInfo *time_info,
diff --git a/src/online/online-faster-decoder.cc b/src/online/online-faster-decoder.cc
index c403a3eeff5..007779293f8 100644
--- a/src/online/online-faster-decoder.cc
+++ b/src/online/online-faster-decoder.cc
@@ -88,7 +88,7 @@ void OnlineFasterDecoder::UpdateImmortalToken() {
   unordered_set<Token*> emitting;
   for (const Elem *e = toks_.GetList(); e != NULL; e = e->tail) {
     Token* tok = e->val;
-    while (tok->arc_.ilabel == 0) //deal with non-emitting ones ...
+    while (tok != NULL && tok->arc_.ilabel == 0) //deal with non-emitting ones ...
       tok = tok->prev_;
     if (tok != NULL)
       emitting.insert(tok);
@@ -170,7 +170,7 @@ OnlineFasterDecoder::TracebackNFrames(int32 nframes,
   bool is_final = false;
   double this_cost = best_tok->cost_ +
       fst_.Final(best_tok->arc_.nextstate).Value();
-                             
+
   if (this_cost != std::numeric_limits<double>::infinity())
     is_final = true;
   std::vector<LatticeArc> arcs_reverse;  // arcs in reverse order.
diff --git a/src/online/online-feat-input.cc b/src/online/online-feat-input.cc
index 529cf71d4c2..c0c9c5026a6 100644
--- a/src/online/online-feat-input.cc
+++ b/src/online/online-feat-input.cc
@@ -33,7 +33,7 @@ namespace kaldi {
 // may otherwise confuse decoder code into thinking there is
 // a problem with the stream (too many timeouts), and cause it to fail.
 bool OnlineCmnInput::Compute(Matrix<BaseFloat> *output) {
-  
+
   int32 orig_nr = output->NumRows(), orig_nc = output->NumCols();
   int32 initial_t_in = t_in_;
   bool ans;
@@ -78,16 +78,16 @@ bool OnlineCmnInput::ComputeInternal(Matrix<BaseFloat> *output) {
 
   Matrix<BaseFloat> input;
   input.Swap(output);
-  
+
   bool more_data = input_->Compute(&input);
 
   int32 num_input_frames = input.NumRows();
-  
+
   int32 output_frames = NumOutputFrames(num_input_frames,
                                         more_data);
   output->Resize(output_frames,
                  output_frames == 0 ? 0 : Dim());
-  
+
   int32 output_counter = 0;
   for (int32 i = 0; i < num_input_frames; i++) {
     AcceptFrame(input.Row(i));
@@ -131,11 +131,11 @@ void OnlineCmnInput::OutputFrame(VectorBase<BaseFloat> *output) {
     num_history_frames = (t_in_ < min_window_ ? t_in_ : min_window_);
   else
     num_history_frames = t_out_;
-  
+
   SubVector<BaseFloat> input_frame(history_, t_out_ % (cmn_window_ + 1));
   output->CopyFromVec(input_frame);
   output->AddVec(-1.0 / num_history_frames, sum_); // Apply CMN to the output.
-  
+
   // Update sum.
   if (t_out_ >= min_window_)
     sum_.AddVec(1.0, input_frame);
@@ -223,7 +223,7 @@ void OnlineLdaInput::SpliceFrames(const MatrixBase<BaseFloat> &input1,
       num_frames_out = num_frames_in - (context_window - 1),
       dim = std::max(input1.NumCols(), std::max(input2.NumCols(), input3.NumCols()));
   // do std::max in case one or more of the input matrices is empty.
-  
+
   if (num_frames_out <= 0) {
     output->Resize(0, 0);
     return;
@@ -299,14 +299,14 @@ bool OnlineLdaInput::Compute(Matrix<BaseFloat> *output) {
         tail.Row(i).CopyFromVec(remainder_.Row(remainder_.NumRows() - 1));
     }
   }
-  
+
   Matrix<BaseFloat> spliced_feats;
   int32 context_window = left_context_ + 1 + right_context_;
   // The next line is a call to a member function.
   SpliceFrames(remainder_, input, tail, context_window, &spliced_feats);
   TransformToOutput(spliced_feats, output);
   ComputeNextRemainder(input);
-  return ans; 
+  return ans;
 }
 
 void OnlineLdaInput::ComputeNextRemainder(const MatrixBase<BaseFloat> &input) {
@@ -378,7 +378,7 @@ void OnlineDeltaInput::AppendFrames(const MatrixBase<BaseFloat> &input1,
     output->Resize(0, 0);
     return;
   }
-  // do std::max in case one or more of the input matrices is empty.  
+  // do std::max in case one or more of the input matrices is empty.
   int32 dim = std::max(input1.NumCols(),
                        std::max(input2.NumCols(), input3.NumCols()));
 
@@ -417,7 +417,7 @@ void OnlineDeltaInput::DeltaComputation(const MatrixBase<BaseFloat> &input,
   } else {
     output->Resize(0, 0);
   }
-}                                     
+}
 
 bool OnlineDeltaInput::Compute(Matrix<BaseFloat> *output) {
   KALDI_ASSERT(output->NumRows() > 0 &&
@@ -462,22 +462,22 @@ bool OnlineDeltaInput::Compute(Matrix<BaseFloat> *output) {
         tail.Row(i).CopyFromVec(remainder_.Row(remainder_.NumRows() - 1));
     }
   }
-  
+
   Matrix<BaseFloat> appended_feats;
   AppendFrames(remainder_, input, tail, &appended_feats);
   DeltaComputation(appended_feats, output, &remainder_);
-  return ans; 
+  return ans;
 }
 
 
 
 void OnlineFeatureMatrix::GetNextFeatures() {
   if (finished_) return; // Nothing to do.
-  
+
   // We always keep the most recent frame of features, if present,
   // in case it is needed (this may happen when someone calls
   // IsLastFrame(), which requires us to get the next frame, while
-  // they're stil processing this frame.
+  // they're still processing this frame.
   bool have_last_frame = (feat_matrix_.NumRows() != 0);
   Vector<BaseFloat> last_frame;
   if (have_last_frame)
diff --git a/src/online/online-feat-input.h b/src/online/online-feat-input.h
index b730a373ac0..302769e053c 100644
--- a/src/online/online-feat-input.h
+++ b/src/online/online-feat-input.h
@@ -31,6 +31,7 @@
 
 #include "online-audio-source.h"
 #include "feat/feature-functions.h"
+#include "feat/feature-window.h"
 
 namespace kaldi {
 
@@ -53,7 +54,7 @@ class OnlineFeatInputItf {
   //            several times in a row you may want to terminate processing the
   //            stream.
   //
-  // Note: similar to the OnlineAudioInput::Read(), Compute() previously
+  // Note: similar to the OnlineAudioSourceItf::Read(), Compute() previously
   //       had a second argument - "timeout". Again we decided against including
   //       this parameter in the interface specification. Instead we are
   //       considering time out handling to be implementation detail, and if needed
@@ -69,7 +70,7 @@ class OnlineFeatInputItf {
   virtual bool Compute(Matrix<BaseFloat> *output) = 0;
 
   virtual int32 Dim() const = 0; // Return the output dimension of these features.
-  
+
   virtual ~OnlineFeatInputItf() {}
 };
 
@@ -88,15 +89,15 @@ class OnlineCmnInput: public OnlineFeatInputItf {
       : input_(input), cmn_window_(cmn_window), min_window_(min_window),
         history_(cmn_window + 1, input->Dim()), t_in_(0), t_out_(0),
         sum_(input->Dim()) { KALDI_ASSERT(cmn_window >= min_window && min_window > 0); }
-  
+
   virtual bool Compute(Matrix<BaseFloat> *output);
 
   virtual int32 Dim() const { return input_->Dim(); }
 
  private:
-  virtual bool ComputeInternal(Matrix<BaseFloat> *output);
+  bool ComputeInternal(Matrix<BaseFloat> *output);
+
 
-  
   OnlineFeatInputItf *input_;
   const int32 cmn_window_; // > 0
   const int32 min_window_; // > 0, < cmn_window_.
@@ -108,20 +109,20 @@ class OnlineCmnInput: public OnlineFeatInputItf {
                                                         // of input (read into the
                                                         // history buffer).
   void OutputFrame(VectorBase<BaseFloat> *output); // Output the next frame.
-  
+
   int32 NumOutputFrames(int32 num_new_frames,
                         bool more_data) const; // Tells the caller, assuming
   // we get given "num_new_frames" of input (and given knowledge of whether
   // there is more data coming), how many frames would we be able to
   // output?
-  
-  
+
+
   int64 t_in_; // Time-counter for what we've obtained from the input.
   int64 t_out_; // Time-counter for what we've written to the output.
-  
+
   Vector<double> sum_; // Sum of the frames from t_out_ - HistoryLength(t_out_),
                        // to t_out_ - 1.
-  
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineCmnInput);
 };
 
@@ -129,20 +130,20 @@ class OnlineCmnInput: public OnlineFeatInputItf {
 class OnlineCacheInput : public OnlineFeatInputItf {
  public:
   OnlineCacheInput(OnlineFeatInputItf *input): input_(input) { }
-  
+
   // The Compute function just forwards to the previous member of the
   // chain, except that we locally accumulate the result, and
   // GetCachedData() will return the entire input up to the current time.
   virtual bool Compute(Matrix<BaseFloat> *output);
 
   void GetCachedData(Matrix<BaseFloat> *output);
-  
+
   int32 Dim() const { return input_->Dim(); }
-  
+
   void Deallocate();
-    
+
   virtual ~OnlineCacheInput() { Deallocate(); }
-  
+
  private:
   OnlineFeatInputItf *input_;
   // data_ is a list of all the outputs we produced in successive
@@ -155,7 +156,7 @@ class OnlineCacheInput : public OnlineFeatInputItf {
 
 // Accepts features over an UDP socket
 // The current implementation doesn't support the "timeout" -
-// the server is waiting for data indefinetily long time.
+// the server is waiting for data indefinitely long time.
 class OnlineUdpInput : public OnlineFeatInputItf {
  public:
   OnlineUdpInput(int32 port, int32 feature_dim);
@@ -167,7 +168,7 @@ class OnlineUdpInput : public OnlineFeatInputItf {
   const sockaddr_in& client_addr() const { return client_addr_; }
 
   const int32 descriptor() const { return sock_desc_; }
-  
+
  private:
   int32 feature_dim_;
   // various BSD sockets-related data structures
@@ -194,7 +195,7 @@ class OnlineLdaInput: public OnlineFeatInputItf {
   virtual int32 Dim() const { return linear_transform_.NumRows(); }
 
  private:
-  // The static function SpliceFeats splices together the features and
+  // The static function SpliceFrames splices together the features and
   // puts them together in a matrix, so that each row of "output" contains
   // a contiguous window of size "context_window" of input frames.  The dimension
   // of "output" will be feats.NumRows() - context_window + 1 by
@@ -210,7 +211,7 @@ class OnlineLdaInput: public OnlineFeatInputItf {
   void TransformToOutput(const MatrixBase<BaseFloat> &spliced_feats,
                          Matrix<BaseFloat> *output);
   void ComputeNextRemainder(const MatrixBase<BaseFloat> &input);
-  
+
   OnlineFeatInputItf *input_; // underlying/inferior input object
   const int32 input_dim_; // dimension of the feature vectors before xform
   const int32 left_context_;
@@ -219,7 +220,7 @@ class OnlineLdaInput: public OnlineFeatInputItf {
   Vector<BaseFloat> offset_; // Offset, if present; else empty.
   Matrix<BaseFloat> remainder_; // The last few frames of the input, that may
   // be needed for context purposes.
-  
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineLdaInput);
 };
 
@@ -232,11 +233,11 @@ class OnlineDeltaInput: public OnlineFeatInputItf {
  public:
   OnlineDeltaInput(const DeltaFeaturesOptions &delta_opts,
                    OnlineFeatInputItf *input);
-  
+
   virtual bool Compute(Matrix<BaseFloat> *output);
 
   virtual int32 Dim() const { return input_dim_ * (opts_.order + 1); }
-  
+
  private:
   // The static function AppendFrames appends together the three input matrices,
   // some of which may be empty.
@@ -248,25 +249,25 @@ class OnlineDeltaInput: public OnlineFeatInputItf {
   // Context() is the number of frames on each side of a given frame,
   // that we need for context.
   int32 Context() const { return opts_.order * opts_.window; }
-  
+
   // Does the delta computation.  Here, "output" will be resized to dimension
   // (input.NumRows() - Context() * 2) by (input.NumCols() * opts_.order)
   // "remainder" will be the last Context() rows of "input".
   void DeltaComputation(const MatrixBase<BaseFloat> &input,
                         Matrix<BaseFloat> *output,
                         Matrix<BaseFloat> *remainder) const;
-  
+
   OnlineFeatInputItf *input_; // underlying/inferior input object
   DeltaFeaturesOptions opts_;
   const int32 input_dim_;
   Matrix<BaseFloat> remainder_; // The last few frames of the input, that may
   // be needed for context purposes.
-  
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineDeltaInput);
 };
 
 // Implementation, that is meant to be used to read samples from an
-// OnlineAudioSource and to extract MFCC/PLP features in the usual way
+// OnlineAudioSourceItf and to extract MFCC/PLP features in the usual way
 template <class E>
 class OnlineFeInput : public OnlineFeatInputItf {
  public:
@@ -275,7 +276,8 @@ class OnlineFeInput : public OnlineFeatInputItf {
   // "frame_size" - frame extraction window size in audio samples
   // "frame_shift" - feature frame width in audio samples
   OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe,
-                const int32 frame_size, const int32 frame_shift);
+                const int32 frame_size, const int32 frame_shift,
+                const bool snip_edges = true);
 
   virtual int32 Dim() const { return extractor_->Dim(); }
 
@@ -287,15 +289,26 @@ class OnlineFeInput : public OnlineFeatInputItf {
   const int32 frame_size_;
   const int32 frame_shift_;
   Vector<BaseFloat> wave_; // the samples to be passed for extraction
+  Vector<BaseFloat> wave_remainder_; // the samples remained from the previous
+                                     // feature batch
+  FrameExtractionOptions frame_opts_;
 
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineFeInput);
 };
 
 template<class E>
 OnlineFeInput<E>::OnlineFeInput(OnlineAudioSourceItf *au_src, E *fe,
-                                   int32 frame_size, int32 frame_shift)
+                                int32 frame_size, int32 frame_shift,
+                                bool snip_edges)
     : source_(au_src), extractor_(fe),
-      frame_size_(frame_size), frame_shift_(frame_shift) {}
+      frame_size_(frame_size), frame_shift_(frame_shift) {
+      // we need a FrameExtractionOptions to call NumFrames()
+      // 1000 is just a fake sample rate which equates ms and samples
+      frame_opts_.samp_freq = 1000;
+      frame_opts_.frame_shift_ms = frame_shift;
+      frame_opts_.frame_length_ms = frame_size;
+      frame_opts_.snip_edges = snip_edges;
+}
 
 template<class E> bool
 OnlineFeInput<E>::Compute(Matrix<BaseFloat> *output) {
@@ -311,11 +324,26 @@ OnlineFeInput<E>::Compute(Matrix<BaseFloat> *output) {
 
   bool ans = source_->Read(&read_samples);
 
+  Vector<BaseFloat> all_samples(wave_remainder_.Dim() + read_samples.Dim());
+  all_samples.Range(0, wave_remainder_.Dim()).CopyFromVec(wave_remainder_);
+  all_samples.Range(wave_remainder_.Dim(), read_samples.Dim()).
+      CopyFromVec(read_samples);
+
   // Extract the features
-  if (read_samples.Dim() >= frame_size_) {
-    extractor_->Compute(read_samples, 1.0, output);
+  if (all_samples.Dim() >= frame_size_) {
+    // extract waveform remainder before calling Compute()
+    int32 num_frames = NumFrames(all_samples.Dim(), frame_opts_);
+    // offset is the amount at the start that has been extracted.
+    int32 offset = num_frames * frame_shift_;
+    int32 remaining_len = all_samples.Dim() - offset;
+    wave_remainder_.Resize(remaining_len);
+    KALDI_ASSERT(remaining_len >= 0);
+    if (remaining_len > 0)
+      wave_remainder_.CopyFromVec(SubVector<BaseFloat>(all_samples, offset, remaining_len));
+    extractor_->Compute(all_samples, 1.0, output);
   } else {
     output->Resize(0, 0);
+    wave_remainder_ = all_samples;
   }
 
   return ans;
@@ -345,8 +373,8 @@ class OnlineFeatureMatrix {
                       OnlineFeatInputItf *input):
       opts_(opts), input_(input), feat_dim_(input->Dim()),
       feat_offset_(0), finished_(false) { }
-  
-  bool IsValidFrame (int32 frame); 
+
+  bool IsValidFrame (int32 frame);
 
   int32 Dim() const { return feat_dim_; }
 
@@ -355,11 +383,10 @@ class OnlineFeatureMatrix {
   // is valid.
   SubVector<BaseFloat> GetFrame(int32 frame);
 
-  bool Good(); // returns true if we have at least one frame.
  private:
-  void GetNextFeatures(); // called when we need more features.  Guarantees
+  void GetNextFeatures(); // called when we need more features. Guarantees
   // to get at least one more frame, or set finished_ = true.
-  
+
   const OnlineFeatureMatrixOptions opts_;
   OnlineFeatInputItf *input_;
   int32 feat_dim_;
diff --git a/src/online/online-feat-test.cc b/src/online/online-feat-test.cc
index aa612d02dd8..10a6512e236 100644
--- a/src/online/online-feat-test.cc
+++ b/src/online/online-feat-test.cc
@@ -24,7 +24,7 @@ namespace kaldi {
 // This class is for testing and prototyping purposes, it
 // does not really do anything except wrap a matrix of features
 // in this class.  Note: it maintains a reference to the input
-// matrix, so be careful not to delete it while this object
+// matrix, so be careful not to delete it while this object is alive.
 // Since this is intended for testing purposes, it may occasionally
 // "time out" and return fewer than requested
 class OnlineMatrixInput : public OnlineFeatInputItf {
diff --git a/src/online/onlinebin-util.cc b/src/online/onlinebin-util.cc
index 5f02c29e26d..9ab2efc48bb 100644
--- a/src/online/onlinebin-util.cc
+++ b/src/online/onlinebin-util.cc
@@ -24,7 +24,7 @@
 
 namespace kaldi {
 
-fst::Fst<fst::StdArc> *ReadDecodeGraph(std::string filename) {
+fst::Fst<fst::StdArc> *ReadDecodeGraph(const std::string& filename) {
   // read decoding network FST
   Input ki(filename); // use ki.Stream() instead of is.
   if (!ki.Stream().good()) KALDI_ERR << "Could not open decoding-graph FST "
diff --git a/src/online/onlinebin-util.h b/src/online/onlinebin-util.h
index 800a5c7f53d..f5f33fc0e50 100644
--- a/src/online/onlinebin-util.h
+++ b/src/online/onlinebin-util.h
@@ -28,13 +28,13 @@
 
 // This file hosts the declarations of various auxiliary functions, used by
 // the binaries in "onlinebin" directory. These functions are not part of the
-// core online decoding infrastructure, but rather artefacts of the particular
+// core online decoding infrastructure, but rather artifacts of the particular
 // implementation of the binaries.
 
 namespace kaldi {
 
 // Reads a decoding graph from a file
-fst::Fst<fst::StdArc> *ReadDecodeGraph(std::string filename);
+fst::Fst<fst::StdArc> *ReadDecodeGraph(const std::string& filename);
 
 // Prints a string corresponding to (a possibly partial) decode result as
 // and adds a "new line" character if "line_break" argument is true
diff --git a/src/online2/Makefile b/src/online2/Makefile
index 764fef3ab26..bbc7ac07bb1 100644
--- a/src/online2/Makefile
+++ b/src/online2/Makefile
@@ -9,7 +9,7 @@ OBJFILES = online-gmm-decodable.o online-feature-pipeline.o online-ivector-featu
            online-nnet2-feature-pipeline.o online-gmm-decoding.o online-timing.o \
            online-endpoint.o onlinebin-util.o online-speex-wrapper.o \
            online-nnet2-decoding.o online-nnet2-decoding-threaded.o \
-           online-nnet3-decoding.o
+           online-nnet3-decoding.o online-nnet3-incremental-decoding.o
 
 LIBNAME = kaldi-online2
 
@@ -18,8 +18,8 @@ ADDLIBS = ../ivector/kaldi-ivector.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 
diff --git a/src/online2/online-endpoint.cc b/src/online2/online-endpoint.cc
index c34ddaca9d3..a3be0791f03 100644
--- a/src/online2/online-endpoint.cc
+++ b/src/online2/online-endpoint.cc
@@ -19,6 +19,7 @@
 
 #include "online2/online-endpoint.h"
 #include "decoder/lattice-faster-online-decoder.h"
+#include "decoder/grammar-fst.h"
 
 namespace kaldi {
 
@@ -70,9 +71,10 @@ bool EndpointDetected(const OnlineEndpointConfig &config,
   return false;
 }
 
+template <typename FST, typename DEC>
 int32 TrailingSilenceLength(const TransitionModel &tmodel,
                             const std::string &silence_phones_str,
-                            const LatticeFasterOnlineDecoder &decoder) {
+                            const DEC &decoder) {
   std::vector<int32> silence_phones;
   if (!SplitStringToIntegers(silence_phones_str, ":", false, &silence_phones))
     KALDI_ERR << "Bad --silence-phones option in endpointing config: "
@@ -85,7 +87,7 @@ int32 TrailingSilenceLength(const TransitionModel &tmodel,
   ConstIntegerSet<int32> silence_set(silence_phones);
 
   bool use_final_probs = false;
-  LatticeFasterOnlineDecoder::BestPathIterator iter =
+  typename DEC::BestPathIterator iter =
       decoder.BestPathEnd(use_final_probs, NULL);
   int32 num_silence_frames = 0;
   while (!iter.Done()) {  // we're going backwards in time from the most
@@ -104,17 +106,18 @@ int32 TrailingSilenceLength(const TransitionModel &tmodel,
   return num_silence_frames;
 }
 
+template <typename FST>
 bool EndpointDetected(
     const OnlineEndpointConfig &config,
     const TransitionModel &tmodel,
     BaseFloat frame_shift_in_seconds,
-    const LatticeFasterOnlineDecoder &decoder) {
+    const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
   if (decoder.NumFramesDecoded() == 0) return false;
 
   BaseFloat final_relative_cost = decoder.FinalRelativeCost();
 
   int32 num_frames_decoded = decoder.NumFramesDecoded(),
-      trailing_silence_frames = TrailingSilenceLength(tmodel,
+      trailing_silence_frames = TrailingSilenceLength<FST, LatticeFasterOnlineDecoderTpl<FST>>(tmodel,
                                                       config.silence_phones,
                                                       decoder);
 
@@ -122,5 +125,59 @@ bool EndpointDetected(
                           frame_shift_in_seconds, final_relative_cost);
 }
 
+template <typename FST>
+bool EndpointDetected(
+    const OnlineEndpointConfig &config,
+    const TransitionModel &tmodel,
+    BaseFloat frame_shift_in_seconds,
+    const LatticeIncrementalOnlineDecoderTpl<FST> &decoder) {
+  if (decoder.NumFramesDecoded() == 0) return false;
+
+  BaseFloat final_relative_cost = decoder.FinalRelativeCost();
+
+  int32 num_frames_decoded = decoder.NumFramesDecoded(),
+      trailing_silence_frames = TrailingSilenceLength<FST, LatticeIncrementalOnlineDecoderTpl<FST>>(tmodel,
+                                                      config.silence_phones,
+                                                      decoder);
+
+  return EndpointDetected(config, num_frames_decoded, trailing_silence_frames,
+                          frame_shift_in_seconds, final_relative_cost);
+}
+
+
+
+// Instantiate EndpointDetected for the types we need.
+// It will require TrailingSilenceLength so we don't have to instantiate that.
+template
+bool EndpointDetected<fst::Fst<fst::StdArc> >(
+    const OnlineEndpointConfig &config,
+    const TransitionModel &tmodel,
+    BaseFloat frame_shift_in_seconds,
+    const LatticeFasterOnlineDecoderTpl<fst::Fst<fst::StdArc> > &decoder);
+
+
+template
+bool EndpointDetected<fst::GrammarFst>(
+    const OnlineEndpointConfig &config,
+    const TransitionModel &tmodel,
+    BaseFloat frame_shift_in_seconds,
+    const LatticeFasterOnlineDecoderTpl<fst::GrammarFst> &decoder);
+
+template
+bool EndpointDetected<fst::Fst<fst::StdArc> >(
+    const OnlineEndpointConfig &config,
+    const TransitionModel &tmodel,
+    BaseFloat frame_shift_in_seconds,
+    const LatticeIncrementalOnlineDecoderTpl<fst::Fst<fst::StdArc> > &decoder);
+
+
+template
+bool EndpointDetected<fst::GrammarFst>(
+    const OnlineEndpointConfig &config,
+    const TransitionModel &tmodel,
+    BaseFloat frame_shift_in_seconds,
+    const LatticeIncrementalOnlineDecoderTpl<fst::GrammarFst> &decoder);
+
+
 
 }  // namespace kaldi
diff --git a/src/online2/online-endpoint.h b/src/online2/online-endpoint.h
index ec946d4e52c..3171f0c532c 100644
--- a/src/online2/online-endpoint.h
+++ b/src/online2/online-endpoint.h
@@ -35,6 +35,7 @@
 #include "lat/kaldi-lattice.h"
 #include "hmm/transition-model.h"
 #include "decoder/lattice-faster-online-decoder.h"
+#include "decoder/lattice-incremental-online-decoder.h"
 
 namespace kaldi {
 /// @addtogroup  onlinedecoding OnlineDecoding
@@ -182,26 +183,34 @@ bool EndpointDetected(const OnlineEndpointConfig &config,
                       BaseFloat final_relative_cost);
 
 
-
-class LatticeFasterOnlineDecoder;
-
 /// returns the number of frames of trailing silence in the best-path traceback
 /// (not using final-probs).  "silence_phones" is a colon-separated list of
 /// integer id's of phones that we consider silence.  We use the the
 /// BestPathEnd() and TraceBackOneLink() functions of LatticeFasterOnlineDecoder
 /// to do this efficiently.
+template <typename FST, typename DEC>
 int32 TrailingSilenceLength(const TransitionModel &tmodel,
                             const std::string &silence_phones,
-                            const LatticeFasterOnlineDecoder &decoder);
+                            const DEC &decoder);
+
 
+/// This is a higher-level convenience function that works out the
+/// arguments to the EndpointDetected function above, from the decoder.
+template <typename FST>
+bool EndpointDetected(
+    const OnlineEndpointConfig &config,
+    const TransitionModel &tmodel,
+    BaseFloat frame_shift_in_seconds,
+    const LatticeFasterOnlineDecoderTpl<FST> &decoder);
 
 /// This is a higher-level convenience function that works out the
 /// arguments to the EndpointDetected function above, from the decoder.
+template <typename FST>
 bool EndpointDetected(
     const OnlineEndpointConfig &config,
     const TransitionModel &tmodel,
     BaseFloat frame_shift_in_seconds,
-    const LatticeFasterOnlineDecoder &decoder);
+    const LatticeIncrementalOnlineDecoderTpl<FST> &decoder);
 
 
 
diff --git a/src/online2/online-feature-pipeline.cc b/src/online2/online-feature-pipeline.cc
index 3cd3a9daaa4..471de71f181 100644
--- a/src/online2/online-feature-pipeline.cc
+++ b/src/online2/online-feature-pipeline.cc
@@ -30,7 +30,7 @@ OnlineFeaturePipelineConfig::OnlineFeaturePipelineConfig(
     feature_type = config.feature_type;
   } else {
     KALDI_ERR << "Invalid feature type: " << config.feature_type << ". "
-              << "Supported feature types: mfcc, plp.";
+              << "Supported feature types: mfcc, plp, fbank.";
   }
 
   if (config.mfcc_config != "") {
@@ -288,6 +288,8 @@ BaseFloat OnlineFeaturePipelineConfig::FrameShiftInSeconds() const {
     return mfcc_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else if (feature_type == "plp") {
     return plp_opts.frame_opts.frame_shift_ms / 1000.0f;
+  } else if (feature_type == "fbank") {
+    return fbank_opts.frame_opts.frame_shift_ms / 1000.0f;
   } else {
     KALDI_ERR << "Unknown feature type " << feature_type;
     return 0.0;
diff --git a/src/online2/online-feature-pipeline.h b/src/online2/online-feature-pipeline.h
index f89cbbbb898..6196a3b30f9 100644
--- a/src/online2/online-feature-pipeline.h
+++ b/src/online2/online-feature-pipeline.h
@@ -166,7 +166,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
 
   // This is supplied for debug purposes.
   void GetAsMatrix(Matrix<BaseFloat> *feats);
-  
+
   void FreezeCmvn();  // stop it from moving further (do this when you start
                       // using fMLLR). This will crash if NumFramesReady() == 0.
 
@@ -226,7 +226,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
   Matrix<BaseFloat> lda_mat_;  // LDA matrix, if supplied.
   Matrix<BaseFloat> global_cmvn_stats_;  // Global CMVN stats.
 
-  OnlineBaseFeature *base_feature_;        // MFCC/PLP
+  OnlineBaseFeature *base_feature_;        // MFCC/PLP/Fbank
   OnlinePitchFeature *pitch_;              // Raw pitch
   OnlineProcessPitch *pitch_feature_;  // Processed pitch
   OnlineFeatureInterface *feature_;        // CMVN (+ processed pitch)
diff --git a/src/online2/online-gmm-decodable.h b/src/online2/online-gmm-decodable.h
index c037ad0efe4..1a1d37ba2a2 100644
--- a/src/online2/online-gmm-decodable.h
+++ b/src/online2/online-gmm-decodable.h
@@ -24,8 +24,10 @@
 #define KALDI_ONLINE2_ONLINE_GMM_DECODABLE_H_
 
 #include "itf/online-feature-itf.h"
-#include "gmm/decodable-am-diag-gmm.h"
 #include "matrix/matrix-lib.h"
+#include "itf/decodable-itf.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
 
 namespace kaldi {
 
@@ -37,20 +39,20 @@ class DecodableDiagGmmScaledOnline : public DecodableInterface {
                                const BaseFloat scale,
                                OnlineFeatureInterface *input_feats);
 
-  
+
   /// Returns the scaled log likelihood
   virtual BaseFloat LogLikelihood(int32 frame, int32 index);
-  
+
   virtual bool IsLastFrame(int32 frame) const;
 
-  virtual int32 NumFramesReady() const;  
-  
+  virtual int32 NumFramesReady() const;
+
   /// Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
  private:
   void CacheFrame(int32 frame);
-  
+
   OnlineFeatureInterface *features_;
   const AmDiagGmm &ac_model_;
   BaseFloat ac_scale_;
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index 1048bd1caa8..3a15ac9a325 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -28,6 +28,7 @@ OnlineIvectorExtractionInfo::OnlineIvectorExtractionInfo(
 
 void OnlineIvectorExtractionInfo::Init(
     const OnlineIvectorExtractionConfig &config) {
+  online_cmvn_iextractor = config.online_cmvn_iextractor;
   ivector_period = config.ivector_period;
   num_gselect = config.num_gselect;
   min_post = config.min_post;
@@ -174,24 +175,67 @@ void OnlineIvectorFeature::UpdateFrameWeights(
   delta_weights_provided_ = true;
 }
 
-void OnlineIvectorFeature::UpdateStatsForFrame(int32 t,
-                                               BaseFloat weight) {
+
+BaseFloat OnlineIvectorFeature::GetMinPost(BaseFloat weight) const {
+  BaseFloat min_post = info_.min_post;
+  BaseFloat abs_weight = fabs(weight);
+  // If we return 0.99, it will have the same effect as just picking the
+  // most probable Gaussian on that frame.
+  if (abs_weight == 0.0)
+    return 0.99;   // I don't anticipate reaching here.
+  min_post /= abs_weight;
+  if (min_post > 0.99)
+    min_post = 0.99;
+  return min_post;
+}
+
+void OnlineIvectorFeature::UpdateStatsForFrames(
+    const std::vector<std::pair<int32, BaseFloat> > &frame_weights_in) {
+
+  std::vector<std::pair<int32, BaseFloat> > frame_weights(frame_weights_in);
+  // Remove duplicates of frames.
+  MergePairVectorSumming(&frame_weights);
+
+  if (frame_weights.empty())
+    return;
+
+  int32 num_frames = static_cast<int32>(frame_weights.size());
   int32 feat_dim = lda_normalized_->Dim();
-  Vector<BaseFloat> feat(feat_dim),  // features given to iVector extractor
-      log_likes(info_.diag_ubm.NumGauss());
-  lda_normalized_->GetFrame(t, &feat);
-  info_.diag_ubm.LogLikelihoods(feat, &log_likes);
-  // "posterior" stores the pruned posteriors for Gaussians in the UBM.
-  std::vector<std::pair<int32, BaseFloat> > posterior;
-  tot_ubm_loglike_ += weight *
-      VectorToPosteriorEntry(log_likes, info_.num_gselect,
-                             info_.min_post, &posterior);
-  for (size_t i = 0; i < posterior.size(); i++)
-    posterior[i].second *= info_.posterior_scale * weight;
-  lda_->GetFrame(t, &feat); // get feature without CMN.
-  ivector_stats_.AccStats(info_.extractor, feat, posterior);
+  Matrix<BaseFloat> feats(num_frames, feat_dim, kUndefined),
+      log_likes;
+
+  std::vector<int32> frames;
+  frames.reserve(frame_weights.size());
+  for (int32 i = 0; i < num_frames; i++)
+    frames.push_back(frame_weights[i].first);
+  lda_normalized_->GetFrames(frames, &feats);
+
+  info_.diag_ubm.LogLikelihoods(feats, &log_likes);
+
+  // "posteriors" stores, for each frame index in the range of frames, the
+  // pruned posteriors for the Gaussians in the UBM.
+  std::vector<std::vector<std::pair<int32, BaseFloat> > > posteriors(num_frames);
+  for (int32 i = 0; i < num_frames; i++) {
+    std::vector<std::pair<int32, BaseFloat> > &posterior = posteriors[i];
+    BaseFloat weight = frame_weights[i].second;
+    if (weight != 0.0) {
+      tot_ubm_loglike_ += weight *
+          VectorToPosteriorEntry(log_likes.Row(i), info_.num_gselect,
+                                 GetMinPost(weight), &posterior);
+      for (size_t j = 0; j < posterior.size(); j++)
+        posterior[j].second *= info_.posterior_scale * weight;
+    }
+  }
+
+  if (! info_.online_cmvn_iextractor) {
+    lda_->GetFrames(frames, &feats);  // default, get features without OnlineCmvn
+  } else {
+    lda_normalized_->GetFrames(frames, &feats); // get features with OnlineCmvn
+  }
+  ivector_stats_.AccStats(info_.extractor, feats, posteriors);
 }
 
+
 void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
   KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
                !delta_weights_provided_);
@@ -200,11 +244,19 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
   int32 ivector_period = info_.ivector_period;
   int32 num_cg_iters = info_.num_cg_iters;
 
+  std::vector<std::pair<int32, BaseFloat> > frame_weights;
+
   for (; num_frames_stats_ <= frame; num_frames_stats_++) {
     int32 t = num_frames_stats_;
-    UpdateStatsForFrame(t, 1.0);
+    BaseFloat frame_weight = 1.0;
+    frame_weights.push_back(std::pair<int32, BaseFloat>(t, frame_weight));
     if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
         (info_.use_most_recent_ivector && t == frame)) {
+      // The call below to UpdateStatsForFrames() is equivalent to doing, for
+      // all valid indexes i:
+      //  UpdateStatsForFrame(cur_start_frame + i, frame_weights[i])
+      UpdateStatsForFrames(frame_weights);
+      frame_weights.clear();
       ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
       if (!info_.use_most_recent_ivector) {  // need to cache iVectors.
         int32 ivec_index = t / ivector_period;
@@ -213,6 +265,8 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
       }
     }
   }
+  if (!frame_weights.empty())
+    UpdateStatsForFrames(frame_weights);
 }
 
 void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
@@ -225,17 +279,19 @@ void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
   int32 ivector_period = info_.ivector_period;
   int32 num_cg_iters = info_.num_cg_iters;
 
+  std::vector<std::pair<int32, BaseFloat> > frame_weights;
+  frame_weights.reserve(delta_weights_.size());
+
   for (; num_frames_stats_ <= frame; num_frames_stats_++) {
     int32 t = num_frames_stats_;
     // Instead of just updating frame t, we update all frames that need updating
-    // with index <= 1, in case old frames were reclassified as silence/nonsilence.
+    // with index <= t, in case old frames were reclassified as silence/nonsilence.
     while (!delta_weights_.empty() &&
            delta_weights_.top().first <= t) {
-      std::pair<int32, BaseFloat> p = delta_weights_.top();
+      int32 frame = delta_weights_.top().first;
+      BaseFloat weight = delta_weights_.top().second;
+      frame_weights.push_back(delta_weights_.top());
       delta_weights_.pop();
-      int32 frame = p.first;
-      BaseFloat weight = p.second;
-      UpdateStatsForFrame(frame, weight);
       if (debug_weights) {
         if (current_frame_weight_debug_.size() <= frame)
           current_frame_weight_debug_.resize(frame + 1, 0.0);
@@ -244,6 +300,8 @@ void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
     }
     if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
         (info_.use_most_recent_ivector && t == frame)) {
+      UpdateStatsForFrames(frame_weights);
+      frame_weights.clear();
       ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
       if (!info_.use_most_recent_ivector) {  // need to cache iVectors.
         int32 ivec_index = t / ivector_period;
@@ -252,6 +310,8 @@ void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
       }
     }
   }
+  if (!frame_weights.empty())
+    UpdateStatsForFrames(frame_weights);
 }
 
 
@@ -297,7 +357,7 @@ void OnlineIvectorFeature::PrintDiagnostics() const {
     Vector<BaseFloat> temp_ivector(current_ivector_);
     temp_ivector(0) -= info_.extractor.PriorOffset();
 
-    KALDI_VLOG(3) << "By the end of the utterance, objf change/frame "
+    KALDI_VLOG(2) << "By the end of the utterance, objf change/frame "
                   << "from estimating iVector (vs. default) was "
                   << ivector_stats_.ObjfChange(current_ivector_)
                   << " and iVector length was "
@@ -308,12 +368,8 @@ void OnlineIvectorFeature::PrintDiagnostics() const {
 OnlineIvectorFeature::~OnlineIvectorFeature() {
   PrintDiagnostics();
   // Delete objects owned here.
-  delete lda_normalized_;
-  delete splice_normalized_;
-  delete cmvn_;
-  delete lda_;
-  delete splice_;
-  // base_ is not owned here so don't delete it.
+  for (size_t i = 0; i < to_delete_.size(); i++)
+    delete to_delete_[i];
   for (size_t i = 0; i < ivectors_history_.size(); i++)
     delete ivectors_history_[i];
 }
@@ -334,7 +390,8 @@ void OnlineIvectorFeature::GetAdaptationState(
 OnlineIvectorFeature::OnlineIvectorFeature(
     const OnlineIvectorExtractionInfo &info,
     OnlineFeatureInterface *base_feature):
-    info_(info), base_(base_feature),
+    info_(info),
+    base_(base_feature),
     ivector_stats_(info_.extractor.IvectorDim(),
                    info_.extractor.PriorOffset(),
                    info_.max_count),
@@ -343,16 +400,33 @@ OnlineIvectorFeature::OnlineIvectorFeature(
     most_recent_frame_with_weight_(-1), tot_ubm_loglike_(0.0) {
   info.Check();
   KALDI_ASSERT(base_feature != NULL);
-  splice_ = new OnlineSpliceFrames(info_.splice_opts, base_);
-  lda_ = new OnlineTransform(info.lda_mat, splice_);
+  OnlineFeatureInterface *splice_feature = new OnlineSpliceFrames(info_.splice_opts, base_feature);
+  to_delete_.push_back(splice_feature);
+  OnlineFeatureInterface *lda_feature = new OnlineTransform(info.lda_mat, splice_feature);
+  to_delete_.push_back(lda_feature);
+  OnlineFeatureInterface *lda_cache_feature = new OnlineCacheFeature(lda_feature);
+  lda_ = lda_cache_feature;
+  to_delete_.push_back(lda_cache_feature);
+
+
   OnlineCmvnState naive_cmvn_state(info.global_cmvn_stats);
   // Note: when you call this constructor the CMVN state knows nothing
   // about the speaker.  If you want to inform this class about more specific
   // adaptation state, call this->SetAdaptationState(), most likely derived
   // from a call to GetAdaptationState() from a previous object of this type.
-  cmvn_ = new OnlineCmvn(info.cmvn_opts, naive_cmvn_state, base_);
-  splice_normalized_ = new OnlineSpliceFrames(info_.splice_opts, cmvn_);
-  lda_normalized_ = new OnlineTransform(info.lda_mat, splice_normalized_);
+  cmvn_ = new OnlineCmvn(info.cmvn_opts, naive_cmvn_state, base_feature);
+  to_delete_.push_back(cmvn_);
+
+  OnlineFeatureInterface *splice_normalized =
+      new OnlineSpliceFrames(info_.splice_opts, cmvn_),
+      *lda_normalized =
+      new OnlineTransform(info.lda_mat, splice_normalized),
+      *cache_normalized = new OnlineCacheFeature(lda_normalized);
+  lda_normalized_ = cache_normalized;
+
+  to_delete_.push_back(splice_normalized);
+  to_delete_.push_back(lda_normalized);
+  to_delete_.push_back(cache_normalized);
 
   // Set the iVector to its default value, [ prior_offset, 0, 0, ... ].
   current_ivector_.Resize(info_.extractor.IvectorDim());
@@ -395,8 +469,9 @@ OnlineSilenceWeighting::OnlineSilenceWeighting(
 }
 
 
+template <typename FST>
 void OnlineSilenceWeighting::ComputeCurrentTraceback(
-    const LatticeFasterOnlineDecoder &decoder) {
+    const LatticeFasterOnlineDecoderTpl<FST> &decoder) {
   int32 num_frames_decoded = decoder.NumFramesDecoded(),
       num_frames_prev = frame_info_.size();
   // note, num_frames_prev is not the number of frames previously decoded,
@@ -412,7 +487,7 @@ void OnlineSilenceWeighting::ComputeCurrentTraceback(
     return;
   int32 frame = num_frames_decoded - 1;
   bool use_final_probs = false;
-  LatticeFasterOnlineDecoder::BestPathIterator iter =
+  typename LatticeFasterOnlineDecoderTpl<FST>::BestPathIterator iter =
       decoder.BestPathEnd(use_final_probs, NULL);
   while (frame >= 0) {
     LatticeArc arc;
@@ -444,80 +519,102 @@ void OnlineSilenceWeighting::ComputeCurrentTraceback(
   }
 }
 
-int32 OnlineSilenceWeighting::GetBeginFrame() {
-  int32 max_duration = config_.max_state_duration;
-  if (max_duration <= 0 || num_frames_output_and_correct_ == 0)
-    return num_frames_output_and_correct_;
-
-  // t_last_untouched is the index of the last frame that is not newly touched
-  // by ComputeCurrentTraceback.  We are interested in whether it is part of a
-  // run of length greater than max_duration, since this would force it
-  // to be treated as silence (note: typically a non-silence phone that's very
-  // long is really silence, for example this can happen with the word "mm").
-
-  int32 t_last_untouched = num_frames_output_and_correct_ - 1,
-      t_end = frame_info_.size();
-  int32 transition_id = frame_info_[t_last_untouched].transition_id;
-  // no point searching longer than max_duration; when the length of the run is
-  // at least that much, a longer length makes no difference.
-  int32 lower_search_bound = std::max(0, t_last_untouched - max_duration),
-      upper_search_bound = std::min(t_last_untouched + max_duration, t_end - 1),
-      t_lower, t_upper;
-
-  // t_lower will be the first index in the run of equal transition-ids.
-  for (t_lower = t_last_untouched;
-       t_lower > lower_search_bound &&
-           frame_info_[t_lower - 1].transition_id == transition_id; t_lower--);
-
-  // t_lower will be the last index in the run of equal transition-ids.
-  for (t_upper = t_last_untouched;
-       t_upper < upper_search_bound &&
-           frame_info_[t_upper + 1].transition_id == transition_id; t_upper++);
-
-  int32 run_length = t_upper - t_lower + 1;
-  if (run_length <= max_duration) {
-    // we wouldn't treat this run as being silence, as it's within
-    // the duration limit.  So we return the default value
-    // num_frames_output_and_correct_ as our lower bound for processing.
-    return num_frames_output_and_correct_;
-  }
-  int32 old_run_length = t_last_untouched - t_lower + 1;
-  if (old_run_length > max_duration) {
-    // The run-length before we got this new data was already longer than the
-    // max-duration, so would already have been treated as silence.  therefore
-    // we don't have to encompass it all- we just include a long enough length
-    // in the region we are going to process, that the run-length in that region
-    // is longer than max_duration.
-    int32 ans = t_upper - max_duration;
-    KALDI_ASSERT(ans >= t_lower);
-    return ans;
-  } else {
-    return t_lower;
+template <typename FST>
+void OnlineSilenceWeighting::ComputeCurrentTraceback(
+    const LatticeIncrementalOnlineDecoderTpl<FST> &decoder) {
+  int32 num_frames_decoded = decoder.NumFramesDecoded(),
+      num_frames_prev = frame_info_.size();
+  // note, num_frames_prev is not the number of frames previously decoded,
+  // it's the generally-larger number of frames that we were requested to
+  // provide weights for.
+  if (num_frames_prev < num_frames_decoded)
+    frame_info_.resize(num_frames_decoded);
+  if (num_frames_prev > num_frames_decoded &&
+      frame_info_[num_frames_decoded].transition_id != -1)
+    KALDI_ERR << "Number of frames decoded decreased";  // Likely bug
+
+  if (num_frames_decoded == 0)
+    return;
+  int32 frame = num_frames_decoded - 1;
+  bool use_final_probs = false;
+  typename LatticeIncrementalOnlineDecoderTpl<FST>::BestPathIterator iter =
+      decoder.BestPathEnd(use_final_probs, NULL);
+  while (frame >= 0) {
+    LatticeArc arc;
+    arc.ilabel = 0;
+    while (arc.ilabel == 0)  // the while loop skips over input-epsilons
+      iter = decoder.TraceBackBestPath(iter, &arc);
+    // note, the iter.frame values are slightly unintuitively defined,
+    // they are one less than you might expect.
+    KALDI_ASSERT(iter.frame == frame - 1);
+
+    if (frame_info_[frame].token == iter.tok) {
+      // we know that the traceback from this point back will be identical, so
+      // no point tracing back further.  Note: we are comparing memory addresses
+      // of tokens of the decoder; this guarantees it's the same exact token,
+      // because tokens, once allocated on a frame, are only deleted, never
+      // reallocated for that frame.
+      break;
+    }
+
+    if (num_frames_output_and_correct_ > frame)
+      num_frames_output_and_correct_ = frame;
+
+    frame_info_[frame].token = iter.tok;
+    frame_info_[frame].transition_id = arc.ilabel;
+    frame--;
+    // leave frame_info_.current_weight at zero for now (as set in the
+    // constructor), reflecting that we haven't already output a weight for that
+    // frame.
   }
 }
 
+
+// Instantiate the template OnlineSilenceWeighting::ComputeCurrentTraceback().
+template
+void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::Fst<fst::StdArc> >(
+    const LatticeFasterOnlineDecoderTpl<fst::Fst<fst::StdArc> > &decoder);
+template
+void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::GrammarFst>(
+    const LatticeFasterOnlineDecoderTpl<fst::GrammarFst> &decoder);
+template
+void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::Fst<fst::StdArc> >(
+    const LatticeIncrementalOnlineDecoderTpl<fst::Fst<fst::StdArc> > &decoder);
+template
+void OnlineSilenceWeighting::ComputeCurrentTraceback<fst::GrammarFst>(
+    const LatticeIncrementalOnlineDecoderTpl<fst::GrammarFst> &decoder);
+
+
 void OnlineSilenceWeighting::GetDeltaWeights(
-    int32 num_frames_ready_in,
+    int32 num_frames_ready, int32 first_decoder_frame,
     std::vector<std::pair<int32, BaseFloat> > *delta_weights) {
-  // num_frames_ready_in is at the feature frame-rate, most of the code
+  // num_frames_ready is at the feature frame-rate, most of the code
   // in this function is at the decoder frame-rate.
   // round up, so we are sure to get weights for at least the frame
-  // 'num_frames_ready_in - 1', and maybe one or two frames afterward.
+  // 'num_frames_ready - 1', and maybe one or two frames afterward.
+  KALDI_ASSERT(num_frames_ready > first_decoder_frame || num_frames_ready == 0);
   int32 fs = frame_subsampling_factor_,
-  num_frames_ready = (num_frames_ready_in + fs - 1) / fs;
+  num_decoder_frames_ready = (num_frames_ready - first_decoder_frame + fs - 1) / fs;
 
   const int32 max_state_duration = config_.max_state_duration;
   const BaseFloat silence_weight = config_.silence_weight;
 
   delta_weights->clear();
 
-  if (frame_info_.size() < static_cast<size_t>(num_frames_ready))
-    frame_info_.resize(num_frames_ready);
-
-  // we may have to make begin_frame earlier than num_frames_output_and_correct_
-  // so that max_state_duration is properly enforced.   GetBeginFrame() handles
-  // this logic.
-  int32 begin_frame = GetBeginFrame(),
+  int32 prev_num_frames_processed = frame_info_.size();
+  if (frame_info_.size() < static_cast<size_t>(num_decoder_frames_ready))
+    frame_info_.resize(num_decoder_frames_ready);
+
+  // Don't go further backward into the past then 100 frames before the most
+  // recent frame previously than 100 frames when modifying the traceback.
+  // C.f. the value 200 in template
+  // OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature in online-feature.cc,
+  // which needs to be more than this value of 100 plus the amount of context
+  // that LDA might use plus the chunk size we're likely to decode in one time.
+  // The user can always increase the value of --max-feature-vectors in case one
+  // of these conditions is broken.  Search for ONLINE_IVECTOR_LIMIT in
+  // online-feature.cc
+  int32 begin_frame = std::max<int32>(0, prev_num_frames_processed - 100),
       frames_out = static_cast<int32>(frame_info_.size()) - begin_frame;
   // frames_out is the number of frames we will output.
   KALDI_ASSERT(frames_out >= 0);
@@ -580,18 +677,18 @@ void OnlineSilenceWeighting::GetDeltaWeights(
         new_weight = frame_weight[offset],
         weight_diff = new_weight - old_weight;
     frame_info_[frame].current_weight = new_weight;
-    KALDI_VLOG(6) << "Weight for frame " << frame << " changing from "
-                  << old_weight << " to " << new_weight;
     // Even if the delta-weight is zero for the last frame, we provide it,
     // because the identity of the most recent frame with a weight is used in
     // some debugging/checking code.
     if (weight_diff != 0.0 || offset + 1 == frames_out) {
+      KALDI_VLOG(6) << "Weight for frame " << frame << " changing from "
+                    << old_weight << " to " << new_weight;
       for(int32 i = 0; i < frame_subsampling_factor_; i++) {
-	int32 input_frame = (frame * frame_subsampling_factor_) + i;
-	delta_weights->push_back(std::make_pair(input_frame, weight_diff));
+        int32 input_frame = first_decoder_frame + (frame * frame_subsampling_factor_) + i;
+        delta_weights->push_back(std::make_pair(input_frame, weight_diff));
       }
     }
-  }  
+  }
 }
 
 }  // namespace kaldi
diff --git a/src/online2/online-ivector-feature.h b/src/online2/online-ivector-feature.h
index 942cb387bbb..0d02ab06eff 100644
--- a/src/online2/online-ivector-feature.h
+++ b/src/online2/online-ivector-feature.h
@@ -33,6 +33,7 @@
 #include "feat/online-feature.h"
 #include "ivector/ivector-extractor.h"
 #include "decoder/lattice-faster-online-decoder.h"
+#include "decoder/lattice-incremental-online-decoder.h"
 
 namespace kaldi {
 /// @addtogroup  onlinefeat OnlineFeatureExtraction
@@ -57,6 +58,8 @@ struct OnlineIvectorExtractionConfig {
                                             // stats
   std::string splice_config_rxfilename;  // to read OnlineSpliceOptions
   std::string cmvn_config_rxfilename;  // to read in OnlineCmvnOptions
+  bool online_cmvn_iextractor; // flag activating online-cmvn in iextractor
+                               // feature pipeline
   std::string diag_ubm_rxfilename;  // reads type DiagGmm.
   std::string ivector_extractor_rxfilename;  // reads type IvectorExtractor
 
@@ -79,12 +82,12 @@ struct OnlineIvectorExtractionConfig {
 
   int32 num_cg_iters;  // set to 15.  I don't believe this is very important, so it's
                        // not configurable from the command line for now.
-  
+
 
   // If use_most_recent_ivector is true, we always return the most recent
   // available iVector rather than the one for the current frame.  This means
   // that if audio is coming in faster than we can process it, we will return a
-  // more accurate iVector. 
+  // more accurate iVector.
   bool use_most_recent_ivector;
 
   // If true, always read ahead to NumFramesReady() when getting iVector stats.
@@ -98,14 +101,15 @@ struct OnlineIvectorExtractionConfig {
   // (assuming you provided info from a previous utterance of the same speaker
   // by calling SetAdaptationState()).
   BaseFloat max_remembered_frames;
-  
-  OnlineIvectorExtractionConfig(): ivector_period(10), num_gselect(5),
+
+  OnlineIvectorExtractionConfig(): online_cmvn_iextractor(false),
+                                   ivector_period(10), num_gselect(5),
                                    min_post(0.025), posterior_scale(0.1),
                                    max_count(0.0), num_cg_iters(15),
                                    use_most_recent_ivector(true),
                                    greedy_ivector_extractor(false),
                                    max_remembered_frames(1000) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("lda-matrix", &lda_mat_rxfilename, "Filename of LDA matrix, "
                    "e.g. final.mat; used for iVector extraction. ");
@@ -118,6 +122,11 @@ struct OnlineIvectorExtractionConfig {
                    "file for online CMVN features (e.g. conf/online_cmvn.conf),"
                    "only used for iVector extraction.  Contains options "
                    "as for the program 'apply-cmvn-online'");
+    opts->Register("online-cmvn-iextractor", &online_cmvn_iextractor,
+                   "add online-cmvn to feature pipeline of ivector extractor, "
+                   "use the cmvn setup from the UBM.  Note: the default of "
+                   "false is what we historically used; we'd use true if "
+                   "we were using CMVN'ed features for the neural net.");
     opts->Register("splice-config", &splice_config_rxfilename, "Configuration file "
                    "for frame splicing (--left-context and --right-context "
                    "options); used for iVector extraction.");
@@ -157,11 +166,12 @@ struct OnlineIvectorExtractionConfig {
 /// This struct contains various things that are needed (as const references)
 /// by class OnlineIvectorExtractor.
 struct OnlineIvectorExtractionInfo {
-  
+
   Matrix<BaseFloat> lda_mat;  // LDA+MLLT matrix.
   Matrix<double> global_cmvn_stats;  // Global CMVN stats.
 
   OnlineCmvnOptions cmvn_opts;  // Options for online CMN/CMVN computation.
+  bool online_cmvn_iextractor;  // flag activating online CMN/CMVN for iextractor input.
   OnlineSpliceOptions splice_opts;  // Options for frame splicing
                                     // (--left-context,--right-context)
 
@@ -202,7 +212,7 @@ struct OnlineIvectorExtractorAdaptationState {
   // instead the iVector is used.
 
   // Adaptation state for online CMVN (used for getting posteriors for iVector)
-  OnlineCmvnState cmvn_state;  
+  OnlineCmvnState cmvn_state;
 
   /// Stats for online iVector estimation.
   OnlineIvectorEstimationStats ivector_stats;
@@ -213,7 +223,7 @@ struct OnlineIvectorExtractorAdaptationState {
       ivector_stats(info.extractor.IvectorDim(),
                     info.extractor.PriorOffset(),
                     info.max_count) { }
-  
+
   /// Copy constructor
   OnlineIvectorExtractorAdaptationState(
       const OnlineIvectorExtractorAdaptationState &other);
@@ -259,7 +269,7 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
   //     std::vector<BaseFloat> frame_weights,
   //OnlineFeatureInterface *base_feature);
 
-  
+
   // Member functions from OnlineFeatureInterface:
 
   /// Dim() will return the iVector dimension.
@@ -274,7 +284,7 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
   /// constructing a new instance of this class.
   void SetAdaptationState(
       const OnlineIvectorExtractorAdaptationState &adaptation_state);
-  
+
 
   /// Get the adaptation state; you may want to call this before destroying this
   /// object, to get adaptation state that can be used to improve decoding of
@@ -309,11 +319,21 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
   // lifetime of this object.
   void UpdateFrameWeights(
       const std::vector<std::pair<int32, BaseFloat> > &delta_weights);
-  
+
  private:
-  // this function adds "weight" to the stats for frame "frame".
-  void UpdateStatsForFrame(int32 frame,
-                           BaseFloat weight);
+
+  // This accumulates i-vector stats for a set of frames, specified as pairs
+  // (t, weight).  The weights do not have to be positive.  (In the online
+  // silence-weighting that we do, negative weights can occur if we change our
+  // minds about the assignment of a frame as silence vs. non-silence).
+  void UpdateStatsForFrames(
+      const std::vector<std::pair<int32, BaseFloat> > &frame_weights);
+
+  // Returns a modified version of info_.min_post, which is opts_.min_post if
+  // weight is 1.0 or -1.0, but gets larger if fabs(weight) is small... but no
+  // larger than 0.99.  (This is an efficiency thing, to not bother processing
+  // very small counts).
+  BaseFloat GetMinPost(BaseFloat weight) const;
 
   // This is the original UpdateStatsUntilFrame that is called when there is
   // no data-weighting involved.
@@ -322,19 +342,21 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
   // This is the new UpdateStatsUntilFrame that is called when there is
   // data-weighting (i.e. when the user has been calling UpdateFrameWeights()).
   void UpdateStatsUntilFrameWeighted(int32 frame);
-  
+
   void PrintDiagnostics() const;
-  
+
   const OnlineIvectorExtractionInfo &info_;
 
-  // base_ is the base feature; it is not owned here.
-  OnlineFeatureInterface *base_;
-  // the following online-feature-extractor pointers are owned here:
-  OnlineSpliceFrames *splice_; // splice on top of raw features.
-  OnlineTransform *lda_;  // LDA on top of raw+splice features.
-  OnlineCmvn *cmvn_;
-  OnlineSpliceFrames *splice_normalized_; // splice on top of CMVN feats.
-  OnlineTransform *lda_normalized_;  // LDA on top of CMVN+splice
+  OnlineFeatureInterface *base_;  // The feature this is built on top of
+                                  // (e.g. MFCC); not owned here
+
+  OnlineFeatureInterface *lda_;  // LDA on top of raw+splice features.
+  OnlineCmvn *cmvn_;  // the CMVN that we give to the lda_normalized_.
+  OnlineFeatureInterface *lda_normalized_;  // LDA on top of CMVN+splice
+
+  // the following is the pointers to OnlineFeatureInterface objects that are
+  // owned here and which we need to delete.
+  std::vector<OnlineFeatureInterface*> to_delete_;
 
   /// the iVector estimation stats
   OnlineIvectorEstimationStats ivector_stats_;
@@ -359,33 +381,33 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
 
   /// this is only used for validating that the frame-weighting code is not buggy.
   std::vector<BaseFloat> current_frame_weight_debug_;
-  
+
   /// delta_weights_provided_ is set to true if UpdateFrameWeights was ever called; it's
   /// used to detect wrong usage of this class.
   bool delta_weights_provided_;
   /// The following is also used to detect wrong usage of this class; it's set
   /// to true if UpdateStatsUntilFrame() was ever called.
   bool updated_with_no_delta_weights_;
-  
+
   /// if delta_weights_ was ever called, this keeps track of the most recent
   /// frame that ever had a weight.  It's mostly for detecting errors.
   int32 most_recent_frame_with_weight_;
-  
+
   /// The following is only needed for diagnostics.
   double tot_ubm_loglike_;
-  
+
   /// Most recently estimated iVector, will have been
   /// estimated at the greatest time t where t <= num_frames_stats_ and
   /// t % info_.ivector_period == 0.
   Vector<double> current_ivector_;
-  
+
   /// if info_.use_most_recent_ivector == false, we need to store
   /// the iVector we estimated each info_.ivector_period frames so that
   /// GetFrame() can return the iVector that was active on that frame.
   /// ivectors_history_[i] contains the iVector we estimated on
   /// frame t = i * info_.ivector_period.
   std::vector<Vector<BaseFloat>* > ivectors_history_;
- 
+
 };
 
 
@@ -399,7 +421,7 @@ struct OnlineSilenceWeightingConfig {
   // Transition-ids that get repeated at least this many times (if
   // max_state_duration > 0) are treated as silence.
   BaseFloat max_state_duration;
-  
+
   // This is the scale that we apply to data that we don't yet have a decoder
   // traceback for, in the online silence
   BaseFloat new_data_weight;
@@ -407,10 +429,10 @@ struct OnlineSilenceWeightingConfig {
   bool Active() const {
     return !silence_phones_str.empty() && silence_weight != 1.0;
   }
-  
+
   OnlineSilenceWeightingConfig():
       silence_weight(1.0), max_state_duration(-1) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("silence-phones", &silence_phones_str, "(RE weighting in "
                    "iVector estimation for online decoding) List of integer ids of "
@@ -449,40 +471,63 @@ class OnlineSilenceWeighting {
 
   OnlineSilenceWeighting(const TransitionModel &trans_model,
                          const OnlineSilenceWeightingConfig &config,
-			 int32 frame_subsampling_factor = 1);
-  
+                         int32 frame_subsampling_factor = 1);
+
   bool Active() const { return config_.Active(); }
 
   // This should be called before GetDeltaWeights, so this class knows about the
   // traceback info from the decoder.  It records the traceback information from
   // the decoder using its BestPathEnd() and related functions.
-  void ComputeCurrentTraceback(const LatticeFasterOnlineDecoder &decoder);
-  
+  // It will be instantiated for FST == fst::Fst<fst::StdArc> and fst::GrammarFst.
+  template <typename FST>
+  void ComputeCurrentTraceback(const LatticeFasterOnlineDecoderTpl<FST> &decoder);
+  template <typename FST>
+  void ComputeCurrentTraceback(const LatticeIncrementalOnlineDecoderTpl<FST> &decoder);
+
   // Calling this function gets the changes in weight that require us to modify
-  // the stats... the output format is (frame-index, delta-weight).  The
-  // num_frames_ready argument is the number of frames available at the input
-  // (or equivalently, output) of the online iVector extractor class, which may
-  // be more than the currently available decoder traceback.  How many frames
-  // of weights it outputs depends on how much "num_frames_ready" increased
-  // since last time we called this function, and whether the decoder traceback
-  // changed.  Negative delta_weights might occur if frames previously
+  // the stats... the output format is (frame-index, delta-weight).
+  //
+  // The num_frames_ready argument is the number of frames available at
+  // the input (or equivalently, output) of the online iVector feature in the
+  // feature pipeline from the stream start. It may be more than the currently
+  // available decoder traceback.
+  //
+  // The first_decoder_frame is the offset from the start of the stream in
+  // pipeline frames when decoder was restarted last time. We do not change
+  // weight for the frames earlier than first_decoder_frame. Set it to 0 in
+  // case of compilation error to reproduce the previous behavior or for a
+  // single utterance decoding.
+  //
+  // How many frames of weights it outputs depends on how much "num_frames_ready"
+  // increased since last time we called this function, and whether the decoder
+  // traceback changed.  Negative delta_weights might occur if frames previously
   // classified as non-silence become classified as silence if the decoder's
   // traceback changes.  You must call this function with "num_frames_ready"
   // arguments that only increase, not decrease, with time.  You would provide
   // this output to class OnlineIvectorFeature by calling its function
   // UpdateFrameWeights with the output.
+  //
+  // Returned frame-index is in pipeline frames from the pipeline start.
   void GetDeltaWeights(
-      int32 num_frames_ready_in,
+      int32 num_frames_ready, int32 first_decoder_frame,
       std::vector<std::pair<int32, BaseFloat> > *delta_weights);
-  
+
+  // A method for backward compatibility, same as above, but for a single
+  // utterance.
+  void GetDeltaWeights(
+      int32 num_frames_ready,
+      std::vector<std::pair<int32, BaseFloat> > *delta_weights) {
+    GetDeltaWeights(num_frames_ready, 0, delta_weights);
+  }
+
  private:
   const TransitionModel &trans_model_;
   const OnlineSilenceWeightingConfig &config_;
-  
+
   int32 frame_subsampling_factor_;
 
   unordered_set<int32> silence_phones_;
-  
+
   struct FrameInfo {
     // The only reason we need the token pointer is to know far back we have to
     // trace before the traceback is the same as what we previously traced back.
@@ -496,12 +541,6 @@ class OnlineSilenceWeighting {
     FrameInfo(): token(NULL), transition_id(-1), current_weight(0.0) {}
   };
 
-  // gets the frame at which we need to begin our processing in
-  // GetDeltaWeights...  normally this is equal to
-  // num_frames_output_and_correct_, but it may be earlier in case
-  // max_state_duration is relevant.
-  int32 GetBeginFrame();
-
   // This contains information about any previously computed traceback;
   // when the traceback changes we use this variable to compare it with the
   // previous traceback.
@@ -525,4 +564,3 @@ class OnlineSilenceWeighting {
 }  // namespace kaldi
 
 #endif  // KALDI_ONLINE2_ONLINE_IVECTOR_FEATURE_H_
-
diff --git a/src/online2/online-nnet2-decoding-threaded.cc b/src/online2/online-nnet2-decoding-threaded.cc
index c70eb571a46..de3194019f4 100644
--- a/src/online2/online-nnet2-decoding-threaded.cc
+++ b/src/online2/online-nnet2-decoding-threaded.cc
@@ -115,7 +115,8 @@ SingleUtteranceNnet2DecoderThreaded::SingleUtteranceNnet2DecoderThreaded(
     const nnet2::AmNnet &am_nnet,
     const fst::Fst<fst::StdArc> &fst,
     const OnlineNnet2FeaturePipelineInfo &feature_info,
-    const OnlineIvectorExtractorAdaptationState &adaptation_state):
+    const OnlineIvectorExtractorAdaptationState &adaptation_state,
+    const OnlineCmvnState &cmvn_state):
   config_(config), am_nnet_(am_nnet), tmodel_(tmodel), sampling_rate_(0.0),
   num_samples_received_(0), input_finished_(false),
   feature_pipeline_(feature_info),
@@ -129,6 +130,7 @@ SingleUtteranceNnet2DecoderThreaded::SingleUtteranceNnet2DecoderThreaded(
   // utterance(s)... this only makes sense if theose previous utterance(s) are
   // believed to be from the same speaker.
   feature_pipeline_.SetAdaptationState(adaptation_state);
+  feature_pipeline_.SetCmvnState(cmvn_state);
   // spawn threads.
   threads_[0] = std::thread(RunNnetEvaluation, this);
   decoder_.InitDecoding();
@@ -285,6 +287,13 @@ void SingleUtteranceNnet2DecoderThreaded::GetAdaptationState(
   feature_pipeline_.GetAdaptationState(adaptation_state);
 }
 
+void SingleUtteranceNnet2DecoderThreaded::GetCmvnState(
+    OnlineCmvnState *cmvn_state) {
+  std::lock_guard<std::mutex> lock(feature_pipeline_mutex_);
+  // If this blocks, it shouldn't be for very long.
+  feature_pipeline_.GetCmvnState(cmvn_state);
+}
+
 void SingleUtteranceNnet2DecoderThreaded::GetLattice(
     bool end_of_utterance,
     CompactLattice *clat,
diff --git a/src/online2/online-nnet2-decoding-threaded.h b/src/online2/online-nnet2-decoding-threaded.h
index e77166ac801..e6c8422b2a2 100644
--- a/src/online2/online-nnet2-decoding-threaded.h
+++ b/src/online2/online-nnet2-decoding-threaded.h
@@ -200,7 +200,8 @@ class SingleUtteranceNnet2DecoderThreaded {
       const nnet2::AmNnet &am_nnet,
       const fst::Fst<fst::StdArc> &fst,
       const OnlineNnet2FeaturePipelineInfo &feature_info,
-      const OnlineIvectorExtractorAdaptationState &adaptation_state);
+      const OnlineIvectorExtractorAdaptationState &adaptation_state,
+      const OnlineCmvnState &cmvn_state);
 
 
 
@@ -294,6 +295,14 @@ class SingleUtteranceNnet2DecoderThreaded {
   /// InputFinished, and then Wait().  Otherwise it is an error.
   void GetAdaptationState(OnlineIvectorExtractorAdaptationState *adaptation_state);
 
+  /// Outputs the OnlineCmvnState of the feature pipeline to "cmvn_stat".  This
+  /// stores cmvn stats for the non-iVector features, and will be called at the
+  /// end of an utterance, assuming it's a scenario where each speaker is seen for
+  /// more than one utterance.
+  /// You may only call this function after either calling TerminateDecoding() or
+  /// InputFinished, and then Wait().  Otherwise it is an error.
+  void GetCmvnState(OnlineCmvnState *cmvn_state);
+
   /// Gets the remaining, un-decoded part of the waveform and returns the sample
   /// rate.  May only be called after Wait(), and it only makes sense to call
   /// this if you called TerminateDecoding() before Wait().  The idea is that
diff --git a/src/online2/online-nnet2-feature-pipeline.cc b/src/online2/online-nnet2-feature-pipeline.cc
index 510c401fba2..b291ba92d98 100644
--- a/src/online2/online-nnet2-feature-pipeline.cc
+++ b/src/online2/online-nnet2-feature-pipeline.cc
@@ -30,7 +30,7 @@ OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
     feature_type = config.feature_type;
   } else {
     KALDI_ERR << "Invalid feature type: " << config.feature_type << ". "
-              << "Supported feature types: mfcc, plp.";
+              << "Supported feature types: mfcc, plp, fbank.";
   }
 
   if (config.mfcc_config != "") {
@@ -65,6 +65,15 @@ OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
                  << "since you did not supply --add-pitch option.";
   }  // else use the defaults.
 
+  use_cmvn = (config.cmvn_config != "");
+  if (use_cmvn) {
+    ReadConfigFromFile(config.cmvn_config, &cmvn_opts);
+    global_cmvn_stats_rxfilename = config.global_cmvn_stats_rxfilename;
+    if (global_cmvn_stats_rxfilename == "")
+      KALDI_ERR << "--global-cmvn-stats option is required "
+                << " when --cmvn-config is specified.";
+  }
+
   if (config.ivector_extraction_config != "") {
     use_ivectors = true;
     OnlineIvectorExtractionConfig ivector_extraction_opts;
@@ -76,9 +85,19 @@ OnlineNnet2FeaturePipelineInfo::OnlineNnet2FeaturePipelineInfo(
   }
 }
 
+
+/// The main feature extraction pipeline is constructed in this constructor.
 OnlineNnet2FeaturePipeline::OnlineNnet2FeaturePipeline(
     const OnlineNnet2FeaturePipelineInfo &info):
-    info_(info) {
+    info_(info), base_feature_(NULL),
+    pitch_(NULL), pitch_feature_(NULL),
+    cmvn_feature_(NULL),
+    feature_plus_optional_pitch_(NULL),
+    feature_plus_optional_cmvn_(NULL),
+    ivector_feature_(NULL),
+    nnet3_feature_(NULL),
+    final_feature_(NULL) {
+
   if (info_.feature_type == "mfcc") {
     base_feature_ = new OnlineMfcc(info_.mfcc_opts);
   } else if (info_.feature_type == "plp") {
@@ -96,22 +115,36 @@ OnlineNnet2FeaturePipeline::OnlineNnet2FeaturePipeline(
     feature_plus_optional_pitch_ = new OnlineAppendFeature(base_feature_,
                                                            pitch_feature_);
   } else {
-    pitch_ = NULL;
-    pitch_feature_ = NULL;
     feature_plus_optional_pitch_ = base_feature_;
   }
 
+  if (info_.use_cmvn) {
+    KALDI_ASSERT(info.global_cmvn_stats_rxfilename != "");
+    ReadKaldiObject(info.global_cmvn_stats_rxfilename, &global_cmvn_stats_);
+    OnlineCmvnState initial_state(global_cmvn_stats_);
+    cmvn_feature_ = new OnlineCmvn(info_.cmvn_opts, initial_state,
+        feature_plus_optional_pitch_);
+    feature_plus_optional_cmvn_ = cmvn_feature_;
+  } else {
+    feature_plus_optional_cmvn_ = feature_plus_optional_pitch_;
+  }
+
   if (info_.use_ivectors) {
+    nnet3_feature_ = feature_plus_optional_cmvn_;
+    // Note: the i-vector extractor OnlineIvectorFeature gets 'base_feautre_'
+    // without cmvn (the online cmvn is applied inside the class)
     ivector_feature_ = new OnlineIvectorFeature(info_.ivector_extractor_info,
                                                 base_feature_);
-    final_feature_ = new OnlineAppendFeature(feature_plus_optional_pitch_,
+    final_feature_ = new OnlineAppendFeature(feature_plus_optional_cmvn_,
                                              ivector_feature_);
   } else {
-    ivector_feature_ = NULL;
-    final_feature_ = feature_plus_optional_pitch_;
+    nnet3_feature_ = feature_plus_optional_cmvn_;
+    final_feature_ = feature_plus_optional_cmvn_;
   }
   dim_ = final_feature_->Dim();
 }
+/// ^-^
+
 
 int32 OnlineNnet2FeaturePipeline::Dim() const { return dim_; }
 
@@ -128,6 +161,11 @@ void OnlineNnet2FeaturePipeline::GetFrame(int32 frame,
   return final_feature_->GetFrame(frame, feat);
 }
 
+void OnlineNnet2FeaturePipeline::UpdateFrameWeights(
+    const std::vector<std::pair<int32, BaseFloat> > &delta_weights) {
+    IvectorFeature()->UpdateFrameWeights(delta_weights);
+}
+
 void OnlineNnet2FeaturePipeline::SetAdaptationState(
     const OnlineIvectorExtractorAdaptationState &adaptation_state) {
   if (info_.use_ivectors) {
@@ -144,15 +182,31 @@ void OnlineNnet2FeaturePipeline::GetAdaptationState(
   // else silently do nothing, as there is nothing to do.
 }
 
+void OnlineNnet2FeaturePipeline::SetCmvnState(
+    const OnlineCmvnState &cmvn_state) {
+  if (NULL != cmvn_feature_)
+    cmvn_feature_->SetState(cmvn_state);
+}
+
+void OnlineNnet2FeaturePipeline::GetCmvnState(
+    OnlineCmvnState *cmvn_state) {
+  if (NULL != cmvn_feature_) {
+    int32 frame = cmvn_feature_->NumFramesReady() - 1;
+    // the following call will crash if no frames are ready.
+    cmvn_feature_->GetState(frame, cmvn_state);
+  }
+}
+
 
 OnlineNnet2FeaturePipeline::~OnlineNnet2FeaturePipeline() {
   // Note: the delete command only deletes pointers that are non-NULL.  Not all
   // of the pointers below will be non-NULL.
   // Some of the online-feature pointers are just copies of other pointers,
   // and we do have to avoid deleting them in those cases.
-  if (final_feature_ != feature_plus_optional_pitch_)
+  if (final_feature_ != feature_plus_optional_cmvn_)
     delete final_feature_;
   delete ivector_feature_;
+  delete cmvn_feature_;
   if (feature_plus_optional_pitch_ != base_feature_)
     delete feature_plus_optional_pitch_;
   delete pitch_feature_;
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index d8f933a090d..6275378823a 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -55,6 +55,11 @@ namespace kaldi {
 ///
 /// Although the name of this header mentions nnet2, actually the code is
 /// used in the online decoding with nnet3 also.
+///
+/// The class OnlineNnet2FeaturePipeline also has a support to optionally
+/// append pitch features and to apply OnlineCmvn on nnet3 input.
+/// We pass the unnormalized base_features to i-vector extractor,
+/// the OnlineCmvn for i-vector extractor is handled elsewhere.
 
 
 /// This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which
@@ -66,12 +71,13 @@ struct OnlineNnet2FeaturePipelineConfig {
   std::string mfcc_config;
   std::string plp_config;
   std::string fbank_config;
+  std::string cmvn_config;
+  std::string global_cmvn_stats_rxfilename;
 
   // Note: if we do add pitch, it will not be added to the features we give to
   // the iVector extractor but only to the features we give to the neural
   // network, after the base features but before the iVector.  We don't think
-  // the iVector will be particularly helpful in normalizing the pitch features,
-  // and we wanted to avoid complications with things like online CMVN.
+  // the iVector will be particularly helpful in normalizing the pitch features.
   bool add_pitch;
 
   // the following contains the type of options that you could give to
@@ -101,6 +107,13 @@ struct OnlineNnet2FeaturePipelineConfig {
                    "PLP features (e.g. conf/plp.conf)");
     opts->Register("fbank-config", &fbank_config, "Configuration file for "
                    "filterbank features (e.g. conf/fbank.conf)");
+    opts->Register("cmvn-config", &cmvn_config, "Configuration file for "
+                   "online cmvn features (e.g. conf/online_cmvn.conf). "
+                   "Controls features on nnet3 input (not ivector features). "
+                   "If not set, the OnlineCmvn is disabled.");
+    opts->Register("global-cmvn-stats", &global_cmvn_stats_rxfilename,
+                   "filename with global stats for OnlineCmvn for features "
+                   "on nnet3 input (not ivector features)");
     opts->Register("add-pitch", &add_pitch, "Append pitch features to raw "
                    "MFCC/PLP/filterbank features [but not for iVector extraction]");
     opts->Register("online-pitch-config", &online_pitch_config, "Configuration "
@@ -121,40 +134,46 @@ struct OnlineNnet2FeaturePipelineConfig {
 /// line; instead, it is initialized from class OnlineNnet2FeaturePipelineConfig
 /// which reads the options from the command line.  The reason for structuring
 /// it this way is to make it easier to configure from code as well as from the
-/// command line, as well as for easiter multithreaded operation.
+/// command line, as well as for easier multithreaded operation.
 struct OnlineNnet2FeaturePipelineInfo {
   OnlineNnet2FeaturePipelineInfo():
-      feature_type("mfcc"), add_pitch(false) { }
+      feature_type("mfcc"), add_pitch(false), use_cmvn(false) { }
 
   OnlineNnet2FeaturePipelineInfo(
       const OnlineNnet2FeaturePipelineConfig &config);
 
   BaseFloat FrameShiftInSeconds() const;
 
-  std::string feature_type;  // "mfcc" or "plp" or "fbank"
+  std::string feature_type; /// "mfcc" or "plp" or "fbank"
 
-  MfccOptions mfcc_opts;  // options for MFCC computation,
-                          // if feature_type == "mfcc"
-  PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
-  FbankOptions fbank_opts;  // Options for filterbank computation, if
-                            // feature_type == "fbank"
+  MfccOptions mfcc_opts;  /// options for MFCC computation,
+                          /// if feature_type == "mfcc"
+  PlpOptions plp_opts;    /// Options for PLP computation, if feature_type == "plp"
+  FbankOptions fbank_opts;  /// Options for filterbank computation, if
+                            /// feature_type == "fbank"
 
   bool add_pitch;
-  PitchExtractionOptions pitch_opts;  // Options for pitch extraction, if done.
-  ProcessPitchOptions pitch_process_opts;  // Options for pitch post-processing
-
-
-  // If the user specified --ivector-extraction-config, we assume we're using
-  // iVectors as an extra input to the neural net.  Actually, we don't
-  // anticipate running this setup without iVectors.
+  PitchExtractionOptions pitch_opts;  /// Options for pitch extraction, if done.
+  ProcessPitchOptions pitch_process_opts;  /// Options for pitch post-processing
+
+  /// If the user specified --cmvn-config, we set 'use_cmvn' to true,
+  /// and the OnlineCmvn is added to the feature preparation pipeline.
+  bool use_cmvn;
+  OnlineCmvnOptions cmvn_opts; /// Options for online cmvn, read from config file.
+  std::string global_cmvn_stats_rxfilename;  /// Filename used for reading global
+                                             /// cmvn stats in OnlineCmvn.
+
+  /// If the user specified --ivector-extraction-config, we assume we're using
+  /// iVectors as an extra input to the neural net.  Actually, we don't
+  /// anticipate running this setup without iVectors.
   bool use_ivectors;
   OnlineIvectorExtractionInfo ivector_extractor_info;
 
-  // Config for weighting silence in iVector adaptation.
-  // We declare this outside of ivector_extractor_info... it was
-  // just easier to set up the code that way; and also we think
-  // it's the kind of thing you might want to play with directly
-  // on the command line instead of inside sub-config-files.
+  /// Config for weighting silence in iVector adaptation.
+  /// We declare this outside of ivector_extractor_info... it was
+  /// just easier to set up the code that way; and also we think
+  /// it's the kind of thing you might want to play with directly
+  /// on the command line instead of inside sub-config-files.
   OnlineSilenceWeightingConfig silence_weighting_config;
 
   int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
@@ -196,13 +215,25 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   virtual int32 NumFramesReady() const;
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  /// If you are downweighting silence, you can call
+  /// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
+  /// class using UpdateFrameWeights().  The reason why this call happens
+  /// outside this class, rather than this class pulling in the data weights,
+  /// relates to multi-threaded operation and also from not wanting this class
+  /// to have excessive dependencies.
+  ///
+  /// You must either always call this as soon as new data becomes available,
+  /// ideally just after calling AcceptWaveform(), or never call it for the
+  /// lifetime of this object.
+  void UpdateFrameWeights(
+      const std::vector<std::pair<int32, BaseFloat> > &delta_weights);
+
   /// Set the adaptation state to a particular value, e.g. reflecting previous
   /// utterances of the same speaker; this will generally be called after
   /// Copy().
   void SetAdaptationState(
       const OnlineIvectorExtractorAdaptationState &adaptation_state);
 
-
   /// Get the adaptation state; you may want to call this before destroying this
   /// object, to get adaptation state that can be used to improve decoding of
   /// later utterances of this speaker.  You might not want to do this, though,
@@ -211,6 +242,10 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   void GetAdaptationState(
       OnlineIvectorExtractorAdaptationState *adaptation_state) const;
 
+  /// Set the CMVN state to a particular value.
+  /// (for features on nnet3 input, not the i-vector input).
+  void SetCmvnState(const OnlineCmvnState &cmvn_state);
+  void GetCmvnState(OnlineCmvnState *cmvn_state);
 
   /// Accept more data to process.  It won't actually process it until you call
   /// GetFrame() [probably indirectly via (decoder).AdvanceDecoding()], when you
@@ -228,51 +263,69 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   /// rescoring the lattices, this may not be much of an issue.
   void InputFinished();
 
-  // This function returns the ivector-extracting part of the feature pipeline
-  // (or NULL if iVectors are not being used); the pointer is owned here and not
-  // given to the caller.  This function is used in nnet3, and also in the
-  // silence-weighting code used to exclude silence from the iVector estimation.
+  /// This function returns the iVector-extracting part of the feature pipeline
+  /// (or NULL if iVectors are not being used); the pointer ownership is retained
+  /// by this object and not transferred to the caller.  This function is used in
+  /// nnet3, and also in the silence-weighting code used to exclude silence from
+  /// the iVector estimation.
   OnlineIvectorFeature *IvectorFeature() {
     return ivector_feature_;
   }
 
-  // This function returns the part of the feature pipeline that would be given
-  // as the primary (non-iVector) input to the neural network in nnet3
-  // applications.
- OnlineFeatureInterface *InputFeature() {
-    return feature_plus_optional_pitch_;
+  /// A const accessor for the iVector extractor. Returns NULL if iVectors are
+  /// not being used.
+  const OnlineIvectorFeature *IvectorFeature() const {
+    return ivector_feature_;
+  }
+
+  /// This function returns the part of the feature pipeline that would be given
+  /// as the primary (non-iVector) input to the neural network in nnet3
+  /// applications.
+  OnlineFeatureInterface *InputFeature() {
+    return nnet3_feature_;
   }
 
   virtual ~OnlineNnet2FeaturePipeline();
- private:
 
+ private:
   const OnlineNnet2FeaturePipelineInfo &info_;
 
-  OnlineBaseFeature *base_feature_;        // MFCC/PLP/filterbank
+  OnlineBaseFeature *base_feature_;    /// MFCC/PLP/filterbank
 
-  OnlinePitchFeature *pitch_;              // Raw pitch, if used
-  OnlineProcessPitch *pitch_feature_;  // Processed pitch, if pitch used.
+  OnlinePitchFeature *pitch_;          /// Raw pitch, if used
+  OnlineProcessPitch *pitch_feature_;  /// Processed pitch, if pitch used.
 
+  OnlineCmvn *cmvn_feature_;
+  Matrix<BaseFloat> lda_mat_;          /// LDA matrix, if supplied
+  Matrix<double> global_cmvn_stats_;   /// Global CMVN stats.
 
-  // feature_plus_pitch_ is the base_feature_ appended (OnlineAppendFeature)
+  /// feature_plus_optional_pitch_ is the base_feature_ appended (OnlineAppendFeature)
   /// with pitch_feature_, if used; otherwise, points to the same address as
   /// base_feature_.
   OnlineFeatureInterface *feature_plus_optional_pitch_;
 
-  OnlineIvectorFeature *ivector_feature_;  // iVector feature, if used.
+  /// feature_plus_optional_cmvn_ is the feature_plus_optional_pitch_
+  /// transformed with OnlineCmvn if cmvn is active; otherwise, points
+  /// to the same address as feature_plus_optional_pitch_.
+  OnlineFeatureInterface *feature_plus_optional_cmvn_;
+
+  OnlineIvectorFeature *ivector_feature_;  /// iVector feature, if used.
 
-  // final_feature_ is feature_plus_optional_pitch_ appended
-  // (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
-  // otherwise, points to the same address as feature_plus_optional_pitch_.
+  /// Part of the feature pipeline that would be given as the primary
+  /// (non-iVector) input to the neural network in nnet3 applications.
+  /// This pointer is returned by InputFeature().
+  OnlineFeatureInterface *nnet3_feature_;
+
+  /// final_feature_ is feature_plus_optional_cmvn_ appended
+  /// (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
+  /// otherwise, points to the same address as feature_plus_optional_pitch_.
   OnlineFeatureInterface *final_feature_;
 
-  // we cache the feature dimension, to save time when calling Dim().
+  /// we cache the feature dimension, to save time when calling Dim().
   int32 dim_;
 };
 
 
-
-
 /// @} End of "addtogroup onlinefeat"
 }  // namespace kaldi
 
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index ff74c07f10c..1a6e43f1723 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -21,14 +21,16 @@
 #include "online2/online-nnet3-decoding.h"
 #include "lat/lattice-functions.h"
 #include "lat/determinize-lattice-pruned.h"
+#include "decoder/grammar-fst.h"
 
 namespace kaldi {
 
-SingleUtteranceNnet3Decoder::SingleUtteranceNnet3Decoder(
+template <typename FST>
+SingleUtteranceNnet3DecoderTpl<FST>::SingleUtteranceNnet3DecoderTpl(
     const LatticeFasterDecoderConfig &decoder_opts,
     const TransitionModel &trans_model,
     const nnet3::DecodableNnetSimpleLoopedInfo &info,
-    const fst::Fst<fst::StdArc> &fst,
+    const FST &fst,
     OnlineNnet2FeaturePipeline *features):
     decoder_opts_(decoder_opts),
     input_feature_frame_shift_in_seconds_(features->FrameShiftInSeconds()),
@@ -39,19 +41,29 @@ SingleUtteranceNnet3Decoder::SingleUtteranceNnet3Decoder(
   decoder_.InitDecoding();
 }
 
-void SingleUtteranceNnet3Decoder::AdvanceDecoding() {
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::InitDecoding(int32 frame_offset) {
+  decoder_.InitDecoding();
+  decodable_.SetFrameOffset(frame_offset);
+}
+
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::AdvanceDecoding() {
   decoder_.AdvanceDecoding(&decodable_);
 }
 
-void SingleUtteranceNnet3Decoder::FinalizeDecoding() {
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::FinalizeDecoding() {
   decoder_.FinalizeDecoding();
 }
 
-int32 SingleUtteranceNnet3Decoder::NumFramesDecoded() const {
+template <typename FST>
+int32 SingleUtteranceNnet3DecoderTpl<FST>::NumFramesDecoded() const {
   return decoder_.NumFramesDecoded();
 }
 
-void SingleUtteranceNnet3Decoder::GetLattice(bool end_of_utterance,
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::GetLattice(bool end_of_utterance,
                                              CompactLattice *clat) const {
   if (NumFramesDecoded() == 0)
     KALDI_ERR << "You cannot get a lattice if you decoded no frames.";
@@ -66,12 +78,14 @@ void SingleUtteranceNnet3Decoder::GetLattice(bool end_of_utterance,
       trans_model_, &raw_lat, lat_beam, clat, decoder_opts_.det_opts);
 }
 
-void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::GetBestPath(bool end_of_utterance,
                                               Lattice *best_path) const {
   decoder_.GetBestPath(best_path, end_of_utterance);
 }
 
-bool SingleUtteranceNnet3Decoder::EndpointDetected(
+template <typename FST>
+bool SingleUtteranceNnet3DecoderTpl<FST>::EndpointDetected(
     const OnlineEndpointConfig &config) {
   BaseFloat output_frame_shift =
       input_feature_frame_shift_in_seconds_ *
@@ -81,4 +95,8 @@ bool SingleUtteranceNnet3Decoder::EndpointDetected(
 }
 
 
+// Instantiate the template for the types needed.
+template class SingleUtteranceNnet3DecoderTpl<fst::Fst<fst::StdArc> >;
+template class SingleUtteranceNnet3DecoderTpl<fst::GrammarFst>;
+
 }  // namespace kaldi
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index 1888b71dbf1..9adf77fcb56 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -43,21 +43,30 @@ namespace kaldi {
 
 
 /**
-   You will instantiate this class when you want to decode a single
-   utterance using the online-decoding setup for neural nets.
+   You will instantiate this class when you want to decode a single utterance
+   using the online-decoding setup for neural nets.  The template will be
+   instantiated only for FST = fst::Fst<fst::StdArc> and FST = fst::GrammarFst.
 */
-class SingleUtteranceNnet3Decoder {
+
+template <typename FST>
+class SingleUtteranceNnet3DecoderTpl {
  public:
 
   // Constructor. The pointer 'features' is not being given to this class to own
   // and deallocate, it is owned externally.
-  SingleUtteranceNnet3Decoder(const LatticeFasterDecoderConfig &decoder_opts,
-                              const TransitionModel &trans_model,
-                              const nnet3::DecodableNnetSimpleLoopedInfo &info,
-                              const fst::Fst<fst::StdArc> &fst,
-                              OnlineNnet2FeaturePipeline *features);
-
-  /// advance the decoding as far as we can.
+  SingleUtteranceNnet3DecoderTpl(const LatticeFasterDecoderConfig &decoder_opts,
+                                 const TransitionModel &trans_model,
+                                 const nnet3::DecodableNnetSimpleLoopedInfo &info,
+                                 const FST &fst,
+                                 OnlineNnet2FeaturePipeline *features);
+
+  /// Initializes the decoding and sets the frame offset of the underlying
+  /// decodable object. This method is called by the constructor. You can also
+  /// call this method when you want to reset the decoder state, but want to
+  /// keep using the same decodable object, e.g. in case of an endpoint.
+  void InitDecoding(int32 frame_offset = 0);
+
+  /// Advances the decoding as far as we can.
   void AdvanceDecoding();
 
   /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
@@ -87,9 +96,9 @@ class SingleUtteranceNnet3Decoder {
   /// with the required arguments.
   bool EndpointDetected(const OnlineEndpointConfig &config);
 
-  const LatticeFasterOnlineDecoder &Decoder() const { return decoder_; }
+  const LatticeFasterOnlineDecoderTpl<FST> &Decoder() const { return decoder_; }
 
-  ~SingleUtteranceNnet3Decoder() { }
+  ~SingleUtteranceNnet3DecoderTpl() { }
  private:
 
   const LatticeFasterDecoderConfig &decoder_opts_;
@@ -104,11 +113,13 @@ class SingleUtteranceNnet3Decoder {
 
   nnet3::DecodableAmNnetLoopedOnline decodable_;
 
-  LatticeFasterOnlineDecoder decoder_;
+  LatticeFasterOnlineDecoderTpl<FST> decoder_;
 
 };
 
 
+typedef SingleUtteranceNnet3DecoderTpl<fst::Fst<fst::StdArc> > SingleUtteranceNnet3Decoder;
+
 /// @} End of "addtogroup onlinedecoding"
 
 }  // namespace kaldi
diff --git a/src/online2/online-nnet3-incremental-decoding.cc b/src/online2/online-nnet3-incremental-decoding.cc
new file mode 100644
index 00000000000..5e7acf147ee
--- /dev/null
+++ b/src/online2/online-nnet3-incremental-decoding.cc
@@ -0,0 +1,75 @@
+// online2/online-nnet3-incremental-decoding.cc
+
+// Copyright      2019  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "online2/online-nnet3-incremental-decoding.h"
+#include "lat/lattice-functions.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "decoder/grammar-fst.h"
+
+namespace kaldi {
+
+template <typename FST>
+SingleUtteranceNnet3IncrementalDecoderTpl<FST>::SingleUtteranceNnet3IncrementalDecoderTpl(
+    const LatticeIncrementalDecoderConfig &decoder_opts,
+    const TransitionModel &trans_model,
+    const nnet3::DecodableNnetSimpleLoopedInfo &info,
+    const FST &fst,
+    OnlineNnet2FeaturePipeline *features):
+    decoder_opts_(decoder_opts),
+    input_feature_frame_shift_in_seconds_(features->FrameShiftInSeconds()),
+    trans_model_(trans_model),
+    decodable_(trans_model_, info,
+               features->InputFeature(), features->IvectorFeature()),
+    decoder_(fst, trans_model, decoder_opts_) {
+  decoder_.InitDecoding();
+}
+
+template <typename FST>
+void SingleUtteranceNnet3IncrementalDecoderTpl<FST>::InitDecoding(int32 frame_offset) {
+  decoder_.InitDecoding();
+  decodable_.SetFrameOffset(frame_offset);
+}
+
+template <typename FST>
+void SingleUtteranceNnet3IncrementalDecoderTpl<FST>::AdvanceDecoding() {
+  decoder_.AdvanceDecoding(&decodable_);
+}
+
+template <typename FST>
+void SingleUtteranceNnet3IncrementalDecoderTpl<FST>::GetBestPath(bool end_of_utterance,
+                                              Lattice *best_path) const {
+  decoder_.GetBestPath(best_path, end_of_utterance);
+}
+
+template <typename FST>
+bool SingleUtteranceNnet3IncrementalDecoderTpl<FST>::EndpointDetected(
+    const OnlineEndpointConfig &config) {
+  BaseFloat output_frame_shift =
+      input_feature_frame_shift_in_seconds_ *
+      decodable_.FrameSubsamplingFactor();
+  return kaldi::EndpointDetected(config, trans_model_,
+                                 output_frame_shift, decoder_);
+}
+
+
+// Instantiate the template for the types needed.
+template class SingleUtteranceNnet3IncrementalDecoderTpl<fst::Fst<fst::StdArc> >;
+template class SingleUtteranceNnet3IncrementalDecoderTpl<fst::GrammarFst>;
+
+}  // namespace kaldi
diff --git a/src/online2/online-nnet3-incremental-decoding.h b/src/online2/online-nnet3-incremental-decoding.h
new file mode 100644
index 00000000000..e407cc2be2b
--- /dev/null
+++ b/src/online2/online-nnet3-incremental-decoding.h
@@ -0,0 +1,148 @@
+// online2/online-nnet3-incremental-decoding.h
+
+// Copyright      2019  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_ONLINE2_ONLINE_NNET3_INCREMENTAL_DECODING_H_
+#define KALDI_ONLINE2_ONLINE_NNET3_INCREMENTAL_DECODING_H_
+
+#include <string>
+#include <vector>
+#include <deque>
+
+#include "nnet3/decodable-online-looped.h"
+#include "matrix/matrix-lib.h"
+#include "util/common-utils.h"
+#include "base/kaldi-error.h"
+#include "itf/online-feature-itf.h"
+#include "online2/online-endpoint.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "decoder/lattice-incremental-online-decoder.h"
+#include "hmm/transition-model.h"
+#include "hmm/posterior.h"
+
+namespace kaldi {
+/// @addtogroup  onlinedecoding OnlineDecoding
+/// @{
+
+
+/**
+   You will instantiate this class when you want to decode a single utterance
+   using the online-decoding setup for neural nets.  The template will be
+   instantiated only for FST = fst::Fst<fst::StdArc> and FST = fst::GrammarFst.
+*/
+
+template <typename FST>
+class SingleUtteranceNnet3IncrementalDecoderTpl {
+ public:
+
+  // Constructor. The pointer 'features' is not being given to this class to own
+  // and deallocate, it is owned externally.
+  SingleUtteranceNnet3IncrementalDecoderTpl(const LatticeIncrementalDecoderConfig &decoder_opts,
+                                            const TransitionModel &trans_model,
+                                            const nnet3::DecodableNnetSimpleLoopedInfo &info,
+                                            const FST &fst,
+                                            OnlineNnet2FeaturePipeline *features);
+
+  /// Initializes the decoding and sets the frame offset of the underlying
+  /// decodable object. This method is called by the constructor. You can also
+  /// call this method when you want to reset the decoder state, but want to
+  /// keep using the same decodable object, e.g. in case of an endpoint.
+  void InitDecoding(int32 frame_offset = 0);
+
+  /// Advances the decoding as far as we can.
+  void AdvanceDecoding();
+
+  /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
+  /// GetLattice() call will return faster.  You must not call this before
+  /// calling (TerminateDecoding() or InputIsFinished()) and then Wait().
+  void FinalizeDecoding() { decoder_.FinalizeDecoding(); }
+
+  int32 NumFramesDecoded() const { return decoder_.NumFramesDecoded(); }
+
+  int32 NumFramesInLattice() const { return decoder_.NumFramesInLattice(); }
+
+  /* Gets the lattice.  The output lattice has any acoustic scaling in it
+     (which will typically be desirable in an online-decoding context); if you
+     want an un-scaled lattice, scale it using ScaleLattice() with the inverse
+     of the acoustic weight.
+
+         @param [in] num_frames_to_include  The number of frames you want
+                  to be included in the lattice.  Must be in the range
+                  [NumFramesInLattice().. NumFramesDecoded()].  If you
+                  make it a few frames less than NumFramesDecoded(), it
+                  will save significant computation.
+         @param [in] use_final_probs   True if you want the lattice to
+                  contain final-probs (if at least one state was final
+                  on the most recently decoded frame).  Must be false
+                  if num_frames_to_include < NumFramesDecoded().
+                  Must be true if you have previously called
+                  FinalizeDecoding().
+  */
+  const CompactLattice &GetLattice(int32 num_frames_to_include,
+                                      bool use_final_probs = false) {
+    return decoder_.GetLattice(num_frames_to_include, use_final_probs);
+  }
+
+
+
+
+
+  /// Outputs an FST corresponding to the single best path through the current
+  /// lattice. If "use_final_probs" is true AND we reached the final-state of
+  /// the graph then it will include those as final-probs, else it will treat
+  /// all final-probs as one.
+  void GetBestPath(bool end_of_utterance,
+                   Lattice *best_path) const;
+
+
+  /// This function calls EndpointDetected from online-endpoint.h,
+  /// with the required arguments.
+  bool EndpointDetected(const OnlineEndpointConfig &config);
+
+  const LatticeIncrementalOnlineDecoderTpl<FST> &Decoder() const { return decoder_; }
+
+  ~SingleUtteranceNnet3IncrementalDecoderTpl() { }
+ private:
+
+  const LatticeIncrementalDecoderConfig &decoder_opts_;
+
+  // this is remembered from the constructor; it's ultimately
+  // derived from calling FrameShiftInSeconds() on the feature pipeline.
+  BaseFloat input_feature_frame_shift_in_seconds_;
+
+  // we need to keep a reference to the transition model around only because
+  // it's needed by the endpointing code.
+  const TransitionModel &trans_model_;
+
+  nnet3::DecodableAmNnetLoopedOnline decodable_;
+
+  LatticeIncrementalOnlineDecoderTpl<FST> decoder_;
+
+};
+
+
+typedef SingleUtteranceNnet3IncrementalDecoderTpl<fst::Fst<fst::StdArc> > SingleUtteranceNnet3IncrementalDecoder;
+
+/// @} End of "addtogroup onlinedecoding"
+
+}  // namespace kaldi
+
+
+
+#endif  // KALDI_ONLINE2_ONLINE_NNET3_DECODING_H_
diff --git a/src/online2/online-speex-wrapper.cc b/src/online2/online-speex-wrapper.cc
index 0af5bd90bd0..e41a812ca32 100644
--- a/src/online2/online-speex-wrapper.cc
+++ b/src/online2/online-speex-wrapper.cc
@@ -18,7 +18,7 @@
 // limitations under the License.
 
 #include <cstring>
-#include "online-speex-wrapper.h"
+#include "online2/online-speex-wrapper.h"
 
 namespace kaldi {
 
diff --git a/src/online2/online-speex-wrapper.h b/src/online2/online-speex-wrapper.h
index 2da87670c10..c8871ca1a8c 100644
--- a/src/online2/online-speex-wrapper.h
+++ b/src/online2/online-speex-wrapper.h
@@ -40,7 +40,7 @@ struct SpeexOptions {
   BaseFloat sample_rate;
 
   /// Ranges from 0 to 10, the higher the quality is better. In my preliminary
-  /// tests with the RM recipe, if set it to 8, I observed the WER incresed by
+  /// tests with the RM recipe, if set it to 8, I observed the WER increased by
   /// 0.1%; while set it to 10, the WER almost kept unchanged.
   int32 speex_quality;
 
diff --git a/src/online2/online-timing.h b/src/online2/online-timing.h
index 7f84734d5c4..405294d08b0 100644
--- a/src/online2/online-timing.h
+++ b/src/online2/online-timing.h
@@ -36,7 +36,7 @@ namespace kaldi {
 class OnlineTimer;
 
 /// class OnlineTimingStats stores statistics from timing of online decoding,
-/// which will enable the Print() function to print out the averate real-time
+/// which will enable the Print() function to print out the average real-time
 /// factor and average delay per utterance.  See class OnlineTimer.
 class OnlineTimingStats {
  public:
diff --git a/src/online2/onlinebin-util.cc b/src/online2/onlinebin-util.cc
index 74c594eeb79..ae94c670255 100644
--- a/src/online2/onlinebin-util.cc
+++ b/src/online2/onlinebin-util.cc
@@ -20,11 +20,11 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include "onlinebin-util.h"
+#include "online2/onlinebin-util.h"
 
 namespace kaldi {
 
-fst::Fst<fst::StdArc> *ReadDecodeGraph(std::string filename) {
+fst::Fst<fst::StdArc> *ReadDecodeGraph(const std::string &filename) {
   // read decoding network FST
   Input ki(filename); // use ki.Stream() instead of is.
   if (!ki.Stream().good()) KALDI_ERR << "Could not open decoding-graph FST "
@@ -57,7 +57,7 @@ fst::Fst<fst::StdArc> *ReadDecodeGraph(std::string filename) {
 }
 
 
-void PrintPartialResult(const std::vector<int32>& words,
+void PrintPartialResult(const std::vector<int32> &words,
                         const fst::SymbolTable *word_syms,
                         bool line_break) {
   KALDI_ASSERT(word_syms != NULL);
diff --git a/src/online2/onlinebin-util.h b/src/online2/onlinebin-util.h
index 66158278fe7..e6af036b0bd 100644
--- a/src/online2/onlinebin-util.h
+++ b/src/online2/onlinebin-util.h
@@ -34,11 +34,11 @@
 namespace kaldi {
 
 // Reads a decoding graph from a file
-fst::Fst<fst::StdArc> *ReadDecodeGraph(std::string filename);
+fst::Fst<fst::StdArc> *ReadDecodeGraph(const std::string &filename);
 
 // Prints a string corresponding to (a possibly partial) decode result as
 // and adds a "new line" character if "line_break" argument is true
-void PrintPartialResult(const std::vector<int32>& words,
+void PrintPartialResult(const std::vector<int32> &words,
                         const fst::SymbolTable *word_syms,
                         bool line_break);
 
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index ed5b6eb353f..2552e7148dc 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -11,7 +11,8 @@ BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      online2-wav-nnet2-latgen-faster ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
      online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
-     online2-wav-nnet3-latgen-faster
+     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar \
+     online2-tcp-nnet3-decode-faster online2-wav-nnet3-latgen-incremental
 
 OBJFILES =
 
@@ -23,6 +24,5 @@ ADDLIBS = ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 include ../makefiles/default_rules.mk
diff --git a/src/online2bin/apply-cmvn-online.cc b/src/online2bin/apply-cmvn-online.cc
index 2745df5d4e6..06157d0fcdf 100644
--- a/src/online2bin/apply-cmvn-online.cc
+++ b/src/online2bin/apply-cmvn-online.cc
@@ -127,8 +127,6 @@ int main(int argc, char *argv[]) {
         num_done++;
         tot_t += feats.NumRows();
         feature_writer.Write(utt, normalized_feats);
-        
-        num_done++;
       }
     }
     
diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc
index 4d71c2923ab..e30d78620ad 100644
--- a/src/online2bin/ivector-extract-online2.cc
+++ b/src/online2bin/ivector-extract-online2.cc
@@ -23,6 +23,7 @@
 #include "gmm/am-diag-gmm.h"
 #include "online2/online-ivector-feature.h"
 #include "util/kaldi-thread.h"
+#include "base/timer.h"
 
 int main(int argc, char *argv[]) {
   using namespace kaldi;
@@ -47,44 +48,52 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         "  ivector-extract-online2 --config=exp/nnet2_online/nnet_online/conf/ivector_extractor.conf \\\n"
         "    ark:data/train/spk2utt scp:data/train/feats.scp ark,t:ivectors.1.ark\n";
-    
+
     ParseOptions po(usage);
-    
+
     OnlineIvectorExtractionConfig ivector_config;
     ivector_config.Register(&po);
 
     g_num_threads = 8;
     bool repeat = false;
-    
+    int32 length_tolerance = 0;
+    std::string frame_weights_rspecifier;
+
     po.Register("num-threads", &g_num_threads,
                 "Number of threads to use for computing derived variables "
                 "of iVector extractor, at process start-up.");
     po.Register("repeat", &repeat,
                 "If true, output the same number of iVectors as input frames "
                 "(including repeated data).");
+    po.Register("frame-weights-rspecifier", &frame_weights_rspecifier,
+                "Archive of frame weights to scale stats");
+    po.Register("length-tolerance", &length_tolerance,
+                "Tolerance on the difference in number of frames "
+                "for feats and frame weights");
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     std::string spk2utt_rspecifier = po.GetArg(1),
         feature_rspecifier = po.GetArg(2),
         ivectors_wspecifier = po.GetArg(3);
-    
+
     double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
         tot_length = 0.0, tot_length_utt_end = 0.0;
     int32 num_done = 0, num_err = 0;
-    
+
     ivector_config.use_most_recent_ivector = false;
     OnlineIvectorExtractionInfo ivector_info(ivector_config);
-    
+
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
+    RandomAccessBaseFloatVectorReader frame_weights_reader(frame_weights_rspecifier);
     BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier);
-    
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -98,21 +107,46 @@ int main(int argc, char *argv[]) {
           continue;
         }
         const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-        
+
         OnlineMatrixFeature matrix_feature(feats);
 
         OnlineIvectorFeature ivector_feature(ivector_info,
                                              &matrix_feature);
-        
+
         ivector_feature.SetAdaptationState(adaptation_state);
 
+        if (!frame_weights_rspecifier.empty()) {
+          if (!frame_weights_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find weights for utterance " << utt;
+            num_err++;
+            continue;
+          }
+          const Vector<BaseFloat> &weights = frame_weights_reader.Value(utt);
+
+          if (std::abs(weights.Dim() - feats.NumRows()) > length_tolerance) {
+            num_err++;
+            continue;
+          }
+
+          std::vector<std::pair<int32, BaseFloat> > frame_weights;
+          for (int32 i = 0; i < feats.NumRows(); i++) {
+            if (i < weights.Dim())
+              frame_weights.push_back(std::make_pair(i, weights(i)));
+            else
+              frame_weights.push_back(std::make_pair(i, 0.0));
+          }
+
+
+          ivector_feature.UpdateFrameWeights(frame_weights);
+        }
+
         int32 T = feats.NumRows(),
             n = (repeat ? 1 : ivector_config.ivector_period),
             num_ivectors = (T + n - 1) / n;
-        
+
         Matrix<BaseFloat> ivectors(num_ivectors,
                                    ivector_feature.Dim());
-        
+
         for (int32 i = 0; i < num_ivectors; i++) {
           int32 t = i * n;
           SubVector<BaseFloat> ivector(ivectors, i);
diff --git a/src/online2bin/online2-tcp-nnet3-decode-faster.cc b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
new file mode 100644
index 00000000000..f68bf91cf60
--- /dev/null
+++ b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
@@ -0,0 +1,490 @@
+// online2bin/online2-tcp-nnet3-decode-faster.cc
+
+// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
+//           2016  Api.ai (Author: Ilya Platonov)
+//           2018  Polish-Japanese Academy of Information Technology (Author: Danijel Korzinek)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave-reader.h"
+#include "online2/online-nnet3-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "online2/onlinebin-util.h"
+#include "online2/online-timing.h"
+#include "online2/online-endpoint.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "util/kaldi-thread.h"
+#include "nnet3/nnet-utils.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string>
+
+namespace kaldi {
+
+class TcpServer {
+ public:
+  explicit TcpServer(int read_timeout);
+  ~TcpServer();
+
+  bool Listen(int32 port);  // start listening on a given port
+  int32 Accept();  // accept a client and return its descriptor
+
+  bool ReadChunk(size_t len); // get more data and return false if end-of-stream
+
+  Vector<BaseFloat> GetChunk(); // get the data read by above method
+
+  bool Write(const std::string &msg); // write to accepted client
+  bool WriteLn(const std::string &msg, const std::string &eol = "\n"); // write line to accepted client
+
+  void Disconnect();
+
+ private:
+  struct ::sockaddr_in h_addr_;
+  int32 server_desc_, client_desc_;
+  int16 *samp_buf_;
+  size_t buf_len_, has_read_;
+  pollfd client_set_[1];
+  int read_timeout_;
+};
+
+std::string LatticeToString(const Lattice &lat, const fst::SymbolTable &word_syms) {
+  LatticeWeight weight;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(lat, &alignment, &words, &weight);
+
+  std::ostringstream msg;
+  for (size_t i = 0; i < words.size(); i++) {
+    std::string s = word_syms.Find(words[i]);
+    if (s.empty()) {
+      KALDI_WARN << "Word-id " << words[i] << " not in symbol table.";
+      msg << "<#" << std::to_string(i) << "> ";
+    } else
+      msg << s << " ";
+  }
+  return msg.str();
+}
+
+std::string GetTimeString(int32 t_beg, int32 t_end, BaseFloat time_unit) {
+  char buffer[100];
+  double t_beg2 = t_beg * time_unit;
+  double t_end2 = t_end * time_unit;
+  snprintf(buffer, 100, "%.2f %.2f", t_beg2, t_end2);
+  return std::string(buffer);
+}
+
+int32 GetLatticeTimeSpan(const Lattice& lat) {
+  std::vector<int32> times;
+  LatticeStateTimes(lat, &times);
+  return times.back();
+}
+
+std::string LatticeToString(const CompactLattice &clat, const fst::SymbolTable &word_syms) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return "";
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+  return LatticeToString(best_path_lat, word_syms);
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in audio from a network socket and performs online\n"
+        "decoding with neural nets (nnet3 setup), with iVector-based\n"
+        "speaker adaptation and endpointing.\n"
+        "Note: some configuration values and inputs are set via config\n"
+        "files whose filenames are passed as options\n"
+        "\n"
+        "Usage: online2-tcp-nnet3-decode-faster [options] <nnet3-in> "
+        "<fst-in> <word-symbol-table>\n";
+
+    ParseOptions po(usage);
+
+
+    // feature_opts includes configuration for the iVector adaptation,
+    // as well as the basic features.
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeFasterDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
+
+    BaseFloat chunk_length_secs = 0.18;
+    BaseFloat output_period = 1;
+    BaseFloat samp_freq = 16000.0;
+    int port_num = 5050;
+    int read_timeout = 3;
+    bool produce_time = false;
+
+    po.Register("samp-freq", &samp_freq,
+                "Sampling frequency of the input signal (coded as 16-bit slinear).");
+    po.Register("chunk-length", &chunk_length_secs,
+                "Length of chunk size in seconds, that we process.");
+    po.Register("output-period", &output_period,
+                "How often in seconds, do we check for changes in output.");
+    po.Register("num-threads-startup", &g_num_threads,
+                "Number of threads used when initializing iVector extractor.");
+    po.Register("read-timeout", &read_timeout,
+                "Number of seconds of timout for TCP audio data to appear on the stream. Use -1 for blocking.");
+    po.Register("port-num", &port_num,
+                "Port number the server will listen on.");
+    po.Register("produce-time", &produce_time,
+                "Prepend begin/end times between endpoints (e.g. '5.46 6.81 <text_output>', in seconds)");
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    std::string nnet3_rxfilename = po.GetArg(1),
+        fst_rxfilename = po.GetArg(2),
+        word_syms_filename = po.GetArg(3);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
+
+    BaseFloat frame_shift = feature_info.FrameShiftInSeconds();
+    int32 frame_subsampling = decodable_opts.frame_subsampling_factor;
+
+    KALDI_VLOG(1) << "Loading AM...";
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(nnet3_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+    KALDI_VLOG(1) << "Loading FST...";
+
+    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (!word_syms_filename.empty())
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_filename;
+
+    signal(SIGPIPE, SIG_IGN); // ignore SIGPIPE to avoid crashing when socket forcefully disconnected
+
+    TcpServer server(read_timeout);
+
+    server.Listen(port_num);
+
+    while (true) {
+
+      server.Accept();
+
+      int32 samp_count = 0;// this is used for output refresh rate
+      size_t chunk_len = static_cast<size_t>(chunk_length_secs * samp_freq);
+      int32 check_period = static_cast<int32>(samp_freq * output_period);
+      int32 check_count = check_period;
+
+      int32 frame_offset = 0;
+
+      bool eos = false;
+
+      OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
+      SingleUtteranceNnet3Decoder decoder(decoder_opts, trans_model,
+                                          decodable_info,
+                                          *decode_fst, &feature_pipeline);
+
+      while (!eos) {
+
+        decoder.InitDecoding(frame_offset);
+        OnlineSilenceWeighting silence_weighting(
+            trans_model,
+            feature_info.silence_weighting_config,
+            decodable_opts.frame_subsampling_factor);
+        std::vector<std::pair<int32, BaseFloat>> delta_weights;
+
+        while (true) {
+          eos = !server.ReadChunk(chunk_len);
+
+          if (eos) {
+            feature_pipeline.InputFinished();
+            decoder.AdvanceDecoding();
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            if (decoder.NumFramesDecoded() > 0) {
+              CompactLattice lat;
+              decoder.GetLattice(true, &lat);
+              std::string msg = LatticeToString(lat, *word_syms);
+
+              // get time-span from previous endpoint to end of audio,
+              if (produce_time) {
+                int32 t_beg = frame_offset - decoder.NumFramesDecoded();
+                int32 t_end = frame_offset;
+                msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+              }
+
+              KALDI_VLOG(1) << "EndOfAudio, sending message: " << msg;
+              server.WriteLn(msg);
+            } else
+              server.Write("\n");
+            server.Disconnect();
+            break;
+          }
+
+          Vector<BaseFloat> wave_part = server.GetChunk();
+          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
+          samp_count += chunk_len;
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
+            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
+            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              frame_offset * decodable_opts.frame_subsampling_factor,
+                                              &delta_weights);
+            feature_pipeline.UpdateFrameWeights(delta_weights);
+          }
+
+          decoder.AdvanceDecoding();
+
+          if (samp_count > check_count) {
+            if (decoder.NumFramesDecoded() > 0) {
+              Lattice lat;
+              decoder.GetBestPath(false, &lat);
+              TopSort(&lat); // for LatticeStateTimes(),
+              std::string msg = LatticeToString(lat, *word_syms);
+
+              // get time-span after previous endpoint,
+              if (produce_time) {
+                int32 t_beg = frame_offset;
+                int32 t_end = frame_offset + GetLatticeTimeSpan(lat);
+                msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+              }
+
+              KALDI_VLOG(1) << "Temporary transcript: " << msg;
+              server.WriteLn(msg, "\r");
+            }
+            check_count += check_period;
+          }
+
+          if (decoder.EndpointDetected(endpoint_opts)) {
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            CompactLattice lat;
+            decoder.GetLattice(true, &lat);
+            std::string msg = LatticeToString(lat, *word_syms);
+
+            // get time-span between endpoints,
+            if (produce_time) {
+              int32 t_beg = frame_offset - decoder.NumFramesDecoded();
+              int32 t_end = frame_offset;
+              msg = GetTimeString(t_beg, t_end, frame_shift * frame_subsampling) + " " + msg;
+            }
+
+            KALDI_VLOG(1) << "Endpoint, sending message: " << msg;
+            server.WriteLn(msg);
+            break; // while (true)
+          }
+        }
+      }
+    }
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
+
+
+namespace kaldi {
+TcpServer::TcpServer(int read_timeout) {
+  server_desc_ = -1;
+  client_desc_ = -1;
+  samp_buf_ = NULL;
+  buf_len_ = 0;
+  read_timeout_ = 1000 * read_timeout;
+}
+
+bool TcpServer::Listen(int32 port) {
+  h_addr_.sin_addr.s_addr = INADDR_ANY;
+  h_addr_.sin_port = htons(port);
+  h_addr_.sin_family = AF_INET;
+
+  server_desc_ = socket(AF_INET, SOCK_STREAM, 0);
+
+  if (server_desc_ == -1) {
+    KALDI_ERR << "Cannot create TCP socket!";
+    return false;
+  }
+
+  int32 flag = 1;
+  int32 len = sizeof(int32);
+  if (setsockopt(server_desc_, SOL_SOCKET, SO_REUSEADDR, &flag, len) == -1) {
+    KALDI_ERR << "Cannot set socket options!";
+    return false;
+  }
+
+  if (bind(server_desc_, (struct sockaddr *) &h_addr_, sizeof(h_addr_)) == -1) {
+    KALDI_ERR << "Cannot bind to port: " << port << " (is it taken?)";
+    return false;
+  }
+
+  if (listen(server_desc_, 1) == -1) {
+    KALDI_ERR << "Cannot listen on port!";
+    return false;
+  }
+
+  KALDI_LOG << "TcpServer: Listening on port: " << port;
+
+  return true;
+
+}
+
+TcpServer::~TcpServer() {
+  Disconnect();
+  if (server_desc_ != -1)
+    close(server_desc_);
+  delete[] samp_buf_;
+}
+
+int32 TcpServer::Accept() {
+  KALDI_LOG << "Waiting for client...";
+
+  socklen_t len;
+
+  len = sizeof(struct sockaddr);
+  client_desc_ = accept(server_desc_, (struct sockaddr *) &h_addr_, &len);
+
+  struct sockaddr_storage addr;
+  char ipstr[20];
+
+  len = sizeof addr;
+  getpeername(client_desc_, (struct sockaddr *) &addr, &len);
+
+  struct sockaddr_in *s = (struct sockaddr_in *) &addr;
+  inet_ntop(AF_INET, &s->sin_addr, ipstr, sizeof ipstr);
+
+  client_set_[0].fd = client_desc_;
+  client_set_[0].events = POLLIN;
+
+  KALDI_LOG << "Accepted connection from: " << ipstr;
+
+  return client_desc_;
+}
+
+bool TcpServer::ReadChunk(size_t len) {
+  if (buf_len_ != len) {
+    buf_len_ = len;
+    delete[] samp_buf_;
+    samp_buf_ = new int16[len];
+  }
+
+  ssize_t ret;
+  int poll_ret;
+  size_t to_read = len;
+  has_read_ = 0;
+  while (to_read > 0) {
+    poll_ret = poll(client_set_, 1, read_timeout_);
+    if (poll_ret == 0) {
+      KALDI_WARN << "Socket timeout! Disconnecting...";
+      break;
+    }
+    if (poll_ret < 0) {
+      KALDI_WARN << "Socket error! Disconnecting...";
+      break;
+    }
+    ret = read(client_desc_, static_cast<void *>(samp_buf_ + has_read_), to_read * sizeof(int16));
+    if (ret <= 0) {
+      KALDI_WARN << "Stream over...";
+      break;
+    }
+    to_read -= ret / sizeof(int16);
+    has_read_ += ret / sizeof(int16);
+  }
+
+  return has_read_ > 0;
+}
+
+Vector<BaseFloat> TcpServer::GetChunk() {
+  Vector<BaseFloat> buf;
+
+  buf.Resize(static_cast<MatrixIndexT>(has_read_));
+
+  for (int i = 0; i < has_read_; i++)
+    buf(i) = static_cast<BaseFloat>(samp_buf_[i]);
+
+  return buf;
+}
+
+bool TcpServer::Write(const std::string &msg) {
+
+  const char *p = msg.c_str();
+  size_t to_write = msg.size();
+  size_t wrote = 0;
+  while (to_write > 0) {
+    ssize_t ret = write(client_desc_, static_cast<const void *>(p + wrote), to_write);
+    if (ret <= 0)
+      return false;
+
+    to_write -= ret;
+    wrote += ret;
+  }
+
+  return true;
+}
+
+bool TcpServer::WriteLn(const std::string &msg, const std::string &eol) {
+  if (Write(msg))
+    return Write(eol);
+  else return false;
+}
+
+void TcpServer::Disconnect() {
+  if (client_desc_ != -1) {
+    close(client_desc_);
+    client_desc_ = -1;
+  }
+}
+}  // namespace kaldi
diff --git a/src/online2bin/online2-wav-nnet2-am-compute.cc b/src/online2bin/online2-wav-nnet2-am-compute.cc
index 4fa707f8b13..28ae0ccd01d 100644
--- a/src/online2bin/online2-wav-nnet2-am-compute.cc
+++ b/src/online2bin/online2-wav-nnet2-am-compute.cc
@@ -29,9 +29,9 @@ int main(int argc, char *argv[]) {
     using namespace kaldi::nnet2;
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
-    
+
     const char *usage =
-        "Simulates the online neural net computation for each file of input\n" 
+        "Simulates the online neural net computation for each file of input\n"
         "features, and outputs as a matrix the result, with optional\n"
         "iVector-based speaker adaptation. Note: some configuration values\n"
         "and inputs are set via config files whose filenames are passed as\n"
@@ -43,7 +43,7 @@ int main(int argc, char *argv[]) {
         "<spk2utt-rspecifier> <wav-rspecifier> <feature-or-loglikes-wspecifier>\n"
         "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
         "you want to compute utterance by utterance.\n";
-    
+
     BaseFloat chunk_length_secs = 0.05;
     bool apply_log = false;
     bool pad_input = true;
@@ -51,7 +51,7 @@ int main(int argc, char *argv[]) {
 
     // feature_config includes configuration for the iVector adaptation,
     // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;  
+    OnlineNnet2FeaturePipelineConfig feature_config;
     ParseOptions po(usage);
     po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
                 "before outputting.");
@@ -69,19 +69,19 @@ int main(int argc, char *argv[]) {
                 "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
                 "in the file given to --ivector-extraction-config, and "
                 "--chunk-length=-1.");
-    
+
     feature_config.Register(&po);
     po.Read(argc, argv);
     if (po.NumArgs() != 4) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string nnet2_rxfilename = po.GetArg(1),
         spk2utt_rspecifier = po.GetArg(2),
         wav_rspecifier = po.GetArg(3),
         features_or_loglikes_wspecifier = po.GetArg(4);
-    
+
     OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
@@ -89,6 +89,11 @@ int main(int argc, char *argv[]) {
       chunk_length_secs = -1.0;
     }
 
+    Matrix<double> global_cmvn_stats;
+    if (feature_info.global_cmvn_stats_rxfilename != "")
+      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
+                      &global_cmvn_stats);
+
     TransitionModel trans_model;
     AmNnet am_nnet;
     {
@@ -98,17 +103,20 @@ int main(int argc, char *argv[]) {
       am_nnet.Read(ki.Stream(), binary);
     }
     Nnet &nnet = am_nnet.GetNnet();
-    
+
     int64 num_done = 0, num_frames = 0;
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     BaseFloatCuMatrixWriter writer(features_or_loglikes_wspecifier);
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+
       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
+      OnlineCmvnState cmvn_state(global_cmvn_stats);
+
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
@@ -122,7 +130,8 @@ int main(int argc, char *argv[]) {
 
         OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
         feature_pipeline.SetAdaptationState(adaptation_state);
-        
+        feature_pipeline.SetCmvnState(cmvn_state);
+
         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         if (chunk_length_secs > 0) {
@@ -131,23 +140,23 @@ int main(int argc, char *argv[]) {
         } else {
           chunk_length = std::numeric_limits<int32>::max();
         }
-        
+
         int32 samp_offset = 0;
         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;
-          
+
           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
           feature_pipeline.AcceptWaveform(samp_freq, wave_part);
-          
+
           samp_offset += num_samp;
           if (samp_offset == data.Dim()) {
             // no more input. flush out last frames
             feature_pipeline.InputFinished();
           }
         }
-        
+
         int32 feats_num_frames = feature_pipeline.NumFramesReady(),
               feats_dim = feature_pipeline.Dim();
         Matrix<BaseFloat> feats(feats_num_frames, feats_dim);
@@ -160,8 +169,9 @@ int main(int argc, char *argv[]) {
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
+        feature_pipeline.GetCmvnState(&cmvn_state);
 
-        int32 output_frames = feats.NumRows(), 
+        int32 output_frames = feats.NumRows(),
               output_dim = nnet.OutputDim();
         CuMatrix<BaseFloat> output(output_frames, output_dim),
                             feats_cu(feats);
@@ -173,7 +183,7 @@ int main(int argc, char *argv[]) {
                      << "would be empty.";
           continue;
         }
-        
+
         NnetComputation(nnet, feats_cu, pad_input, &output);
 
         if (apply_log) {
diff --git a/src/online2bin/online2-wav-nnet2-latgen-faster.cc b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
index 02b8dcf2ef5..f4d950636ab 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
@@ -145,13 +145,17 @@ int main(int argc, char *argv[]) {
         clat_wspecifier = po.GetArg(5);
 
     OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
-
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
 
+    Matrix<double> global_cmvn_stats;
+    if (feature_info.global_cmvn_stats_rxfilename != "")
+      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
+                      &global_cmvn_stats);
+
     TransitionModel trans_model;
     nnet2::AmNnet nnet;
     {
@@ -182,8 +186,11 @@ int main(int argc, char *argv[]) {
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+
       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
+      OnlineCmvnState cmvn_state(global_cmvn_stats);
+
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
@@ -198,6 +205,7 @@ int main(int argc, char *argv[]) {
 
         OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
         feature_pipeline.SetAdaptationState(adaptation_state);
+        feature_pipeline.SetCmvnState(cmvn_state);
 
         OnlineSilenceWeighting silence_weighting(
             trans_model,
@@ -266,6 +274,7 @@ int main(int argc, char *argv[]) {
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
+        feature_pipeline.GetCmvnState(&cmvn_state);
 
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
diff --git a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
index e9f43867801..2c04cc4180e 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-threaded.cc
@@ -40,10 +40,10 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   }
   CompactLattice best_path_clat;
   CompactLatticeShortestPath(clat, &best_path_clat);
-  
+
   Lattice best_path_lat;
   ConvertLattice(best_path_clat, &best_path_lat);
-  
+
   double likelihood;
   LatticeWeight weight;
   int32 num_frames;
@@ -57,7 +57,7 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                 << (likelihood / num_frames) << " over " << num_frames
                 << " frames.";
-             
+
   if (word_syms != NULL) {
     std::cerr << utt << ' ';
     for (size_t i = 0; i < words.size(); i++) {
@@ -76,10 +76,10 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;
-    
+
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
-    
+
     const char *usage =
         "Reads in wav file(s) and simulates online decoding with neural nets\n"
         "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
@@ -93,23 +93,23 @@ int main(int argc, char *argv[]) {
         "you want to decode utterance by utterance.\n"
         "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
         "See also online2-wav-nnet2-latgen-faster\n";
-    
+
     ParseOptions po(usage);
-    
+
     std::string word_syms_rxfilename;
-    
+
     OnlineEndpointConfig endpoint_config;
 
     // feature_config includes configuration for the iVector adaptation,
     // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;  
+    OnlineNnet2FeaturePipelineConfig feature_config;
     OnlineNnet2DecodingThreadedConfig nnet2_decoding_config;
-    
+
     BaseFloat chunk_length_secs = 0.05;
     bool do_endpointing = false;
     bool modify_ivector_config = false;
     bool simulate_realtime_decoding = true;
-    
+
     po.Register("chunk-length", &chunk_length_secs,
                 "Length of chunk size in seconds, that we provide each time to the "
                 "decoder.  The actual chunk sizes it processes for various stages "
@@ -130,31 +130,36 @@ int main(int argc, char *argv[]) {
                 "If false, don't sleep (so it will be faster).");
     po.Register("num-threads-startup", &g_num_threads,
                 "Number of threads used when initializing iVector extractor.  ");
-    
+
     feature_config.Register(&po);
     nnet2_decoding_config.Register(&po);
     endpoint_config.Register(&po);
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 5) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string nnet2_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         spk2utt_rspecifier = po.GetArg(3),
         wav_rspecifier = po.GetArg(4),
         clat_wspecifier = po.GetArg(5);
-    
+
     OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
 
     if (modify_ivector_config) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
     }
-    
+
+    Matrix<double> global_cmvn_stats;
+    if (feature_info.global_cmvn_stats_rxfilename != "")
+      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
+                      &global_cmvn_stats);
+
     TransitionModel trans_model;
     nnet2::AmNnet am_nnet;
     {
@@ -163,31 +168,34 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
       am_nnet.Read(ki.Stream(), binary);
     }
-    
+
     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
-    
+
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_rxfilename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;
-    
+
     int32 num_done = 0, num_err = 0;
     double tot_like = 0.0;
     int64 num_frames = 0;
     Timer global_timer;
-    
+
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     CompactLatticeWriter clat_writer(clat_wspecifier);
-    
+
     OnlineTimingStats timing_stats;
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+
       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
+      OnlineCmvnState cmvn_state(global_cmvn_stats);
+
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
@@ -200,25 +208,24 @@ int main(int argc, char *argv[]) {
         // take the first channel).
         SubVector<BaseFloat> data(wave_data.Data(), 0);
 
-        
         SingleUtteranceNnet2DecoderThreaded decoder(
             nnet2_decoding_config, trans_model, am_nnet,
-            *decode_fst, feature_info, adaptation_state);
-        
+            *decode_fst, feature_info, adaptation_state, cmvn_state);
+
         OnlineTimer decoding_timer(utt);
-        
+
         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         KALDI_ASSERT(chunk_length_secs > 0);
         chunk_length = int32(samp_freq * chunk_length_secs);
         if (chunk_length == 0) chunk_length = 1;
-        
+
         int32 samp_offset = 0;
         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;
-          
+
           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
 
           // The endpointing code won't work if we let the waveform be given to
@@ -228,9 +235,9 @@ int main(int argc, char *argv[]) {
           while (do_endpointing &&
                  decoder.NumWaveformPiecesPending() * chunk_length_secs > 2.0)
             Sleep(0.5f);
-          
+
           decoder.AcceptWaveform(samp_freq, wave_part);
-          
+
           samp_offset += num_samp;
 
           if (simulate_realtime_decoding) {
@@ -241,7 +248,7 @@ int main(int argc, char *argv[]) {
             // no more input. flush out last frames
             decoder.InputFinished();
           }
-          
+
           if (do_endpointing && decoder.EndpointDetected(endpoint_config)) {
             decoder.TerminateDecoding();
             break;
@@ -258,35 +265,34 @@ int main(int argc, char *argv[]) {
         CompactLattice clat;
         bool end_of_utterance = true;
         decoder.GetLattice(end_of_utterance, &clat, NULL);
-        
+
         GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
                                      &num_frames, &tot_like);
-        
+
         decoding_timer.OutputStats(&timing_stats);
-        
+
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         decoder.GetAdaptationState(&adaptation_state);
-        
+        decoder.GetCmvnState(&cmvn_state);
+
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
             1.0 / nnet2_decoding_config.acoustic_scale;
         ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
 
-        if (simulate_realtime_decoding) {        
+        if (simulate_realtime_decoding) {
           KALDI_VLOG(1) << "Adding the various end-of-utterance tasks took the "
                         << "total latency to " << timer.Elapsed() << " seconds.";
         }
         clat_writer.Write(utt, clat);
         KALDI_LOG << "Decoded utterance " << utt;
 
-
-        
         num_done++;
       }
     }
     bool online = true;
-            
+
     if (simulate_realtime_decoding) {
       timing_stats.Print(online);
     } else {
@@ -297,7 +303,7 @@ int main(int argc, char *argv[]) {
         KALDI_LOG << "Real-time factor was " << real_time_factor
                   << " assuming frame shift of " << frame_shift;
     }
-    
+
     KALDI_LOG << "Decoded " << num_done << " utterances, "
               << num_err << " with errors.";
     KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
diff --git a/src/online2bin/online2-wav-nnet3-latgen-faster.cc b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
index af330a59375..c7fb3806e6b 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
@@ -58,7 +58,8 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   *tot_like += likelihood;
   KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                 << (likelihood / num_frames) << " over " << num_frames
-                << " frames.";
+                << " frames, = " << (-weight.Value1() / num_frames)
+                << ',' << (weight.Value2() / num_frames);
 
   if (word_syms != NULL) {
     std::cerr << utt << ' ';
@@ -147,13 +148,17 @@ int main(int argc, char *argv[]) {
         clat_wspecifier = po.GetArg(5);
 
     OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
-
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
 
+    Matrix<double> global_cmvn_stats;
+    if (feature_info.global_cmvn_stats_rxfilename != "")
+      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
+                      &global_cmvn_stats);
+
     TransitionModel trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
@@ -194,8 +199,11 @@ int main(int argc, char *argv[]) {
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+
       OnlineIvectorExtractorAdaptationState adaptation_state(
           feature_info.ivector_extractor_info);
+      OnlineCmvnState cmvn_state(global_cmvn_stats);
+
       for (size_t i = 0; i < uttlist.size(); i++) {
         std::string utt = uttlist[i];
         if (!wav_reader.HasKey(utt)) {
@@ -210,6 +218,7 @@ int main(int argc, char *argv[]) {
 
         OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
         feature_pipeline.SetAdaptationState(adaptation_state);
+        feature_pipeline.SetCmvnState(cmvn_state);
 
         OnlineSilenceWeighting silence_weighting(
             trans_model,
@@ -276,6 +285,7 @@ int main(int argc, char *argv[]) {
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
+        feature_pipeline.GetCmvnState(&cmvn_state);
 
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
diff --git a/src/online2bin/online2-wav-nnet3-latgen-grammar.cc b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
new file mode 100644
index 00000000000..559baa1db6d
--- /dev/null
+++ b/src/online2bin/online2-wav-nnet3-latgen-grammar.cc
@@ -0,0 +1,316 @@
+// online2bin/online2-wav-nnet3-latgen-grammar.cc
+
+// Copyright 2014-2018  Johns Hopkins University (author: Daniel Povey)
+//           2016  Api.ai (Author: Ilya Platonov)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave-reader.h"
+#include "online2/online-nnet3-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "online2/onlinebin-util.h"
+#include "online2/online-timing.h"
+#include "online2/online-endpoint.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "util/kaldi-thread.h"
+#include "nnet3/nnet-utils.h"
+#include "decoder/grammar-fst.h"
+
+namespace kaldi {
+
+void GetDiagnosticsAndPrintOutput(const std::string &utt,
+                                  const fst::SymbolTable *word_syms,
+                                  const CompactLattice &clat,
+                                  int64 *tot_num_frames,
+                                  double *tot_like) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return;
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  num_frames = alignment.size();
+  likelihood = -(weight.Value1() + weight.Value2());
+  *tot_num_frames += num_frames;
+  *tot_like += likelihood;
+  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
+                << (likelihood / num_frames) << " over " << num_frames
+                << " frames.";
+
+  if (word_syms != NULL) {
+    std::cerr << utt << ' ';
+    for (size_t i = 0; i < words.size(); i++) {
+      std::string s = word_syms->Find(words[i]);
+      if (s == "")
+        KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+      std::cerr << s << ' ';
+    }
+    std::cerr << std::endl;
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in wav file(s) and simulates online decoding with neural nets\n"
+        "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
+        "optional endpointing.  Note: some configuration values and inputs are\n"
+        "set via config files whose filenames are passed as options.\n"
+        "This program like online2-wav-nnet3-latgen-faster but when the FST to\n"
+        "be decoded is of type GrammarFst.\n"
+        "\n"
+        "Usage: online2-wav-nnet3-latgen-grammar [options] <nnet3-in> <fst-in> "
+        "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
+        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
+        "you want to decode utterance by utterance.\n";
+
+    ParseOptions po(usage);
+
+    std::string word_syms_rxfilename;
+
+    // feature_opts includes configuration for the iVector adaptation,
+    // as well as the basic features.
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeFasterDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
+
+    BaseFloat chunk_length_secs = 0.18;
+    bool do_endpointing = false;
+    bool online = true;
+
+    po.Register("chunk-length", &chunk_length_secs,
+                "Length of chunk size in seconds, that we process.  Set to <= 0 "
+                "to use all input in one chunk.");
+    po.Register("word-symbol-table", &word_syms_rxfilename,
+                "Symbol table for words [for debug output]");
+    po.Register("do-endpointing", &do_endpointing,
+                "If true, apply endpoint detection");
+    po.Register("online", &online,
+                "You can set this to false to disable online iVector estimation "
+                "and have all the data for each utterance used, even at "
+                "utterance start.  This is useful where you just want the best "
+                "results and don't care about online operation.  Setting this to "
+                "false has the same effect as setting "
+                "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
+                "in the file given to --ivector-extraction-config, and "
+                "--chunk-length=-1.");
+    po.Register("num-threads-startup", &g_num_threads,
+                "Number of threads used when initializing iVector extractor.");
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    std::string nnet3_rxfilename = po.GetArg(1),
+        fst_rxfilename = po.GetArg(2),
+        spk2utt_rspecifier = po.GetArg(3),
+        wav_rspecifier = po.GetArg(4),
+        clat_wspecifier = po.GetArg(5);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
+    if (!online) {
+      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
+      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
+      chunk_length_secs = -1.0;
+    }
+
+    Matrix<double> global_cmvn_stats;
+    if (feature_info.global_cmvn_stats_rxfilename != "")
+      ReadKaldiObject(feature_info.global_cmvn_stats_rxfilename,
+                      &global_cmvn_stats);
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(nnet3_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+
+    fst::GrammarFst fst;
+    ReadKaldiObject(fst_rxfilename, &fst);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_rxfilename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename;
+
+    int32 num_done = 0, num_err = 0;
+    double tot_like = 0.0;
+    int64 num_frames = 0;
+
+    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
+    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
+    CompactLatticeWriter clat_writer(clat_wspecifier);
+
+    OnlineTimingStats timing_stats;
+
+    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
+      std::string spk = spk2utt_reader.Key();
+      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+
+      OnlineIvectorExtractorAdaptationState adaptation_state(
+          feature_info.ivector_extractor_info);
+      OnlineCmvnState cmvn_state(global_cmvn_stats);
+
+      for (size_t i = 0; i < uttlist.size(); i++) {
+        std::string utt = uttlist[i];
+        if (!wav_reader.HasKey(utt)) {
+          KALDI_WARN << "Did not find audio for utterance " << utt;
+          num_err++;
+          continue;
+        }
+        const WaveData &wave_data = wav_reader.Value(utt);
+        // get the data for channel zero (if the signal is not mono, we only
+        // take the first channel).
+        SubVector<BaseFloat> data(wave_data.Data(), 0);
+
+        OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
+        feature_pipeline.SetAdaptationState(adaptation_state);
+        feature_pipeline.SetCmvnState(cmvn_state);
+
+        OnlineSilenceWeighting silence_weighting(
+            trans_model,
+            feature_info.silence_weighting_config,
+            decodable_opts.frame_subsampling_factor);
+
+        SingleUtteranceNnet3DecoderTpl<fst::GrammarFst> decoder(
+            decoder_opts, trans_model,
+            decodable_info, fst, &feature_pipeline);
+
+        OnlineTimer decoding_timer(utt);
+
+        BaseFloat samp_freq = wave_data.SampFreq();
+        int32 chunk_length;
+        if (chunk_length_secs > 0) {
+          chunk_length = int32(samp_freq * chunk_length_secs);
+          if (chunk_length == 0) chunk_length = 1;
+        } else {
+          chunk_length = std::numeric_limits<int32>::max();
+        }
+
+        int32 samp_offset = 0;
+        std::vector<std::pair<int32, BaseFloat> > delta_weights;
+
+        while (samp_offset < data.Dim()) {
+          int32 samp_remaining = data.Dim() - samp_offset;
+          int32 num_samp = chunk_length < samp_remaining ? chunk_length
+                                                         : samp_remaining;
+
+          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
+          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
+
+          samp_offset += num_samp;
+          decoding_timer.WaitUntil(samp_offset / samp_freq);
+          if (samp_offset == data.Dim()) {
+            // no more input. flush out last frames
+            feature_pipeline.InputFinished();
+          }
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
+            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
+            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              &delta_weights);
+            feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights);
+          }
+
+          decoder.AdvanceDecoding();
+
+          if (do_endpointing && decoder.EndpointDetected(endpoint_opts)) {
+            break;
+          }
+        }
+        decoder.FinalizeDecoding();
+
+        CompactLattice clat;
+        bool end_of_utterance = true;
+        decoder.GetLattice(end_of_utterance, &clat);
+
+        GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
+                                     &num_frames, &tot_like);
+
+        decoding_timer.OutputStats(&timing_stats);
+
+        // In an application you might avoid updating the adaptation state if
+        // you felt the utterance had low confidence.  See lat/confidence.h
+        feature_pipeline.GetAdaptationState(&adaptation_state);
+        feature_pipeline.GetCmvnState(&cmvn_state);
+
+        // we want to output the lattice with un-scaled acoustics.
+        BaseFloat inv_acoustic_scale =
+            1.0 / decodable_opts.acoustic_scale;
+        ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
+
+        clat_writer.Write(utt, clat);
+        KALDI_LOG << "Decoded utterance " << utt;
+        num_done++;
+      }
+    }
+    timing_stats.Print(online);
+
+    KALDI_LOG << "Decoded " << num_done << " utterances, "
+              << num_err << " with errors.";
+    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
+              << " per frame over " << num_frames << " frames.";
+    delete word_syms; // will delete if non-NULL.
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception& e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
diff --git a/src/online2bin/online2-wav-nnet3-latgen-incremental.cc b/src/online2bin/online2-wav-nnet3-latgen-incremental.cc
new file mode 100644
index 00000000000..aaa87f24de1
--- /dev/null
+++ b/src/online2bin/online2-wav-nnet3-latgen-incremental.cc
@@ -0,0 +1,306 @@
+// online2bin/online2-wav-nnet3-latgen-incremental.cc
+
+// Copyright      2019  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave-reader.h"
+#include "online2/online-nnet3-incremental-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "online2/onlinebin-util.h"
+#include "online2/online-timing.h"
+#include "online2/online-endpoint.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "util/kaldi-thread.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+
+void GetDiagnosticsAndPrintOutput(const std::string &utt,
+                                  const fst::SymbolTable *word_syms,
+                                  const CompactLattice &clat,
+                                  int64 *tot_num_frames,
+                                  double *tot_like) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return;
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(best_path_lat, &alignment, &words, &weight);
+  num_frames = alignment.size();
+  likelihood = -(weight.Value1() + weight.Value2());
+  *tot_num_frames += num_frames;
+  *tot_like += likelihood;
+  KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
+                << (likelihood / num_frames) << " over " << num_frames
+                << " frames, = " << (-weight.Value1() / num_frames)
+                << ',' << (weight.Value2() / num_frames);
+
+  if (word_syms != NULL) {
+    std::cerr << utt << ' ';
+    for (size_t i = 0; i < words.size(); i++) {
+      std::string s = word_syms->Find(words[i]);
+      if (s == "")
+        KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+      std::cerr << s << ' ';
+    }
+    std::cerr << std::endl;
+  }
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in wav file(s) and simulates online decoding with neural nets\n"
+        "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
+        "optional endpointing.  Note: some configuration values and inputs are\n"
+        "set via config files whose filenames are passed as options\n"
+        "The lattice determinization algorithm here can operate\n"
+        "incrementally.\n"
+        "\n"
+        "Usage: online2-wav-nnet3-latgen-incremental [options] <nnet3-in> <fst-in> "
+        "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
+        "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
+        "you want to decode utterance by utterance.\n";
+
+    ParseOptions po(usage);
+
+    std::string word_syms_rxfilename;
+
+    // feature_opts includes configuration for the iVector adaptation,
+    // as well as the basic features.
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeIncrementalDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
+
+    BaseFloat chunk_length_secs = 0.18;
+    bool do_endpointing = false;
+    bool online = true;
+
+    po.Register("chunk-length", &chunk_length_secs,
+                "Length of chunk size in seconds, that we process.  Set to <= 0 "
+                "to use all input in one chunk.");
+    po.Register("word-symbol-table", &word_syms_rxfilename,
+                "Symbol table for words [for debug output]");
+    po.Register("do-endpointing", &do_endpointing,
+                "If true, apply endpoint detection");
+    po.Register("online", &online,
+                "You can set this to false to disable online iVector estimation "
+                "and have all the data for each utterance used, even at "
+                "utterance start.  This is useful where you just want the best "
+                "results and don't care about online operation.  Setting this to "
+                "false has the same effect as setting "
+                "--use-most-recent-ivector=true and --greedy-ivector-extractor=true "
+                "in the file given to --ivector-extraction-config, and "
+                "--chunk-length=-1.");
+    po.Register("num-threads-startup", &g_num_threads,
+                "Number of threads used when initializing iVector extractor.");
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 5) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    std::string nnet3_rxfilename = po.GetArg(1),
+        fst_rxfilename = po.GetArg(2),
+        spk2utt_rspecifier = po.GetArg(3),
+        wav_rspecifier = po.GetArg(4),
+        clat_wspecifier = po.GetArg(5);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
+
+    if (!online) {
+      feature_info.ivector_extractor_info.use_most_recent_ivector = true;
+      feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
+      chunk_length_secs = -1.0;
+    }
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(nnet3_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+
+    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_rxfilename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename;
+
+    int32 num_done = 0, num_err = 0;
+    double tot_like = 0.0;
+    int64 num_frames = 0;
+
+    SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
+    RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
+    CompactLatticeWriter clat_writer(clat_wspecifier);
+
+    OnlineTimingStats timing_stats;
+
+    for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
+      std::string spk = spk2utt_reader.Key();
+      const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+      OnlineIvectorExtractorAdaptationState adaptation_state(
+          feature_info.ivector_extractor_info);
+      for (size_t i = 0; i < uttlist.size(); i++) {
+        std::string utt = uttlist[i];
+        if (!wav_reader.HasKey(utt)) {
+          KALDI_WARN << "Did not find audio for utterance " << utt;
+          num_err++;
+          continue;
+        }
+        const WaveData &wave_data = wav_reader.Value(utt);
+        // get the data for channel zero (if the signal is not mono, we only
+        // take the first channel).
+        SubVector<BaseFloat> data(wave_data.Data(), 0);
+
+        OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
+        feature_pipeline.SetAdaptationState(adaptation_state);
+
+        OnlineSilenceWeighting silence_weighting(
+            trans_model,
+            feature_info.silence_weighting_config,
+            decodable_opts.frame_subsampling_factor);
+
+        SingleUtteranceNnet3IncrementalDecoder decoder(decoder_opts, trans_model,
+                                            decodable_info,
+                                            *decode_fst, &feature_pipeline);
+        OnlineTimer decoding_timer(utt);
+
+        BaseFloat samp_freq = wave_data.SampFreq();
+        int32 chunk_length;
+        if (chunk_length_secs > 0) {
+          chunk_length = int32(samp_freq * chunk_length_secs);
+          if (chunk_length == 0) chunk_length = 1;
+        } else {
+          chunk_length = std::numeric_limits<int32>::max();
+        }
+
+        int32 samp_offset = 0;
+        std::vector<std::pair<int32, BaseFloat> > delta_weights;
+
+        while (samp_offset < data.Dim()) {
+          int32 samp_remaining = data.Dim() - samp_offset;
+          int32 num_samp = chunk_length < samp_remaining ? chunk_length
+                                                         : samp_remaining;
+
+          SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
+          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
+
+          samp_offset += num_samp;
+          decoding_timer.WaitUntil(samp_offset / samp_freq);
+          if (samp_offset == data.Dim()) {
+            // no more input. flush out last frames
+            feature_pipeline.InputFinished();
+          }
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
+            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
+            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              &delta_weights);
+            feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights);
+          }
+
+          decoder.AdvanceDecoding();
+
+          if (do_endpointing && decoder.EndpointDetected(endpoint_opts)) {
+            break;
+          }
+        }
+        decoder.FinalizeDecoding();
+
+        bool use_final_probs = true;
+        CompactLattice clat = decoder.GetLattice(decoder.NumFramesDecoded(),
+                                                 use_final_probs);
+
+        Connect(&clat);
+        GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
+                                     &num_frames, &tot_like);
+
+        decoding_timer.OutputStats(&timing_stats);
+
+        // In an application you might avoid updating the adaptation state if
+        // you felt the utterance had low confidence.  See lat/confidence.h
+        feature_pipeline.GetAdaptationState(&adaptation_state);
+
+        // we want to output the lattice with un-scaled acoustics.
+        BaseFloat inv_acoustic_scale =
+            1.0 / decodable_opts.acoustic_scale;
+        ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
+
+        clat_writer.Write(utt, clat);
+        KALDI_LOG << "Decoded utterance " << utt;
+        num_done++;
+      }
+    }
+    timing_stats.Print(online);
+
+    KALDI_LOG << "Decoded " << num_done << " utterances, "
+              << num_err << " with errors.";
+    KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
+              << " per frame over " << num_frames << " frames.";
+    delete decode_fst;
+    delete word_syms; // will delete if non-NULL.
+    return (num_done != 0 ? 0 : 1);
+  } catch(const std::exception& e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
diff --git a/src/onlinebin/Makefile b/src/onlinebin/Makefile
index 0999f4e7792..670dbc0dab2 100644
--- a/src/onlinebin/Makefile
+++ b/src/onlinebin/Makefile
@@ -3,7 +3,7 @@ all:
 EXTRA_CXXFLAGS += -Wno-sign-compare -I../../tools/portaudio/install/include
 include ../kaldi.mk
 
-# The PA_RingBuffer interface is internal and is not exported in the .so libray
+# The PA_RingBuffer interface is internal and is not exported in the .so library
 # so we have to link against the static one
 
 ifneq "$(wildcard ../../tools/portaudio/install/lib/libportaudio.a)" ""
@@ -39,7 +39,7 @@ TESTFILES =
 ADDLIBS = ../online/kaldi-online.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/onlinebin/online-audio-client.cc b/src/onlinebin/online-audio-client.cc
index 241aee426cc..577204b65e7 100644
--- a/src/onlinebin/online-audio-client.cc
+++ b/src/onlinebin/online-audio-client.cc
@@ -85,7 +85,7 @@ int main(int argc, char** argv) {
 
     int32 client_desc = socket(AF_INET, SOCK_STREAM, 0);
     if (client_desc == -1) {
-      std::cerr << "ERROR: couldn't create socket!" << std::endl;
+      std::cerr << "ERROR: couldn't create socket!\n";
       return -1;
     }
 
@@ -96,8 +96,8 @@ int main(int argc, char** argv) {
     if (addr == INADDR_NONE) {
       hp = gethostbyname(server_addr_str.c_str());
       if (hp == NULL) {
-        std::cerr << "ERROR: couldn't resolve host string: " << server_addr_str
-                  << std::endl;
+        std::cerr << "ERROR: couldn't resolve host string: "
+                  << server_addr_str << '\n';
         close(client_desc);
         return -1;
       }
@@ -110,13 +110,13 @@ int main(int argc, char** argv) {
     server.sin_family = AF_INET;
     server.sin_port = htons(server_port);
     if (::connect(client_desc, (struct sockaddr*) &server, sizeof(server))) {
-      std::cerr << "ERROR: couldn't connect to server!" << std::endl;
+      std::cerr << "ERROR: couldn't connect to server!\n";
       close(client_desc);
       return -1;
     }
 
     KALDI_VLOG(2) << "Connected to KALDI server at host " << server_addr_str
-        << " port " << server_port << std::endl;
+        << " port " << server_port;
 
     char* pack_buffer = new char[packet_size];
 
@@ -124,7 +124,7 @@ int main(int argc, char** argv) {
     for (; !reader.Done(); reader.Next()) {
       std::string wav_key = reader.Key();
 
-      KALDI_VLOG(2) << "File: " << wav_key << std::endl;
+      KALDI_VLOG(2) << "File: " << wav_key;
 
       const WaveData &wav_data = reader.Value();
 
@@ -257,8 +257,7 @@ int main(int argc, char** argv) {
 
       {
         float speed = total_input_dur / total_reco_dur;
-        KALDI_VLOG(2) << "Recognized (" << speed << "xRT): " << reco_output
-            << std::endl;
+        KALDI_VLOG(2) << "Recognized (" << speed << "xRT): " << reco_output;
       }
 
       if (htk) {
@@ -266,7 +265,8 @@ int main(int argc, char** argv) {
         std::ofstream htk_file(name.c_str());
         for (size_t i = 0; i < results.size(); i++)
           htk_file << (int) (results[i].start * 10000000) << " "
-              << (int) (results[i].end * 10000000) << " " << results[i].word << std::endl;
+                   << (int) (results[i].end * 10000000) << " "
+                   << results[i].word << "\n";
         htk_file.close();
       }
 
@@ -309,12 +309,13 @@ int main(int argc, char** argv) {
         std::string name = wav_key + ".vtt";
         std::ofstream vtt_file(name.c_str());
 
-        vtt_file << "WEBVTT FILE" << std::endl << std::endl;
+        vtt_file << "WEBVTT FILE\n\n";
 
         for (size_t i = 0; i < subtitles.size(); i++)
-          vtt_file << (i + 1) << std::endl << TimeToTimecode(subtitles[i].start)
-              << " --> " << TimeToTimecode(subtitles[i].end) << std::endl
-              << subtitles[i].word << std::endl << std::endl;
+          vtt_file << (i + 1) << "\n"
+                   << TimeToTimecode(subtitles[i].start) << " --> "
+                   << TimeToTimecode(subtitles[i].end) << "\n"
+                   << subtitles[i].word << "\n\n";
 
         vtt_file.close();
       }
diff --git a/src/onlinebin/online-gmm-decode-faster.cc b/src/onlinebin/online-gmm-decode-faster.cc
index 8ad86a489d4..46904dbc59e 100644
--- a/src/onlinebin/online-gmm-decode-faster.cc
+++ b/src/onlinebin/online-gmm-decode-faster.cc
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
     OnlineFeatureMatrixOptions feature_reading_opts;
     decoder_opts.Register(&po, true);
     feature_reading_opts.Register(&po);
-    
+
     po.Register("left-context", &left_context, "Number of frames of left context");
     po.Register("right-context", &right_context, "Number of frames of right context");
     po.Register("acoustic-scale", &acoustic_scale,
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string model_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         word_syms_filename = po.GetArg(3),
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
       opts.order = kDeltaOrder;
       feat_transform = new OnlineDeltaInput(opts, &cmn_input);
     }
-    
+
     // feature_reading_opts contains number of retries, batch size.
     OnlineFeatureMatrix feature_matrix(feature_reading_opts,
                                        feat_transform);
@@ -200,4 +200,4 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 #endif
-} // main()
+}  // main()
diff --git a/src/onlinebin/online-net-client.cc b/src/onlinebin/online-net-client.cc
index dfcfa9361fc..64d157886a3 100644
--- a/src/onlinebin/online-net-client.cc
+++ b/src/onlinebin/online-net-client.cc
@@ -30,6 +30,7 @@
 
 int main(int argc, char *argv[]) {
   try {
+#ifndef KALDI_NO_PORTAUDIO
     using namespace kaldi;
 
     typedef kaldi::int32 int32;
@@ -122,6 +123,9 @@ int main(int argc, char *argv[]) {
     }
     freeaddrinfo(server_addr);
     return 0;
+#else
+    throw std::runtime_error("kaldi is compiled with KALDI_NO_PORTAUDIO");
+#endif
   } catch(const std::exception& e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/probe/README.slow_expf b/src/probe/README.slow_expf
index 00c9ce5be09..c20386b8137 100644
--- a/src/probe/README.slow_expf
+++ b/src/probe/README.slow_expf
@@ -1,5 +1,6 @@
-On some machines, expf() turns out to be very slow: much slower than its double precision counterpart exp().
-Probably this is concerned with the version of glibc.
+On some machines, expf() turns out to be very slow: much slower than its double
+precision counterpart exp().  Probably this is concerned with the version of
+glibc.
 
 Here are a couple of examples:
 
@@ -21,5 +22,7 @@ configuration$ ./exp-test
 exp() time: 0.0028439
 expf() time: 0.00713329
 
-If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the Exp() wrapper in base/kaldi-math.h will use exp() even for single precision floats.
-The behaviour of expf() is considered to be slow if it is slower than exp() by at least 10%.
\ No newline at end of file
+If slow behaviour is detected, then KALDI_NO_EXPF macro will be used, and the
+Exp() wrapper in base/kaldi-math.h will use exp() even for single precision
+floats.  The behaviour of expf() is considered to be slow if it is slower than
+exp() by at least 10%.
diff --git a/src/probe/exp-test.cc b/src/probe/exp-test.cc
index 1fd8a64c6a6..d6cc76d4ce2 100644
--- a/src/probe/exp-test.cc
+++ b/src/probe/exp-test.cc
@@ -17,35 +17,52 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+// Read Makefile.slow_expf. This test must be compiled with -O0.
+
 #include <iostream>
 #include <cmath>
 #include "base/timer.h"
 
-#define SAMPLE 100000
+int main() {
+  int test_iter = 300000;
+
+  // Make sure that the CPU bumps its clock to full speed: run the first loop
+  // without timing. Then increase the sample iteration count exponentially
+  // until the loop takes at least 10ms. We run this loop 1/4 of the number of
+  // actual test iterations and call both exp() and expf(), so that the overall
+  // test run will take 20 to 60 ms, to ensure a sensibly measurable result.
+  for (bool first = true; ; first=false) {
+    kaldi::Timer timer;
+    for(int i = 0; i < test_iter; i += 4) {
+      (void)exp((double)(i & 0x0F));
+      (void)expf((double)(i & 0x0F));
+    }
+    double time = timer.Elapsed();
+    if (first) continue;
+    if (time > 0.01) break;
+    test_iter *= 3;
+  }
 
-int main() { 
-  float dummy = 0.0;
   kaldi::Timer exp_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += exp((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)exp((double)(i & 0x0F));
   }
   double exp_time = exp_timer.Elapsed();
 
   kaldi::Timer expf_timer;
-  for(int i = 0; i < SAMPLE; ++i) {
-    dummy += expf((double)(i % 10));
+  for(int i = 0; i < test_iter; ++i) {
+    (void)expf((double)(i & 0x0F));
   }
   double expf_time = expf_timer.Elapsed();
-  
-  // Often exp() and expf() perform very similarly, 
-  // so we will replace expf() by exp() only if there is at least 10% difference 
-  if (expf_time < exp_time * 1.1) { 
+
+  double ratio = expf_time / exp_time;
+  if (ratio < 1.1) {
+    // Often exp() and expf() perform very similarly, so we will replace expf()
+    // by exp() only if there is at least 10% difference.
     return 0;
-  } else {
-    std::cerr << "exp() time: " << exp_time << std::endl;
-    std::cerr << "expf() time: " << expf_time << std::endl;
-    return 1;
   }
-  
-  std::cerr << dummy << std::endl; // No complaint about the unused variable
+
+  std::cerr << ("WARNING: slow expf() detected. expf() is slower than exp() "
+                "by the factor of ") << ratio << "\n";
+  return 1;
 }
diff --git a/src/pybind/Makefile b/src/pybind/Makefile
new file mode 100644
index 00000000000..7d79229c08b
--- /dev/null
+++ b/src/pybind/Makefile
@@ -0,0 +1,129 @@
+
+# make "all" the target.
+all:
+
+
+# Disable linking math libs because not needed here. Just for compilation speed.
+# no, it's now needed for context-fst-test.
+# MATHLIB = NONE
+
+EXTRA_CXXFLAGS = -Wno-sign-compare
+
+include ../kaldi.mk
+
+
+ifeq ($(KALDI_FLAVOR),static)
+  $(error You cannot build the pybind directory unless the build was dynamic; reconfigure with --shared option.)
+endif
+
+PYBIND_INCLUDES=$(shell python3 -m pybind11 --includes)
+
+CXXFLAGS += $(PYBIND_INCLUDES)
+
+PYBIND_EXTENSION := $(shell python3-config --extension-suffix)
+LIBFILE := kaldi-pybind
+LIBFILE_EXTENSION := $(PYBIND_EXTENSION)
+
+# pybind11 is heavily templated and generates code that is bloated before optimization.
+# -flto is link time optimization which apparently is important.
+CXXFLAGS += -O3 -flto -I.
+LDFLAGS += -flto
+
+ifeq ($(shell uname),Darwin)
+  LDFLAGS += -undefined dynamic_lookup
+endif
+
+# files should be sorted in alphabetic order
+CCFILES = \
+chain/chain_pybind.cc \
+chain/chain_supervision_pybind.cc \
+cudamatrix/cu_device_pybind.cc \
+cudamatrix/cu_matrix_pybind.cc \
+cudamatrix/cu_vector_pybind.cc \
+cudamatrix/cudamatrix_pybind.cc \
+dlpack/dlpack_pybind.cc \
+feat/feat_pybind.cc \
+feat/feature_pybind.cc \
+feat/wave_reader_pybind.cc \
+fst/arc_pybind.cc \
+fst/compile_pybind.cc \
+fst/fst_pybind.cc \
+fst/symbol_table_pybind.cc \
+fst/vector_fst_pybind.cc \
+fst/weight_pybind.cc \
+fstext/kaldi_fst_io_pybind.cc \
+kaldi_pybind.cc \
+matrix/matrix_common_pybind.cc \
+matrix/matrix_pybind.cc \
+matrix/vector_pybind.cc \
+nnet3/nnet3_pybind.cc \
+nnet3/nnet_chain_example_pybind.cc \
+nnet3/nnet_common_pybind.cc \
+nnet3/nnet_example_pybind.cc \
+util/table_types_pybind.cc
+
+CCFILES_OBJS := $(CCFILES:%.cc=%.o)
+
+LIBNAME := kaldi_pybind
+
+# libs should be sorted in alphabetic order
+ADDLIBS := \
+../base/kaldi-base.a \
+../chain/kaldi-chain.a \
+../cudamatrix/kaldi-cudamatrix.a \
+../feat/kaldi-feat.a \
+../fstext/kaldi-fstext.a \
+../matrix/kaldi-matrix.a \
+../nnet3/kaldi-nnet3.a \
+../util/kaldi-util.a
+
+
+EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).so)
+EXTRA_LDLIBS += ../../tools/openfst/lib/libfstscript.so
+
+LDFLAGS += -Wl,-rpath=$(CURDIR)/../../tools/openfst/lib
+
+LIBFILE=$(LIBNAME)$(LIBFILE_EXTENSION)
+
+
+.PHONY: all clean test
+
+all: $(LIBFILE)
+
+%.o: %.cc
+	$(CXX) -c $(CXXFLAGS) -o $@ $<
+
+$(LIBFILE): $(ADDLIBS) $(CCFILES_OBJS)
+	$(CXX) $(CXXFLAGS) -shared -o $@ $(CCFILES_OBJS) -Wl,--no-whole-archive -Wl,-rpath=$(CURDIR)/../lib $(LDFLAGS) $(LDLIBS) $(EXTRA_LDLIBS)
+	python3 -c 'import kaldi_pybind'  # this line is a test.
+
+clean:
+	-rm -f *.so
+	-rm -rf __pycache__
+	-rm -f $(CCFILES_OBJS)
+	-rm -f .depend.mk
+
+test: all
+	python3 tests/test_kaldi_pybind.py
+	make -C chain test
+	make -C cudamatrix test
+	make -C dlpack test
+	make -C feat test
+	make -C fst test
+	make -C matrix test
+	make -C nnet3 test
+
+# valgrind-python.supp is from http://svn.python.org/projects/python/trunk/Misc/valgrind-python.supp
+# since we do not compile Python from source, we follow the comment in valgrind-python.supp
+# to uncomment suppressions for PyObject_Free and PyObject_Realloc.
+valgrind:
+	valgrind --tool=memcheck --suppressions=./valgrind-python.supp \
+   python3 -E -tt ./tests/test_matrix.py
+
+depend:
+	rm -f .depend.mk
+	for f in $(CCFILES); do \
+    $(CXX) -M -MT "$$(dirname $$f)/$$(basename -s .cc $$f).o" $(CXXFLAGS) $$f >> .depend.mk; \
+  done
+
+-include .depend.mk
diff --git a/src/pybind/README.md b/src/pybind/README.md
new file mode 100644
index 00000000000..03cf1713d4d
--- /dev/null
+++ b/src/pybind/README.md
@@ -0,0 +1,14 @@
+This was tested as follows.  After configuring with --shared and making the rest of Kaldi,
+you can cd to here and do:
+```
+# make
+# python3
+>>> import kaldi_pybind as k
+>>> import numpy as np
+>>> a = k.FloatVector(10)
+>>> b = np.array(a, copy = False)
+>>> b[5:10] = 2.0
+>>> str(a)
+[ 0 0 0 0 0 2 2 2 2 2 ]
+
+```
diff --git a/src/pybind/chain/Makefile b/src/pybind/chain/Makefile
new file mode 100644
index 00000000000..f02ee0f0d74
--- /dev/null
+++ b/src/pybind/chain/Makefile
@@ -0,0 +1,4 @@
+
+test:
+	python3 ./chain_supervision_pybind_test.py
+
diff --git a/src/pybind/chain/chain_pybind.cc b/src/pybind/chain/chain_pybind.cc
new file mode 100644
index 00000000000..0dbe46f9707
--- /dev/null
+++ b/src/pybind/chain/chain_pybind.cc
@@ -0,0 +1,29 @@
+// pybind/chain/chain_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "chain/chain_pybind.h"
+
+#include "chain/chain_supervision_pybind.h"
+
+void pybind_chain(py::module& _m) {
+  py::module m = _m.def_submodule("chain", "chain pybind for Kaldi");
+
+  pybind_chain_supervision(m);
+}
diff --git a/src/pybind/chain/chain_pybind.h b/src/pybind/chain/chain_pybind.h
new file mode 100644
index 00000000000..0871e6e6e41
--- /dev/null
+++ b/src/pybind/chain/chain_pybind.h
@@ -0,0 +1,26 @@
+// pybind/chain/chain_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_CHAIN_CHAIN_PYBIND_H_
+#define KALDI_PYBIND_CHAIN_CHAIN_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_chain(py::module& m);
+
+#endif  // KALDI_PYBIND_CHAIN_CHAIN_PYBIND_H_
diff --git a/src/pybind/chain/chain_supervision_pybind.cc b/src/pybind/chain/chain_supervision_pybind.cc
new file mode 100644
index 00000000000..ac72b81a1e9
--- /dev/null
+++ b/src/pybind/chain/chain_supervision_pybind.cc
@@ -0,0 +1,121 @@
+// pybind/chain/chain_supervision_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "chain/chain_pybind.h"
+
+#include "chain/chain-supervision.h"
+
+using namespace kaldi::chain;
+
+void pybind_chain_supervision(py::module& m) {
+  {
+    using PyClass = Supervision;
+    py::class_<PyClass>(m, "Supervision",
+                        "struct Supervision is the fully-processed supervision "
+                        "information for a whole utterance or (after "
+                        "splitting) part of an utterance.  It contains the "
+                        "time limits on phones encoded into the FST.")
+        .def(py::init<>())
+        .def(py::init<const PyClass&>(), py::arg("other"))
+        .def("Swap", &PyClass::Swap)
+        .def_readwrite("weight", &PyClass::weight,
+                       "The weight of this example (will usually be 1.0).")
+        .def_readwrite("num_sequences", &PyClass::num_sequences,
+                       "num_sequences will be 1 if you create a Supervision "
+                       "object from a single lattice or alignment, but if you "
+                       "combine multiple Supevision objects the "
+                       "'num_sequences' is the number of objects that were "
+                       "combined (the FSTs get appended).")
+        .def_readwrite("frames_per_sequence", &PyClass::frames_per_sequence,
+                       "the number of frames in each sequence of appended "
+                       "objects.  num_frames * num_sequences must equal the "
+                       "path length of any path in the FST. Technically this "
+                       "information is redundant with the FST, but it's "
+                       "convenient to have it separately.")
+        .def_readwrite("label_dim", &PyClass::label_dim,
+                       "the maximum possible value of the labels in 'fst' "
+                       "(which go from 1 to label_dim).  For fully-processed "
+                       "examples this will equal the NumPdfs() in the "
+                       "TransitionModel object, but for newer-style "
+                       "'unconstrained' examples that have been output by "
+                       "chain-get-supervision but not yet processed by "
+                       "nnet3-chain-get-egs, it will be the NumTransitionIds() "
+                       "of the TransitionModel object.")
+        .def_readwrite(
+            "fst", &PyClass::fst,
+            "This is an epsilon-free unweighted acceptor that is sorted in "
+            "increasing order of frame index (this implies it's topologically "
+            "sorted but it's a stronger condition).  The labels will normally "
+            "be pdf-ids plus one (to avoid epsilons, since pdf-ids are "
+            "zero-based), but for newer-style 'unconstrained' examples that "
+            "have been output by chain-get-supervision but not yet processed "
+            "by nnet3-chain-get-egs, they will be transition-ids. Each "
+            "successful path in 'fst' has exactly 'frames_per_sequence * "
+            "num_sequences' arcs on it (first 'frames_per_sequence' arcs for "
+            "the first sequence; then 'frames_per_sequence' arcs for the "
+            "second sequence, and so on).")
+        .def_readwrite(
+            "e2e_fsts", &PyClass::e2e_fsts,
+            "'e2e_fsts' may be set as an alternative to 'fst'.  These FSTs are "
+            "used when the numerator computation will be done with 'full "
+            "forward_backward' instead of constrained in time.  (The "
+            "'constrained in time' fsts are how we described it in the "
+            "original LF-MMI paper, where each phone can only occur at the "
+            "same time it occurred in the lattice, extended by a tolerance)."
+            "\n"
+            "This 'e2e_fsts' is an array of FSTs, one per sequence, that are "
+            "acceptors with (pdf_id + 1) on the labels, just like 'fst', but "
+            "which are cyclic FSTs. Unlike with 'fst', it is not the case with "
+            "'e2e_fsts' that each arc corresponds to a specific frame)."
+            "\n"
+            "There are two situations 'e2e_fsts' might be set. The first is in "
+            "'end-to-end' training, where we train without a tree from a flat "
+            "start.  The function responsible for creating this object in that "
+            "case is TrainingGraphToSupervision(); to find out more about "
+            "end-to-end training, see chain-generic-numerator.h The second "
+            "situation is where we create the supervision from lattices, and "
+            "split them into chunks using the time marks in the lattice, but "
+            "then make a cyclic FST, and don't enforce the times on the "
+            "lattice inside the chunk.  [Code location TBD].")
+        .def_readwrite("alignment_pdfs", &PyClass::alignment_pdfs,
+                       "This member is only set to a nonempty value if we are "
+                       "creating 'unconstrained' egs.  These are egs that are "
+                       "split into chunks using the lattice alignments, but "
+                       "then within the chunks we remove the frame-level "
+                       "constraints on which phones can appear when, and use "
+                       "the 'e2e_fsts' member."
+                       "\n"
+                       "It is only required in order to accumulate the LDA "
+                       "stats using `nnet3-chain-acc-lda-stats`, and it is not "
+                       "merged by nnet3-chain-merge-egs; it will only be "
+                       "present for un-merged egs.")
+        .def("__str__",
+             [](const PyClass& sup) {
+               std::ostringstream os;
+               os << "weight: " << sup.weight << "\n"
+                  << "num_sequences: " << sup.num_sequences << "\n"
+                  << "frames_per_sequence: " << sup.frames_per_sequence << "\n"
+                  << "label_dim: " << sup.label_dim << "\n";
+               return os.str();
+             })
+        // TODO(fangjun): Check, Write and Read are not wrapped
+        ;
+  }
+}
diff --git a/src/pybind/chain/chain_supervision_pybind.h b/src/pybind/chain/chain_supervision_pybind.h
new file mode 100644
index 00000000000..cd4a5a7ba67
--- /dev/null
+++ b/src/pybind/chain/chain_supervision_pybind.h
@@ -0,0 +1,26 @@
+// pybind/chain/chain_superversion_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_CHAIN_CHAIN_SUPERVISION_PYBIND_H_
+#define KALDI_PYBIND_CHAIN_CHAIN_SUPERVISION_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_chain_supervision(py::module& m);
+
+#endif  // KALDI_PYBIND_CHAIN_CHAIN_SUPERVISION_PYBIND_H_
diff --git a/src/pybind/chain/chain_supervision_pybind_test.py b/src/pybind/chain/chain_supervision_pybind_test.py
new file mode 100755
index 00000000000..30339069cbf
--- /dev/null
+++ b/src/pybind/chain/chain_supervision_pybind_test.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi_pybind.chain as chain
+
+
+class TestChainSupervision(unittest.TestCase):
+
+    def test_chain_supervision(self):
+        supervision = chain.Supervision()
+        print(supervision)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/cudamatrix/Makefile b/src/pybind/cudamatrix/Makefile
new file mode 100644
index 00000000000..473b0200924
--- /dev/null
+++ b/src/pybind/cudamatrix/Makefile
@@ -0,0 +1,3 @@
+
+test:
+	python3 ./cu_device_pybind_test.py
diff --git a/src/pybind/cudamatrix/cu_device_pybind.cc b/src/pybind/cudamatrix/cu_device_pybind.cc
new file mode 100644
index 00000000000..5096ee28423
--- /dev/null
+++ b/src/pybind/cudamatrix/cu_device_pybind.cc
@@ -0,0 +1,45 @@
+// pybind/cudamatrix/cudamatrix_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudamatrix/cu_device_pybind.h"
+
+#include "cudamatrix/cu-device.h"
+
+using namespace kaldi;
+
+void pybind_cu_device(py::module& m) {
+  m.def("SelectGpuDevice",
+        [](int device_id) {
+#if HAVE_CUDA == 1
+          CuDevice::Instantiate().SelectGpuDevice(device_id);
+#else
+          KALDI_LOG << "Kaldi is NOT compiled with GPU! Ingore it.";
+#endif
+        },
+        py::arg("device_id"));
+
+  m.def("CuDeviceAllowMultithreading", []() {
+#if HAVE_CUDA == 1
+    CuDevice::Instantiate().AllowMultithreading();
+#else
+    KALDI_LOG << "Kaldi is NOT compiled with GPU! Ignore it.";
+#endif
+  });
+}
diff --git a/src/pybind/cudamatrix/cu_device_pybind.h b/src/pybind/cudamatrix/cu_device_pybind.h
new file mode 100644
index 00000000000..d3e1ba92973
--- /dev/null
+++ b/src/pybind/cudamatrix/cu_device_pybind.h
@@ -0,0 +1,26 @@
+// pybind/cudamatrix/cu_device_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_CUDAMATRIX_CU_DEVICE_PYBIND_H_
+#define KALDI_PYBIND_CUDAMATRIX_CU_DEVICE_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_cu_device(py::module& m);
+
+#endif  // KALDI_PYBIND_CUDAMATRIX_CU_DEVICE_PYBIND_H_
diff --git a/src/pybind/cudamatrix/cu_device_pybind_test.py b/src/pybind/cudamatrix/cu_device_pybind_test.py
new file mode 100755
index 00000000000..9308941193b
--- /dev/null
+++ b/src/pybind/cudamatrix/cu_device_pybind_test.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi
+
+
+class TestCuDevice(unittest.TestCase):
+
+    def test_cu_device(self):
+        device_id = 0
+        kaldi.SelectGpuDevice(device_id)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/cudamatrix/cu_matrix_pybind.cc b/src/pybind/cudamatrix/cu_matrix_pybind.cc
new file mode 100644
index 00000000000..c769d3a9ef1
--- /dev/null
+++ b/src/pybind/cudamatrix/cu_matrix_pybind.cc
@@ -0,0 +1,48 @@
+// pybind/cudamatrix/cu_matrix_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudamatrix/cu_matrix_pybind.h"
+
+#include "cudamatrix/cu-matrix.h"
+
+using namespace kaldi;
+
+void pybind_cu_matrix(py::module& m) {
+  {
+    using PyClass = CuMatrixBase<float>;
+    py::class_<PyClass, std::unique_ptr<PyClass, py::nodelete>>(
+        m, "FloatCuMatrixBase", "Matrix for CUDA computing")
+        .def("NumRows", &PyClass::NumRows, "Return number of rows")
+        .def("NumCols", &PyClass::NumCols, "Return number of columns")
+        .def("Stride", &PyClass::Stride, "Return stride")
+        // the following methods are only for testing
+        .def("ApplyExp", &PyClass::ApplyExp)
+        .def("SetZero", &PyClass::SetZero)
+        .def("Set", &PyClass::Set, py::arg("value"))
+        .def("Add", &PyClass::Add, py::arg("value"))
+        .def("Scale", &PyClass::Scale, py::arg("value"));
+    ;
+  }
+  // TODO(fangjun): add wrapper for CuMatrix
+  {
+    using PyClass = CuSubMatrix<float>;
+    py::class_<PyClass, CuMatrixBase<float>>(m, "FloatCuSubMatrix");
+  }
+}
diff --git a/src/pybind/cudamatrix/cu_matrix_pybind.h b/src/pybind/cudamatrix/cu_matrix_pybind.h
new file mode 100644
index 00000000000..f324751ad2f
--- /dev/null
+++ b/src/pybind/cudamatrix/cu_matrix_pybind.h
@@ -0,0 +1,26 @@
+// pybind/cudamatrix/cu_matrix_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_CUDAMATRIX_CU_MATRIX_PYBIND_H_
+#define KALDI_PYBIND_CUDAMATRIX_CU_MATRIX_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_cu_matrix(py::module& m);
+
+#endif  // KALDI_PYBIND_CUDAMATRIX_CU_MATRIX_PYBIND_H_
diff --git a/src/pybind/cudamatrix/cu_vector_pybind.cc b/src/pybind/cudamatrix/cu_vector_pybind.cc
new file mode 100644
index 00000000000..454814f06e5
--- /dev/null
+++ b/src/pybind/cudamatrix/cu_vector_pybind.cc
@@ -0,0 +1,44 @@
+// pybind/cudamatrix/cu_vector_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudamatrix/cu_vector_pybind.h"
+
+#include "cudamatrix/cu-vector.h"
+
+using namespace kaldi;
+
+void pybind_cu_vector(py::module& m) {
+  {
+    using PyClass = CuVectorBase<float>;
+    py::class_<PyClass, std::unique_ptr<PyClass, py::nodelete>>(
+        m, "FloatCuVectorBase", "Vector for CUDA computing")
+        .def("Dim", &PyClass::Dim, "Dimensions")
+        // the following methods are only for testing
+        .def("SetZero", &PyClass::SetZero)
+        .def("Set", &PyClass::Set, py::arg("value"))
+        .def("Add", &PyClass::Add, py::arg("value"))
+        .def("Scale", &PyClass::Scale, py::arg("value"));
+  }
+  // TODO(fangjun): add wrapper for CuVector
+  {
+    using PyClass = CuSubVector<float>;
+    py::class_<PyClass, CuVectorBase<float>>(m, "FloatCuSubVector");
+  }
+}
diff --git a/src/pybind/cudamatrix/cu_vector_pybind.h b/src/pybind/cudamatrix/cu_vector_pybind.h
new file mode 100644
index 00000000000..f64fc1fee69
--- /dev/null
+++ b/src/pybind/cudamatrix/cu_vector_pybind.h
@@ -0,0 +1,26 @@
+// pybind/cudamatrix/cu_vector_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_CUDAMATRIX_CU_VECTOR_PYBIND_H_
+#define KALDI_PYBIND_CUDAMATRIX_CU_VECTOR_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_cu_vector(py::module& m);
+
+#endif  // KALDI_PYBIND_CUDAMATRIX_CU_VECTOR_PYBIND_H_
diff --git a/src/pybind/cudamatrix/cudamatrix_pybind.cc b/src/pybind/cudamatrix/cudamatrix_pybind.cc
new file mode 100644
index 00000000000..07bc3e59048
--- /dev/null
+++ b/src/pybind/cudamatrix/cudamatrix_pybind.cc
@@ -0,0 +1,31 @@
+// pybind/cudamatrix/cudamatrix_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "cudamatrix/cudamatrix_pybind.h"
+
+#include "cudamatrix/cu_device_pybind.h"
+#include "cudamatrix/cu_matrix_pybind.h"
+#include "cudamatrix/cu_vector_pybind.h"
+
+void pybind_cudamatrix(py::module& m) {
+  pybind_cu_device(m);
+  pybind_cu_vector(m);
+  pybind_cu_matrix(m);
+}
diff --git a/src/pybind/cudamatrix/cudamatrix_pybind.h b/src/pybind/cudamatrix/cudamatrix_pybind.h
new file mode 100644
index 00000000000..a80be70b43c
--- /dev/null
+++ b/src/pybind/cudamatrix/cudamatrix_pybind.h
@@ -0,0 +1,26 @@
+// pybind/cudamatrix/cudamatrix_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_CUDAMATRIX_CUMATRIX_PYBIND_H_
+#define KALDI_PYBIND_CUDAMATRIX_CUMATRIX_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_cudamatrix(py::module& m);
+
+#endif  // KALDI_PYBIND_CUDAMATRIX_CUMATRIX_PYBIND_H_
diff --git a/src/pybind/dlpack/Makefile b/src/pybind/dlpack/Makefile
new file mode 100644
index 00000000000..0637fe3cb08
--- /dev/null
+++ b/src/pybind/dlpack/Makefile
@@ -0,0 +1,3 @@
+
+test:
+	python3 ./dlpack_pybind_test.py
diff --git a/src/pybind/dlpack/dlpack.h b/src/pybind/dlpack/dlpack.h
new file mode 100644
index 00000000000..9bfe93196b9
--- /dev/null
+++ b/src/pybind/dlpack/dlpack.h
@@ -0,0 +1,179 @@
+// pybind/dlpack/dlpack.h
+/* this file is copied from
+ https://github.com/dmlc/dlpack/blob/b7bd45cdd8b67f4ab2502a8717f2e0926d9b4121/include/dlpack/dlpack.h
+ Date: 2019-12-22, from the master branch
+*/
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current version of dlpack */
+#define DLPACK_VERSION 020
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!
+ * \brief The device type in DLContext.
+ */
+typedef enum {
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLGPU = 2,
+  /*!
+   * \brief Pinned CUDA GPU device by cudaMallocHost
+   * \note kDLCPUPinned = kDLCPU | kDLGPU
+   */
+  kDLCPUPinned = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+} DLDeviceType;
+
+/*!
+ * \brief A Device context for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*! \brief The device index */
+  int device_id;
+} DLContext;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  kDLInt = 0U,
+  kDLUInt = 1U,
+  kDLFloat = 2U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold.
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes=1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+ *   - int8: type_code = 0, bits = 8, lanes=1
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The opaque data pointer points to the allocated data. This will be
+   * CUDA device pointer or cl_mem handle in OpenCL. This pointer is always
+   * aligned to 256 bytes as in CUDA.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device context of the tensor */
+  DLContext ctx;
+  /*! \brief Number of dimensions */
+  int ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void* manager_ctx;
+  /*! \brief Destructor signature void (*)(void*) - this should be called
+   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+   *   if there is no way for the caller to provide a reasonable destructor.
+   *   The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor* self);
+} DLManagedTensor;
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/src/pybind/dlpack/dlpack_pybind.cc b/src/pybind/dlpack/dlpack_pybind.cc
new file mode 100644
index 00000000000..e9a3376f2d4
--- /dev/null
+++ b/src/pybind/dlpack/dlpack_pybind.cc
@@ -0,0 +1,112 @@
+// pybind/dlpack/dlpack_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dlpack/dlpack_pybind.h"
+
+#include "dlpack/dlpack.h"
+
+#include "cudamatrix/cu-matrix.h"
+#include "cudamatrix/cu-vector.h"
+#include "matrix/kaldi-matrix.h"
+#include "matrix/kaldi-vector.h"
+
+using namespace kaldi;
+
+void pybind_dlpack(py::module& m) {
+  m.def("ToSubVector", [](py::capsule* capsule) {
+    DLManagedTensor* managed_tensor = *capsule;
+    // (fangjun): the above assignment will either throw or succeed with a
+    // non-null ptr so no need to check for nullptr below
+
+    auto* tensor = &managed_tensor->dl_tensor;
+
+    // we support only 1-D tensor
+    KALDI_ASSERT(tensor->ndim == 1);
+
+    // we support only float (single precision, 32-bit) tensor
+    KALDI_ASSERT(tensor->dtype.code == kDLFloat);
+    KALDI_ASSERT(tensor->dtype.bits == 32);
+    KALDI_ASSERT(tensor->dtype.lanes == 1);
+
+    return SubVector<float>(reinterpret_cast<float*>(tensor->data),
+                            tensor->shape[0]);
+  });
+
+  m.def("ToSubMatrix", [](py::capsule* capsule) {
+    DLManagedTensor* managed_tensor = *capsule;
+
+    auto* tensor = &managed_tensor->dl_tensor;
+
+    // we support only 2-D tensor
+    KALDI_ASSERT(tensor->ndim == 2);
+
+    // we support only float (single precision, 32-bit) tensor
+    KALDI_ASSERT(tensor->dtype.code == kDLFloat);
+    KALDI_ASSERT(tensor->dtype.bits == 32);
+    KALDI_ASSERT(tensor->dtype.lanes == 1);
+
+    // DLPack assumes row major, so we use strides[0]
+    return SubMatrix<float>(reinterpret_cast<float*>(tensor->data),
+                            tensor->shape[0], tensor->shape[1],
+                            tensor->strides[0]);
+  });
+
+  m.def("ToCuSubVector", [](py::capsule* capsule) {
+#if HAVE_CUDA == 1
+    DLManagedTensor* managed_tensor = *capsule;
+
+    auto* tensor = &managed_tensor->dl_tensor;
+
+    // we support only 1-D tensor
+    KALDI_ASSERT(tensor->ndim == 1);
+
+    // we support only float (single precision, 32-bit) tensor
+    KALDI_ASSERT(tensor->dtype.code == kDLFloat);
+    KALDI_ASSERT(tensor->dtype.bits == 32);
+    KALDI_ASSERT(tensor->dtype.lanes == 1);
+
+    return CuSubVector<float>(reinterpret_cast<float*>(tensor->data),
+                              tensor->shape[0]);
+#else
+      KALDI_ERR << "Kaldi is not compiled with GPU!"
+#endif
+  });
+
+  m.def("ToCuSubMatrix", [](py::capsule* capsule) {
+#if HAVE_CUDA == 1
+    DLManagedTensor* managed_tensor = *capsule;
+
+    auto* tensor = &managed_tensor->dl_tensor;
+
+    // we support only 2-D tensor
+    KALDI_ASSERT(tensor->ndim == 2);
+
+    // we support only float (single precision, 32-bit) tensor
+    KALDI_ASSERT(tensor->dtype.code == kDLFloat);
+    KALDI_ASSERT(tensor->dtype.bits == 32);
+    KALDI_ASSERT(tensor->dtype.lanes == 1);
+
+    // DLPack assumes row major, so we use strides[0]
+    return CuSubMatrix<float>(reinterpret_cast<float*>(tensor->data),
+                              tensor->shape[0], tensor->shape[1],
+                              tensor->strides[0]);
+#else
+      KALDI_ERR << "Kaldi is not compiled with GPU!"
+#endif
+  });
+}
diff --git a/src/pybind/dlpack/dlpack_pybind.h b/src/pybind/dlpack/dlpack_pybind.h
new file mode 100644
index 00000000000..57532fd241a
--- /dev/null
+++ b/src/pybind/dlpack/dlpack_pybind.h
@@ -0,0 +1,26 @@
+// pybind/dlpck/dlpack_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_DLPACK_DLPACK_PYBIND_H_
+#define KALDI_PYBIND_DLPACK_DLPACK_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_dlpack(py::module& m);
+
+#endif  // KALDI_PYBIND_DLPACK_DLPACK_PYBIND_H_
diff --git a/src/pybind/dlpack/dlpack_pybind_test.py b/src/pybind/dlpack/dlpack_pybind_test.py
new file mode 100644
index 00000000000..04602269a49
--- /dev/null
+++ b/src/pybind/dlpack/dlpack_pybind_test.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import math
+import unittest
+
+try:
+    import torch
+except ImportError:
+    print('This test needs PyTorch.')
+    print('Please install PyTorch first.')
+    print('PyTorch 1.3.0dev20191006 has been tested and is guaranteed to work.')
+    sys.exit(0)
+
+from torch.utils.dlpack import to_dlpack
+
+import kaldi
+
+
+class TestDLPack(unittest.TestCase):
+
+    def test_pytorch_cpu_tensor_to_subvector(self):
+
+        tensor = torch.arange(3).float()
+
+        # v and tensor share the same memory
+        # no data is copied
+        v = kaldi.ToSubVector(to_dlpack(tensor))
+        self.assertIsInstance(v, kaldi.FloatSubVector)
+
+        v[0] = 100
+        v[1] = 200
+        v[2] = 300
+
+        self.assertEqual(v[0], 100)
+        self.assertEqual(v[1], 200)
+        self.assertEqual(v[2], 300)
+
+    def test_pytorch_cpu_tensor_to_submatrix(self):
+        tensor = torch.arange(6).reshape(2, 3).float()
+
+        m = kaldi.ToSubMatrix(to_dlpack(tensor))
+        self.assertIsInstance(m, kaldi.FloatSubMatrix)
+
+        m[0, 0] = 100  # also changes tensor, since memory is shared
+        self.assertEqual(tensor[0, 0], 100)
+
+    def test_pytorch_gpu_tensor_to_cu_subvector_and_cu_submatrix(self):
+        if torch.cuda.is_available() == False:
+            print('No GPU detected! Skip it')
+            return
+
+        device_id = 0
+
+        # Kaldi and PyTorch will use the same GPU
+        kaldi.SelectGpuDevice(device_id=device_id)
+
+        device = torch.device('cuda', device_id)
+
+        tensor = torch.arange(3).float()
+        tensor = tensor.to(device)
+
+        # make sure the tensor from PyTorch is indeed on GPU
+        self.assertTrue(tensor.is_cuda)
+
+        # GPU data is shared between kaldi::CuSubVector and PyTorch GPU tensor
+        # no data is copied
+        v = kaldi.ToCuSubVector(to_dlpack(tensor))
+        self.assertIsInstance(v, kaldi.FloatCuSubVector)
+
+        v.Add(value=10)
+        self.assertEqual(tensor[0], 10)
+        self.assertEqual(tensor[1], 11)
+        self.assertEqual(tensor[2], 12)
+
+        v.Scale(value=6)
+        self.assertEqual(tensor[0], 60)
+        self.assertEqual(tensor[1], 66)
+        self.assertEqual(tensor[2], 72)
+
+        v.SetZero()
+        self.assertEqual(tensor[0], 0)
+        self.assertEqual(tensor[1], 0)
+        self.assertEqual(tensor[2], 0)
+
+        # Now for CuSubMatrix
+        tensor = torch.arange(3).reshape(1, 3).float()
+        tensor = tensor.to(device)
+
+        # make sure the tensor from PyTorch is indeed on GPU
+        self.assertTrue(tensor.is_cuda)
+
+        m = kaldi.ToCuSubMatrix(to_dlpack(tensor))
+        m.ApplyExp()
+
+        self.assertAlmostEqual(tensor[0, 0], math.exp(0), places=7)
+        self.assertAlmostEqual(tensor[0, 1], math.exp(1), places=7)
+        self.assertAlmostEqual(tensor[0, 2], math.exp(2), places=7)
+
+        m.SetZero()
+        self.assertEqual(tensor[0, 0], 0)
+        self.assertEqual(tensor[0, 1], 0)
+        self.assertEqual(tensor[0, 2], 0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/feat/Makefile b/src/pybind/feat/Makefile
new file mode 100644
index 00000000000..ba459e14d73
--- /dev/null
+++ b/src/pybind/feat/Makefile
@@ -0,0 +1,4 @@
+
+test:
+	python3 ./feat_pybind_test.py
+	python3 ./wave_reader_pybind_test.py
diff --git a/src/pybind/feat/feat.ark b/src/pybind/feat/feat.ark
new file mode 100644
index 00000000000..09eb8e487bb
Binary files /dev/null and b/src/pybind/feat/feat.ark differ
diff --git a/src/pybind/feat/feat_pybind.cc b/src/pybind/feat/feat_pybind.cc
new file mode 100644
index 00000000000..abd6d3dd78d
--- /dev/null
+++ b/src/pybind/feat/feat_pybind.cc
@@ -0,0 +1,28 @@
+// pybind/feat/feat_pybind.cc
+
+// Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/feat_pybind.h"
+
+#include "feat/feature_pybind.h"
+#include "feat/wave_reader_pybind.h"
+
+void pybind_feat(py::module& _m) {
+  py::module m = _m.def_submodule("feat", "feat pybind for Kaldi");
+
+  pybind_wave_reader(m);
+  pybind_feature(m);
+}
diff --git a/src/pybind/feat/feat_pybind.h b/src/pybind/feat/feat_pybind.h
new file mode 100644
index 00000000000..37c1bfa7457
--- /dev/null
+++ b/src/pybind/feat/feat_pybind.h
@@ -0,0 +1,25 @@
+// pybind/feat/feat_pybind.h
+
+// Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FEAT_FEAT_PYBIND_H_
+#define KALDI_PYBIND_FEAT_FEAT_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_feat(py::module& m);
+
+#endif  // KALDI_PYBIND_FEAT_FEAT_PYBIND_H_
diff --git a/src/pybind/feat/feat_pybind_test.py b/src/pybind/feat/feat_pybind_test.py
new file mode 100755
index 00000000000..6682f955324
--- /dev/null
+++ b/src/pybind/feat/feat_pybind_test.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+import numpy as np
+
+import kaldi_pybind as k
+
+import kaldi_pybind.feat as feat
+from kaldi import SequentialWaveReader
+from kaldi import SequentialMatrixReader
+
+
+class TestFeat(unittest.TestCase):
+
+    def test_mfcc(self):
+        mfcc = feat.Mfcc(feat.MfccOptions())
+        reader = SequentialWaveReader('ark:wav.ark')
+        # gold set is feature extracted using featbin/compute-mfcc-feats
+        gold_reader = SequentialMatrixReader('ark:feat.ark')
+        for key, value in reader:
+            print('Validate utterance: {}'.format(key))
+            self.assertEqual(value.SampFreq(), 16000)
+
+            wave_data = value.Data()
+            nd = wave_data.numpy()
+            nsamp = wave_data.NumCols()
+            self.assertAlmostEqual(nsamp,
+                                   value.Duration() * value.SampFreq(),
+                                   places=1)
+
+            waveform = k.FloatSubVector(nd.reshape(nsamp))
+            features = k.FloatMatrix(1, 1)
+            mfcc.ComputeFeatures(waveform, value.SampFreq(), 1.0, features)
+            self.assertEqual(key, gold_reader.Key())
+            gold_feat = gold_reader.Value().numpy()
+            np.testing.assert_almost_equal(features.numpy(),
+                                           gold_feat,
+                                           decimal=3)
+            gold_reader.Next()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/feat/feature_pybind.cc b/src/pybind/feat/feature_pybind.cc
new file mode 100644
index 00000000000..19afdceae69
--- /dev/null
+++ b/src/pybind/feat/feature_pybind.cc
@@ -0,0 +1,83 @@
+// pybind/feat/feature_pybind.cc
+
+// Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/feature_pybind.h"
+
+#include "feat/feature-mfcc.h"
+#include "feat/feature-fbank.h"
+
+using namespace kaldi;
+
+template <class Feature>
+void offline_feature(py::module& m, const std::string& feat_type) {
+  py::class_<OfflineFeatureTpl<Feature>>(m, feat_type.c_str())
+      .def(py::init<const typename Feature::Options&>())
+      .def("ComputeFeatures", &OfflineFeatureTpl<Feature>::ComputeFeatures)
+      .def("Dim", &OfflineFeatureTpl<Feature>::Dim);
+}
+
+void pybind_feature(py::module& m) {
+  py::class_<FrameExtractionOptions>(m, "FrameExtractionOptions")
+      .def_readwrite("samp_freq", &FrameExtractionOptions::samp_freq)
+      .def_readwrite("frame_shift_ms", &FrameExtractionOptions::frame_shift_ms)
+      .def_readwrite("frame_length_ms", &FrameExtractionOptions::frame_length_ms)
+      .def_readwrite("dither", &FrameExtractionOptions::dither)
+      .def_readwrite("preemph_coeff", &FrameExtractionOptions::preemph_coeff)
+      .def_readwrite("remove_dc_offset", &FrameExtractionOptions::remove_dc_offset)
+      .def_readwrite("window_type", &FrameExtractionOptions::window_type)
+      .def_readwrite("round_to_power_of_two", &FrameExtractionOptions::round_to_power_of_two)
+      .def_readwrite("blackman_coeff", &FrameExtractionOptions::blackman_coeff)
+      .def_readwrite("snip_edges", &FrameExtractionOptions::snip_edges)
+      .def_readwrite("allow_downsample", &FrameExtractionOptions::allow_downsample)
+      .def_readwrite("allow_upsample", &FrameExtractionOptions::allow_upsample)
+      .def_readwrite("max_feature_vectors", &FrameExtractionOptions::max_feature_vectors);
+
+  py::class_<MelBanksOptions>(m, "MelBanksOptions")
+      .def(py::init<const int&>())
+      .def_readwrite("num_bins", &MelBanksOptions::num_bins)
+      .def_readwrite("low_freq", &MelBanksOptions::low_freq)
+      .def_readwrite("high_freq", &MelBanksOptions::high_freq)
+      .def_readwrite("vtln_low", &MelBanksOptions::vtln_low)
+      .def_readwrite("vtln_high", &MelBanksOptions::vtln_high)
+      .def_readwrite("debug_mel", &MelBanksOptions::debug_mel)
+      .def_readwrite("htk_mode", &MelBanksOptions::htk_mode);
+
+  py::class_<MfccOptions>(m, "MfccOptions")
+      .def(py::init<>())
+      .def_readwrite("frame_opts", &MfccOptions::frame_opts)
+      .def_readwrite("mel_opts", &MfccOptions::mel_opts)
+      .def_readwrite("num_ceps", &MfccOptions::num_ceps)
+      .def_readwrite("use_energy", &MfccOptions::use_energy)
+      .def_readwrite("energy_floor", &MfccOptions::energy_floor)
+      .def_readwrite("raw_energy", &MfccOptions::raw_energy)
+      .def_readwrite("cepstral_lifter", &MfccOptions::cepstral_lifter)
+      .def_readwrite("htk_compat", &MfccOptions::htk_compat);
+
+  py::class_<FbankOptions>(m, "FbankOptions")
+      .def(py::init<>())
+      .def_readwrite("frame_opts", &FbankOptions::frame_opts)
+      .def_readwrite("mel_opts", &FbankOptions::mel_opts)
+      .def_readwrite("use_energy", &FbankOptions::use_energy)
+      .def_readwrite("energy_floor", &FbankOptions::energy_floor)
+      .def_readwrite("raw_energy", &FbankOptions::raw_energy)
+      .def_readwrite("use_log_fbank", &FbankOptions::use_log_fbank)
+      .def_readwrite("use_power", &FbankOptions::use_power)
+      .def_readwrite("htk_compat", &FbankOptions::htk_compat);
+
+  offline_feature<MfccComputer>(m, "Mfcc");
+  offline_feature<FbankComputer>(m, "Fbank");
+}
diff --git a/src/pybind/feat/feature_pybind.h b/src/pybind/feat/feature_pybind.h
new file mode 100644
index 00000000000..2c19bdf057a
--- /dev/null
+++ b/src/pybind/feat/feature_pybind.h
@@ -0,0 +1,25 @@
+// pybind/feat/feature_pybind.h
+
+// Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FEAT_FEATURE_PYBIND_H_
+#define KALDI_PYBIND_FEAT_FEATURE_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_feature(py::module& m);
+
+#endif  // KALDI_PYBIND_FEAT_FEATURE_PYBIND_H_
diff --git a/src/pybind/feat/wav.ark b/src/pybind/feat/wav.ark
new file mode 100644
index 00000000000..daf9ffc2e62
Binary files /dev/null and b/src/pybind/feat/wav.ark differ
diff --git a/src/pybind/feat/wave_reader_pybind.cc b/src/pybind/feat/wave_reader_pybind.cc
new file mode 100644
index 00000000000..f1b23f21907
--- /dev/null
+++ b/src/pybind/feat/wave_reader_pybind.cc
@@ -0,0 +1,64 @@
+// pybind/feat/wave_reader_pybind.cc
+
+// Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave_reader_pybind.h"
+
+#include "feat/wave-reader.h"
+#include "util/kaldi_table_pybind.h"
+
+using namespace kaldi;
+
+void pybind_wave_reader(py::module& m) {
+  m.attr("kWaveSampleMax") = py::cast(kWaveSampleMax);
+
+  py::class_<WaveInfo>(m, "WaveInfo")
+      .def(py::init<>())
+      .def("IsStreamed", &WaveInfo::IsStreamed,
+           "Is stream size unknown? Duration and SampleCount not valid if true.")
+      .def("SampFreq", &WaveInfo::SampFreq,
+           "Sample frequency, Hz.")
+      .def("SampleCount", &WaveInfo::SampleCount,
+           "Number of samples in stream. Invalid if IsStreamed() is true.")
+      .def("Duration", &WaveInfo::Duration,
+           "Approximate duration, seconds. Invalid if IsStreamed() is true.")
+      .def("NumChannels", &WaveInfo::NumChannels,
+           "Number of channels, 1 to 16.")
+      .def("BlockAlign", &WaveInfo::BlockAlign,
+           "Bytes per sample.")
+      .def("DataBytes", &WaveInfo::DataBytes,
+           "Wave data bytes. Invalid if IsStreamed() is true.")
+      .def("ReverseBytes", &WaveInfo::ReverseBytes,
+           "Is data file byte order different from machine byte order?");
+
+  py::class_<WaveData>(m, "WaveData")
+      .def(py::init<>())
+      .def(py::init<const float, const Matrix<float>>(),
+           py::arg("samp_freq"), py::arg("data"))
+      .def("Duration", &WaveData::Duration,
+           "Returns the duration in seconds")
+      .def("Data", &WaveData::Data, py::return_value_policy::reference)
+      .def("SampFreq", &WaveData::SampFreq)
+      .def("Clear", &WaveData::Clear)
+      .def("CopyFrom", &WaveData::CopyFrom)
+      .def("Swap", &WaveData::Swap);
+
+  pybind_sequential_table_reader<WaveHolder>(m, "_SequentialWaveReader");
+  pybind_sequential_table_reader<WaveInfoHolder>(m, "_SequentialWaveInfoReader");
+  pybind_random_access_table_reader<WaveHolder>(m, "_RandomAccessWaveReader");
+  pybind_random_access_table_reader<WaveInfoHolder>(m, "_RandomAccessWaveInfoReader");
+}
+
diff --git a/src/pybind/feat/wave_reader_pybind.h b/src/pybind/feat/wave_reader_pybind.h
new file mode 100644
index 00000000000..2be25beab88
--- /dev/null
+++ b/src/pybind/feat/wave_reader_pybind.h
@@ -0,0 +1,25 @@
+// pybind/feat/wave_reader_pybind.h
+
+// Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FEAT_WAVE_READER_PYBIND_H_
+#define KALDI_PYBIND_FEAT_WAVE_READER_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_wave_reader(py::module& m);
+
+#endif  // KALDI_PYBIND_FEAT_WAVE_READER_PYBIND_H_
diff --git a/src/pybind/feat/wave_reader_pybind_test.py b/src/pybind/feat/wave_reader_pybind_test.py
new file mode 100755
index 00000000000..c326ff407e6
--- /dev/null
+++ b/src/pybind/feat/wave_reader_pybind_test.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# Copyright 2019   Microsoft Corporation (author: Xingyu Na)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi
+
+
+class TestWaveData(unittest.TestCase):
+
+    def test_duration(self):
+        waveform = kaldi.FloatMatrix(1, 16000)
+        wave_data = kaldi.feat.WaveData(samp_freq=16000, data=waveform)
+        self.assertEqual(1, wave_data.Duration())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/fst/Makefile b/src/pybind/fst/Makefile
new file mode 100644
index 00000000000..e21abdeec73
--- /dev/null
+++ b/src/pybind/fst/Makefile
@@ -0,0 +1,7 @@
+
+test:
+	python3 ./arc_pybind_test.py
+	python3 ./fst_pybind_test.py
+	python3 ./symbol_table_pybind_test.py
+	python3 ./vector_fst_pybind_test.py
+	python3 ./weight_pybind_test.py
diff --git a/src/pybind/fst/arc_pybind.cc b/src/pybind/fst/arc_pybind.cc
new file mode 100644
index 00000000000..bfd2e368c46
--- /dev/null
+++ b/src/pybind/fst/arc_pybind.cc
@@ -0,0 +1,50 @@
+// pybind/fst/arc_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fst/arc_pybind.h"
+
+#include "fst/arc.h"
+
+void pybind_arc(py::module& m) {
+  {
+    using PyClass = fst::StdArc;
+    using Weight = PyClass::Weight;
+    using Label = int;
+    using StateId = int;
+
+    py::class_<PyClass>(m, "StdArc")
+        .def(py::init<>())
+        .def(py::init<Label, Label, Weight, StateId>(), py::arg("ilabel"),
+             py::arg("olabel"), py::arg("weight"), py::arg("nextstate"))
+        .def(py::init<const PyClass&>(), py::arg("weight"))
+        .def_readwrite("ilabel", &PyClass::ilabel)
+        .def_readwrite("olabel", &PyClass::olabel)
+        .def_readwrite("weight", &PyClass::weight)
+        .def_readwrite("nextstate", &PyClass::nextstate)
+        .def("__str__",
+             [](const PyClass& arc) {
+               std::ostringstream os;
+               os << "(ilabel: " << arc.ilabel << ", "
+                  << "olabel: " << arc.olabel << ", "
+                  << "weight: " << arc.weight.Value() << ", "
+                  << "nextstate: " << arc.nextstate << ")";
+               return os.str();
+             })
+        .def_static("Type", &PyClass::Type);
+  }
+}
diff --git a/src/pybind/fst/arc_pybind.h b/src/pybind/fst/arc_pybind.h
new file mode 100644
index 00000000000..f2cbed77db0
--- /dev/null
+++ b/src/pybind/fst/arc_pybind.h
@@ -0,0 +1,26 @@
+// pybind/fst/arc_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FST_ARC_PYBIND_H_
+#define KALDI_PYBIND_FST_ARC_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_arc(py::module& m);
+
+#endif  // KALDI_PYBIND_FST_ARC_PYBIND_H_
diff --git a/src/pybind/fst/arc_pybind_test.py b/src/pybind/fst/arc_pybind_test.py
new file mode 100755
index 00000000000..71bfe396132
--- /dev/null
+++ b/src/pybind/fst/arc_pybind_test.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import math  # for math.isnan
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi_pybind.fst as fst
+
+
+class TestArc(unittest.TestCase):
+
+    def test_std_arc(self):
+        arc = fst.StdArc()
+
+        self.assertEqual(arc.Type(), 'standard')
+        self.assertEqual(fst.StdArc.Type(), 'standard')
+
+        ilabel = 0
+        olabel = 1
+        weight = fst.TropicalWeight.One()
+        nextstate = 2
+
+        arc = fst.StdArc(ilabel=ilabel,
+                         olabel=olabel,
+                         weight=weight,
+                         nextstate=nextstate)
+        self.assertEqual(arc.ilabel, ilabel)
+        self.assertEqual(arc.olabel, olabel)
+        self.assertEqual(arc.weight, weight)
+        self.assertEqual(arc.nextstate, nextstate)
+        self.assertEqual(str(arc),
+                         '(ilabel: 0, olabel: 1, weight: 0, nextstate: 2)')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/fst/compile_pybind.cc b/src/pybind/fst/compile_pybind.cc
new file mode 100644
index 00000000000..3b6459c86be
--- /dev/null
+++ b/src/pybind/fst/compile_pybind.cc
@@ -0,0 +1,49 @@
+// pybind/fst/compile_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fst/compile_pybind.h"
+
+#include "fst/script/compile.h"
+
+void pybind_compile(py::module& m) {
+  m.def(
+      "CompileFst",
+      [](std::string& text_fst_str, const std::string& out_binary_fst_filename,
+         const std::string& source = "standard_input",
+         const string& fst_type = "vector", const string& arc_type = "standard",
+         const fst::SymbolTable* isyms = nullptr,
+         const fst::SymbolTable* osyms = nullptr,
+         const fst::SymbolTable* ssyms = nullptr, bool accep = false,
+         bool ikeep = false, bool okeep = false, bool nkeep = false,
+         bool allow_negative_labels = false) {
+        // (fangjun): paramemter `source` is only for debugging !
+        std::stringstream strm;
+        strm << text_fst_str;
+        fst::script::CompileFst(strm, source, out_binary_fst_filename, fst_type,
+                                arc_type, isyms, osyms, ssyms, accep, ikeep,
+                                okeep, nkeep, allow_negative_labels);
+      },
+      "the fst is written to out_binary_fst_filename", py::arg("text_fst_str"),
+      py::arg("out_binary_fst_filename"), py::arg("source") = "standard input",
+      py::arg("fst_type") = "vector", py::arg("arc_type") = "standard",
+      py::arg("isymbols") = nullptr, py::arg("osymbols") = nullptr,
+      py::arg("ssymbols") = nullptr, py::arg("acceptor") = false,
+      py::arg("keep_isymbols") = false, py::arg("keep_osymbols") = false,
+      py::arg("keep_state_numbering") = false,
+      py::arg("allow_negative_labels") = false);
+}
diff --git a/src/pybind/fst/compile_pybind.h b/src/pybind/fst/compile_pybind.h
new file mode 100644
index 00000000000..d531d2f6b78
--- /dev/null
+++ b/src/pybind/fst/compile_pybind.h
@@ -0,0 +1,26 @@
+// pybind/fst/compile_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FST_COMPILE_PYBIND_H_
+#define KALDI_PYBIND_FST_COMPILE_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_compile(py::module& m);
+
+#endif  // KALDI_PYBIND_FST_COMPILE_PYBIND_H_
diff --git a/src/pybind/fst/fst_pybind.cc b/src/pybind/fst/fst_pybind.cc
new file mode 100644
index 00000000000..f8cda4787a0
--- /dev/null
+++ b/src/pybind/fst/fst_pybind.cc
@@ -0,0 +1,353 @@
+// pybind/fst/fst_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fst/fst_pybind.h"
+
+#include "fst/fst.h"
+
+#include "fst/arc_pybind.h"
+#include "fst/compile_pybind.h"
+#include "fst/symbol_table_pybind.h"
+#include "fst/vector_fst_pybind.h"
+#include "fst/weight_pybind.h"
+#include "fstext/kaldi_fst_io_pybind.h"
+
+template <typename... Args>
+using overload_cast_ = py::detail::overload_cast_impl<Args...>;
+
+namespace {
+
+void _pybind_fst(py::module& m) {
+  m.attr("kNoLabel") = fst::kNoLabel;
+  m.attr("kNoStateId") = fst::kNoStateId;
+  {
+    using PyClass = fst::FstHeader;
+    py::class_<PyClass>(m, "FstHeader")
+        .def(py::init<>())
+        .def("FstType", &PyClass::FstType, py::return_value_policy::reference)
+        .def("ArcType", &PyClass::ArcType, py::return_value_policy::reference)
+        .def("Version", &PyClass::Version)
+        .def("GetFlags", &PyClass::GetFlags)
+        .def("Properties", &PyClass::Properties)
+        .def("Start", &PyClass::Start)
+        .def("NumStates", &PyClass::NumStates)
+        .def("NumArcs", &PyClass::NumArcs)
+        .def("SetFstType", &PyClass::SetFstType, py::arg("type"))
+        .def("SetArcType", &PyClass::SetArcType, py::arg("type"))
+        .def("SetVersion", &PyClass::SetVersion, py::arg("version"))
+        .def("SetFlags", &PyClass::SetFlags, py::arg("flags"))
+        .def("SetProperties", &PyClass::SetProperties, py::arg("properties"))
+        .def("SetStart", &PyClass::SetStart, py::arg("start"))
+        .def("SetNumStates", &PyClass::SetNumStates, py::arg("numstates"))
+        .def("SetNumArcs", &PyClass::SetNumArcs, py::arg("numarcs"))
+        .def("Read", &PyClass::Read, py::arg("strm"), py::arg("source"),
+             py::arg("rewind") = false)
+        .def("Write", &PyClass::Write, py::arg("strm"), py::arg("source"))
+        .def("DebugString", &PyClass::DebugString);
+  }
+  {
+    using PyClass = fst::FstWriteOptions;
+
+    py::class_<PyClass>(m, "FstWriteOptions")
+        .def_readwrite("source", &PyClass::source, "Where you're writing to.")
+        .def_readwrite("write_header", &PyClass::write_header,
+                       "Where you're writing to.")
+        .def_readwrite("write_isymbols", &PyClass::write_isymbols,
+                       "Write the header?")
+        .def_readwrite("write_osymbols", &PyClass::write_osymbols,
+                       "Write input symbols?")
+        .def_readwrite("align", &PyClass::align,
+                       "Write data aligned (may fail on pipes)?")
+        .def_readwrite("stream_write", &PyClass::stream_write,
+                       "Avoid seek operations in writing.")
+        .def(
+            py::init<const string&, bool, bool, bool, bool, bool>(),
+            py::arg("source") = "<unspecified>", py::arg("write_header") = true,
+            py::arg("write_isymbols") = true, py::arg("write_osymbols") = true,
+            py::arg("align") = FLAGS_fst_align, py::arg("stream_write") = false)
+        .def("__str__", [](const PyClass& opt) {
+          std::ostringstream os;
+          os << "source: " << opt.source << "\n"
+             << "write_header: " << opt.write_header << "\n"
+             << "write_isymbols: " << opt.write_isymbols << "\n"
+             << "write_osymbols: " << opt.write_osymbols << "\n"
+             << "align: " << opt.align << "\n"
+             << "stream_write: " << opt.stream_write << "\n";
+          return os.str();
+        });
+  }
+
+  auto fst_read_options =
+      py::class_<fst::FstReadOptions>(m, "FstReadOptions")
+          .def(py::init<const fst::string&, const fst::FstHeader*,
+                        const fst::SymbolTable*, const fst::SymbolTable*>(),
+               py::arg("source") = "<unspecified>", py::arg("header") = nullptr,
+               py::arg("isymbols") = nullptr, py::arg("osymbols") = nullptr)
+          .def(py::init<const fst::string&, const fst::SymbolTable*,
+                        const fst::SymbolTable*>(),
+               py::arg("source"), py::arg("isymbols") = nullptr,
+               py::arg("osymbols") = nullptr)
+          .def_readwrite("source", &fst::FstReadOptions::source,
+                         "Where you're reading from.")
+          .def_readwrite("header", &fst::FstReadOptions::header,
+                         "Pointer to FST header; if non-zero, use this info "
+                         "(don't read a stream header).")
+          .def_readwrite("isymbols", &fst::FstReadOptions::isymbols,
+                         "Pointer to input symbols; if non-zero, use this info "
+                         "(read and skip stream isymbols)")
+          .def_readwrite("osymbols", &fst::FstReadOptions::osymbols,
+                         "Pointer to output symbols; if non-zero, use this "
+                         "info (read and skip stream osymbols)")
+          .def_readwrite("mode", &fst::FstReadOptions::mode,
+                         "Read or map files (advisory, if possible)")
+          .def_readwrite("read_isymbols", &fst::FstReadOptions::read_isymbols,
+                         "Read isymbols, if any (default: true).")
+          .def_readwrite("read_osymbols", &fst::FstReadOptions::read_osymbols,
+                         "Read osymbols, if any (default: true).")
+          .def_static("ReadMode", &fst::FstReadOptions::ReadMode,
+                      "Helper function to convert strings FileReadModes into "
+                      "their enum value.",
+                      py::arg("mode"))
+          .def("DebugString", &fst::FstReadOptions::DebugString,
+               "Outputs a debug string for the FstReadOptions object.");
+
+  py::enum_<fst::FstReadOptions::FileReadMode>(
+      fst_read_options, "FileReadMode", py::arithmetic(),
+      "FileReadMode(s) are advisory, there are "
+      "many conditions than prevent a\n"
+      "file from being mapped, READ mode will "
+      "be selected in these cases with\n"
+      "a warning indicating why it was chosen.")
+      .value("READ", fst::FstReadOptions::FileReadMode::READ)
+      .value("MAP", fst::FstReadOptions::FileReadMode::MAP)
+      .export_values();
+
+  py::enum_<fst::MatchType>(m, "MatchType", py::arithmetic(),
+                            "Specifies matcher action.")
+      .value("MATCH_INPUT", fst::MatchType::MATCH_INPUT, "Match input label.")
+      .value("MATCH_OUTPUT", fst::MatchType::MATCH_OUTPUT,
+             "Match output label.")
+      .value("MATCH_BOTH", fst::MatchType::MATCH_BOTH,
+             "Match input or output label.")
+      .value("MATCH_NONE", fst::MatchType::MATCH_NONE, "Match nothing.")
+      .value("MATCH_UNKNOWN", fst::MatchType::MATCH_UNKNOWN,
+             "match type unknown.")
+      .export_values();
+  {
+    using PyClass = fst::StateIteratorBase<fst::StdArc>;
+    py::class_<PyClass>(m, "StdArcStateIteratorBase")
+        .def("Done", &PyClass::Done, "End of iterator?")
+        .def("Value", &PyClass::Value, "Returns current state (when !Done()).")
+        .def("Next", &PyClass::Next, "Advances to next state (when !Done()).")
+        .def("Reset", &PyClass::Reset, "Resets to initial condition.");
+  }
+
+  {
+    using PyClass = fst::StateIteratorData<fst::StdArc>;
+    py::class_<PyClass, std::unique_ptr<PyClass, py::nodelete>>(
+        m, "StdArcStateIteratorData")
+        .def(py::init<>())
+        .def_readwrite("base", &PyClass::base,
+                       "Specialized iterator if non-zero.")
+        .def_readwrite("nstates", &PyClass::nstates,
+                       "Otherwise, the total number of states.");
+  }
+
+  {
+    using PyClass = fst::ArcIteratorBase<fst::StdArc>;
+    py::class_<PyClass>(m, "StdArcArcIteratorBase")
+        .def("Done", &PyClass::Done, "End of iterator?")
+        .def("Value", &PyClass::Value, "Returns current arc (when !Done()).",
+             py::return_value_policy::reference)
+        .def("Next", &PyClass::Next, "Advances to next arc (when !Done()).")
+        .def("Position", &PyClass::Position, "Returns current position.")
+        .def("Reset", &PyClass::Reset, "Resets to initial condition.")
+        .def("Seek", &PyClass::Seek, "Advances to arbitrary arc by position.")
+        .def("Flags", &PyClass::Flags, "Returns current behavorial flags.")
+        .def("SetFlags", &PyClass::SetFlags, "Sets behavorial flags.");
+  }
+
+  {
+    using PyClass = fst::ArcIteratorData<fst::StdArc>;
+    py::class_<PyClass, std::unique_ptr<PyClass, py::nodelete>>(
+        m, "StdArcArcIteratorData")
+        .def(py::init<>())
+        .def_readwrite("base", &PyClass::base,
+                       "Specialized iterator if non-zero.")
+        .def_readwrite("arcs", &PyClass::arcs, "Otherwise arcs pointer")
+        .def_readwrite("narcs", &PyClass::narcs, "arc count")
+        .def_readwrite("ref_count", &PyClass::ref_count,
+                       "reference count if non-zero.");
+  }
+
+  {
+    using PyClass = fst::StdFst;
+    using Arc = PyClass::Arc;
+    using StateId = PyClass::StateId;
+    using Weight = PyClass::Weight;
+
+    auto fst_state_iterator =
+        py::class_<fst::StateIterator<fst::StdFst>>(m, "StdFstStateIterator");
+    auto fst_arc_iterator =
+        py::class_<fst::ArcIterator<fst::StdFst>>(m, "StdFstArcIterator");
+
+    py::class_<PyClass>(
+        m, "StdFst",
+        "A generic FST, templated on the arc definition, with \n"
+        "common-demoninator methods (use StateIterator and \n"
+        "ArcIterator to iterate over its states and arcs).")
+        .def("Start", &PyClass::Start, "Initial state.")
+        .def("Final", &PyClass::Final, "State's final weight.")
+        .def("NumArcs", &PyClass::NumArcs, "State's arc count.")
+        .def("NumInputEpsilons", &PyClass::NumInputEpsilons,
+             "State's output epsilon count.")
+        .def("Properties", &PyClass::Properties,
+             "Property bits. If test = false, return stored properties bits "
+             "for mask\n"
+             "(some possibly unknown); if test = true, return property bits "
+             "for mask\n"
+             "(computing o.w. unknown).",
+             py::arg("mask"), py::arg("test"))
+        .def("Type", &PyClass::Type, "FST typename",
+             py::return_value_policy::reference)
+        .def(
+            "Copy", &PyClass::Copy,
+            "Gets a copy of this Fst. The copying behaves as follows:\n",
+            "\n"
+            "(1) The copying is constant time if safe = false or if safe = "
+            "true and is on an otherwise unaccessed FST.\n"
+            "\n"
+            "(2) If safe = true, the copy is thread-safe in that the original\n"
+            "and copy can be safely accessed (but not necessarily mutated) by\n"
+            "separate threads. For some FST types, 'Copy(true)' should only\n"
+            "be called on an FST that has not otherwise been accessed.\n"
+            "Behavior is otherwise undefined.\n"
+            "\n"
+            "(3) If a MutableFst is copied and then mutated, then the original"
+            "\n"
+            "is unmodified and vice versa (often by a copy-on-write on the \n"
+            "initial mutation, which may not be constant time).",
+            py::arg("safe") = false, py::return_value_policy::take_ownership)
+        .def_static(
+            "Read",
+            // clang-format off
+            overload_cast_<std::istream&, const fst::FstReadOptions&>()(&PyClass::Read),
+            // clang-format on
+            "Reads an FST from an input stream; returns nullptr on error.",
+            py::arg("strm"), py::arg("opts"),
+            py::return_value_policy::take_ownership)
+        .def_static(
+            "Read", overload_cast_<const fst::string&>()(&PyClass::Read),
+            "Reads an FST from a file; returns nullptr on error. An empty\n"
+            "filename results in reading from standard input.",
+            py::arg("filename"), py::return_value_policy::take_ownership)
+        .def("Write",
+             // clang-format off
+            (bool (PyClass::*)(std::ostream&, const fst::FstWriteOptions&)const)&PyClass::Write,
+             // clang-format on
+             "Writes an FST to an output stream; returns false on error.",
+             py::arg("strm"), py::arg("opts"))
+        .def("Write",
+             (bool (PyClass::*)(const fst::string&) const) & PyClass::Write,
+             "Writes an FST to a file; returns false on error; an empty\n"
+             "filename results in writing to standard output.",
+             py::arg("filename"))
+        .def("InputSymbols", &PyClass::InputSymbols,
+             "Returns input label symbol table; return nullptr if not "
+             "specified.",
+             py::return_value_policy::reference)
+        .def("OutputSymbols", &PyClass::OutputSymbols,
+             "Returns output label symbol table; return nullptr if not "
+             "specified.",
+             py::return_value_policy::reference)
+        .def("InitStateIterator", &PyClass::InitStateIterator,
+             "For generic state iterator construction (not normally called "
+             "directly by users). Does not copy the FST.",
+             py::arg("data"))
+        .def("InitArcIterator", &PyClass::InitArcIterator,
+             "For generic arc iterator construction (not normally called "
+             "directly by users). Does not copy the FST.",
+             py::arg("s"), py::arg("data"))
+#if 0
+      // TODO(fangjun): what is the use of InitMatcher?
+        .def("InitMatcher", &PyClass::InitMatcher,
+             "For generic matcher construction (not normally called directly "
+             "by users).",
+             py::arg("match_type")) // TODO(fangjun): reference semantics ?
+#endif
+        ;
+    fst_state_iterator.def(py::init<const PyClass&>(), py::arg("fst"))
+        .def("Done", &fst::StateIterator<PyClass>::Done)
+        .def("Value", &fst::StateIterator<PyClass>::Value)
+        .def("Next", &fst::StateIterator<PyClass>::Next)
+        .def("Reset", &fst::StateIterator<PyClass>::Reset);
+
+    fst_arc_iterator
+        .def(py::init<const PyClass&, StateId>(), py::arg("fst"), py::arg("s"))
+        .def("Done", &fst::ArcIterator<PyClass>::Done)
+        .def("Value", &fst::ArcIterator<PyClass>::Value,
+             py::return_value_policy::reference)
+        .def("Next", &fst::ArcIterator<PyClass>::Next)
+        .def("Reset", &fst::ArcIterator<PyClass>::Reset)
+        .def("Seek", &fst::ArcIterator<PyClass>::Seek, py::arg("a"))
+        .def("Position", &fst::ArcIterator<PyClass>::Position)
+        .def("Flags", &fst::ArcIterator<PyClass>::Flags)
+        .def("SetFlags", &fst::ArcIterator<PyClass>::SetFlags);
+    ;
+  }
+
+  m.def("TestProperties", &fst::TestProperties<fst::StdArc>, py::arg("fst"),
+        py::arg("mask"), py::arg("known"));
+
+  m.def("FstToString",
+        // clang-format off
+      (fst::string (*)(const fst::StdFst&, const fst::FstWriteOptions&))&fst::FstToString<fst::StdArc>,
+        // clang-format on
+        py::arg("fst"),
+        py::arg("options") = fst::FstWriteOptions("FstToString"));
+
+  m.def("FstToString",
+        // clang-format off
+      (void (*)(const fst::StdFst&, fst::string*))&fst::FstToString<fst::StdArc>,
+        // clang-format on
+        py::arg("fst"), py::arg("result"));
+
+  m.def("FstToString",
+        // clang-format off
+      (void (*)(const fst::StdFst&, fst::string*, const fst::FstWriteOptions&))&fst::FstToString<fst::StdArc>,
+        // clang-format on
+        py::arg("fst"), py::arg("result"), py::arg("options"));
+
+  m.def("StringToFst", &fst::StringToFst<fst::StdArc>, py::arg("s"));
+}
+
+}  // namespace
+
+void pybind_fst(py::module& _m) {
+  py::module m = _m.def_submodule("fst", "FST pybind for Kaldi");
+
+  // WARNING(fangjun): do NOT sort the following in alphabetic order!
+  pybind_weight(m);
+  pybind_arc(m);
+  pybind_symbol_table(m);
+
+  _pybind_fst(m);
+  pybind_vector_fst(m);
+  pybind_kaldi_fst_io(m);
+  pybind_compile(m);
+}
diff --git a/src/pybind/fst/fst_pybind.h b/src/pybind/fst/fst_pybind.h
new file mode 100644
index 00000000000..dadde21da90
--- /dev/null
+++ b/src/pybind/fst/fst_pybind.h
@@ -0,0 +1,26 @@
+// pybind/fst/fst_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FST_FST_PYBIND_H_
+#define KALDI_PYBIND_FST_FST_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_fst(py::module& m);
+
+#endif  // KALDI_PYBIND_FST_FST_PYBIND_H_
diff --git a/src/pybind/fst/fst_pybind_test.py b/src/pybind/fst/fst_pybind_test.py
new file mode 100755
index 00000000000..307e1f8b4b1
--- /dev/null
+++ b/src/pybind/fst/fst_pybind_test.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi_pybind.fst as fst
+
+
+class TestArc(unittest.TestCase):
+
+    def test_FstWriteOptions(self):
+        source = "source name"
+        write_header = True
+        write_isymbols = False
+        write_osymbols = True
+        align = False
+        stream_write = True
+
+        opt = fst.FstWriteOptions(source=source,
+                                  write_header=write_header,
+                                  write_isymbols=write_isymbols,
+                                  write_osymbols=write_osymbols,
+                                  align=align,
+                                  stream_write=stream_write)
+
+        self.assertEqual(opt.source, source)
+        self.assertEqual(opt.write_header, write_header)
+        self.assertEqual(opt.write_isymbols, write_isymbols)
+        self.assertEqual(opt.write_osymbols, write_osymbols)
+        self.assertEqual(opt.align, align)
+        self.assertEqual(opt.stream_write, stream_write)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/fst/symbol_table_pybind.cc b/src/pybind/fst/symbol_table_pybind.cc
new file mode 100644
index 00000000000..96b351a6497
--- /dev/null
+++ b/src/pybind/fst/symbol_table_pybind.cc
@@ -0,0 +1,199 @@
+// pybind/fst/symbol_table_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fst/symbol_table_pybind.h"
+
+#include "fst/symbol-table.h"
+
+template <typename... Args>
+using overload_cast_ = py::detail::overload_cast_impl<Args...>;
+
+void pybind_symbol_table(py::module& m) {
+  m.attr("kNoSymbol") = fst::kNoSymbol;
+
+  {
+    using PyClass = fst::SymbolTableReadOptions;
+    py::class_<PyClass>(m, "SymbolTableReadOptions")
+        .def(
+            py::init<std::vector<std::pair<int64, int64>>, const fst::string>(),
+            py::arg("string_hash_ranges"), py::arg("source"))
+        .def_readwrite("string_hash_ranges", &PyClass::string_hash_ranges)
+        .def_readwrite("source", &PyClass::source);
+  }
+  {
+    using PyClass = fst::SymbolTableTextOptions;
+    py::class_<PyClass>(m, "SymbolTableTextOptions")
+        .def(py::init<bool>(), py::arg("allow_negative_labels") = false)
+        .def_readwrite("allow_negative_labels", &PyClass::allow_negative_labels)
+        .def_readwrite("fst_field_separator", &PyClass::fst_field_separator);
+  }
+  {
+    using PyClass = fst::SymbolTable;
+    py::class_<PyClass>(
+        m, "SymbolTable",
+        "Symbol (string) to integer (and reverse) mapping.\n"
+        "\n"
+        "The SymbolTable implements the mappings of labels to strings and "
+        "reverse. SymbolTables are used to describe the alphabet of the input "
+        "and output abels for arcs in a Finite State Transducer."
+        "\n"
+        "SymbolTables are reference-counted and can therefore be shared across "
+        "multiple machines. For example a language model grammar G, with a "
+        "SymbolTable for the words in the language model can share this symbol "
+        "table with the lexical representation L o G.")
+        .def(py::init<const fst::string&>(),
+             "Constructs symbol table with an optional name.",
+             py::arg("name") = "<unspecified>")
+        .def_static("ReadText",
+                    overload_cast_<std::istream&, const fst::string&,
+                                   const fst::SymbolTableTextOptions&>()(
+                        &PyClass::ReadText),
+                    "Reads a text representation of the symbol table from an "
+                    "istream. Pass a name to give the resulting SymbolTable.",
+                    py::arg("strm"), py::arg("name"),
+                    py::arg("opts") = fst::SymbolTableTextOptions())
+        .def_static("ReadText",
+                    overload_cast_<const fst::string&,
+                                   const fst::SymbolTableTextOptions&>()(
+                        &PyClass::ReadText),
+                    "Reads a text representation of the symbol table",
+                    py::arg("filename"),
+                    py::arg("opts") = fst::SymbolTableTextOptions())
+        .def_static(
+            "Read",
+            overload_cast_<std::istream&, const fst::SymbolTableReadOptions&>()(
+                &PyClass::Read),
+            "WARNING: Reading via symbol table read options should not be "
+            "used. This is a temporary work-around.",
+            py::arg("strm"), py::arg("opts") = fst::SymbolTableReadOptions())
+        .def_static("Read", overload_cast_<std::istream&, const fst::string&>()(
+                                &PyClass::Read),
+                    "Reads a binary dump of the symbol table from a stream.",
+                    py::arg("strm"), py::arg("source"))
+        .def_static(
+            "Read", overload_cast_<const fst::string&>()(&PyClass::Read),
+            "Reads a binary dump of the symbol table.", py::arg("filename"))
+        .def("Copy", &PyClass::Copy, "Creates a reference counted copy.")
+        .def("AddSymbol",
+             // clang-format off
+             (int64 (PyClass::*)(const fst::string&, int64)) &PyClass::AddSymbol,
+             // clang-format on
+             "Adds a symbol with given key to table. A symbol table also keeps "
+             "track of the last available key (highest key value in the symbol "
+             "table).",
+             py::arg("symbol"), py::arg("key"))
+        .def("AddSymbol",
+             (int64 (PyClass::*)(const fst::string&)) & PyClass::AddSymbol,
+             "Adds a symbol to the table. The associated value key is "
+             "automatically assigned by the symbol table.",
+             py::arg("symbol"))
+        .def("AddTable", &PyClass::AddTable,
+             "Adds another symbol table to this table. All key values will be "
+             "offset"
+             "by the current available key (highest key value in the symbol "
+             "table)."
+             "Note string symbols with the same key value will still have the "
+             "same"
+             "key value after the symbol table has been merged, but a different"
+             "value. Adding symbol tables do not result in changes in the base "
+             "table.",
+             py::arg("table"))
+        .def("RemoveSymbol", &PyClass::RemoveSymbol, py::arg("key"))
+        .def("Name", &PyClass::Name, "Returns the name of the symbol table.")
+        .def("SetName", &PyClass::SetName, "Sets the name of the symbol table.")
+        .def("CheckSum", &PyClass::CheckSum,
+             "Return the label-agnostic MD5 check-sum for this table. All new "
+             "symbols added to the table will result in an updated checksum. "
+             "Deprecated.")
+        .def("LabeledCheckSum", &PyClass::LabeledCheckSum,
+             "Same as CheckSum(), but returns an label-dependent version.")
+        .def("Write", (bool (PyClass::*)(std::ostream&) const) & PyClass::Write,
+             py::arg("strm"))
+        .def("Write",
+             (bool (PyClass::*)(const fst::string&) const) & PyClass::Write,
+             py::arg("filename"))
+        .def("WriteText",
+             // clang-format off
+             (bool (PyClass::*)(std::ostream&, const fst::SymbolTableTextOptions&) const) &PyClass::WriteText,
+             // clang-format on
+             "Dump a text representation of the symbol table via a stream.",
+             py::arg("strm"), py::arg("opts") = fst::SymbolTableTextOptions())
+        .def("WriteText",
+             (bool (PyClass::*)(const fst::string&) const) & PyClass::WriteText,
+             "Dump a text representation of the symbol table.",
+             py::arg("filename"))
+        .def("Find", (fst::string (PyClass::*)(int64) const) & PyClass::Find,
+             "Returns the string associated with the key; if the key is out of"
+             "range (<0, >max), returns an empty string.",
+             py::arg("key"))
+        .def("Find",
+             (int64 (PyClass::*)(const fst::string&) const) & PyClass::Find,
+             "Returns the key associated with the symbol; if the symbol does "
+             "not exist, kNoSymbol is returned.",
+             py::arg("symbol"))
+        .def("Find", (int64 (PyClass::*)(const char*) const) & PyClass::Find,
+             "Returns the key associated with the symbol; if the symbol does "
+             "not exist,"
+             "kNoSymbol is returned.",
+             py::arg("symbol"))
+        .def("Member", (bool (PyClass::*)(int64) const) & PyClass::Member,
+             py::arg("key"))
+        .def("Member",
+             (bool (PyClass::*)(const fst::string&) const) & PyClass::Member,
+             py::arg("symbol"))
+        .def("AvailableKey", &PyClass::AvailableKey,
+             "Returns the current available key (i.e., highest key + 1) in the "
+             "symbol table.")
+        .def("NumSymbols", &PyClass::NumSymbols,
+             "Returns the current number of symbols in table (not necessarily "
+             "equal to AvailableKey()).")
+        .def("GetNthKey", &PyClass::GetNthKey, py::arg("pos"))
+        .def("__str__", [](const PyClass& sym) {
+          std::ostringstream os;
+          sym.WriteText(os);
+          return os.str();
+        });
+  }
+  {
+    using PyClass = fst::SymbolTableIterator;
+    py::class_<PyClass>(m, "SymbolTableIterator")
+        .def(py::init<const fst::SymbolTable&>(), py::arg("table"))
+        .def("Done", &PyClass::Done, "Returns whether iterator is done.")
+        .def("Value", &PyClass::Value, "Return the key of the current symbol.")
+        .def("Symbol", &PyClass::Symbol,
+             "Return the string of the current symbol.")
+        .def("Next", &PyClass::Next, "Advances iterator.")
+        .def("Reset", &PyClass::Reset, "Resets iterator.");
+  }
+  m.def("RelabelSymbolTable", &fst::RelabelSymbolTable<int>,
+        "Relabels a symbol table as specified by the input vector of pairs "
+        "(old label, new label). The new symbol table only retains symbols for "
+        "which a relabeling is explicitly specified.",
+        py::arg("table"), py::arg("pairs"));
+
+  m.def("CompatSymbols", &fst::CompatSymbols,
+        "Returns true if the two symbol tables have equal checksums. Passing "
+        "in nullptr for either table always returns true.",
+        py::arg("sysm1"), py::arg("syms2"), py::arg("warning") = true);
+
+  m.def("SymbolTableToString", &fst::SymbolTableToString, py::arg("table"),
+        py::arg("result"));
+
+  m.def("StringToSymbolTable", &fst::StringToSymbolTable, py::arg("str"),
+        py::return_value_policy::take_ownership);
+}
diff --git a/src/pybind/fst/symbol_table_pybind.h b/src/pybind/fst/symbol_table_pybind.h
new file mode 100644
index 00000000000..f7da83e9f98
--- /dev/null
+++ b/src/pybind/fst/symbol_table_pybind.h
@@ -0,0 +1,26 @@
+// pybind/fst/symbol_table_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FST_SYMBOL_TABLE_PYBIND_H_
+#define KALDI_PYBIND_FST_SYMBOL_TABLE_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_symbol_table(py::module& m);
+
+#endif  // KALDI_PYBIND_FST_SYMBOL_TABLE_PYBIND_H_
diff --git a/src/pybind/fst/symbol_table_pybind_test.py b/src/pybind/fst/symbol_table_pybind_test.py
new file mode 100755
index 00000000000..a17c3d08570
--- /dev/null
+++ b/src/pybind/fst/symbol_table_pybind_test.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi_pybind.fst as fst
+import kaldi
+
+
+class TestSymbolTable(unittest.TestCase):
+
+    def test_symbol_table(self):
+        self.assertEqual(fst.kNoSymbol, -1)
+
+        # the name can be arbitrary string, or it can simply be omitted
+        words = fst.SymbolTable(name='words.txt')
+        self.assertEqual(words.Name(), 'words.txt')
+
+        #        0     1      2     3    4     5    6
+        text = '<eps> hello OpenFST in Python with Pybind11'.split()
+        indices = [words.AddSymbol(w) for w in text]
+
+        for i in range(len(text)):
+            self.assertEqual(words.Find(key=i), text[i])
+            self.assertEqual(words.Find(symbol=text[i]), i)
+            self.assertTrue(words.Member(key=i))
+            self.assertTrue(words.Member(symbol=text[i]))
+
+        self.assertEqual(words.Find('Kaldi'), fst.kNoSymbol)
+        self.assertEqual(words.AvailableKey(), len(text))
+        self.assertEqual(words.NumSymbols(), len(text))
+
+        self.assertEqual(words.GetNthKey(pos=5), 5)
+
+        symbol_table_iterator = fst.SymbolTableIterator(words)
+        i = 0
+        while not symbol_table_iterator.Done():
+            index = symbol_table_iterator.Value()
+            symbol = symbol_table_iterator.Symbol()
+            self.assertEqual(index, i)
+            self.assertEqual(symbol, text[i])
+            symbol_table_iterator.Next()
+            i += 1
+
+        # the following is more pythonic for iteration
+        i = 0
+        kaldi_symbol_iterator = kaldi.SymbolTableIterator(words)
+        for index, symbol in kaldi_symbol_iterator:
+            self.assertEqual(index, i)
+            self.assertEqual(symbol, text[i])
+            i += 1
+
+        # to use the iterator again, we must reset it manually
+        kaldi_symbol_iterator.Reset()
+        i = 0
+        for index, symbol in kaldi_symbol_iterator:
+            self.assertEqual(index, i)
+            self.assertEqual(symbol, text[i])
+            i += 1
+
+        # after removing the word 'with' whose index is 5
+
+        words.RemoveSymbol(key=5)
+        self.assertEqual(words.Find(key=5), '')
+        self.assertEqual(words.Find(symbol='with'), fst.kNoSymbol)
+        self.assertEqual(words.AvailableKey(), len(text))  # still 7
+        self.assertEqual(words.NumSymbols(), len(text) - 1)  # now 6 = 7-1
+
+        # at pos 5, we have the word `Pybind11` which has index 6
+        self.assertEqual(words.GetNthKey(pos=5), 6)
+
+        words.AddSymbol(symbol='with', key=5)
+        self.assertEqual(words.Find(key=5), 'with')
+        self.assertEqual(words.Find(symbol='with'), 5)
+        self.assertEqual(words.AvailableKey(), len(text))  # still 7
+        self.assertEqual(words.NumSymbols(), len(text))  # now 7
+
+        self.assertEqual(words.GetNthKey(pos=5), 6)  # it's still 6 !
+
+        # test I/O
+        # to control the field separator, we can use
+        # fst::SymbolTableTextOptions::fst_field_separator,
+        # the default separator is controlled by FLAGS_fst_field_separator
+        # whose default value is '\t ', e.g., a tab and a space
+        filename = 'words.txt'
+        words.WriteText(filename=filename)
+
+        words_txt_read_back = fst.SymbolTable.ReadText(filename=filename)
+
+        self.assertEqual(words.CheckSum(), words_txt_read_back.CheckSum())
+        self.assertTrue(fst.CompatSymbols(words, words_txt_read_back))
+
+        # now for binary
+        filename = 'words.bin'
+        words.Write(filename=filename)
+        words_bin_read_back = fst.SymbolTable.Read(filename=filename)
+
+        self.assertEqual(words.CheckSum(), words_bin_read_back.CheckSum())
+        self.assertTrue(fst.CompatSymbols(words, words_bin_read_back))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/fst/vector_fst_pybind.cc b/src/pybind/fst/vector_fst_pybind.cc
new file mode 100644
index 00000000000..a5d148db7be
--- /dev/null
+++ b/src/pybind/fst/vector_fst_pybind.cc
@@ -0,0 +1,311 @@
+// pybind/fst/vector_fst_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fst/vector_fst_pybind.h"
+
+#include "fst/script/info-impl.h"
+
+#include "fst/script/fst-class.h"
+#include "fst/script/print-impl.h"
+#include "fst/vector-fst.h"
+
+template <typename... Args>
+using overload_cast_ = py::detail::overload_cast_impl<Args...>;
+
+namespace {
+// this following function is copied from openfst/src/script/info-impl.cc
+void PrintFstInfoImpl(const fst::FstInfo& fstinfo, std::ostream& ostrm) {
+  using namespace fst;
+  ostrm.setf(std::ios::left);
+  ostrm.width(50);
+  ostrm << "fst type" << fstinfo.FstType() << std::endl;
+  ostrm.width(50);
+  ostrm << "arc type" << fstinfo.ArcType() << std::endl;
+  ostrm.width(50);
+  ostrm << "input symbol table" << fstinfo.InputSymbols() << std::endl;
+  ostrm.width(50);
+  ostrm << "output symbol table" << fstinfo.OutputSymbols() << std::endl;
+  if (!fstinfo.LongInfo()) {
+    return;
+  }
+  ostrm.width(50);
+  ostrm << "# of states" << fstinfo.NumStates() << std::endl;
+  ostrm.width(50);
+  ostrm << "# of arcs" << fstinfo.NumArcs() << std::endl;
+  ostrm.width(50);
+  ostrm << "initial state" << fstinfo.Start() << std::endl;
+  ostrm.width(50);
+  ostrm << "# of final states" << fstinfo.NumFinal() << std::endl;
+  ostrm.width(50);
+  ostrm << "# of input/output epsilons" << fstinfo.NumEpsilons() << std::endl;
+  ostrm.width(50);
+  ostrm << "# of input epsilons" << fstinfo.NumInputEpsilons() << std::endl;
+  ostrm.width(50);
+  ostrm << "# of output epsilons" << fstinfo.NumOutputEpsilons() << std::endl;
+  ostrm.width(50);
+  ostrm << "input label multiplicity" << fstinfo.InputLabelMultiplicity()
+        << std::endl;
+  ostrm.width(50);
+  ostrm << "output label multiplicity" << fstinfo.OutputLabelMultiplicity()
+        << std::endl;
+  ostrm.width(50);
+  string arc_type = "";
+  if (fstinfo.ArcFilterType() == "epsilon")
+    arc_type = "epsilon ";
+  else if (fstinfo.ArcFilterType() == "iepsilon")
+    arc_type = "input-epsilon ";
+  else if (fstinfo.ArcFilterType() == "oepsilon")
+    arc_type = "output-epsilon ";
+  const auto accessible_label = "# of " + arc_type + "accessible states";
+  ostrm.width(50);
+  ostrm << accessible_label << fstinfo.NumAccessible() << std::endl;
+  const auto coaccessible_label = "# of " + arc_type + "coaccessible states";
+  ostrm.width(50);
+  ostrm << coaccessible_label << fstinfo.NumCoAccessible() << std::endl;
+  const auto connected_label = "# of " + arc_type + "connected states";
+  ostrm.width(50);
+  ostrm << connected_label << fstinfo.NumConnected() << std::endl;
+  const auto numcc_label = "# of " + arc_type + "connected components";
+  ostrm.width(50);
+  ostrm << numcc_label << fstinfo.NumCc() << std::endl;
+  const auto numscc_label = "# of " + arc_type + "strongly conn components";
+  ostrm.width(50);
+  ostrm << numscc_label << fstinfo.NumScc() << std::endl;
+  ostrm.width(50);
+  ostrm << "input matcher"
+        << (fstinfo.InputMatchType() == MATCH_INPUT
+                ? 'y'
+                : fstinfo.InputMatchType() == MATCH_NONE ? 'n' : '?')
+        << std::endl;
+  ostrm.width(50);
+  ostrm << "output matcher"
+        << (fstinfo.OutputMatchType() == MATCH_OUTPUT
+                ? 'y'
+                : fstinfo.OutputMatchType() == MATCH_NONE ? 'n' : '?')
+        << std::endl;
+  ostrm.width(50);
+  ostrm << "input lookahead" << (fstinfo.InputLookAhead() ? 'y' : 'n')
+        << std::endl;
+  ostrm.width(50);
+  ostrm << "output lookahead" << (fstinfo.OutputLookAhead() ? 'y' : 'n')
+        << std::endl;
+  uint64 prop = 1;
+  for (auto i = 0; i < 64; ++i, prop <<= 1) {
+    if (prop & kBinaryProperties) {
+      char value = 'n';
+      if (fstinfo.Properties() & prop) value = 'y';
+      ostrm.width(50);
+      ostrm << PropertyNames[i] << value << std::endl;
+    } else if (prop & kPosTrinaryProperties) {
+      char value = '?';
+      if (fstinfo.Properties() & prop)
+        value = 'y';
+      else if (fstinfo.Properties() & prop << 1)
+        value = 'n';
+      ostrm.width(50);
+      ostrm << PropertyNames[i] << value << std::endl;
+    }
+  }
+}
+}
+
+void pybind_vector_fst(py::module& m) {
+  {
+    using PyClass = fst::StdVectorFst;
+    using Arc = PyClass::Arc;
+    using StateId = PyClass::StateId;
+    using State = PyClass::State;
+
+    py::class_<PyClass>(m, "StdVectorFst")
+        .def(py::init<>())
+        .def(py::init<const fst::StdFst&>(), py::arg("fst"))
+        .def(py::init<const PyClass&, bool>(), py::arg("fst"),
+             py::arg("safe") = false)
+        .def("Start", &PyClass::Start)
+        .def("Final", &PyClass::Final, py::arg("s"))
+        .def("SetStart", &PyClass::SetStart, py::arg("s"))
+        .def("SetFinal", &PyClass::SetFinal, py::arg("s"), py::arg("weight"))
+        .def("SetProperties", &PyClass::SetProperties, py::arg("props"),
+             py::arg("mask"))
+        .def("AddState", (StateId (PyClass::*)()) & PyClass::AddState)
+        .def("AddArc", &PyClass::AddArc, py::arg("s"), py::arg("arc"))
+        .def("DeleteStates", (void (PyClass::*)(const std::vector<StateId>&)) &
+                                 PyClass::DeleteStates,
+             py::arg("dstates"))
+        .def("DeleteStates", (void (PyClass::*)()) & PyClass::DeleteStates,
+             "Delete all states")
+        .def("DeleteArcs",
+             (void (PyClass::*)(StateId, size_t)) & PyClass::DeleteArcs,
+             py::arg("state"), py::arg("n"))
+        .def("DeleteArcs", (void (PyClass::*)(StateId)) & PyClass::DeleteArcs,
+             py::arg("s"))
+        .def("ReserveStates", &PyClass::ReserveStates, py::arg("s"))
+        .def("ReserveArcs", &PyClass::ReserveArcs, py::arg("s"), py::arg("n"))
+        .def("InputSymbols", &PyClass::InputSymbols,
+             "Returns input label symbol table; return nullptr if not "
+             "specified.",
+             py::return_value_policy::reference)
+        .def("OutputSymbols", &PyClass::OutputSymbols,
+             "Returns output label symbol table; return nullptr if not "
+             "specified.",
+             py::return_value_policy::reference)
+        .def("MutableInputSymbols", &PyClass::MutableInputSymbols,
+             "Returns input label symbol table; return nullptr if not "
+             "specified.",
+             py::return_value_policy::reference)
+        .def("MutableOutputSymbols", &PyClass::MutableOutputSymbols,
+             "Returns output label symbol table; return nullptr if not "
+             "specified.",
+             py::return_value_policy::reference)
+        .def("SetInputSymbols", &PyClass::SetInputSymbols, py::arg("isyms"))
+        .def("SetOutputSymbols", &PyClass::SetOutputSymbols, py::arg("osyms"))
+        .def("NumStates", &PyClass::NumStates)
+        .def("NumArcs", &PyClass::NumArcs, py::arg("s"))
+        .def("NumInputEpsilons", &PyClass::NumInputEpsilons, py::arg("s"))
+        .def("NumOutputEpsilons", &PyClass::NumOutputEpsilons, py::arg("s"))
+        .def("Properties", &PyClass::Properties, py::arg("mask"),
+             py::arg("test"))
+        .def("Type", &PyClass::Type, "FST typename",
+             py::return_value_policy::reference)
+        .def("Copy", &PyClass::Copy,
+             "Get a copy of this VectorFst. See Fst<>::Copy() for further "
+             "doc.",
+             py::arg("safe") = false, py::return_value_policy::take_ownership)
+        .def_static("Read",
+                    // clang-format off
+            overload_cast_<std::istream&, const fst::FstReadOptions&>()(&PyClass::Read),
+                    // clang-format on
+                    "Reads a VectorFst from an input stream, returning nullptr "
+                    "on error.",
+                    py::arg("strm"), py::arg("opts"),
+                    py::return_value_policy::take_ownership)
+        .def_static(
+            "Read", overload_cast_<const fst::string&>()(&PyClass::Read),
+            "Read a VectorFst from a file, returning nullptr on error; "
+            "empty "
+            "filename reads from standard input.",
+            py::arg("filename"), py::return_value_policy::take_ownership)
+        .def("Write",
+             // clang-format off
+            (bool (PyClass::*)(std::ostream&, const fst::FstWriteOptions&)const)&PyClass::Write,
+             // clang-format on
+             "Writes an FST to an output stream; returns false on error.",
+             py::arg("strm"), py::arg("opts"))
+        .def("Write",
+             (bool (PyClass::*)(const fst::string&) const) & PyClass::Write,
+             "Writes an FST to a file; returns false on error; an empty\n"
+             "filename results in writing to standard output.",
+             py::arg("filename"))
+        .def_static("WriteFst", &PyClass::WriteFst<fst::StdVectorFst>,
+                    py::arg("fst"), py::arg("strm"), py::arg("opts"))
+        .def("InitStateIterator", &PyClass::InitStateIterator,
+             "For generic state iterator construction (not normally called "
+             "directly by users). Does not copy the FST.",
+             py::arg("data"))
+        .def("InitArcIterator", &PyClass::InitArcIterator,
+             "For generic arc iterator construction (not normally called "
+             "directly by users). Does not copy the FST.",
+             py::arg("s"), py::arg("data"))
+        .def("info",
+             [](const PyClass& vector_fst) -> std::string {
+               std::ostringstream os;
+               auto _fst = fst::script::FstClass(vector_fst);
+               auto fst_info = fst::FstInfo(*_fst.GetFst<Arc>(), true);
+               PrintFstInfoImpl(fst_info);
+               return os.str();
+             })
+        .def("__str__",
+             [](const PyClass& vector_fst) -> std::string {
+               std::ostringstream os;
+               auto _fst = fst::script::FstClass(vector_fst);
+               fst::FstPrinter<Arc>(
+                   *_fst.GetFst<Arc>(), _fst.InputSymbols(),
+                   _fst.OutputSymbols(),
+                   nullptr,   // state symbol table, ssyms
+                   false,     // false means not in acceptor format
+                   false,     // false means not to show weight one
+                   "      ",  // fst field separator, 6 spaces
+                   ""         // missing symbol
+                   )
+                   .Print(&os, "standard output");
+               return os.str();
+             })
+        .def("ToString",
+             [](const PyClass& vector_fst, bool is_acceptor = false,
+                bool show_weight_one = false,
+                const std::string& fst_field_separator = "      ",
+                const std::string& missing_symbol = "",
+                const std::string& dest = "stardard output") {
+               std::ostringstream os;
+               auto _fst = fst::script::FstClass(vector_fst);
+               fst::FstPrinter<Arc>(*_fst.GetFst<Arc>(), _fst.InputSymbols(),
+                                    _fst.OutputSymbols(), nullptr, is_acceptor,
+                                    show_weight_one, fst_field_separator,
+                                    missing_symbol)
+                   .Print(&os, dest);
+               return os.str();
+             },
+             "see fstprint for help, e.g., fstprint --help",
+             py::arg("is_acceptor") = false, py::arg("show_weight_one") = false,
+             py::arg("fst_field_separator") = "      ",
+             py::arg("missing_symbol") = "",
+             py::arg("dest") = "stardard output");
+  }
+  {
+    using PyClass = fst::StateIterator<fst::StdVectorFst>;
+    py::class_<PyClass>(m, "StdVectorFstStateIterator")
+        .def(py::init<const fst::StdVectorFst&>(), py::arg("fst"))
+        .def("Done", &PyClass::Done)
+        .def("Value", &PyClass::Value)
+        .def("Next", &PyClass::Next)
+        .def("Reset", &PyClass::Reset);
+  }
+
+  {
+    using PyClass = fst::ArcIterator<fst::StdVectorFst>;
+    using StateId = PyClass::StateId;
+    py::class_<PyClass>(m, "StdVectorFstArcIterator")
+        .def(py::init<const fst::StdVectorFst&, StateId>(), py::arg("fst"),
+             py::arg("s"))
+        .def("Done", &PyClass::Done)
+        .def("Value", &PyClass::Value, py::return_value_policy::reference)
+        .def("Next", &PyClass::Next)
+        .def("Reset", &PyClass::Reset)
+        .def("Seek", &PyClass::Seek, py::arg("a"))
+        .def("Position", &PyClass::Position)
+        .def("Flags", &PyClass::Flags)
+        .def("SetFlags", &PyClass::SetFlags);
+  }
+
+  {
+    using PyClass = fst::MutableArcIterator<fst::StdVectorFst>;
+    using StateId = PyClass::StateId;
+    py::class_<PyClass>(m, "StdVectorFstMutableArcIterator")
+        .def(py::init<fst::StdVectorFst*, StateId>(), py::arg("fst"),
+             py::arg("s"))
+        .def("Done", &PyClass::Done)
+        .def("Value", &PyClass::Value, py::return_value_policy::reference)
+        .def("SetValue", &PyClass::SetValue, py::arg("arc"))
+        .def("Next", &PyClass::Next)
+        .def("Reset", &PyClass::Reset)
+        .def("Seek", &PyClass::Seek, py::arg("a"))
+        .def("Position", &PyClass::Position)
+        .def("Flags", &PyClass::Flags)
+        .def("SetFlags", &PyClass::SetFlags);
+  }
+}
diff --git a/src/pybind/fst/vector_fst_pybind.h b/src/pybind/fst/vector_fst_pybind.h
new file mode 100644
index 00000000000..61db2e3c77f
--- /dev/null
+++ b/src/pybind/fst/vector_fst_pybind.h
@@ -0,0 +1,26 @@
+// pybind/fst/vector_fst_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FST_VECTOR_FST_PYBIND_H_
+#define KALDI_PYBIND_FST_VECTOR_FST_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_vector_fst(py::module& m);
+
+#endif  // KALDI_PYBIND_FST_VECTOR_FST_PYBIND_H_
diff --git a/src/pybind/fst/vector_fst_pybind_test.py b/src/pybind/fst/vector_fst_pybind_test.py
new file mode 100755
index 00000000000..7836ded6268
--- /dev/null
+++ b/src/pybind/fst/vector_fst_pybind_test.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi_pybind.fst as fst
+import kaldi
+
+
+class TestStdVectorFst(unittest.TestCase):
+
+    def test_std_vector_fst(self):
+        vector_fst = fst.StdVectorFst()
+
+        # create the same FST from
+        # http://www.openfst.org/twiki/bin/view/FST/FstQuickTour#Creating%20FSTs%20Using%20Constructors
+        # 1st state will be state 0 (returned by AddState)
+        vector_fst.AddState()
+        vector_fst.SetStart(0)
+        vector_fst.AddArc(0, fst.StdArc(1, 1, fst.TropicalWeight(0.5), 1))
+        vector_fst.AddArc(0, fst.StdArc(2, 2, fst.TropicalWeight(1.5), 1))
+
+        vector_fst.AddState()
+        vector_fst.AddArc(1, fst.StdArc(3, 3, fst.TropicalWeight(2.5), 2))
+
+        vector_fst.AddState()
+        vector_fst.SetFinal(2, fst.TropicalWeight(3.5))
+
+        # fstprint with default options
+        print(vector_fst)
+
+        print('-' * 20)
+        print('fstprint with customized options (default options)')
+        print(
+            vector_fst.ToString(is_acceptor=False,
+                                show_weight_one=False,
+                                fst_field_separator=" " * 6,
+                                missing_symbol=""))
+        # now build the symbol table
+        input_words = '<eps> a b c'.split()
+        output_words = '<eps> x y z'.split()
+
+        isymbol_table = fst.SymbolTable()
+        for w in input_words:
+            isymbol_table.AddSymbol(w)
+
+        osymbol_table = fst.SymbolTable()
+        for w in output_words:
+            osymbol_table.AddSymbol(w)
+
+        vector_fst.SetInputSymbols(isyms=isymbol_table)
+        vector_fst.SetOutputSymbols(osyms=osymbol_table)
+        print(vector_fst)
+
+        # now for I/O
+        fst_filename = 'test.fst'
+        vector_fst.Write(filename=fst_filename)
+
+        read_back_fst = fst.StdVectorFst.Read(filename=fst_filename)
+        print('fst after reading back is:')
+        print(read_back_fst)
+
+        # TODO(fangjun): check that the two fsts are the same: start/final/states/arcs/symbol tables
+        # TODO(fangjun): add fstdraw support
+        # TODO(fangjun): test fstcompile
+
+        text_fst_str = read_back_fst.ToString()
+
+        compiled_filename = "compiled.fst"
+        fst.CompileFst(text_fst_str=text_fst_str,
+                       out_binary_fst_filename=compiled_filename,
+                       isymbols=isymbol_table,
+                       osymbols=osymbol_table,
+                       keep_isymbols=True,
+                       keep_osymbols=True)
+
+        read_back_compiled_fst = fst.StdVectorFst.Read(
+            filename=compiled_filename)
+        print('-' * 20)
+        print('read back compiled fst is:')
+        print(read_back_compiled_fst)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/fst/weight_pybind.cc b/src/pybind/fst/weight_pybind.cc
new file mode 100644
index 00000000000..18ed7d06cad
--- /dev/null
+++ b/src/pybind/fst/weight_pybind.cc
@@ -0,0 +1,67 @@
+// pybind/fst/weight_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fst/weight_pybind.h"
+
+#include "fst/float-weight.h"
+
+void pybind_weight(py::module& m) {
+  {
+    using PyClass = fst::FloatWeight;
+    py::class_<PyClass>(m, "FloatWeight")
+        .def(py::init<>())
+        .def(py::init<float>(), py::arg("f"))
+        .def(py::init<const PyClass&>(), py::arg("weight"))
+        .def("Value", &PyClass::Value, py::return_value_policy::reference)
+        .def("Hash", &PyClass::Hash)
+        .def("__eq__",
+             [](const PyClass& w1, const PyClass& w2) { return w1 == w2; })
+        .def("__str__", [](const PyClass& w) {
+          std::ostringstream os;
+          os << w.Value();
+          return os.str();
+        });
+  }
+  {
+    using PyClass = fst::TropicalWeight;
+    py::class_<PyClass, fst::FloatWeight>(m, "TropicalWeight")
+        .def(py::init<>())
+        .def(py::init<float>(), py::arg("f"))
+        .def(py::init<const PyClass&>(), py::arg("weight"))
+        .def("Member", &PyClass::Member)
+        .def("Quantize", &PyClass::Quantize, py::arg("delta") = fst::kDelta)
+        .def("Reverse", &PyClass::Reverse)
+        .def_static("Zero", &PyClass::Zero)
+        .def_static("One", &PyClass::One)
+        .def_static("NoWeight", &PyClass::NoWeight)
+        .def_static("Type", &PyClass::Type)
+        .def_static("Properties", &PyClass::Properties);
+
+    m.def("Plus", [](const PyClass& w1, const PyClass& w2) {
+      return fst::Plus(w1, w2);
+    });
+
+    m.def("Times", [](const PyClass& w1, const PyClass& w2) {
+      return fst::Times(w1, w2);
+    });
+
+    m.def("Divide", [](const PyClass& w1, const PyClass& w2) {
+      return fst::Divide(w1, w2);
+    });
+  }
+}
diff --git a/src/pybind/fst/weight_pybind.h b/src/pybind/fst/weight_pybind.h
new file mode 100644
index 00000000000..f02d672f5fc
--- /dev/null
+++ b/src/pybind/fst/weight_pybind.h
@@ -0,0 +1,26 @@
+// pybind/fst/weight_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FST_WEIGHT_PYBIND_H_
+#define KALDI_PYBIND_FST_WEIGHT_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_weight(py::module& m);
+
+#endif  // KALDI_PYBIND_FST_WEIGHT_PYBIND_H_
diff --git a/src/pybind/fst/weight_pybind_test.py b/src/pybind/fst/weight_pybind_test.py
new file mode 100755
index 00000000000..42d7f82a953
--- /dev/null
+++ b/src/pybind/fst/weight_pybind_test.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import math  # for math.isnan
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi_pybind.fst as fst
+
+
+class TestWeight(unittest.TestCase):
+
+    def test_float_weight(self):
+        w = fst.FloatWeight(100)
+        self.assertEqual(w.Value(), 100)
+        self.assertEqual(str(w), '100')
+
+    def test_tropical_weight(self):
+        w = fst.TropicalWeight(100)
+        self.assertEqual(w.Value(), 100)
+        self.assertEqual(str(w), '100')
+        self.assertEqual(w.Type(), 'tropical')
+
+        one = w.One()
+        self.assertEqual(one.Value(), 0)
+
+        zero = fst.TropicalWeight.Zero()
+        self.assertEqual(zero.Value(), float('inf'))
+
+        self.assertTrue(math.isnan(w.NoWeight().Value()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/fstext/kaldi_fst_io_pybind.cc b/src/pybind/fstext/kaldi_fst_io_pybind.cc
new file mode 100644
index 00000000000..c7a9e7616f2
--- /dev/null
+++ b/src/pybind/fstext/kaldi_fst_io_pybind.cc
@@ -0,0 +1,94 @@
+// pybind/fstext/kaldi_fst_io_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fstext/kaldi_fst_io_pybind.h"
+
+#include "fstext/kaldi-fst-io.h"
+
+void pybind_kaldi_fst_io(py::module& m) {
+  m.def("ReadFstKaldi", (fst::StdVectorFst * (*)(std::string))fst::ReadFstKaldi,
+        "Read a binary FST using Kaldi I/O mechanisms (pipes, etc.) On error, "
+        "throws using KALDI_ERR.  Note: this doesn't support the text-mode "
+        "option that we generally like to support.",
+        py::arg("rxfilename"), py::return_value_policy::reference);
+
+  m.def("ReadFstKaldiGeneric", fst::ReadFstKaldiGeneric,
+        "Read a binary FST using Kaldi I/O mechanisms (pipes, etc.) If it "
+        "can't read the FST, if throw_on_err == true it throws using "
+        "KALDI_ERR; otherwise it prints a warning and returns. Note:this "
+        "doesn't support the text-mode option that we generally like to "
+        "support. This version currently supports ConstFst<StdArc> or "
+        "VectorFst<StdArc> (const-fst can give better performance for "
+        "decoding).",
+        py::arg("rxfilename"), py::arg("throw_on_err") = true,
+        py::return_value_policy::reference);
+
+  m.def("CastOrConvertToVectorFst", &fst::CastOrConvertToVectorFst,
+        "This function attempts to dynamic_cast the pointer 'fst' (which will "
+        "likely have been returned by ReadFstGeneric()), to the more derived "
+        "type VectorFst<StdArc>. If this succeeds, it returns the same "
+        "pointer; if it fails, it converts the FST type (by creating a new "
+        "VectorFst<stdArc> initialized by 'fst'), prints a warning, and "
+        "deletes 'fst'.",
+        py::arg("fst"), py::return_value_policy::reference);
+
+  m.def("ReadFstKaldi",
+        (void (*)(std::string, fst::StdVectorFst*)) & fst::ReadFstKaldi,
+        "Version of ReadFstKaldi() that writes to a pointer.  Assumes the FST "
+        "is binary with no binary marker.  Crashes on error.",
+        py::arg("rxfilename"), py::arg("ofst"));
+
+  m.def("WriteFstKaldi",
+        (void (*)(const fst::StdVectorFst&, std::string)) & fst::WriteFstKaldi,
+        "Write an FST using Kaldi I/O mechanisms (pipes, etc.) On error, "
+        "throws using KALDI_ERR.  For use only in code in fstbin/, as it "
+        "doesn't support the text-mode option.",
+        py::arg("fst"), py::arg("wxfilename"));
+
+  m.def("WriteFstKaldi",
+        (void (*)(std::ostream&, bool, const fst::StdVectorFst&)) &
+            fst::WriteFstKaldi,
+        "This is a more general Kaldi-type-IO mechanism of writing FSTs to "
+        "streams, supporting binary or text-mode writing.  (note: we just "
+        "write the integers, symbol tables are not supported). On error, "
+        "throws using KALDI_ERR.",
+        py::arg("os"), py::arg("binary"), py::arg("fst"));
+
+  m.def("ReadFstKaldi",
+        (void (*)(std::istream&, bool, fst::StdVectorFst*)) & fst::ReadFstKaldi,
+        "A generic Kaldi-type-IO mechanism of reading FSTs from streams, "
+        "supporting binary or text-mode reading/writing.",
+        py::arg("is"), py::arg("binary"), py::arg("fst"));
+  m.def("ReadAndPrepareLmFst", &fst::ReadAndPrepareLmFst,
+        "Read an FST file for LM (G.fst) and make it an acceptor, and make "
+        "sure it is sorted on labels",
+        py::arg("rxfilename"), py::return_value_policy::reference);
+
+  {
+    // fangjun: it should be called StdVectorFstHolder to match the naming
+    // convention in OpenFst but kaldi uses only StdArc so there is no confusion
+    // here.
+    using PyClass = fst::VectorFstHolder;
+    py::class_<PyClass>(m, "VectorFstHolder")
+        .def(py::init<>())
+        .def_static("Write", &PyClass::Write, py::arg("os"), py::arg("binary"),
+                    py::arg("t"))
+        .def("Copy", &PyClass::Copy)
+        .def("Read", &PyClass::Read, "Reads into the holder.", py::arg("is"));
+  }
+}
diff --git a/src/pybind/fstext/kaldi_fst_io_pybind.h b/src/pybind/fstext/kaldi_fst_io_pybind.h
new file mode 100644
index 00000000000..936e63cd3a4
--- /dev/null
+++ b/src/pybind/fstext/kaldi_fst_io_pybind.h
@@ -0,0 +1,26 @@
+// pybind/fstext/kaldi_fst_io_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_FSTEXT_KALDI_FST_IO_PYBIND_H_
+#define KALDI_PYBIND_FSTEXT_KALDI_FST_IO_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_kaldi_fst_io(py::module& m);
+
+#endif  // KALDI_PYBIND_FSTEXT_KALDI_FST_IO_PYBIND_H_
diff --git a/src/pybind/kaldi.py b/src/pybind/kaldi.py
new file mode 100644
index 00000000000..d42fe055d6c
--- /dev/null
+++ b/src/pybind/kaldi.py
@@ -0,0 +1,19 @@
+from kaldi_pybind import *
+from symbol_table import *
+from util.table import SequentialNnetChainExampleReader
+from util.table import RandomAccessNnetChainExampleReader
+from util.table import NnetChainExampleWriter
+
+from util.table import SequentialWaveReader
+from util.table import RandomAccessWaveReader
+
+from util.table import SequentialWaveInfoReader
+from util.table import RandomAccessWaveInfoReader
+
+from util.table import SequentialMatrixReader
+from util.table import RandomAccessMatrixReader
+from util.table import MatrixWriter
+
+from util.table import SequentialVectorReader
+from util.table import RandomAccessVectorReader
+from util.table import VectorWriter
diff --git a/src/pybind/kaldi_pybind.cc b/src/pybind/kaldi_pybind.cc
new file mode 100644
index 00000000000..17216ec9363
--- /dev/null
+++ b/src/pybind/kaldi_pybind.cc
@@ -0,0 +1,58 @@
+// pybind/kaldi_pybind.cc
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pybind/kaldi_pybind.h"
+
+#include <string>
+
+#include "cudamatrix/cudamatrix_pybind.h"
+#include "feat/feat_pybind.h"
+#include "feat/wave_reader_pybind.h"
+#include "matrix/matrix_common_pybind.h"
+#include "matrix/matrix_pybind.h"
+#include "matrix/vector_pybind.h"
+#include "util/table_types_pybind.h"
+
+#include "fst/fst_pybind.h"
+
+#include "chain/chain_pybind.h"
+#include "nnet3/nnet3_pybind.h"
+
+#include "dlpack/dlpack_pybind.h"
+
+PYBIND11_MODULE(kaldi_pybind, m) {
+  m.doc() =
+      "pybind11 binding of some things from kaldi's "
+      "src/matrix and src/util directories. "
+      "Source is in $(KALDI_ROOT)/src/pybind/kaldi_pybind.cc";
+
+  pybind_matrix_common(m);
+  pybind_matrix(m);
+  pybind_vector(m);
+  pybind_table_types(m);
+  pybind_feat(m);
+
+  pybind_fst(m);
+  pybind_chain(m);
+  pybind_nnet3(m);
+
+  pybind_dlpack(m);
+
+  pybind_cudamatrix(m);
+}
diff --git a/src/pybind/kaldi_pybind.h b/src/pybind/kaldi_pybind.h
new file mode 100644
index 00000000000..6ec6f768d69
--- /dev/null
+++ b/src/pybind/kaldi_pybind.h
@@ -0,0 +1,30 @@
+// pybind/kaldi_pybind.h
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_KALDI_PYBIND_H_
+#define KALDI_PYBIND_KALDI_PYBIND_H_
+
+#include <pybind11/numpy.h>
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+#endif  // KALDI_PYBIND_KALDI_PYBIND_H_
diff --git a/src/pybind/matrix/Makefile b/src/pybind/matrix/Makefile
new file mode 100644
index 00000000000..c0d541ea953
--- /dev/null
+++ b/src/pybind/matrix/Makefile
@@ -0,0 +1,4 @@
+
+test:
+	python3 ./vector_pybind_test.py
+	python3 ./matrix_pybind_test.py
diff --git a/src/pybind/matrix/matrix_common_pybind.cc b/src/pybind/matrix/matrix_common_pybind.cc
new file mode 100644
index 00000000000..0abaa2a7617
--- /dev/null
+++ b/src/pybind/matrix/matrix_common_pybind.cc
@@ -0,0 +1,43 @@
+// pybind/matrix/matrix_common_pybind.cc
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "matrix/matrix_common_pybind.h"
+
+#include "matrix/matrix-common.h"
+
+using namespace kaldi;
+
+void pybind_matrix_common(py::module& m) {
+  py::enum_<MatrixResizeType>(m, "MatrixResizeType", py::arithmetic(),
+                              "Matrix initialization policies")
+      .value("kSetZero", kSetZero, "Set to zero")
+      .value("kUndefined", kUndefined, "Leave undefined")
+      .value("kCopyData", kCopyData, "Copy any previously existing data")
+      .export_values();
+
+  py::enum_<MatrixStrideType>(m, "MatrixStrideType", py::arithmetic(),
+                              "Matrix stride policies")
+      .value("kDefaultStride", kDefaultStride,
+             "Set to a multiple of 16 in bytes")
+      .value("kStrideEqualNumCols", kStrideEqualNumCols,
+             "Set to the number of columns")
+      .export_values();
+}
diff --git a/src/pybind/matrix/matrix_common_pybind.h b/src/pybind/matrix/matrix_common_pybind.h
new file mode 100644
index 00000000000..a9eb664334c
--- /dev/null
+++ b/src/pybind/matrix/matrix_common_pybind.h
@@ -0,0 +1,27 @@
+// pybind/matrix/matrix_common_pybind.h
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_MATRIX_MATRIX_COMMON_PYBIND_H_
+#define KALDI_PYBIND_MATRIX_MATRIX_COMMON_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_matrix_common(py::module& m);
+
+#endif  // KALDI_PYBIND_MATRIX_MATRIX_COMMON_PYBIND_H_
diff --git a/src/pybind/matrix/matrix_pybind.cc b/src/pybind/matrix/matrix_pybind.cc
new file mode 100644
index 00000000000..62f2bc356fa
--- /dev/null
+++ b/src/pybind/matrix/matrix_pybind.cc
@@ -0,0 +1,95 @@
+// pybind/matrix/matrix_pybind.cc
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "matrix/matrix_pybind.h"
+
+#include "matrix/kaldi-matrix.h"
+
+using namespace kaldi;
+
+void pybind_matrix(py::module& m) {
+  py::class_<MatrixBase<float>,
+             std::unique_ptr<MatrixBase<float>, py::nodelete>>(
+      m, "FloatMatrixBase",
+      "Base class which provides matrix operations not involving resizing\n"
+      "or allocation.   Classes Matrix and SubMatrix inherit from it and take "
+      "care of allocation and resizing.")
+      .def("NumRows", &MatrixBase<float>::NumRows, "Return number of rows")
+      .def("NumCols", &MatrixBase<float>::NumCols, "Return number of columns")
+      .def("Stride", &MatrixBase<float>::Stride, "Return stride")
+      .def("__repr__",
+           [](const MatrixBase<float>& b) -> std::string {
+             std::ostringstream str;
+             b.Write(str, false);
+             return str.str();
+           })
+      .def("__getitem__",
+           [](const MatrixBase<float>& m, std::pair<ssize_t, ssize_t> i) {
+             return m(i.first, i.second);
+           })
+      .def("__setitem__",
+           [](MatrixBase<float>& m, std::pair<ssize_t, ssize_t> i, float v) {
+             m(i.first, i.second) = v;
+           })
+      .def("numpy", [](MatrixBase<float>* m) {
+        return py::array_t<float>(
+            {m->NumRows(), m->NumCols()},                  // shape
+            {sizeof(float) * m->Stride(), sizeof(float)},  // stride in bytes
+            m->Data(),                                     // ptr
+            py::none());  // pass a base object to avoid copy!
+      });
+
+  py::class_<Matrix<float>, MatrixBase<float>>(m, "FloatMatrix",
+                                               pybind11::buffer_protocol())
+      .def_buffer([](const Matrix<float>& m) -> pybind11::buffer_info {
+        return pybind11::buffer_info(
+            (void*)m.Data(),  // pointer to buffer
+            sizeof(float),    // size of one scalar
+            pybind11::format_descriptor<float>::format(),
+            2,                           // num-axes
+            {m.NumRows(), m.NumCols()},  // buffer dimensions
+            {sizeof(float) * m.Stride(),
+             sizeof(float)});  // stride for each index (in chars)
+      })
+      .def(py::init<const MatrixIndexT, const MatrixIndexT, MatrixResizeType,
+                    MatrixStrideType>(),
+           py::arg("row"), py::arg("col"), py::arg("resize_type") = kSetZero,
+           py::arg("stride_type") = kDefaultStride);
+
+  py::class_<SubMatrix<float>, MatrixBase<float>>(m, "FloatSubMatrix")
+      .def(py::init([](py::buffer b) {
+        py::buffer_info info = b.request();
+        if (info.format != py::format_descriptor<float>::format()) {
+          KALDI_ERR << "Expected format: "
+                    << py::format_descriptor<float>::format() << "\n"
+                    << "Current format: " << info.format;
+        }
+        if (info.ndim != 2) {
+          KALDI_ERR << "Expected dim: 2\n"
+                    << "Current dim: " << info.ndim;
+        }
+
+        // numpy is row major by default, so we use strides[0]
+        return new SubMatrix<float>(reinterpret_cast<float*>(info.ptr),
+                                    info.shape[0], info.shape[1],
+                                    info.strides[0] / sizeof(float));
+      }));
+}
diff --git a/src/pybind/matrix/matrix_pybind.h b/src/pybind/matrix/matrix_pybind.h
new file mode 100644
index 00000000000..0758b970d45
--- /dev/null
+++ b/src/pybind/matrix/matrix_pybind.h
@@ -0,0 +1,27 @@
+// pybind/matrix/matrix_pybind.h
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_MATRIX_MATRIX_PYBIND_H_
+#define KALDI_PYBIND_MATRIX_MATRIX_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_matrix(py::module& m);
+
+#endif  // KALDI_PYBIND_MATRIX_MATRIX_PYBIND_H_
diff --git a/src/pybind/matrix/matrix_pybind_test.py b/src/pybind/matrix/matrix_pybind_test.py
new file mode 100755
index 00000000000..b1caa1d89cf
--- /dev/null
+++ b/src/pybind/matrix/matrix_pybind_test.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi
+
+
+class TestFloatSubMatrix(unittest.TestCase):
+
+    def test_from_numpy(self):
+        num_rows = 5
+        num_cols = 6
+        data = np.arange(num_rows * num_cols).reshape(
+            num_rows, num_cols).astype(np.float32)
+
+        # =============================================================
+        # build a FloatSubMatrix() from a numpy array; memory is shared
+        # -------------------------------------------------------------
+        m = kaldi.FloatSubMatrix(data)
+
+        self.assertEqual(m.NumRows(), num_rows)
+        self.assertEqual(m.NumCols(), num_cols)
+        self.assertEqual(m.Stride(), data.strides[0] / 4)
+        for r in range(num_rows):
+            for c in range(num_cols):
+                self.assertEqual(m[r, c], data[r, c])
+
+        # memory is shared between numpy array and FloatSubMatrix
+        for r in range(num_rows):
+            for c in range(num_cols):
+                m[r, c] += 10
+                self.assertEqual(m[r, c], data[r, c])
+
+        # =============================================================
+        # Convert a FloatSubMatrix to a numpy array; memory is shared
+        # -------------------------------------------------------------
+        d = m.numpy()
+        d += 10  # m is also changed because of memory sharing
+        for r in range(num_rows):
+            for c in range(num_cols):
+                self.assertEqual(d[r, c], m[r, c])
+
+
+class TestFloatMatrix(unittest.TestCase):
+
+    def test_to_numpy(self):
+        # first, build a kaldi matrix
+        num_rows = 6
+        num_cols = 8
+        m = kaldi.FloatMatrix(row=num_rows, col=num_cols)
+        for r in range(num_rows):
+            for c in range(num_cols):
+                self.assertEqual(m[r, c], 0)
+
+        # now to numpy; memory is shared
+        d = m.numpy()
+
+        d += 10
+        for r in range(num_rows):
+            for c in range(num_cols):
+                self.assertEqual(d[r, c], m[r, c])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/matrix/sparse_matrix_pybind.cc b/src/pybind/matrix/sparse_matrix_pybind.cc
new file mode 100644
index 00000000000..6ee045d5f82
--- /dev/null
+++ b/src/pybind/matrix/sparse_matrix_pybind.cc
@@ -0,0 +1,43 @@
+// pybind/matrix/sparse_matrix_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "matrix/sparse_matrix_pybind.h"
+
+#include "matrix/sparse-matrix.h"
+
+using namespace kaldi;
+
+// in nnet-example.h, class NnetIO contains a field `GeneralMatrix features`
+// so we need to wrap GeneralMatrix
+
+void pybind_sparse_matrix(py::module& m) {
+  {
+    using PyClass = GeneralMatrix;
+    py::class_<PyClass>(
+        m, "GeneralMatrix",
+        "This class is a wrapper that enables you to store a matrix in one of "
+        "three forms: either as a Matrix<BaseFloat>, or a CompressedMatrix, or "
+        "a SparseMatrix<BaseFloat>.  It handles the I/O for you, i.e. you read "
+        "and write a single object type.  It is useful for neural-net training "
+        "targets which might be sparse or not, and might be compressed or not.")
+        .def(py::init<>())
+        // TODO(fangjun): wrap other methods when needed
+        ;
+  }
+}
diff --git a/src/pybind/matrix/sparse_matrix_pybind.h b/src/pybind/matrix/sparse_matrix_pybind.h
new file mode 100644
index 00000000000..eeebc1f6c16
--- /dev/null
+++ b/src/pybind/matrix/sparse_matrix_pybind.h
@@ -0,0 +1,25 @@
+// pybind/matrix/sparse_matrix_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_MATRIX_SPARSE_MATRIX_PYBIND_H_
+#define KALDI_PYBIND_MATRIX_SPARSE_MATRIX_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_sparse_matrix(py::module& m);
+
+#endif  // KALDI_PYBIND_MATRIX_SPARSE_MATRIX_PYBIND_H_
diff --git a/src/pybind/matrix/vector_pybind.cc b/src/pybind/matrix/vector_pybind.cc
new file mode 100644
index 00000000000..89531112b69
--- /dev/null
+++ b/src/pybind/matrix/vector_pybind.cc
@@ -0,0 +1,88 @@
+// pybind/matrix/vector_pybind.cc
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "matrix/vector_pybind.h"
+
+#include "matrix/kaldi-vector.h"
+
+using namespace kaldi;
+
+void pybind_vector(py::module& m) {
+  py::class_<VectorBase<float>,
+             std::unique_ptr<VectorBase<float>, py::nodelete>>(
+      m, "FloatVectorBase",
+      "Provides a vector abstraction class.\n"
+      "This class provides a way to work with vectors in kaldi.\n"
+      "It encapsulates basic operations and memory optimizations.")
+      .def("Dim", &VectorBase<float>::Dim,
+           "Returns the dimension of the vector.")
+      .def("__repr__",
+           [](const VectorBase<float>& v) -> std::string {
+             std::ostringstream str;
+             v.Write(str, false);
+             return str.str();
+           })
+      .def("__getitem__",
+           [](const VectorBase<float>& v, int i) { return v(i); })
+      .def("__setitem__",
+           [](VectorBase<float>& v, int i, float val) { v(i) = val; })
+      .def("numpy", [](VectorBase<float>* v) {
+        return py::array_t<float>(
+            {v->Dim()},       // shape
+            {sizeof(float)},  // stride in bytes
+            v->Data(),        // ptr
+            py::none());      // pass a base object to avoid copy!
+        // (fangjun): the base object can be anything containing a non-null
+        // ptr. I cannot think of a better way than to pass a `None` object.
+        //
+        // `numpy` instead of `Numpy`, `ToNumpy` or `SomeOtherName` is used here
+        // because we want to follow the style in PyKaldi and PyTorch using the
+        // method `numpy()` to convert a matrix/tensor to a numpy array.
+      });
+
+  py::class_<Vector<float>, VectorBase<float>>(m, "FloatVector",
+                                               py::buffer_protocol())
+      .def_buffer([](const Vector<float>& v) -> py::buffer_info {
+        return py::buffer_info((void*)v.Data(), sizeof(float),
+                               py::format_descriptor<float>::format(),
+                               1,  // num-axes
+                               {v.Dim()},
+                               {sizeof(float)});  // strides (in chars)
+      })
+      .def(py::init<const MatrixIndexT, MatrixResizeType>(), py::arg("size"),
+           py::arg("resize_type") = kSetZero);
+
+  py::class_<SubVector<float>, VectorBase<float>>(m, "FloatSubVector")
+      .def(py::init([](py::buffer b) {
+        py::buffer_info info = b.request();
+        if (info.format != py::format_descriptor<float>::format()) {
+          KALDI_ERR << "Expected format: "
+                    << py::format_descriptor<float>::format() << "\n"
+                    << "Current format: " << info.format;
+        }
+        if (info.ndim != 1) {
+          KALDI_ERR << "Expected dim: 1\n"
+                    << "Current dim: " << info.ndim;
+        }
+        return new SubVector<float>(reinterpret_cast<float*>(info.ptr),
+                                    info.shape[0]);
+      }));
+}
diff --git a/src/pybind/matrix/vector_pybind.h b/src/pybind/matrix/vector_pybind.h
new file mode 100644
index 00000000000..863fe8abc3d
--- /dev/null
+++ b/src/pybind/matrix/vector_pybind.h
@@ -0,0 +1,27 @@
+// pybind/matrix/vector_pybind.h
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_MATRIX_VECTOR_PYBIND_H_
+#define KALDI_PYBIND_MATRIX_VECTOR_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_vector(py::module& m);
+
+#endif  // KALDI_PYBIND_MATRIX_VECTOR_PYBIND_H_
diff --git a/src/pybind/matrix/vector_pybind_test.py b/src/pybind/matrix/vector_pybind_test.py
new file mode 100755
index 00000000000..ce05453906d
--- /dev/null
+++ b/src/pybind/matrix/vector_pybind_test.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import numpy as np
+
+import kaldi
+
+
+class TestFloatSubVecotr(unittest.TestCase):
+
+    def test_numpy(self):
+        num_data = 10
+
+        data = np.arange(num_data).astype(np.float32)
+
+        # =============================================================
+        # build a FloatSubVector() from a numpy array; memory is shared
+        # -------------------------------------------------------------
+        v = kaldi.FloatSubVector(data)
+        self.assertEqual(v.Dim(), num_data)
+        for i in range(num_data):
+            self.assertEqual(i, v[i])
+
+        # memory is shared between numpy array and FloatSubVector
+        for i in range(num_data):
+            v[i] += 10
+            self.assertEqual(data[i], v[i])
+
+        # =============================================================
+        # Convert a FloatSubVector to a numpy array; memory is shared
+        # -------------------------------------------------------------
+        d = v.numpy()
+
+        self.assertIsInstance(d, np.ndarray)
+        self.assertEqual(d.ndim, 1)
+        self.assertEqual(d.dtype, np.float32)
+        self.assertEqual(d.size, v.Dim())
+
+        for i in range(num_data):
+            d[i] += 10
+            self.assertEqual(v[i], d[i])
+
+
+class TestFloatVecotr(unittest.TestCase):
+
+    def test_to_numpy(self):
+        # first, build a kaldi vector
+        dim = 8
+        v = kaldi.FloatVector(size=dim)
+        self.assertEqual(v.Dim(), dim)
+
+        for i in range(dim):
+            self.assertEqual(v[i], 0)
+
+        # now to numpy; memory is shared
+        d = v.numpy()
+
+        d += 10
+        for i in range(dim):
+            self.assertEqual(d[i], v[i])
+
+        # TODO(fangjun): kaldi::Vector<float> and kaldi::Matrix<float>
+        # have the limitation that they allocate memory by themselves.
+        # There is no way to construct a kaldi::Vector<float> from numpy
+        # without memory copy. Discuss with Dan whether some necessary
+        # changes should be made.
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/nnet3/Makefile b/src/pybind/nnet3/Makefile
new file mode 100644
index 00000000000..d9fd9059847
--- /dev/null
+++ b/src/pybind/nnet3/Makefile
@@ -0,0 +1,4 @@
+
+test:
+	python3 ./nnet_chain_example_pybind_test.py
+
diff --git a/src/pybind/nnet3/aishell_test.ark b/src/pybind/nnet3/aishell_test.ark
new file mode 100644
index 00000000000..1f07e91e932
Binary files /dev/null and b/src/pybind/nnet3/aishell_test.ark differ
diff --git a/src/pybind/nnet3/aishell_test.scp b/src/pybind/nnet3/aishell_test.scp
new file mode 100644
index 00000000000..7c055dc6225
--- /dev/null
+++ b/src/pybind/nnet3/aishell_test.scp
@@ -0,0 +1,2 @@
+BAC009S0704W0210-87 aishell_test.ark:20
+BAC009S0037W0418-150 aishell_test.ark:9595
diff --git a/src/pybind/nnet3/nnet3_pybind.cc b/src/pybind/nnet3/nnet3_pybind.cc
new file mode 100644
index 00000000000..c0ccd5979cb
--- /dev/null
+++ b/src/pybind/nnet3/nnet3_pybind.cc
@@ -0,0 +1,33 @@
+// pybind/nnet3/nnet3_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet3_pybind.h"
+
+#include "nnet3/nnet_chain_example_pybind.h"
+#include "nnet3/nnet_common_pybind.h"
+#include "nnet3/nnet_example_pybind.h"
+
+void pybind_nnet3(py::module& _m) {
+  py::module m = _m.def_submodule("nnet3", "nnet3 pybind for Kaldi");
+
+  pybind_nnet_common(m);
+  pybind_nnet_example(m);
+  pybind_nnet_chain_example(m);
+}
diff --git a/src/pybind/nnet3/nnet3_pybind.h b/src/pybind/nnet3/nnet3_pybind.h
new file mode 100644
index 00000000000..e2498f85ee0
--- /dev/null
+++ b/src/pybind/nnet3/nnet3_pybind.h
@@ -0,0 +1,26 @@
+// pybind/nnet3/nnet3_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_NNET3_PYBIND_H_
+#define KALDI_PYBIND_NNET3_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_nnet3(py::module& m);
+
+#endif  // KALDI_PYBIND_NNET3_PYBIND_H_
diff --git a/src/pybind/nnet3/nnet_chain_example_pybind.cc b/src/pybind/nnet3/nnet_chain_example_pybind.cc
new file mode 100644
index 00000000000..a04b8ef2a27
--- /dev/null
+++ b/src/pybind/nnet3/nnet_chain_example_pybind.cc
@@ -0,0 +1,112 @@
+// pybind/nnet3/nnet_chain_example_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet_chain_example_pybind.h"
+
+#include "nnet3/nnet-chain-example.h"
+#include "util/kaldi_table_pybind.h"
+
+using namespace kaldi;
+using namespace kaldi::nnet3;
+using namespace kaldi::chain;
+
+void pybind_nnet_chain_example(py::module& m) {
+  {
+    using PyClass = NnetChainSupervision;
+    py::class_<PyClass>(
+        m, "NnetChainSupervision",
+        "For regular setups we use struct 'NnetIo' as the output.  For the "
+        "'chain' models, the output supervision is a little more complex as it "
+        "involves a lattice and we need to do forward-backward, so we use a "
+        "separate struct for it.  The 'output' name means that it pertains to "
+        "the output of the network, as opposed to the features which pertain "
+        "to the input of the network.  It actually stores the lattice-like "
+        "supervision information at the output of the network (which imposes "
+        "constraints on which frames each phone can be active on")
+        .def(py::init<>())
+        .def_readwrite("name", &PyClass::name,
+                       "the name of the output in the neural net; in simple "
+                       "setups it will just be 'output'.")
+        .def_readwrite(
+            "indexes", &PyClass::indexes,
+            "The indexes that the output corresponds to.  The size of this "
+            "vector will be equal to supervision.num_sequences * "
+            "supervision.frames_per_sequence. Be careful about the order of "
+            "these indexes-- it is a little confusing. The indexes in the "
+            "'index' vector are ordered as: (frame 0 of each sequence); (frame "
+            "1 of each sequence); and so on.  But in the 'supervision' object, "
+            "the FST contains (sequence 0; sequence 1; ...).  So reordering is "
+            "needed when doing the numerator computation. We order 'indexes' "
+            "in this way for efficiency in the denominator computation (it "
+            "helps memory locality), as well as to avoid the need for the nnet "
+            "to reorder things internally to match the requested output (for "
+            "layers inside the neural net, the ordering is (frame 0; frame 1 "
+            "...) as this corresponds to the order you get when you sort a "
+            "vector of Index).")
+        .def_readwrite("supervision", &PyClass::supervision,
+                       "The supervision object, containing the FST.")
+        .def_readwrite(
+            "deriv_weights", &PyClass::deriv_weights,
+            "This is a vector of per-frame weights, required to be between 0 "
+            "and 1, that is applied to the derivative during training (but not "
+            "during model combination, where the derivatives need to agree "
+            "with the computed objf values for the optimization code to work). "
+            " The reason for this is to more exactly handle edge effects and "
+            "to ensure that no frames are 'double-counted'.  The order of this "
+            "vector corresponds to the order of the 'indexes' (i.e. all the "
+            "first frames, then all the second frames, etc.) If this vector is "
+            "empty it means we're not applying per-frame weights, so it's "
+            "equivalent to a vector of all ones.  This vector is written to "
+            "disk compactly as unsigned char.")
+        .def("CheckDim", &PyClass::CheckDim)
+        .def("__str__",
+             [](const PyClass& sup) {
+               std::ostringstream os;
+               os << "name: " << sup.name << "\n";
+               return os.str();
+             })
+        // TODO(fangjun): other methods can be wrapped when needed
+        ;
+  }
+  {
+    using PyClass = NnetChainExample;
+    py::class_<PyClass>(m, "NnetChainExample")
+        .def(py::init<>())
+        .def_readwrite("inputs", &PyClass::inputs)
+        .def_readwrite("outputs", &PyClass::outputs)
+        .def("Compress", &PyClass::Compress,
+             "Compresses the input features (if not compressed)")
+        .def("__eq__",
+             [](const PyClass& a, const PyClass& b) { return a == b; });
+
+    // (fangjun): we follow the PyKaldi style to prepend a underline before the
+    // registered classes and the user in general should not use them directly;
+    // instead, they should use the corresponding python classes that are more
+    // easier to use.
+    pybind_sequential_table_reader<KaldiObjectHolder<PyClass>>(
+        m, "_SequentialNnetChainExampleReader");
+
+    pybind_random_access_table_reader<KaldiObjectHolder<PyClass>>(
+        m, "_RandomAccessNnetChainExampleReader");
+
+    pybind_table_writer<KaldiObjectHolder<PyClass>>(m,
+                                                    "_NnetChainExampleWriter");
+  }
+}
diff --git a/src/pybind/nnet3/nnet_chain_example_pybind.h b/src/pybind/nnet3/nnet_chain_example_pybind.h
new file mode 100644
index 00000000000..a9abedebc91
--- /dev/null
+++ b/src/pybind/nnet3/nnet_chain_example_pybind.h
@@ -0,0 +1,26 @@
+// pybind/nnet3/nnet_chain_example_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_NNET3_CHAIN_EXAMPLE_PYBIND_H_
+#define KALDI_PYBIND_NNET3_CHAIN_EXAMPLE_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_nnet_chain_example(py::module& m);
+
+#endif  // KALDI_PYBIND_NNET3_CHAIN_EXAMPLE_PYBIND_H_
diff --git a/src/pybind/nnet3/nnet_chain_example_pybind_test.py b/src/pybind/nnet3/nnet_chain_example_pybind_test.py
new file mode 100755
index 00000000000..66776f9facd
--- /dev/null
+++ b/src/pybind/nnet3/nnet_chain_example_pybind_test.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+import unittest
+
+import kaldi_pybind
+
+import kaldi_pybind.chain as chain
+import kaldi_pybind.nnet3 as nnet3
+from kaldi import NnetChainExampleWriter
+from kaldi import RandomAccessNnetChainExampleReader
+from kaldi import SequentialNnetChainExampleReader
+from kaldi_pybind.fst import StdVectorFst
+
+
+class TestNnetChainExample(unittest.TestCase):
+
+    def test_nnet_chain_example(self):
+
+        # TODO(fangjun): find a place to store the test data
+        egs_rspecifier = 'scp:./aishell_test.scp'
+        reader = SequentialNnetChainExampleReader(egs_rspecifier)
+        for key, value in reader:
+            inputs = value.inputs
+            self.assertEqual(len(inputs), 1)
+
+            nnet_io = inputs[0]
+            self.assertTrue(isinstance(nnet_io, nnet3.NnetIo))
+            self.assertEqual(nnet_io.name, 'input')
+            # its `features` has not been wrapped yet.
+
+            self.assertTrue(isinstance(key, str))
+            self.assertTrue(isinstance(value, nnet3.NnetChainExample))
+            outputs = value.outputs
+            num_outputs = len(outputs)
+            self.assertEqual(num_outputs, 1)
+
+            nnet_chain_sup = outputs[0]
+            self.assertTrue(
+                isinstance(nnet_chain_sup, nnet3.NnetChainSupervision))
+            self.assertEqual(nnet_chain_sup.name, 'output')
+
+            sup = nnet_chain_sup.supervision
+            self.assertTrue(isinstance(sup, chain.Supervision))
+            weight = sup.weight
+            self.assertEqual(sup.weight, 1)
+            self.assertEqual(sup.num_sequences, 1)
+            # we have to egs in the ark, with 30 and 50 frames per sequence respectively
+            self.assertTrue(sup.frames_per_sequence == 30 or
+                            sup.frames_per_sequence == 50)
+            self.assertEqual(sup.label_dim, 4336)
+
+            # now comes to the FST part !!!
+            fst = sup.fst
+            self.assertTrue(isinstance(sup.fst, StdVectorFst))
+            # see pybind/fst/vector_fst_pybind_test.py for operations wrapped for fst::StdVectorFst
+            # TODO(fangjun): finish the test
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/nnet3/nnet_common_pybind.cc b/src/pybind/nnet3/nnet_common_pybind.cc
new file mode 100644
index 00000000000..4959b8f174d
--- /dev/null
+++ b/src/pybind/nnet3/nnet_common_pybind.cc
@@ -0,0 +1,59 @@
+// pybind/nnet3/nnet_common_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet_common_pybind.h"
+
+#include "nnet3/nnet-common.h"
+
+using namespace kaldi;
+using namespace kaldi::nnet3;
+
+void pybind_nnet_common(py::module& m) {
+  {
+    // Index is need by NnetChainSupervision in nnet_chain_example_pybind.cc
+    using PyClass = Index;
+    py::class_<PyClass>(
+        m, "Index",
+        "struct Index is intended to represent the various indexes by which we "
+        "number the rows of the matrices that the Components process: mainly "
+        "'n', the index of the member of the minibatch, 't', used for the "
+        "frame index in speech recognition, and 'x', which is a catch-all "
+        "extra index which we might use in convolutional setups or for other "
+        "reasons.  It is possible to extend this by adding new indexes if "
+        "needed.")
+        .def(py::init<>())
+        .def(py::init<int, int, int>(), py::arg("n"), py::arg("t"),
+             py::arg("x") = 0)
+        .def_readwrite("n", &PyClass::n, "member-index of minibatch, or zero.")
+        .def_readwrite("t", &PyClass::t, "time-frame.")
+        .def_readwrite("x", &PyClass::x,
+                       "this may come in useful in convolutional approaches. "
+                       "it is possible to add extra index here, if needed.")
+        .def("__eq__",
+             [](const PyClass& a, const PyClass& b) { return a == b; })
+        .def("__ne__",
+             [](const PyClass& a, const PyClass& b) { return a != b; })
+        .def("__lt__", [](const PyClass& a, const PyClass& b) { return a < b; })
+        .def(py::self + py::self)
+        .def(py::self += py::self)
+        // TODO(fangjun): other methods can be wrapped when needed
+        ;
+  }
+}
diff --git a/src/pybind/nnet3/nnet_common_pybind.h b/src/pybind/nnet3/nnet_common_pybind.h
new file mode 100644
index 00000000000..ab752a87375
--- /dev/null
+++ b/src/pybind/nnet3/nnet_common_pybind.h
@@ -0,0 +1,26 @@
+// pybind/nnet3/nnet_common_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_NNET3_NNET_COMMON_PYBIND_H_
+#define KALDI_PYBIND_NNET3_NNET_COMMON_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_nnet_common(py::module& m);
+
+#endif  // KALDI_PYBIND_NNET3_NNET_COMMON_PYBIND_H_
diff --git a/src/pybind/nnet3/nnet_example_pybind.cc b/src/pybind/nnet3/nnet_example_pybind.cc
new file mode 100644
index 00000000000..4aa3e6f8a5f
--- /dev/null
+++ b/src/pybind/nnet3/nnet_example_pybind.cc
@@ -0,0 +1,38 @@
+// pybind/nnet3/nnet_example_pybind.cc
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet_example_pybind.h"
+
+#include "nnet3/nnet-example.h"
+
+using namespace kaldi;
+using namespace kaldi::nnet3;
+
+void pybind_nnet_example(py::module& m) {
+  {
+    using PyClass = NnetIo;
+    py::class_<PyClass>(m, "NnetIo")
+        .def(py::init<>())
+        .def_readwrite("name", &PyClass::name,
+                       "the name of the input in the neural net; in simple "
+                       "setups it will just be 'input'.");
+    // TODO(fangjun): other constructors, fields and methods can be wrapped when
+  }
+}
diff --git a/src/pybind/nnet3/nnet_example_pybind.h b/src/pybind/nnet3/nnet_example_pybind.h
new file mode 100644
index 00000000000..5fc4d321497
--- /dev/null
+++ b/src/pybind/nnet3/nnet_example_pybind.h
@@ -0,0 +1,26 @@
+// pybind/nnet3/nnet_example_pybind.h
+
+// Copyright 2019   Mobvoi AI Lab, Beijing, China
+//                  (author: Fangjun Kuang, Yaguang Hu, Jian Wang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_NNET3_NNET_EXAMPLE_PYBIND_H_
+#define KALDI_PYBIND_NNET3_NNET_EXAMPLE_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_nnet_example(py::module& m);
+
+#endif  // KALDI_PYBIND_NNET3_NNET_EXAMPLE_PYBIND_H_
diff --git a/src/pybind/symbol_table.py b/src/pybind/symbol_table.py
new file mode 100644
index 00000000000..d4684ffd223
--- /dev/null
+++ b/src/pybind/symbol_table.py
@@ -0,0 +1,14 @@
+# Copyright 2019 Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+# Apache 2.0
+
+import kaldi_pybind.fst as fst
+
+
+class SymbolTableIterator(fst.SymbolTableIterator):
+
+    def __iter__(self):
+        while not self.Done():
+            index = self.Value()
+            symbol = self.Symbol()
+            yield index, symbol
+            self.Next()
diff --git a/src/pybind/tests/test_kaldi_pybind.py b/src/pybind/tests/test_kaldi_pybind.py
new file mode 100755
index 00000000000..5cac929e54f
--- /dev/null
+++ b/src/pybind/tests/test_kaldi_pybind.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+import unittest
+import numpy as np
+import os
+import sys
+
+# Add .. to the PYTHONPATH
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+import kaldi_pybind as kp
+import kaldi
+
+
+class TestKaldiPybind(unittest.TestCase):
+
+    def test_float_vector(self):
+        # test FloatVector
+        kp_vector = kp.FloatVector(5)
+
+        np_array = kp_vector.numpy()
+        self.assertIsInstance(np_array, np.ndarray)
+
+        np_array[2:] = 2.0
+
+        gold = np.array([0, 0, 2, 2, 2])
+        self.assertTrue((kp_vector == gold).all())
+
+    def test_float_matrix(self):
+        return
+        # test FloatMatrix
+        kp_matrix = kp.FloatMatrix(4, 5)
+
+        np_matrix = kp_matrix.numpy()
+
+        np_matrix[2][3] = 2.0
+
+        gold = np.array([
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0],
+            [0, 0, 0, 2, 0],
+            [0, 0, 0, 0, 0],
+        ])
+        self.assertTrue((kp_matrix == gold).all())
+
+    def test_matrix_reader_writer(self):
+        kp_matrix = kp.FloatMatrix(2, 3)
+        wspecifier = 'ark,t:test.ark'
+        rspecifier = 'ark:test.ark'
+        matrix_writer = kaldi.MatrixWriter(wspecifier)
+
+        np_matrix = kp_matrix.numpy()
+        np_matrix[0, 0] = 10
+
+        matrix_writer.Write('id_1', kp_matrix)
+        matrix_writer.Close()
+
+        matrix_reader = kaldi.SequentialMatrixReader(rspecifier)
+        key = matrix_reader.Key()
+        self.assertEqual(key, 'id_1')
+
+        value = matrix_reader.Value()
+        gold = np.array([[10, 0, 0], [0, 0, 0]])
+        self.assertTrue((np.array(value, copy=False) == gold).all())
+
+    def test_matrix_reader_iterator(self):
+        kp_matrix = kp.FloatMatrix(2, 3)
+        wspecifier = 'ark,t:test.ark'
+        rspecifier = 'ark:test.ark'
+        matrix_writer = kaldi.MatrixWriter(wspecifier)
+        matrix_writer.Write('id_1', kp_matrix)
+        matrix_writer.Close()
+
+        gold_key_list = ['id_1']
+        gold_value_list = [np.array([[0, 0, 0], [0, 0, 0]])]
+        for (key, value), gold_key, gold_value in zip(
+                kaldi.SequentialMatrixReader(rspecifier), gold_key_list,
+                gold_value_list):
+            self.assertEqual(key, gold_key)
+            self.assertTrue((value == gold_value).all())
+
+    def test_matrix_random_access_reader(self):
+        kp_matrix = kp.FloatMatrix(2, 3)
+        wspecifier = 'ark,t:test.ark'
+        rspecifier = 'ark:test.ark'
+        matrix_writer = kaldi.MatrixWriter(wspecifier)
+        matrix_writer.Write('id_1', kp_matrix)
+        matrix_writer.Close()
+
+        reader = kaldi.RandomAccessMatrixReader(rspecifier)
+        gold = np.array([[0, 0, 0], [0, 0, 0]])
+        self.assertTrue('id_1' in reader)
+        self.assertTrue((np.array(reader['id_1']) == gold).all())
+        self.assertFalse('id_2' in reader)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/pybind/util/kaldi_table_pybind.h b/src/pybind/util/kaldi_table_pybind.h
new file mode 100644
index 00000000000..b21877c45de
--- /dev/null
+++ b/src/pybind/util/kaldi_table_pybind.h
@@ -0,0 +1,143 @@
+// pybind/util/kaldi_table_pybind.h
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_UTIL_KALDI_TABLE_PYBIND_H_
+#define KALDI_PYBIND_UTIL_KALDI_TABLE_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+#include "util/kaldi-table.h"
+
+using namespace kaldi;
+
+template <class Holder>
+void pybind_sequential_table_reader(py::module& m,
+                                    const std::string& class_name,
+                                    const std::string& class_help_doc = "") {
+  using PyClass = SequentialTableReader<Holder>;
+  py::class_<PyClass>(m, class_name.c_str(), class_help_doc.c_str())
+      .def(py::init<>())
+      .def(py::init<const std::string&>(),
+           "This constructor equivalent to default constructor + 'open', but "
+           "throws on error.",
+           py::arg("rspecifier"))
+      .def("Open", &PyClass::Open,
+           "Opens the table.  Returns exit status; but does throw if "
+           "previously open stream was in error state.  You can call Close to "
+           "prevent this; anyway, calling Open more than once is not usually "
+           "needed.",
+           py::arg("rspecifier"))
+      .def("Done", &PyClass::Done,
+           "Returns true if we're done.  It will also return true if there's "
+           "some kind of error and we can't read any more; in this case, you "
+           "can detect the error by calling Close and checking the return "
+           "status; otherwise the destructor will throw.")
+      .def("Key", &PyClass::Key,
+           "Only valid to call Key() if Done() returned false.")
+      .def("FreeCurrent", &PyClass::FreeCurrent,
+           "FreeCurrent() is provided as an optimization to save memory, for "
+           "large objects.  It instructs the class to deallocate the current "
+           "value. The reference Value() will be invalidated by this.")
+      .def("Value", &PyClass::Value,
+           "Return reference to the current value.  It's only valid to call "
+           "this if Done() returned false.  The reference is valid till next "
+           "call to this object.  It will throw if you are reading an scp "
+           "file, did not specify the 'permissive' (p) option and the file "
+           "cannot be read.  [The permissive option makes it behave as if that "
+           "key does not even exist, if the corresponding file cannot be "
+           "read.]  You probably wouldn't want to catch this exception; the "
+           "user can just specify the p option in the rspecifier. We make this "
+           "non-const to enable things like shallow swap on the held object in "
+           "situations where this would avoid making a redundant copy.",
+           py::return_value_policy::reference)
+      .def("Next", &PyClass::Next,
+           "Next goes to the next key.  It will not throw; any error will "
+           "result in Done() returning true, and then the destructor will "
+           "throw unless you call Close().")
+      .def("IsOpen", &PyClass::IsOpen,
+           "Returns true if table is open for reading (does not imply stream "
+           "is in good state).")
+      .def("Close", &PyClass::Close,
+           "Close() will return false (failure) if Done() became true because "
+           "of an error/ condition rather than because we are really done "
+           "[e.g. because of an error or early termination in the archive]. If "
+           "there is an error and you don't call Close(), the destructor will "
+           "fail. Close()");
+}
+
+template <class Holder>
+void pybind_random_access_table_reader(py::module& m,
+                                       const std::string& class_name,
+                                       const std::string& class_help_doc = "") {
+  using PyClass = RandomAccessTableReader<Holder>;
+  py::class_<PyClass>(m, class_name.c_str(), class_help_doc.c_str())
+      .def(py::init<>())
+      .def(py::init<const std::string&>(),
+           "This constructor equivalent to default constructor + 'open', but "
+           "throws on error.",
+           py::arg("rspecifier"))
+      .def("Open", &PyClass::Open, "Opens the table.", py::arg("rspecifier"))
+      .def("IsOpen", &PyClass::IsOpen, "Returns true if table is open")
+      .def("Close", &PyClass::Close,
+           "Close() will close the table [throws if it was not open], and "
+           "returns true on success (false if we were reading an archive and "
+           "we discovered an error in the archive).")
+      .def("HasKey", &PyClass::HasKey,
+           "Says if it has this key. If you are using the 'permissive' (p) "
+           "read option, it will return false for keys whose corresponding "
+           "entry in the scp file cannot be read.",
+           py::arg("key"))
+      .def("Value", &PyClass::Value,
+           "Value() may throw if you are reading an scp file, you do not have "
+           "the ' permissive'  (p) option, and an entry in the scp file cannot "
+           "be read. Typically you won't want to catch this error.",
+           py::return_value_policy::reference);
+}
+
+template <class Holder>
+void pybind_table_writer(py::module& m, const std::string& class_name,
+                         const std::string& class_help_doc = "") {
+  using PyClass = TableWriter<Holder>;
+  py::class_<PyClass>(m, class_name.c_str(), class_help_doc.c_str())
+      .def(py::init<>())
+      .def(py::init<const std::string&>(),
+           "This constructor equivalent to default constructor + 'open', but "
+           "throws on error.",
+           py::arg("wspecifier"))
+      .def("Open", &PyClass::Open,
+           "Opens the table.  See docs for wspecifier above. If it returns "
+           "true, it is open.",
+           py::arg("wspecifier"))
+      .def("IsOpen", &PyClass::IsOpen, "Returns true if open for writing.")
+      .def("Write", &PyClass::Write,
+           "Write the object. Throws KaldiFatalError on error via the "
+           "KALDI_ERR macro.",
+           py::arg("key"), py::arg("value"))
+      .def("Flush", &PyClass::Flush,
+           "Flush will flush any archive; it does not return error status or "
+           "throw, any errors will be reported on the next Write or Close. "
+           "Useful if we may be writing to a command in a pipe and want to "
+           "ensure good CPU utilization.")
+      .def("Close", &PyClass::Close,
+           "Close() is not necessary to call, as the destructor closes it; "
+           "it's mainly useful if you want to handle error states because the "
+           "destructor will throw on error if you do not call Close().");
+}
+
+#endif  // KALDI_PYBIND_UTIL_KALDI_TABLE_PYBIND_H_
diff --git a/src/pybind/util/table.py b/src/pybind/util/table.py
new file mode 100644
index 00000000000..9198bf2b778
--- /dev/null
+++ b/src/pybind/util/table.py
@@ -0,0 +1,901 @@
+'''
+This file is modified from the PyKaldi project
+https://github.com/pykaldi/pykaldi/blob/master/kaldi/util/table.py
+'''
+#
+#
+# Author:  Dogan Can
+# Author:  Fanjun Kuang
+#
+#
+'''
+For detailed documentation of Kaldi tables, table readers/writers, table
+read/write specifiers, see `Kaldi I/O mechanisms`_ and
+`Kaldi I/O from a command-line perspective`_.
+
+.. _Kaldi I/O mechanisms:
+   http://kaldi-asr.org/doc/io.html
+.. _Kaldi I/O from a command-line perspective:
+   http://kaldi-asr.org/doc/io_tut.html
+'''
+
+# TODO(fangjun): set the PYTHONPATH environment variable outside this script
+# to avoid set sys.path for every Python script
+
+import os
+import sys
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), os.pardir))
+
+from kaldi_pybind.nnet3 import _SequentialNnetChainExampleReader
+from kaldi_pybind.nnet3 import _RandomAccessNnetChainExampleReader
+from kaldi_pybind.nnet3 import _NnetChainExampleWriter
+
+from kaldi_pybind.feat import _SequentialWaveReader
+from kaldi_pybind.feat import _RandomAccessWaveReader
+from kaldi_pybind.feat import _SequentialWaveInfoReader
+from kaldi_pybind.feat import _RandomAccessWaveInfoReader
+
+from kaldi_pybind import _SequentialBaseFloatMatrixReader
+from kaldi_pybind import _RandomAccessBaseFloatMatrixReader
+from kaldi_pybind import _BaseFloatMatrixWriter
+
+from kaldi_pybind import _SequentialBaseFloatVectorReader
+from kaldi_pybind import _RandomAccessBaseFloatVectorReader
+from kaldi_pybind import _BaseFloatVectorWriter
+
+################################################################################
+# Sequential Readers
+################################################################################
+
+
+class _SequentialReaderBase(object):
+    '''Base class defining the Python API for sequential table readers.'''
+
+    def __init__(self, rspecifier=''):
+        '''
+        This class is used for reading objects sequentially from an archive or
+        script file. It implements the iterator protocol similar to how Python
+        implements iteration over dictionaries. Each iteration returns a `(key,
+        value)` pair from the table in sequential order.
+
+        Args:
+            rspecifier(str): Kaldi rspecifier for reading the table.
+                If provided, the table is opened for reading.
+
+        Raises:
+            IOError: If opening the table for reading fails.
+        '''
+        super(_SequentialReaderBase, self).__init__()
+        if rspecifier != '':
+            if not self.Open(rspecifier):
+                raise IOError('Error opening sequential table reader with '
+                              'rspecifier: {}'.format(rspecifier))
+
+    def __enter__(self):
+        return self
+
+    def __iter__(self):
+        while not self.Done():
+            key = self.Key()
+            value = self.Value()
+            yield key, value
+            self.Next()
+
+    def Open(self, rspecifier):
+        '''Opens the table for reading.
+
+        Args:
+            rspecifier(str): Kaldi rspecifier for reading the table.
+                If provided, the table is opened for reading.
+
+        Returns:
+            True if table is opened successfully, False otherwise.
+
+        Raises:
+            IOError: If opening the table for reading fails.
+        '''
+        return super(_SequentialReaderBase, self).Open(rspecifier)
+
+    def Done(self):
+        '''Indicates whether the table reader is exhausted or not.
+
+        This method is provided for compatibility with the C++ API only;
+        most users should use the Pythonic API.
+
+        Returns:
+          True if the table reader is exhausted, False otherwise.
+        '''
+        return super(_SequentialReaderBase, self).Done()
+
+    def Key(self):
+        '''Returns the current key.
+
+        This method is provided for compatibility with the C++ API only;
+        most users should use the Pythonic API.
+
+        Returns:
+            str: The current key.
+        '''
+        return super(_SequentialReaderBase, self).Key()
+
+    def FreeCurrent(self):
+        '''Deallocates the current value.
+
+        This method is provided as an optimization to save memory, for large
+        objects.
+        '''
+        super(_SequentialReaderBase, self).FreeCurrent()
+
+    def Value(self):
+        '''Returns the current value.
+
+        This method is provided for compatibility with the C++ API only;
+        most users should use the Pythonic API.
+
+        Returns:
+            The current value.
+        '''
+        return super(_SequentialReaderBase, self).Value()
+
+    def Next(self):
+        '''Advances the table reader.
+
+        This method is provided for compatibility with the C++ API only;
+        most users should use the Pythonic API.
+        '''
+        super(_SequentialReaderBase, self).Next()
+
+    def IsOpen(self):
+        '''Indicates whether the table reader is open or not.
+
+        This method is provided for compatibility with the C++ API only;
+        most users should use the Pythonic API.
+
+        Returns:
+          True if the table reader is open, False otherwise.
+        '''
+        return super(_SequentialReaderBase, self).IsOpen()
+
+    def Close(self):
+        '''Closes the table.
+
+        This method is provided for compatibility with the C++ API only;
+        most users should use the Pythonic API.
+
+        Returns:
+            True if table is closed successfully, False otherwise.
+        '''
+        return super(_SequentialReaderBase, self).Close()
+
+
+class SequentialNnetChainExampleReader(_SequentialReaderBase,
+                                       _SequentialNnetChainExampleReader):
+    '''Sequential table reader for nnet chain examples.'''
+    pass
+
+
+class SequentialWaveReader(_SequentialReaderBase, _SequentialWaveReader):
+    '''Sequential table reader for wave files.'''
+    pass
+
+
+class SequentialWaveInfoReader(_SequentialReaderBase,
+                               _SequentialWaveInfoReader):
+    '''Sequential table reader for wave file headers.'''
+    pass
+
+
+class SequentialMatrixReader(_SequentialReaderBase,
+                             _SequentialBaseFloatMatrixReader):
+    '''Sequential table reader for single precision matrices.'''
+    pass
+
+
+class SequentialVectorReader(_SequentialReaderBase,
+                             _SequentialBaseFloatVectorReader):
+    '''Sequential table reader for single precision vectors.'''
+    pass
+
+
+################################################################################
+# Random Access Readers
+################################################################################
+
+
+class _RandomAccessReaderBase(object):
+    '''Base class defining the Python API for random access table readers.'''
+
+    def __init__(self, rspecifier=''):
+        '''
+            This class is used for randomly accessing objects in an archive or
+            script file. It implements `__contains__` and `__getitem__` methods to
+            provide a dictionary-like interface for accessing table entries. e.g.
+            `reader[key]` returns the `value` associated with the `key`.
+
+            Args:
+                rspecifier(str): Kaldi rspecifier for reading the table.
+                    If provided, the table is opened for reading.
+
+            Raises:
+                IOError: If opening the table for reading fails.
+            '''
+        super(_RandomAccessReaderBase, self).__init__()
+        if rspecifier != '':
+            if not self.Open(rspecifier):
+                raise IOError('Error opening random access table reader with '
+                              'rspecifier: {}'.format(rspecifier))
+
+    def __enter__(self):
+        return self
+
+    def __contains__(self, key):
+        return self.HasKey(key)
+
+    def __getitem__(self, key):
+        if self.HasKey(key):
+            return self.Value(key)
+        else:
+            raise KeyError(key)
+
+    def Open(self, rspecifier):
+        '''Opens the table for reading.
+
+            Args:
+                rspecifier(str): Kaldi rspecifier for reading the table.
+                    If provided, the table is opened for reading.
+
+            Returns:
+                True if table is opened successfully, False otherwise.
+
+            Raises:
+                IOError: If opening the table for reading fails.
+            '''
+        return super(_RandomAccessReaderBase, self).Open(rspecifier)
+
+    def HasKey(self, key):
+        '''Checks whether the table has the key.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Args:
+                key (str): The key.
+
+            Returns:
+              True if the table has the key, False otherwise.
+            '''
+        return super(_RandomAccessReaderBase, self).HasKey(key)
+
+    def Value(self, key):
+        '''Returns the value associated with the key.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Args:
+                key (str): The key.
+
+            Returns:
+                The value associated with the key.
+            '''
+        return super(_RandomAccessReaderBase, self).Value(key)
+
+    def IsOpen(self):
+        '''Indicates whether the table reader is open or not.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Returns:
+              True if the table reader is open, False otherwise.
+            '''
+        return super(_RandomAccessReaderBase, self).IsOpen()
+
+    def Close(self):
+        '''Closes the table.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Returns:
+                True if table is closed successfully, False otherwise.
+            '''
+        return super(_RandomAccessReaderBase, self).Close()
+
+
+class RandomAccessNnetChainExampleReader(_RandomAccessReaderBase,
+                                         _RandomAccessNnetChainExampleReader):
+    '''Random access table reader for nnet chain examples.'''
+    pass
+
+
+class RandomAccessWaveReader(_RandomAccessReaderBase, _RandomAccessWaveReader):
+    '''Random access table reader for wave files.'''
+    pass
+
+
+class RandomAccessWaveInfoReader(_RandomAccessReaderBase,
+                                 _RandomAccessWaveInfoReader):
+    '''Random access table reader for wave file headers.'''
+    pass
+
+
+class RandomAccessMatrixReader(_RandomAccessReaderBase,
+                               _RandomAccessBaseFloatMatrixReader):
+    '''Random access table reader for single precision matrices.'''
+    pass
+
+
+class RandomAccessVectorReader(_RandomAccessReaderBase,
+                               _RandomAccessBaseFloatVectorReader):
+    '''Random access table reader for single precision vectors.'''
+    pass
+
+
+################################################################################
+# Writers
+################################################################################
+
+
+class _WriterBase(object):
+    '''Base class defining the additional Python API for table writers.'''
+
+    def __init__(self, wspecifier=''):
+        '''
+
+            This class is used for writing objects to an archive or script file. It
+            implements the `__setitem__` method to provide a dictionary-like
+            interface for writing table entries, e.g. `writer[key] = value` writes
+            the pair `(key, value)` to the table.
+
+            Args:
+                wspecifier (str): Kaldi wspecifier for writing the table.
+                    If provided, the table is opened for writing.
+
+            Raises:
+                IOError: If opening the table for writing fails.
+            '''
+        super(_WriterBase, self).__init__()
+        if wspecifier != '':
+            if not self.Open(wspecifier):
+                raise IOError(
+                    'Error opening table writer with wspecifier: {}'.format(
+                        wspecifier))
+
+    def __enter__(self):
+        return self
+
+    def __setitem__(self, key, value):
+        self.Write(key, value)
+
+    def Open(self, wspecifier):
+        '''Opens the table for writing.
+
+            Args:
+                wspecifier(str): Kaldi wspecifier for writing the table.
+                    If provided, the table is opened for writing.
+
+            Returns:
+                True if table is opened successfully, False otherwise.
+
+            Raises:
+                IOError: If opening the table for writing fails.
+            '''
+        return super(_WriterBase, self).Open(wspecifier)
+
+    def Flush(self):
+        '''Flushes the table contents to disk/pipe.'''
+        super(_WriterBase, self).Flush()
+
+    def Write(self, key, value):
+        '''Writes the `(key, value)` pair to the table.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Args:
+                key (str): The key.
+                value: The value.
+            '''
+        super(_WriterBase, self).Write(key, value)
+
+    def IsOpen(self):
+        '''Indicates whether the table writer is open or not.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Returns:
+              True if the table writer is open, False otherwise.
+            '''
+        return super(_WriterBase, self).IsOpen()
+
+    def Close(self):
+        '''Closes the table.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Returns:
+                True if table is closed successfully, False otherwise.
+            '''
+        return super(_WriterBase, self).Close()
+
+
+class NnetChainExampleWriter(_WriterBase, _NnetChainExampleWriter):
+    '''Table writer for nnet chain examples.'''
+    pass
+
+
+class MatrixWriter(_WriterBase, _BaseFloatMatrixWriter):
+    '''Table writer for single precision matrices.'''
+    pass
+
+
+class VectorWriter(_WriterBase, _BaseFloatVectorWriter):
+    '''Table writer for single precision vectors.'''
+    pass
+
+
+if False:
+    # TODO(fangjun): enable the following once other wrappers are added
+
+    class SequentialDoubleVectorReader(_SequentialReaderBase,
+                                       _kaldi_table.SequentialDoubleVectorReader
+                                      ):
+        '''Sequential table reader for double precision vectors.'''
+        pass
+
+    class SequentialDoubleMatrixReader(_SequentialReaderBase,
+                                       _kaldi_table.SequentialDoubleMatrixReader
+                                      ):
+        '''Sequential table reader for double precision matrices.'''
+        pass
+
+    class SequentialPosteriorReader(_SequentialReaderBase,
+                                    _kaldi_table.SequentialPosteriorReader):
+        '''Sequential table reader for frame posteriors.'''
+        pass
+
+    class SequentialGaussPostReader(_SequentialReaderBase,
+                                    _kaldi_table.SequentialGaussPostReader):
+        '''Sequential table reader for Gaussian-level frame posteriors.'''
+        pass
+
+    class SequentialFstReader(_SequentialReaderBase,
+                              _kaldi_table_ext.SequentialFstReader):
+        '''Sequential table reader for FSTs over the tropical semiring.'''
+        pass
+
+    class SequentialLogFstReader(_SequentialReaderBase,
+                                 _kaldi_table_ext.SequentialLogFstReader):
+        '''Sequential table reader for FSTs over the log semiring.'''
+        pass
+
+    class SequentialKwsIndexFstReader(
+            _SequentialReaderBase,
+            _kaldi_table_ext.SequentialKwsIndexFstReader):
+        '''Sequential table reader for FSTs over the KWS index semiring.'''
+        pass
+
+    class SequentialLatticeReader(_SequentialReaderBase,
+                                  _kaldi_table.SequentialLatticeReader):
+        '''Sequential table reader for lattices.'''
+        pass
+
+    class SequentialCompactLatticeReader(
+            _SequentialReaderBase, _kaldi_table.SequentialCompactLatticeReader):
+        '''Sequential table reader for compact lattices.'''
+        pass
+
+    class SequentialNnetExampleReader(_SequentialReaderBase,
+                                      _kaldi_table.SequentialNnetExampleReader):
+        '''Sequential table reader for nnet examples.'''
+        pass
+
+    class SequentialRnnlmExampleReader(_SequentialReaderBase,
+                                       _kaldi_table.SequentialRnnlmExampleReader
+                                      ):
+        '''Sequential table reader for RNNLM examples.'''
+        pass
+
+    class SequentialIntReader(_SequentialReaderBase,
+                              _kaldi_table.SequentialIntReader):
+        '''Sequential table reader for integers.'''
+        pass
+
+    class SequentialFloatReader(_SequentialReaderBase,
+                                _kaldi_table.SequentialFloatReader):
+        '''Sequential table reader for single precision floats.'''
+        pass
+
+    class SequentialDoubleReader(_SequentialReaderBase,
+                                 _kaldi_table.SequentialDoubleReader):
+        '''Sequential table reader for double precision floats.'''
+        pass
+
+    class SequentialBoolReader(_SequentialReaderBase,
+                               _kaldi_table.SequentialBoolReader):
+        '''Sequential table reader for Booleans.'''
+        pass
+
+    class SequentialIntVectorReader(_SequentialReaderBase,
+                                    _kaldi_table.SequentialIntVectorReader):
+        '''Sequential table reader for integer sequences.'''
+        pass
+
+    class SequentialIntVectorVectorReader(
+            _SequentialReaderBase,
+            _kaldi_table.SequentialIntVectorVectorReader):
+        '''Sequential table reader for sequences of integer sequences.'''
+        pass
+
+    class SequentialIntPairVectorReader(
+            _SequentialReaderBase, _kaldi_table.SequentialIntPairVectorReader):
+        '''Sequential table reader for sequences of integer pairs.'''
+        pass
+
+    class SequentialFloatPairVectorReader(
+            _SequentialReaderBase,
+            _kaldi_table.SequentialFloatPairVectorReader):
+        '''Sequential table reader for sequences of single precision float pairs.'''
+        pass
+
+    class RandomAccessDoubleVectorReader(
+            _RandomAccessReaderBase,
+            _kaldi_table.RandomAccessDoubleVectorReader):
+        '''Random access table reader for double precision vectors.'''
+        pass
+
+    class RandomAccessDoubleMatrixReader(
+            _RandomAccessReaderBase,
+            _kaldi_table.RandomAccessDoubleMatrixReader):
+        '''Random access table reader for double precision matrices.'''
+        pass
+
+    class RandomAccessPosteriorReader(_RandomAccessReaderBase,
+                                      _kaldi_table.RandomAccessPosteriorReader):
+        '''Random access table reader for frame posteriors.'''
+        pass
+
+    class RandomAccessGaussPostReader(_RandomAccessReaderBase,
+                                      _kaldi_table.RandomAccessGaussPostReader):
+        '''Random access table reader for Gaussian-level frame posteriors.'''
+        pass
+
+    class RandomAccessFstReader(_RandomAccessReaderBase,
+                                _kaldi_table_ext.RandomAccessFstReader):
+        '''Random access table reader for FSTs over the tropical semiring.'''
+        pass
+
+    class RandomAccessLogFstReader(_RandomAccessReaderBase,
+                                   _kaldi_table_ext.RandomAccessLogFstReader):
+        '''Random access table reader for FSTs over the log semiring.'''
+        pass
+
+    class RandomAccessKwsIndexFstReader(
+            _RandomAccessReaderBase,
+            _kaldi_table_ext.RandomAccessKwsIndexFstReader):
+        '''Random access table reader for FSTs over the KWS index semiring.'''
+        pass
+
+    class RandomAccessLatticeReader(_RandomAccessReaderBase,
+                                    _kaldi_table.RandomAccessLatticeReader):
+        '''Random access table reader for lattices.'''
+        pass
+
+    class RandomAccessCompactLatticeReader(
+            _RandomAccessReaderBase,
+            _kaldi_table.RandomAccessCompactLatticeReader):
+        '''Random access table reader for compact lattices.'''
+        pass
+
+    class RandomAccessNnetExampleReader(
+            _RandomAccessReaderBase,
+            _kaldi_table.RandomAccessNnetExampleReader):
+        '''Random access table reader for nnet examples.'''
+        pass
+
+    class RandomAccessIntReader(_RandomAccessReaderBase,
+                                _kaldi_table.RandomAccessIntReader):
+        '''Random access table reader for integers.'''
+        pass
+
+    class RandomAccessFloatReader(_RandomAccessReaderBase,
+                                  _kaldi_table.RandomAccessFloatReader):
+        '''Random access table reader for single precision floats.'''
+        pass
+
+    class RandomAccessDoubleReader(_RandomAccessReaderBase,
+                                   _kaldi_table.RandomAccessDoubleReader):
+        '''Random access table reader for double precision floats.'''
+        pass
+
+    class RandomAccessBoolReader(_RandomAccessReaderBase,
+                                 _kaldi_table.RandomAccessBoolReader):
+        '''Random access table reader for Booleans.'''
+        pass
+
+    class RandomAccessIntVectorReader(_RandomAccessReaderBase,
+                                      _kaldi_table.RandomAccessIntVectorReader):
+        '''Random access table reader for integer sequences.'''
+        pass
+
+    class RandomAccessIntVectorVectorReader(
+            _RandomAccessReaderBase,
+            _kaldi_table.RandomAccessIntVectorVectorReader):
+        '''Random access table reader for sequences of integer sequences.'''
+        pass
+
+    class RandomAccessIntPairVectorReader(
+            _RandomAccessReaderBase,
+            _kaldi_table.RandomAccessIntPairVectorReader):
+        '''Random access table reader for sequences of integer pairs.'''
+        pass
+
+    class RandomAccessFloatPairVectorReader(
+            _RandomAccessReaderBase,
+            _kaldi_table.RandomAccessFloatPairVectorReader):
+        '''
+        Random access table reader for sequences of single precision float pairs.
+        '''
+        pass
+
+################################################################################
+# Mapped Random Access Readers
+################################################################################
+
+    class _RandomAccessReaderMappedBase(object):
+        '''
+        Base class defining the Python API for mapped random access table readers.
+        '''
+
+        def __init__(self, table_rspecifier='', map_rspecifier=''):
+            '''
+            This class is used for randomly accessing objects in an archive or
+            script file. It implements `__contains__` and `__getitem__` methods to
+            provide a dictionary-like interface for accessing table entries. If a
+            **map_rspecifier** is provided, the map is used for converting the keys
+            to the actual keys used to query the table, e.g. `reader[key]` returns
+            the `value` associated with the key `map[key]`. Otherwise, it works like
+            a random access table reader.
+
+            Args:
+                table_rspecifier(str): Kaldi rspecifier for reading the table.
+                    If provided, the table is opened for reading.
+                map_rspecifier (str): Kaldi rspecifier for reading the map.
+                    If provided, the map is opened for reading.
+
+            Raises:
+                IOError: If opening the table or map for reading fails.
+            '''
+            super(_RandomAccessReaderMappedBase, self).__init__()
+            if table_rspecifier != '' and map_rspecifier != '':
+                if not self.open(table_rspecifier, map_rspecifier):
+                    raise IOError(
+                        'Error opening mapped random access table reader '
+                        'with table_rspecifier: {}, map_rspecifier: {}'.format(
+                            table_rspecifier, map_rspecifier))
+
+        def __enter__(self):
+            return self
+
+        def __contains__(self, key):
+            return self.has_key(key)
+
+        def __getitem__(self, key):
+            if self.has_key(key):
+                return self.value(key)
+            else:
+                raise KeyError(key)
+
+        def open(self, table_rspecifier, map_rspecifier):
+            '''Opens the table for reading.
+
+            Args:
+                table_rspecifier(str): Kaldi rspecifier for reading the table.
+                    If provided, the table is opened for reading.
+                map_rspecifier (str): Kaldi rspecifier for reading the map.
+                    If provided, the map is opened for reading.
+
+            Returns:
+                True if table is opened successfully, False otherwise.
+
+            Raises:
+                IOError: If opening the table or map for reading fails.
+            '''
+            return super(_RandomAccessReaderMappedBase,
+                         self).open(table_rspecifier, map_rspecifier)
+
+        def has_key(self, key):
+            '''Checks whether the table has the key.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Args:
+                key (str): The key.
+
+            Returns:
+              True if the table has the key, False otherwise.
+            '''
+            return super(_RandomAccessReaderMappedBase, self).has_key(key)
+
+        def value(self, key):
+            '''Returns the value associated with the key.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Args:
+                key (str): The key.
+
+            Returns:
+                The value associated with the key.
+            '''
+            return super(_RandomAccessReaderMappedBase, self).value(key)
+
+        def is_open(self):
+            '''Indicates whether the table reader is open or not.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Returns:
+              True if the table reader is open, False otherwise.
+            '''
+            return super(_RandomAccessReaderMappedBase, self).is_open()
+
+        def close(self):
+            '''Closes the table.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Returns:
+                True if table is closed successfully, False otherwise.
+            '''
+            return super(_RandomAccessReaderMappedBase, self).close()
+
+    class RandomAccessVectorReaderMapped(
+            _RandomAccessReaderMappedBase,
+            _kaldi_table.RandomAccessVectorReaderMapped):
+        '''Mapped random access table reader for single precision vectors.'''
+        pass
+
+    class RandomAccessDoubleVectorReaderMapped(
+            _RandomAccessReaderMappedBase,
+            _kaldi_table.RandomAccessDoubleVectorReaderMapped):
+        '''Mapped random access table reader for double precision vectors.'''
+        pass
+
+    class RandomAccessMatrixReaderMapped(
+            _RandomAccessReaderMappedBase,
+            _kaldi_table.RandomAccessMatrixReaderMapped):
+        '''Mapped random access table reader for single precision matrices.'''
+        pass
+
+    class RandomAccessDoubleMatrixReaderMapped(
+            _RandomAccessReaderMappedBase,
+            _kaldi_table.RandomAccessDoubleMatrixReaderMapped):
+        '''Mapped random access table reader for double precision matrices.'''
+        pass
+
+    class RandomAccessFloatReaderMapped(
+            _RandomAccessReaderMappedBase,
+            _kaldi_table.RandomAccessFloatReaderMapped):
+        '''Mapped random access table reader for single precision floats.'''
+        pass
+
+    class DoubleVectorWriter(_WriterBase, _kaldi_table.DoubleVectorWriter):
+        '''Table writer for double precision vectors.'''
+
+        def write(self, key, value):
+            '''Writes the `(key, value)` pair to the table.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Overrides write to accept both DoubleVector and DoubleSubVector.
+
+            Args:
+                key (str): The key.
+                value: The value.
+            '''
+            super(DoubleVectorWriter, self).write(key,
+                                                  _matrix.DoubleVector(value))
+
+    class DoubleMatrixWriter(_WriterBase, _kaldi_table.DoubleMatrixWriter):
+        '''Table writer for double precision matrices.'''
+
+        def write(self, key, value):
+            '''Writes the `(key, value)` pair to the table.
+
+            This method is provided for compatibility with the C++ API only;
+            most users should use the Pythonic API.
+
+            Overrides write to accept both DoubleMatrix and DoubleSubMatrix.
+
+            Args:
+                key (str): The key.
+                value: The value.
+            '''
+            super(DoubleMatrixWriter, self).write(key,
+                                                  _matrix.DoubleMatrix(value))
+
+    class WaveWriter(_WriterBase, _kaldi_table.WaveWriter):
+        '''Table writer for wave files.'''
+        pass
+
+    class PosteriorWriter(_WriterBase, _kaldi_table.PosteriorWriter):
+        '''Table writer for frame posteriors.'''
+        pass
+
+    class GaussPostWriter(_WriterBase, _kaldi_table.GaussPostWriter):
+        '''Table writer for Gaussian-level frame posteriors.'''
+        pass
+
+    class FstWriter(_WriterBase, _kaldi_table_ext.FstWriter):
+        '''Table writer for FSTs over the tropical semiring.'''
+        pass
+
+    class LogFstWriter(_WriterBase, _kaldi_table_ext.LogFstWriter):
+        '''Table writer for FSTs over the log semiring.'''
+        pass
+
+    class KwsIndexFstWriter(_WriterBase, _kaldi_table_ext.KwsIndexFstWriter):
+        '''Table writer for FSTs over the KWS index semiring.'''
+        pass
+
+    class LatticeWriter(_WriterBase, _kaldi_table.LatticeWriter):
+        '''Table writer for lattices.'''
+        pass
+
+    class CompactLatticeWriter(_WriterBase, _kaldi_table.CompactLatticeWriter):
+        '''Table writer for compact lattices.'''
+        pass
+
+    class NnetExampleWriter(_WriterBase, _kaldi_table.NnetExampleWriter):
+        '''Table writer for nnet examples.'''
+        pass
+
+    class RnnlmExampleWriter(_WriterBase, _kaldi_table.RnnlmExampleWriter):
+        '''Table writer for RNNLM examples.'''
+        pass
+
+    class IntWriter(_WriterBase, _kaldi_table.IntWriter):
+        '''Table writer for integers.'''
+        pass
+
+    class FloatWriter(_WriterBase, _kaldi_table.FloatWriter):
+        '''Table writer for single precision floats.'''
+        pass
+
+    class DoubleWriter(_WriterBase, _kaldi_table.DoubleWriter):
+        '''Table writer for double precision floats.'''
+        pass
+
+    class BoolWriter(_WriterBase, _kaldi_table.BoolWriter):
+        '''Table writer for Booleans.'''
+        pass
+
+    class IntVectorWriter(_WriterBase, _kaldi_table.IntVectorWriter):
+        '''Table writer for integer sequences.'''
+        pass
+
+    class IntVectorVectorWriter(_WriterBase,
+                                _kaldi_table.IntVectorVectorWriter):
+        '''Table writer for sequences of integer sequences.'''
+        pass
+
+    class IntPairVectorWriter(_WriterBase, _kaldi_table.IntPairVectorWriter):
+        '''Table writer for sequences of integer pairs.'''
+        pass
+
+    class FloatPairVectorWriter(_WriterBase,
+                                _kaldi_table.FloatPairVectorWriter):
+        '''Table writer for sequences of single precision float pairs.'''
+        pass
+
+
+################################################################################
diff --git a/src/pybind/util/table_types_pybind.cc b/src/pybind/util/table_types_pybind.cc
new file mode 100644
index 00000000000..306d48d3f37
--- /dev/null
+++ b/src/pybind/util/table_types_pybind.cc
@@ -0,0 +1,49 @@
+// pybind/util/table_types_pybind.cc
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/table_types_pybind.h"
+
+#include "util/kaldi_table_pybind.h"
+
+#include "util/kaldi-table-inl.h"
+#include "util/table-types.h"
+
+using namespace kaldi;
+
+void pybind_table_types(py::module& m) {
+  pybind_sequential_table_reader<KaldiObjectHolder<Matrix<float>>>(
+      m, "_SequentialBaseFloatMatrixReader");
+
+  pybind_random_access_table_reader<KaldiObjectHolder<Matrix<float>>>(
+      m, "_RandomAccessBaseFloatMatrixReader");
+
+  pybind_table_writer<KaldiObjectHolder<Matrix<float>>>(
+      m, "_BaseFloatMatrixWriter");
+
+  pybind_sequential_table_reader<KaldiObjectHolder<Vector<float>>>(
+      m, "_SequentialBaseFloatVectorReader");
+
+  pybind_random_access_table_reader<KaldiObjectHolder<Vector<float>>>(
+      m, "_RandomAccessBaseFloatVectorReader");
+
+  pybind_table_writer<KaldiObjectHolder<Vector<float>>>(
+      m, "_BaseFloatVectorWriter");
+}
diff --git a/src/pybind/util/table_types_pybind.h b/src/pybind/util/table_types_pybind.h
new file mode 100644
index 00000000000..849a569f9e9
--- /dev/null
+++ b/src/pybind/util/table_types_pybind.h
@@ -0,0 +1,27 @@
+// pybind/util/table_types_pybind.h
+
+// Copyright 2019   Daniel Povey
+//           2019   Dongji Gao
+//           2019   Mobvoi AI Lab, Beijing, China (author: Fangjun Kuang)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_PYBIND_UTIL_TABLE_TYPES_PYBIND_H_
+#define KALDI_PYBIND_UTIL_TABLE_TYPES_PYBIND_H_
+
+#include "pybind/kaldi_pybind.h"
+
+void pybind_table_types(py::module& m);
+
+#endif  // KALDI_PYBIND_UTIL_TABLE_TYPES_PYBIND_H_
diff --git a/src/pybind/valgrind-python.supp b/src/pybind/valgrind-python.supp
new file mode 100644
index 00000000000..15982cc3e8f
--- /dev/null
+++ b/src/pybind/valgrind-python.supp
@@ -0,0 +1,391 @@
+#
+# This is a valgrind suppression file that should be used when using valgrind.
+#
+#  Here's an example of running valgrind:
+#
+#	cd python/dist/src
+#	valgrind --tool=memcheck --suppressions=Misc/valgrind-python.supp \
+#		./python -E -tt ./Lib/test/regrtest.py -u bsddb,network
+#
+# You must edit Objects/obmalloc.c and uncomment Py_USING_MEMORY_DEBUGGER
+# to use the preferred suppressions with Py_ADDRESS_IN_RANGE.
+#
+# If you do not want to recompile Python, you can uncomment
+# suppressions for PyObject_Free and PyObject_Realloc.
+#
+# See Misc/README.valgrind for more information.
+
+# all tool names: Addrcheck,Memcheck,cachegrind,helgrind,massif
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Addr4
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Value4
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 8 (x86_64 aka amd64)
+   Memcheck:Value8
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+{
+   ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+   Memcheck:Cond
+   fun:Py_ADDRESS_IN_RANGE
+}
+
+#
+# Leaks (including possible leaks)
+#    Hmmm, I wonder if this masks some real leaks.  I think it does.
+#    Will need to fix that.
+#
+
+{
+   Suppress leaking the GIL.  Happens once per process, see comment in ceval.c.
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_allocate_lock
+   fun:PyEval_InitThreads
+}
+
+{
+   Suppress leaking the GIL after a fork.
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_allocate_lock
+   fun:PyEval_ReInitThreads
+}
+
+{
+   Suppress leaking the autoTLSkey.  This looks like it shouldn't leak though.
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_create_key
+   fun:_PyGILState_Init
+   fun:Py_InitializeEx
+   fun:Py_Main
+}
+
+{
+   Hmmm, is this a real leak or like the GIL?
+   Memcheck:Leak
+   fun:malloc
+   fun:PyThread_ReInitTLS
+}
+
+{
+   Handle PyMalloc confusing valgrind (possibly leaked)
+   Memcheck:Leak
+   fun:realloc
+   fun:_PyObject_GC_Resize
+   fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+{
+   Handle PyMalloc confusing valgrind (possibly leaked)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyObject_GC_New
+   fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+{
+   Handle PyMalloc confusing valgrind (possibly leaked)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyObject_GC_NewVar
+   fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+#
+# Non-python specific leaks
+#
+
+{
+   Handle pthread issue (possibly leaked)
+   Memcheck:Leak
+   fun:calloc
+   fun:allocate_dtv
+   fun:_dl_allocate_tls_storage
+   fun:_dl_allocate_tls
+}
+
+{
+   Handle pthread issue (possibly leaked)
+   Memcheck:Leak
+   fun:memalign
+   fun:_dl_allocate_tls_storage
+   fun:_dl_allocate_tls
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Addr4
+   fun:PyObject_Free
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Value4
+   fun:PyObject_Free
+}
+
+{
+   ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+   Memcheck:Cond
+   fun:PyObject_Free
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Addr4
+   fun:PyObject_Realloc
+}
+
+{
+   ADDRESS_IN_RANGE/Invalid read of size 4
+   Memcheck:Value4
+   fun:PyObject_Realloc
+}
+
+{
+   ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+   Memcheck:Cond
+   fun:PyObject_Realloc
+}
+
+###
+### All the suppressions below are for errors that occur within libraries
+### that Python uses.  The problems to not appear to be related to Python's
+### use of the libraries.
+###
+
+{
+   Generic ubuntu ld problems
+   Memcheck:Addr8
+   obj:/lib/ld-2.4.so
+   obj:/lib/ld-2.4.so
+   obj:/lib/ld-2.4.so
+   obj:/lib/ld-2.4.so
+}
+
+{
+   Generic gentoo ld problems
+   Memcheck:Cond
+   obj:/lib/ld-2.3.4.so
+   obj:/lib/ld-2.3.4.so
+   obj:/lib/ld-2.3.4.so
+   obj:/lib/ld-2.3.4.so
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Param
+   write(buf)
+   fun:write
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_close
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Value8
+   fun:memmove
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_store
+   fun:dbm_ass_sub
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Cond
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_store
+   fun:dbm_ass_sub
+}
+
+{
+   DBM problems, see test_dbm
+   Memcheck:Cond
+   fun:memmove
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   obj:/usr/lib/libdb1.so.2
+   fun:dbm_store
+   fun:dbm_ass_sub
+}
+
+{
+   GDBM problems, see test_gdbm
+   Memcheck:Param
+   write(buf)
+   fun:write
+   fun:gdbm_open
+
+}
+
+{
+   ZLIB problems, see test_gzip
+   Memcheck:Cond
+   obj:/lib/libz.so.1.2.3
+   obj:/lib/libz.so.1.2.3
+   fun:deflate
+}
+
+{
+   Avoid problems w/readline doing a putenv and leaking on exit
+   Memcheck:Leak
+   fun:malloc
+   fun:xmalloc
+   fun:sh_set_lines_and_columns
+   fun:_rl_get_screen_size
+   fun:_rl_init_terminal_io
+   obj:/lib/libreadline.so.4.3
+   fun:rl_initialize
+}
+
+###
+### These occur from somewhere within the SSL, when running
+###  test_socket_sll.  They are too general to leave on by default.
+###
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Cond
+###   fun:memset
+###}
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Value4
+###   fun:memset
+###}
+###
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Cond
+###   fun:MD5_Update
+###}
+###
+###{
+###   somewhere in SSL stuff
+###   Memcheck:Value4
+###   fun:MD5_Update
+###}
+
+#
+# All of these problems come from using test_socket_ssl
+#
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_bin2bn
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_num_bits_word
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:BN_num_bits_word
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_mod_exp_mont_word
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BN_mod_exp_mont
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Param
+   write(buf)
+   fun:write
+   obj:/usr/lib/libcrypto.so.0.9.7
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:RSA_verify
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:RSA_verify
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:DES_set_key_unchecked
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:DES_encrypt2
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   obj:/usr/lib/libssl.so.0.9.7
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   obj:/usr/lib/libssl.so.0.9.7
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:BUF_MEM_grow_clean
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:memcpy
+   fun:ssl3_read_bytes
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Cond
+   fun:SHA1_Update
+}
+
+{
+   from test_socket_ssl
+   Memcheck:Value4
+   fun:SHA1_Update
+}
+
+
diff --git a/src/rnnlm/Makefile b/src/rnnlm/Makefile
index 6ee52bbb1d7..d4b3f3ce0a8 100644
--- a/src/rnnlm/Makefile
+++ b/src/rnnlm/Makefile
@@ -15,7 +15,7 @@ OBJFILES = sampler.o rnnlm-example.o rnnlm-example-utils.o \
 LIBNAME = kaldi-rnnlm
 
 ADDLIBS = ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-          ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a
+          ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a ../util/kaldi-util.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc
index 63a6dee188d..d1a01f7ef66 100644
--- a/src/rnnlm/rnnlm-core-training.cc
+++ b/src/rnnlm/rnnlm-core-training.cc
@@ -302,7 +302,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                   << ", per-component max-change was enforced "
                   << ((100.0 * num_max_change_per_component_applied_[i]) /
                       num_minibatches_processed_)
-                  << "\% of the time.";
+                  << "% of the time.";
       i++;
     }
   }
@@ -312,7 +312,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
                  (num_minibatches_processed_ *
                  (config_.backstitch_training_scale == 0.0 ? 1.0 :
                  1.0 + 1.0 / config_.backstitch_training_interval))
-              << "\% of the time.";
+              << "% of the time.";
 }
 
 void RnnlmCoreTrainer::ProcessOutput(
@@ -343,6 +343,11 @@ void RnnlmCoreTrainer::ProcessOutput(
   computer->AcceptInput("output", &output_deriv);
 }
 
+void RnnlmCoreTrainer::ConsolidateMemory() {
+  kaldi::nnet3::ConsolidateMemory(nnet_);
+  kaldi::nnet3::ConsolidateMemory(delta_nnet_);
+}
+
 RnnlmCoreTrainer::~RnnlmCoreTrainer() {
   PrintMaxChangeStats();
   // Note: the objective-function stats are printed out in the destructor of the
diff --git a/src/rnnlm/rnnlm-core-training.h b/src/rnnlm/rnnlm-core-training.h
index 8f5ce873ff1..dd5fcfebd95 100644
--- a/src/rnnlm/rnnlm-core-training.h
+++ b/src/rnnlm/rnnlm-core-training.h
@@ -189,6 +189,10 @@ class RnnlmCoreTrainer {
   // per-component max-change and global max-change were enforced.
   void PrintMaxChangeStats() const;
 
+
+  // Calls ConsolidateMemory() on nnet_ and delta_nnet_.
+  void ConsolidateMemory();
+
   ~RnnlmCoreTrainer();
  private:
 
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index c4238c7356a..0b5916b6bba 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -117,9 +117,9 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
     bool is_backstitch_step1,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
-  // backstitch training is incompatible with momentum > 0  
+  // backstitch training is incompatible with momentum > 0
   KALDI_ASSERT(config_.momentum == 0.0);
-  
+
   // If relevant, do the following:
   // "embedding_deriv += - 2 * l2_regularize * embedding_mat_"
   // This is an approximate to the regular l2 regularization (add l2 regularization
@@ -130,7 +130,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) *
           l2_term, *embedding_mat_);
     }
-  } 
+  }
 
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
@@ -213,7 +213,7 @@ void RnnlmEmbeddingTrainer::Train(
 }
 
 void RnnlmEmbeddingTrainer::TrainBackstitch(
-    bool is_backstitch_step1, 
+    bool is_backstitch_step1,
     const CuArrayBase<int32> &active_words,
     CuMatrixBase<BaseFloat> *embedding_deriv) {
 
@@ -232,7 +232,7 @@ void RnnlmEmbeddingTrainer::TrainBackstitch(
       embedding_deriv->AddRows(l2_term / (1.0 + config_.backstitch_training_scale),
                                *embedding_mat_, active_words);
     }
-  } 
+  }
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
     if (is_backstitch_step1) preconditioner_.Freeze(true);
@@ -273,7 +273,7 @@ void RnnlmEmbeddingTrainer::PrintStats() {
                (num_minibatches_ *
                (config_.backstitch_training_scale == 0.0 ? 1.0 :
                1.0 + 1.0 / config_.backstitch_training_interval))
-            << " \% of the time.";
+            << " % of the time.";
 
   Matrix<BaseFloat> delta_embedding_mat(*embedding_mat_);
   delta_embedding_mat.AddMat(-1.0, initial_embedding_mat_);
diff --git a/src/rnnlm/rnnlm-example-test.cc b/src/rnnlm/rnnlm-example-test.cc
index 8b393acf4ff..ccfdd90bbea 100644
--- a/src/rnnlm/rnnlm-example-test.cc
+++ b/src/rnnlm/rnnlm-example-test.cc
@@ -305,6 +305,8 @@ int main() {
   SetVerboseLevel(4);
   CuDevice::Instantiate().PrintProfile();
 #endif
+
+  unlink("tmp.ark");
   return 0;
 }
 
diff --git a/src/rnnlm/rnnlm-example-utils.cc b/src/rnnlm/rnnlm-example-utils.cc
index fd7cca5eadb..a019012d20e 100644
--- a/src/rnnlm/rnnlm-example-utils.cc
+++ b/src/rnnlm/rnnlm-example-utils.cc
@@ -284,11 +284,15 @@ static void ProcessRnnlmOutputNoSampling(
     CuMatrix<BaseFloat> word_probs(nnet_output.NumRows(),
                                    num_words - 1, kUndefined);
     word_probs.CopyFromMat(word_logprobs.ColRange(1, num_words - 1));
-    word_probs.ApplyExp();
+    word_probs.ApplyExpLimited(-80.0, 80.0);
     CuVector<BaseFloat> row_sums(nnet_output.NumRows());
     row_sums.AddColSumMat(1.0, word_probs, 0.0);
     row_sums.ApplyLog();
-    *objf_den_exact = -VecVec(row_sums, minibatch.output_weights);
+    BaseFloat ans = -VecVec(row_sums, minibatch.output_weights);
+    *objf_den_exact =  ans;
+    if (fabs(ans) > 1.0 * nnet_output.NumRows()) {
+      KALDI_WARN << "Big den objf "  << ans;
+    }
   }
 
   // In preparation for computing the denominator objf, change 'word_logprobs'
diff --git a/src/rnnlm/rnnlm-example.cc b/src/rnnlm/rnnlm-example.cc
index 0be4d4ecb47..8dd36689fd6 100644
--- a/src/rnnlm/rnnlm-example.cc
+++ b/src/rnnlm/rnnlm-example.cc
@@ -346,7 +346,7 @@ RnnlmExampleCreator::~RnnlmExampleCreator() {
       num_minibatches_written_;
   KALDI_LOG << "Combined " << num_sequences_processed_ << "/"
             << num_chunks_processed_
-            << " chunks/sequences into " << num_minibatches_written_
+            << " sequences/chunks into " << num_minibatches_written_
             << " minibatches (" << chunks_.size()
             << " chunks left over)";
  KALDI_LOG << "Overall there were "
diff --git a/src/rnnlm/rnnlm-example.h b/src/rnnlm/rnnlm-example.h
index 1f3bcb957a9..3ac92701e36 100644
--- a/src/rnnlm/rnnlm-example.h
+++ b/src/rnnlm/rnnlm-example.h
@@ -401,7 +401,7 @@ class RnnlmExampleCreator {
                       TableWriter<KaldiObjectHolder<RnnlmExample> > *writer):
       config_(config), minibatch_sampler_(NULL),
       sampling_sequencer_(TaskSequencerConfig()),
-      writer_(writer),
+      writer_(writer), num_sequences_processed_(0),
       num_chunks_processed_(0), num_words_processed_(0),
       num_minibatches_written_(0) { Check(); }
 
diff --git a/src/rnnlm/rnnlm-training.cc b/src/rnnlm/rnnlm-training.cc
index 370f6395dc0..6db4d6f05b4 100644
--- a/src/rnnlm/rnnlm-training.cc
+++ b/src/rnnlm/rnnlm-training.cc
@@ -110,6 +110,9 @@ void RnnlmTrainer::Train(RnnlmExample *minibatch) {
   active_word_features_trans_.Swap(&active_word_features_trans);
 
   TrainInternal();
+
+  if (num_minibatches_processed_ == 1)
+    core_trainer_->ConsolidateMemory();
 }
 
 
diff --git a/src/rnnlmbin/Makefile b/src/rnnlmbin/Makefile
index 4c4231c02c8..23a8eba6145 100644
--- a/src/rnnlmbin/Makefile
+++ b/src/rnnlmbin/Makefile
@@ -16,11 +16,11 @@ cuda-compiled.o: ../kaldi.mk
 
 TESTFILES =
 
-ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../lm/kaldi-lm.a ../nnet3/kaldi-nnet3.a \
+ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
-          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
-          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
+          ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/rnnlmbin/rnnlm-train.cc b/src/rnnlmbin/rnnlm-train.cc
index 6a212dd4aad..d9107e310f5 100644
--- a/src/rnnlmbin/rnnlm-train.cc
+++ b/src/rnnlmbin/rnnlm-train.cc
@@ -22,7 +22,7 @@
 #include "rnnlm/rnnlm-training.h"
 #include "rnnlm/rnnlm-example-utils.h"
 #include "nnet3/nnet-utils.h"
-
+#include "cudamatrix/cu-allocator.h"
 
 int main(int argc, char *argv[]) {
   try {
@@ -93,6 +93,7 @@ int main(int argc, char *argv[]) {
 
 
     objective_config.Register(&po);
+    RegisterCuAllocatorOptions(&po);
 
     // register the core RNNLM training options options with the prefix "rnnlm",
     // so they will appear as --rnnlm.max-change and the like.  This is done
diff --git a/src/sgmm2/Makefile b/src/sgmm2/Makefile
index d538c14c1a9..35a8d3a1f40 100644
--- a/src/sgmm2/Makefile
+++ b/src/sgmm2/Makefile
@@ -13,7 +13,7 @@ OBJFILES = am-sgmm2.o estimate-am-sgmm2.o estimate-am-sgmm2-ebw.o fmllr-sgmm2.o
 LIBNAME = kaldi-sgmm2
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/decodable-am-sgmm2.h b/src/sgmm2/decodable-am-sgmm2.h
index 75144650568..18498bf5b24 100644
--- a/src/sgmm2/decodable-am-sgmm2.h
+++ b/src/sgmm2/decodable-am-sgmm2.h
@@ -59,15 +59,15 @@ class DecodableAmSgmm2 : public DecodableInterface {
       sgmm_cache_(sgmm.NumGroups(), sgmm.NumPdfs()), delete_vars_(true) {
     KALDI_ASSERT(gselect->size() == static_cast<size_t>(feats->NumRows()));
   }
-  
+
   // Note, frames are numbered from zero, but transition indices are 1-based!
   // This is for compatibility with OpenFST.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdf(tid));
+    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid));
   }
   int32 NumFramesReady() const { return feature_matrix_->NumRows(); }
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
@@ -81,17 +81,17 @@ class DecodableAmSgmm2 : public DecodableInterface {
   Sgmm2PerSpkDerivedVars *spk_;
   const TransitionModel &trans_model_;  ///< for tid to pdf mapping
   const Matrix<BaseFloat> *feature_matrix_;
-  const std::vector<std::vector<int32> > *gselect_; 
-  
+  const std::vector<std::vector<int32> > *gselect_;
+
   BaseFloat log_prune_;
-  
+
   int32 cur_frame_;
   Sgmm2PerFrameDerivedVars per_frame_vars_;
   Sgmm2LikelihoodCache sgmm_cache_;
 
   bool delete_vars_; // If true, we will delete feature_matrix_, gselect_, and
   // spk_ in the destructor.
-  
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm2);
 };
@@ -121,10 +121,10 @@ class DecodableAmSgmm2Scaled : public DecodableAmSgmm2 {
       : DecodableAmSgmm2(sgmm, tm, feats, gselect, spk, log_prune),
         scale_(scale) {}
 
-  
+
   // Note, frames are numbered from zero but transition-ids from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdf(tid))
+    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid))
             * scale_;
   }
  private:
diff --git a/src/sgmm2bin/Makefile b/src/sgmm2bin/Makefile
index 34407a4f5ad..e973061ed8a 100644
--- a/src/sgmm2bin/Makefile
+++ b/src/sgmm2bin/Makefile
@@ -21,7 +21,6 @@ ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../sgmm2/kaldi-sgmm2.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2bin/sgmm2-sum-accs.cc b/src/sgmm2bin/sgmm2-sum-accs.cc
index 2f8a4e7e62b..8259702ba49 100644
--- a/src/sgmm2bin/sgmm2-sum-accs.cc
+++ b/src/sgmm2bin/sgmm2-sum-accs.cc
@@ -59,10 +59,13 @@ int main(int argc, char *argv[]) {
       }
       for (size_t i = 0; i < po.NumArgs() - 1; i++) {
         bool b;
-        kaldi::InitKaldiInputStream(inputs[i]->Stream(), &b);
-        transition_accs.Read(inputs[i]->Stream(), b, true /* add values */);
-        sgmm_accs.Read(inputs[i]->Stream(), b, true /* add values */);
-        delete inputs[i];
+        if (kaldi::InitKaldiInputStream(inputs[i]->Stream(), &b)) {
+          transition_accs.Read(inputs[i]->Stream(), b, true /* add values */);
+          sgmm_accs.Read(inputs[i]->Stream(), b, true /* add values */);
+          delete inputs[i];
+        } else {
+          KALDI_ERR << "Failed to read input stats file " << po.GetArg(i + 2);
+        }
       }      
     } else {
       for (int i = 2, max = po.NumArgs(); i <= max; i++) {
diff --git a/src/tfrnnlm/Makefile b/src/tfrnnlm/Makefile
index 12e6c9494c9..3dc8d584210 100644
--- a/src/tfrnnlm/Makefile
+++ b/src/tfrnnlm/Makefile
@@ -16,11 +16,13 @@ TENSORFLOW = ../../tools/tensorflow
 
 all:
 
-EXTRA_CXXFLAGS = -Wno-sign-compare -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src \
+EXTRA_CXXFLAGS = -Wno-sign-compare \
+                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src \
                  -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/nsync/public \
-                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src
+                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src \
+                 -I${TENSORFLOW}/tensorflow/contrib/makefile/downloads/absl
 
 OBJFILES = tensorflow-rnnlm.o
 
@@ -28,9 +30,8 @@ TESTFILES =
 
 LIBNAME = kaldi-tensorflow-rnnlm
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
 
diff --git a/src/tfrnnlmbin/Makefile b/src/tfrnnlmbin/Makefile
index f2a353c918c..77fe58c088c 100644
--- a/src/tfrnnlmbin/Makefile
+++ b/src/tfrnnlmbin/Makefile
@@ -14,11 +14,13 @@ TENSORFLOW = $(shell pwd)/../../tools/tensorflow
 
 all:
 
-EXTRA_CXXFLAGS = -Wno-sign-compare -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf/src \
+EXTRA_CXXFLAGS = -Wno-sign-compare \
+                 -I$(TENSORFLOW)/bazel-tensorflow/external/protobuf_archive/src \
                  -I$(TENSORFLOW)/bazel-genfiles -I$(TENSORFLOW) \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/eigen \
                  -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/nsync/public \
-                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src
+                 -I$(TENSORFLOW)/tensorflow/contrib/makefile/downloads/protobuf/src \
+                 -I${TENSORFLOW}/tensorflow/contrib/makefile/downloads/absl
 include ../kaldi.mk
 
 BINFILES = lattice-lmrescore-tf-rnnlm lattice-lmrescore-tf-rnnlm-pruned
@@ -29,12 +31,12 @@ TESTFILES =
 
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a ../tfrnnlm/kaldi-tensorflow-rnnlm.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../tfrnnlm/kaldi-tensorflow-rnnlm.a
 
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
 
-LDFLAGS += -Wl,-rpath=$(shell pwd)/../../tools/tensorflow/bazel-bin/tensorflow/
+LDFLAGS += -Wl,-rpath,$(TENSORFLOW)/bazel-bin/tensorflow/
 
 include ../makefiles/default_rules.mk
diff --git a/src/transform/Makefile b/src/transform/Makefile
index 02f5d0ec396..a265db6ac37 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -14,8 +14,7 @@ OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
 
 LIBNAME = kaldi-transform
 
-ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/transform/cmvn.cc b/src/transform/cmvn.cc
index 8dfe016227a..76f6652eecd 100644
--- a/src/transform/cmvn.cc
+++ b/src/transform/cmvn.cc
@@ -74,41 +74,43 @@ void ApplyCmvn(const MatrixBase<double> &stats,
   if (stats.NumRows() == 1 && var_norm)
     KALDI_ERR << "You requested variance normalization but no variance stats "
               << "are supplied.";
-  
+
   double count = stats(0, dim);
   // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
   // computing an offset and representing it as stats, we use a count of one.
   if (count < 1.0)
     KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
               << "count = " << count;
-  
-  Matrix<BaseFloat> norm(2, dim);  // norm(0, d) = mean offset
+
+  if (!var_norm) {
+    Vector<BaseFloat> offset(dim);
+    SubVector<double> mean_stats(stats.RowData(0), dim);
+    offset.AddVec(-1.0 / count, mean_stats);
+    feats->AddVecToRows(1.0, offset);
+    return;
+  }
+  // norm(0, d) = mean offset;
   // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
+  Matrix<BaseFloat> norm(2, dim);
   for (int32 d = 0; d < dim; d++) {
     double mean, offset, scale;
     mean = stats(0, d)/count;
-    if (!var_norm) {
-      scale = 1.0;
-      offset = -mean;
-    } else {
-      double var = (stats(1, d)/count) - mean*mean,
-          floor = 1.0e-20;
-      if (var < floor) {
-        KALDI_WARN << "Flooring cepstral variance from " << var << " to "
-                   << floor;
-        var = floor;
-      }
-      scale = 1.0 / sqrt(var);
-      if (scale != scale || 1/scale == 0.0)
-        KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
-      offset = -(mean*scale);
+    double var = (stats(1, d)/count) - mean*mean,
+        floor = 1.0e-20;
+    if (var < floor) {
+      KALDI_WARN << "Flooring cepstral variance from " << var << " to "
+                 << floor;
+      var = floor;
     }
+    scale = 1.0 / sqrt(var);
+    if (scale != scale || 1/scale == 0.0)
+      KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
+    offset = -(mean*scale);
     norm(0, d) = offset;
     norm(1, d) = scale;
   }
   // Apply the normalization.
-  if (var_norm)
-    feats->MulColsVec(norm.Row(1));
+  feats->MulColsVec(norm.Row(1));
   feats->AddVecToRows(1.0, norm.Row(0));
 }
 
@@ -125,14 +127,14 @@ void ApplyCmvnReverse(const MatrixBase<double> &stats,
   if (stats.NumRows() == 1 && var_norm)
     KALDI_ERR << "You requested variance normalization but no variance stats "
               << "are supplied.";
-  
+
   double count = stats(0, dim);
   // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
   // computing an offset and representing it as stats, we use a count of one.
   if (count < 1.0)
     KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
               << "count = " << count;
-  
+
   Matrix<BaseFloat> norm(2, dim);  // norm(0, d) = mean offset
   // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
   for (int32 d = 0; d < dim; d++) {
diff --git a/src/transform/cmvn.h b/src/transform/cmvn.h
index 0ae1404784c..c6d1b7f74e1 100644
--- a/src/transform/cmvn.h
+++ b/src/transform/cmvn.h
@@ -67,7 +67,7 @@ void ApplyCmvnReverse(const MatrixBase<double> &stats,
 /// is done to disable CMVN for those dimensions.
 void FakeStatsForSomeDims(const std::vector<int32> &dims,
                           MatrixBase<double> *stats);
-                          
+
 
 
 }  // namespace kaldi
diff --git a/src/transform/decodable-am-diag-gmm-regtree.h b/src/transform/decodable-am-diag-gmm-regtree.h
index 9da4b7f1591..b6e7888ffdc 100644
--- a/src/transform/decodable-am-diag-gmm-regtree.h
+++ b/src/transform/decodable-am-diag-gmm-regtree.h
@@ -51,7 +51,7 @@ class DecodableAmDiagGmmRegtreeFmllr: public DecodableAmDiagGmmUnmapped {
   // Note, frames are numbered from zero but transition-ids (tid) from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
     return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdf(tid));
+                                         trans_model_.TransitionIdToPdfFast(tid));
   }
 
   virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
@@ -94,7 +94,7 @@ class DecodableAmDiagGmmRegtreeMllr: public DecodableAmDiagGmmUnmapped {
   // Note, frames are numbered from zero but transition-ids (tid) from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
     return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdf(tid));
+                                         trans_model_.TransitionIdToPdfFast(tid));
   }
 
   virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
diff --git a/src/transform/fmpe.cc b/src/transform/fmpe.cc
index 4a28ee979ca..9a49bf53678 100644
--- a/src/transform/fmpe.cc
+++ b/src/transform/fmpe.cc
@@ -43,8 +43,8 @@ void Fmpe::SetContexts(std::string context_str) {
       SplitStringToVector(pair_vec[j], ",", false, &one_pair);
       KALDI_ASSERT(one_pair.size() == 2 &&
                    "Mal-formed context string: bad --context-expansion option?");
-      int32 pos;
-      BaseFloat weight;
+      int32 pos = 0;
+      BaseFloat weight = BaseFloat(0);
       bool ok = ConvertStringToInteger(one_pair[0], &pos);
       ok = ConvertStringToReal(one_pair[1], &weight) && ok;
       if (!ok)
diff --git a/src/tree/build-tree-questions.h b/src/tree/build-tree-questions.h
index a6bcfdd500b..22f12d62912 100644
--- a/src/tree/build-tree-questions.h
+++ b/src/tree/build-tree-questions.h
@@ -52,7 +52,7 @@ struct QuestionsForKey {  // Configuration class associated with a particular ke
   std::vector<std::vector<EventValueType> > initial_questions;
   RefineClustersOptions refine_opts;  // if refine_opts.max_iter == 0,
   // we just pick from the initial questions.
-  
+
   QuestionsForKey(int32 num_iters = 5): refine_opts(num_iters, 2) {
     // refine_cfg with 5 iters and top-n = 2 (this is no restriction because
     // RefineClusters called with 2 clusters; would get set to that anyway as
@@ -102,7 +102,9 @@ class Questions {  // careful, this is a class.
     KALDI_ASSERT(keys_out != NULL);
     CopyMapKeysToVector(key_idx_, keys_out);
   }
-  const bool HasQuestionsForKey(EventKeyType key) const { return (key_idx_.count(key) != 0); }
+  bool HasQuestionsForKey(EventKeyType key) const {
+    return (key_idx_.count(key) != 0);
+  }
   ~Questions() { kaldi::DeletePointers(&key_options_); }
 
 
diff --git a/src/tree/build-tree-utils.cc b/src/tree/build-tree-utils.cc
index 4c9be833185..254d7ec36d8 100644
--- a/src/tree/build-tree-utils.cc
+++ b/src/tree/build-tree-utils.cc
@@ -400,7 +400,7 @@ BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats,
     for (size_t i = 0;i < assignments.size();i++) if (assignments[i] == 1) yes_set.push_back(i);
   }
   *yes_set_out = yes_set;
-    
+
   DeletePointers(&clusters);
 #ifdef KALDI_PARANOID
   {  // Check the "ans" is correct.
@@ -763,10 +763,9 @@ EventMap *GetToLengthMap(const BuildTreeStatsType &stats, int32 P,
   std::vector<BuildTreeStatsType> stats_by_phone;
   try {
     SplitStatsByKey(stats, P, &stats_by_phone);
-  } catch(const std::runtime_error &err) {
-    KALDI_ERR << "Caught exception in GetToLengthMap: you seem "
-        "to have provided invalid stats [no central-phone "
-        "key].  Message was: " << err.what();
+  } catch(const KaldiFatalError &) {
+    KALDI_ERR <<
+        "You seem to have provided invalid stats [no central-phone key].";
   }
   std::map<EventValueType, EventAnswerType> phone_to_length;
   for (size_t p = 0; p < stats_by_phone.size(); p++) {
@@ -774,10 +773,9 @@ EventMap *GetToLengthMap(const BuildTreeStatsType &stats, int32 P,
       std::vector<BuildTreeStatsType> stats_by_length;
       try {
         SplitStatsByKey(stats_by_phone[p], kPdfClass, &stats_by_length);
-      } catch(const std::runtime_error &err) {
-        KALDI_ERR << "Caught exception in GetToLengthMap: you seem "
-            "to have provided invalid stats [no position "
-            "key].  Message was: " << err.what();
+      } catch(const KaldiFatalError &) {
+        KALDI_ERR <<
+            "You seem to have provided invalid stats [no position key].";
       }
       size_t length = stats_by_length.size();
       for (size_t i = 0; i < length; i++) {
@@ -868,7 +866,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
     int32 *num_removed_ptr) {
   std::vector<BuildTreeStatsType> split_stats;
   SplitStatsByMap(stats, e_restrict, &split_stats);
-  
+
   if (num_clusters_required < split_stats.size()) {
     KALDI_WARN << "num-clusters-required is less than size of map. Not doing anything.";
     if (num_removed_ptr) *num_removed_ptr = 0;
@@ -904,10 +902,10 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
           if (j > max_index) max_index = j;
         }
       }
-      
+
       normalizer += SumClusterableNormalizer(summed_stats_contiguous[i]);
-    } else { 
-      // Even if split_stats[i] is empty, a cluster will be assigned to 
+    } else {
+      // Even if split_stats[i] is empty, a cluster will be assigned to
       // that. To compensate, we decrease the num-clusters required.
       num_non_empty_clusters_required--;
     }
@@ -919,7 +917,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
   if (num_non_empty_clusters_required > num_non_empty_clusters) {
     KALDI_WARN << "Cannot get required num-clusters " << num_clusters_required
                << " as number of non-empty clusters required is larger than "
-               << " number of non-empty clusters: " << num_non_empty_clusters_required 
+               << " number of non-empty clusters: " << num_non_empty_clusters_required
                << " > " << num_non_empty_clusters;
     if (num_removed_ptr) *num_removed_ptr = 0;
     return e_in.Copy();
@@ -929,7 +927,7 @@ EventMap *ClusterEventMapToNClustersRestrictedByMap(
   BaseFloat change = ClusterBottomUpCompartmentalized(
       summed_stats_contiguous,
       std::numeric_limits<BaseFloat>::infinity(),
-      num_non_empty_clusters_required,  
+      num_non_empty_clusters_required,
       NULL,  // don't need clusters out.
       &assignments);  // this algorithm is quadratic, so might be quite slow.
 
@@ -1052,7 +1050,7 @@ EventMap *GetStubMap(int32 P,
     // Do a split.  Recurse.
     size_t half_sz = phone_sets.size() / 2;
     std::vector<std::vector<int32> >::const_iterator half_phones =
-        phone_sets.begin() + half_sz;  
+        phone_sets.begin() + half_sz;
     std::vector<bool>::const_iterator half_share =
         share_roots.begin() + half_sz;
     std::vector<std::vector<int32> > phone_sets_1, phone_sets_2;
@@ -1127,4 +1125,3 @@ bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP,
 
 
 } // end namespace kaldi
-
diff --git a/src/tree/cluster-utils-test.cc b/src/tree/cluster-utils-test.cc
index fd5d9690939..8eee3fb5505 100644
--- a/src/tree/cluster-utils-test.cc
+++ b/src/tree/cluster-utils-test.cc
@@ -97,10 +97,11 @@ static void TestObjfPlus() {
   AssertEqual(a.Objf(), (BaseFloat)0.0);
   AssertEqual(b.Objf(), (BaseFloat)0.0);
   AssertEqual( a.ObjfPlus(b), -0.5 * (1.0-2.5)*(1.0-2.5));  // 0.5 because half-distance, squared = 1/4, times two points...
-  KALDI_LOG << "Non-binary Output: "<<'\n';
-  a.Write(KALDI_LOG, false);
-  KALDI_LOG << "Binary Output: "<<'\n';
-  a.Write(KALDI_LOG, true);
+  KALDI_LOG << "Non-binary Output:";
+  a.Write(std::cerr, false);
+  std::cerr << "\nBinary Output:\n";
+  a.Write(std::cerr, true);
+  std::cerr << "\n";
 }
 
 static void TestObjfMinus() {
@@ -395,7 +396,7 @@ static void TestClusterKMeansVector() {
     std::vector<Clusterable*> points;
     for (size_t j = 0; j < n_clust; j++) {
       size_t n_points = 1 + Rand() % 5;
-      
+
       Vector<BaseFloat> clust_center(dim);
       clust_center.SetRandn();
       for (size_t k = 0; k < n_points; k++) {
@@ -573,5 +574,3 @@ int main() {
   TestClusterBottomUp();
   TestRefineClusters();
 }
-
-
diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc
index 5583717633c..57efbc733b5 100644
--- a/src/tree/context-dep.cc
+++ b/src/tree/context-dep.cc
@@ -199,6 +199,15 @@ void ContextDependency::EnumeratePairs(
   to_pdf_->MultiMap(vec, &forward_pdfs);
   SortAndUniq(&forward_pdfs);
 
+  if (self_loop_pdf_class < 0) {
+    // Invalid pdf-class because there was no self-loop.  Return pairs
+    // where the self-loop pdf-id is -1.
+    for (int32 forward_pdf: forward_pdfs) {
+      pairs->insert(std::pair<int32,int32>(forward_pdf, -1));
+    }
+    return;
+  }
+
   // get list of possible self-loop pdfs
   vec.clear();
   for (size_t i = 0; i < N_; i++)
diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h
index e69c26f8638..1508616c970 100644
--- a/src/tree/context-dep.h
+++ b/src/tree/context-dep.h
@@ -37,7 +37,7 @@
 namespace kaldi {
 
 static const EventKeyType kPdfClass = -1;  // The "name" to which we assign the
-// pdf-class (generally corresponds ot position in the HMM, zero-based);
+// pdf-class (generally corresponds to position in the HMM, zero-based);
 // must not be used for any other event.  I.e. the value corresponding to
 // this key is the pdf-class (see hmm-topology.h for explanation of what this is).
 
@@ -126,6 +126,8 @@ class ContextDependency: public ContextDependencyInterface {
   /// list of possible (forward-transition-pdf-id, self-loop-pdf-id) that
   /// we can have.
   /// This is less efficient than the other version of GetPdfInfo().
+  /// Note: if there is no self-loop, the corresponding entry (.second) in
+  /// pdf_class_pairs and the output pdf_info would be -1.
   virtual void GetPdfInfo(
       const std::vector<int32> &phones,
       const std::vector<std::vector<std::pair<int32, int32> > > &pdf_class_pairs,
@@ -138,11 +140,11 @@ class ContextDependency: public ContextDependencyInterface {
   EventMap *to_pdf_;  // owned here.
 
   // 'context' is the context-window of phones, of
-  // length N, with -1 for those positions where phones 
-  // that are currently unknown, treated as wildcards; at least 
-  // the central phone [position P] must be a real phone, i.e. 
-  // not -1. 
-  // This function inserts any allowed pairs (forward_pdf, self_loop_pdf) 
+  // length N, with -1 for those positions where phones
+  // that are currently unknown, treated as wildcards; at least
+  // the central phone [position P] must be a real phone, i.e.
+  // not -1.
+  // This function inserts any allowed pairs (forward_pdf, self_loop_pdf)
   // to the set "pairs".
   void EnumeratePairs(
       const std::vector<int32> &phones,
diff --git a/src/tree/event-map.cc b/src/tree/event-map.cc
index f5b84e68d64..1f2581b4751 100644
--- a/src/tree/event-map.cc
+++ b/src/tree/event-map.cc
@@ -289,7 +289,7 @@ void EventMap::Check(const std::vector<std::pair<EventKeyType, EventValueType> >
 // static member of EventMap.
 bool EventMap::Lookup(const EventType &event,
                       EventKeyType key, EventValueType *ans) {
-  // this assumes the the "event" array is sorted (e.g. on the KeyType value;
+  // this assumes that the "event" array is sorted (e.g. on the KeyType value;
   // just doing std::sort will do this) and has no duplicate values with the same
   // key.  call Check() to verify this.
 #ifdef KALDI_PARANOID
diff --git a/src/tree/event-map.h b/src/tree/event-map.h
index 07fcc2b33ab..0772bc1ccce 100644
--- a/src/tree/event-map.h
+++ b/src/tree/event-map.h
@@ -59,7 +59,7 @@ typedef std::vector<std::pair<EventKeyType, EventValueType> > EventType;
 // It is required to be sorted and have unique keys-- i.e. functions assume this when called
 // with this type.
 
-inline std::pair<EventKeyType, EventValueType> MakeEventPair (EventKeyType k, EventValueType v) {  
+inline std::pair<EventKeyType, EventValueType> MakeEventPair (EventKeyType k, EventValueType v) {
   return std::pair<EventKeyType, EventValueType>(k, v);
 }
 
@@ -113,7 +113,7 @@ class EventMap {
   // Copy() does not take ownership of the pointers in new_leaves (it uses the Copy() function of those
   // EventMaps).
   virtual EventMap *Copy(const std::vector<EventMap*> &new_leaves) const = 0;
-  
+
   EventMap *Copy() const { std::vector<EventMap*> new_leaves; return Copy(new_leaves); }
 
   // The function MapValues() is intended to be used to map phone-sets between
@@ -138,7 +138,7 @@ class EventMap {
   // want, you'd put a ConstantEventMap with -1; you'd then call
   // Prune() on the result.  This function is not currently used.
   virtual EventMap *Prune() const = 0;
-  
+
   virtual EventAnswerType MaxResult() const {  // child classes may override this for efficiency; here is basic version.
     // returns -1 if nothing found.
     std::vector<EventAnswerType> tmp; EventType empty_event;
@@ -193,9 +193,9 @@ class ConstantEventMap: public EventMap {
   virtual EventMap *Prune() const {
     return (answer_ == -1 ? NULL : new ConstantEventMap(answer_));
   }
-  
+
   explicit ConstantEventMap(EventAnswerType answer): answer_(answer) { }
-  
+
   virtual void Write(std::ostream &os, bool binary);
   static ConstantEventMap *Read(std::istream &is, bool binary);
  private:
@@ -234,11 +234,11 @@ class TableEventMap: public EventMap {
   }
 
   virtual EventMap *Prune() const;
-  
+
   virtual EventMap *MapValues(
       const unordered_set<EventKeyType> &keys_to_map,
       const unordered_map<EventValueType,EventValueType> &value_map) const;
-  
+
   /// Takes ownership of pointers.
   explicit TableEventMap(EventKeyType key, const std::vector<EventMap*> &table): key_(key), table_(table) {}
   /// Takes ownership of pointers.
@@ -308,11 +308,11 @@ class SplitEventMap: public EventMap {  // A decision tree [non-leaf] node.
   static SplitEventMap *Read(std::istream &is, bool binary);
 
   virtual EventMap *Prune() const;
-  
+
   virtual EventMap *MapValues(
       const unordered_set<EventKeyType> &keys_to_map,
       const unordered_map<EventValueType,EventValueType> &value_map) const;
-  
+
   virtual ~SplitEventMap() { Destroy(); }
 
   /// This constructor takes ownership of the "yes" and "no" arguments.
diff --git a/src/tree/tree-renderer.cc b/src/tree/tree-renderer.cc
index ad7882bc738..bbaa5cda162 100644
--- a/src/tree/tree-renderer.cc
+++ b/src/tree/tree-renderer.cc
@@ -17,7 +17,9 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#include <tree/tree-renderer.h>
+#include "tree/tree-renderer.h"
+
+#include "tree/context-dep.h"
 
 namespace kaldi {
 const int32 TreeRenderer::kEdgeWidth = 1;
diff --git a/src/tree/tree-renderer.h b/src/tree/tree-renderer.h
index 5e0b0d89198..78f4b9aa403 100644
--- a/src/tree/tree-renderer.h
+++ b/src/tree/tree-renderer.h
@@ -23,7 +23,6 @@
 #include "base/kaldi-common.h"
 #include "tree/event-map.h"
 #include "util/common-utils.h"
-#include "hmm/transition-model.h"
 #include "fst/fstlib.h"
 
 namespace kaldi {
diff --git a/src/util/Makefile b/src/util/Makefile
index 80c57fd7435..acfab8b8de1 100644
--- a/src/util/Makefile
+++ b/src/util/Makefile
@@ -15,6 +15,6 @@ OBJFILES = text-utils.o kaldi-io.o kaldi-holder.o kaldi-table.o \
 
 LIBNAME = kaldi-util
 
-ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/util/edit-distance-inl.h b/src/util/edit-distance-inl.h
index c1d1682804c..3304b27d0bf 100644
--- a/src/util/edit-distance-inl.h
+++ b/src/util/edit-distance-inl.h
@@ -35,8 +35,8 @@ int32 LevenshteinEditDistance(const std::vector<T> &a,
   //  elements a_0 ... a_{M-1} and b_0 ... b_{N-1}.
   //  We are computing the recursion
   //     E(m, n) = min(  E(m-1, n-1) + (1-delta(a_{m-1}, b_{n-1})),
-  //                    E(m-1, n),
-  //                    E(m, n-1) ).
+  //                    E(m-1, n) + 1,
+  //                    E(m, n-1) + 1).
   //  where E(m, n) is defined for m = 0..M and n = 0..N and out-of-
   //  bounds quantities are considered to be infinity (i.e. the
   //  recursion does not visit them).
diff --git a/src/util/hash-list-inl.h b/src/util/hash-list-inl.h
index 3fe16182b82..da6165af784 100644
--- a/src/util/hash-list-inl.h
+++ b/src/util/hash-list-inl.h
@@ -121,15 +121,24 @@ HashList<I, T>::~HashList() {
   }
 }
 
-
 template<class I, class T>
-void HashList<I, T>::Insert(I key, T val) {
+inline typename HashList<I, T>::Elem* HashList<I, T>::Insert(I key, T val) {
   size_t index = (static_cast<size_t>(key) % hash_size_);
   HashBucket &bucket = buckets_[index];
+  // Check the element is existing or not.
+  if (bucket.last_elem != NULL) {
+    Elem *head = (bucket.prev_bucket == static_cast<size_t>(-1) ?
+                  list_head_ :
+                  buckets_[bucket.prev_bucket].last_elem->tail),
+         *tail = bucket.last_elem->tail;
+    for (Elem *e = head; e != tail; e = e->tail)
+      if (e->key == key) return e;
+  }
+
+  // This is a new element. Insert it.
   Elem *elem = New();
   elem->key = key;
   elem->val = val;
-
   if (bucket.last_elem == NULL) {  // Unoccupied bucket.  Insert at
     // head of bucket list (which is tail of regular list, they go in
     // opposite directions).
@@ -152,6 +161,7 @@ void HashList<I, T>::Insert(I key, T val) {
     bucket.last_elem->tail = elem;
     bucket.last_elem = elem;
   }
+  return elem;
 }
 
 template<class I, class T>
diff --git a/src/util/hash-list.h b/src/util/hash-list.h
index 67257d053cd..9ae0043f050 100644
--- a/src/util/hash-list.h
+++ b/src/util/hash-list.h
@@ -86,14 +86,12 @@ template<class I, class T> class HashList {
   /// is free to modify the "val" element.
   inline Elem *Find(I key);
 
-  /// Insert inserts a new element into the hashtable/stored list.  By calling
-  /// this,
-  /// the user asserts that it is not already present (e.g. Find was called and
-  /// returned NULL).  With current code, calling this if an element already
-  ///  exists will result in duplicate elements in the structure, and Find()
-  ///  will find the first one that was added.
-  /// [but we don't guarantee this behavior].
-  inline void Insert(I key, T val);
+  /// Insert inserts a new element into the hashtable/stored list.
+  /// Because element keys in a hashtable are unique, this operation checks
+  /// whether each inserted element has a key equivalent to the one of an
+  /// element already in the hashtable. If so, the element is not inserted,
+  /// returning an pointer to this existing element.
+  inline Elem *Insert(I key, T val);
 
   /// Insert inserts another element with same key into the hashtable/
   /// stored list.
diff --git a/src/util/kaldi-holder.h b/src/util/kaldi-holder.h
index f2754677bea..f495f27fd18 100644
--- a/src/util/kaldi-holder.h
+++ b/src/util/kaldi-holder.h
@@ -194,7 +194,7 @@ template<class BasicType> class BasicVectorHolder;
 // types, and bool.
 template<class BasicType> class BasicVectorVectorHolder;
 
-// A holder for vectors of pairsof basic types, e.g.
+// A holder for vectors of pairs of basic types, e.g.
 // std::vector<std::pair<int32, int32> >, and so on.
 // Note: a basic type is defined as a type for which ReadBasicType
 // and WriteBasicType are implemented, i.e. integer and floating
diff --git a/src/util/kaldi-io.cc b/src/util/kaldi-io.cc
index 493a335f2db..96cd8fa1041 100644
--- a/src/util/kaldi-io.cc
+++ b/src/util/kaldi-io.cc
@@ -58,7 +58,7 @@ typedef basic_pipebuf<char> PipebufType;
 
 namespace kaldi {
 
-std::string PrintableRxfilename(std::string rxfilename) {
+std::string PrintableRxfilename(const std::string &rxfilename) {
   if (rxfilename == "" || rxfilename == "-") {
     return "standard input";
   } else {
@@ -70,12 +70,12 @@ std::string PrintableRxfilename(std::string rxfilename) {
 }
 
 
-std::string PrintableWxfilename(std::string wxfilename) {
+std::string PrintableWxfilename(const std::string &wxfilename) {
   if (wxfilename == "" || wxfilename == "-") {
     return "standard output";
   } else {
     // If this call to Escape later causes compilation issues,
-    // just replace it with "return rxfilename"; it's only a
+    // just replace it with "return wxfilename"; it's only a
     // pretty-printing issue.
     return ParseOptions::Escape(wxfilename);
   }
diff --git a/src/util/kaldi-io.h b/src/util/kaldi-io.h
index cf25b6deeb3..c28be8a6cf9 100644
--- a/src/util/kaldi-io.h
+++ b/src/util/kaldi-io.h
@@ -59,13 +59,13 @@ class InputImplBase;  // Forward decl; defined in a .cc file
 //                        Documents\\boo"
 //          (whatever the actual file-system interprets)
 // (2) Standard output:  "" or "-"
-// (3) A pipe: e.g.  "gunzip -c /tmp/abc.gz |"
+// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz"
 //
 //
 // A "rxfilename" is an extended filename for reading.  It can take four forms:
 // (1) An actual filename, whatever the file-system can read, e.g. "/my/file".
 // (2) Standard input: "" or "-"
-// (3) A pipe: e.g. "| gzip -c > /tmp/abc.gz"
+// (3) A pipe: e.g.  "gunzip -c /tmp/abc.gz |"
 // (4) An offset into a file, e.g.: "/mnt/blah/data/1.ark:24871"
 //   [these are created by the Table and TableWriter classes; I may also write
 //    a program that creates them for arbitrary files]
@@ -99,7 +99,7 @@ enum OutputType {
 ///     |.
 ///  - kFileOutput: Normal filenames
 ///  - kStandardOutput: The empty string or "-", interpreted as standard output
-///  - kPipeOutput: pipes, e.g. "gunzip -c some_file.gz |"
+///  - kPipeOutput: pipes, e.g. "| gzip -c > /tmp/abc.gz"
 OutputType ClassifyWxfilename(const std::string &wxfilename);
 
 enum InputType {
@@ -116,7 +116,7 @@ enum InputType {
 ///       with trailing |.
 ///  - kFileInput: normal filenames
 ///  - kStandardInput: the empty string or "-"
-///  - kPipeInput: e.g. "| gzip -c > blah.gz"
+///  - kPipeInput: e.g. "gunzip -c /tmp/abc.gz |"
 ///  - kOffsetFileInput: offsets into files, e.g.  /some/filename:12970
 InputType ClassifyRxfilename(const std::string &rxfilename);
 
@@ -165,7 +165,7 @@ class Output {
 
 // bool binary_in;
 // Input ki(some_filename, &binary_in);
-// MyObject.Read(ki, binary_in);
+// MyObject.Read(ki.Stream(), binary_in);
 //
 // ... more extensive example:
 //
@@ -182,7 +182,7 @@ class Output {
 // Input interprets four kinds of filenames:
 //  (1) Normal filenames
 //  (2) The empty string or "-", interpreted as standard output
-//  (3) Pipes, e.g. "| gzip -c > some_file.gz"
+//  (3) A pipe: e.g.  "gunzip -c /tmp/abc.gz |"
 //  (4) Offsets into [real] files, e.g. "/my/filename:12049"
 // The last one has no correspondence in Output.
 
@@ -264,12 +264,12 @@ template <class C> inline void WriteKaldiObject(const C &c,
 /// PrintableRxfilename turns the rxfilename into a more human-readable
 /// form for error reporting, i.e. it does quoting and escaping and
 /// replaces "" or "-" with "standard input".
-std::string PrintableRxfilename(std::string rxfilename);
+std::string PrintableRxfilename(const std::string &rxfilename);
 
-/// PrintableWxfilename turns the filename into a more human-readable
+/// PrintableWxfilename turns the wxfilename into a more human-readable
 /// form for error reporting, i.e. it does quoting and escaping and
 /// replaces "" or "-" with "standard output".
-std::string PrintableWxfilename(std::string wxfilename);
+std::string PrintableWxfilename(const std::string &wxfilename);
 
 /// @}
 
diff --git a/src/util/kaldi-pipebuf.h b/src/util/kaldi-pipebuf.h
index 9b83cdccc3d..61034ac2757 100644
--- a/src/util/kaldi-pipebuf.h
+++ b/src/util/kaldi-pipebuf.h
@@ -82,7 +82,6 @@ class basic_pipebuf : public std::basic_filebuf<CharType, Traits> {
 };  // class basic_pipebuf
 #endif  // _MSC_VER
 
-};  // namespace kaldi
+}  // namespace kaldi
 
 #endif  // KALDI_UTIL_KALDI_PIPEBUF_H_
-
diff --git a/src/util/kaldi-table-inl.h b/src/util/kaldi-table-inl.h
index 465f800b26c..6aca2f137e3 100644
--- a/src/util/kaldi-table-inl.h
+++ b/src/util/kaldi-table-inl.h
@@ -48,7 +48,7 @@ template<class Holder> class SequentialTableReaderImplBase {
   // called on a just-allocated object.
   virtual bool Open(const std::string &rxfilename) = 0;
   // Done() should be called on a successfully opened, not-closed object.
-  // only throws if called a the wrong time (i.e. code error).
+  // only throws if called at the wrong time (i.e. code error).
   virtual bool Done() const = 0;
   // Returns true if the reader is open [i.e. Open() succeeded and
   // the user has not called Close()]
@@ -1152,7 +1152,7 @@ class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
                                            &script_rxfilename_,
                                            &opts_);
     KALDI_ASSERT(ws == kScriptWspecifier);  // or wrongly called.
-    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this poin.
+    KALDI_ASSERT(script_.empty());  // no way it could be nonempty at this point.
 
     if (!ReadScriptFile(script_rxfilename_,
                          true,  // print any warnings
diff --git a/src/util/kaldi-table-test.cc b/src/util/kaldi-table-test.cc
index 85a4fefe5df..0dca8e4ae33 100644
--- a/src/util/kaldi-table-test.cc
+++ b/src/util/kaldi-table-test.cc
@@ -90,20 +90,20 @@ void UnitTestReadScriptFile() {
 
 void UnitTestClassifyWspecifier() {
   {
-    std::string a = "b,ark:foo|";
+    std::string a = "b,ark:|foo";
     std::string ark = "x", scp = "y";
     WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" &&
+    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "|foo" && scp == "" &&
                  opts.binary == true);
   }
 
   {
-    std::string a = "t,ark:foo|";
+    std::string a = "t,ark:|foo";
     std::string ark = "x", scp = "y";
     WspecifierOptions opts;
     WspecifierType ans = ClassifyWspecifier(a, &ark, &scp, &opts);
-    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "foo|" && scp == "" &&
+    KALDI_ASSERT(ans == kArchiveWspecifier && ark == "|foo" && scp == "" &&
                  opts.binary == false);
   }
 
@@ -133,12 +133,6 @@ void UnitTestClassifyWspecifier() {
     KALDI_ASSERT(ans == kNoWspecifier);
   }
 
-  {
-    std::string a = " t,ark:boo";
-    WspecifierType ans = ClassifyWspecifier(a, NULL, NULL, NULL);
-    KALDI_ASSERT(ans == kNoWspecifier);
-  }
-
   {
     std::string a = " t,ark:boo";  // leading space not allowed.
     WspecifierType ans = ClassifyWspecifier(a, NULL, NULL, NULL);
@@ -825,9 +819,11 @@ void UnitTestTableRandomBothDouble(bool binary, bool read_scp,
     for (size_t i = 0; i < read_keys.size(); i++) {
       std::cout << "Looking up key " << read_keys[i] << std::endl;
       std::string cur_key = read_keys[i];
-      double value;
-      for (size_t i = 0; i < k.size(); i++)
-        if (cur_key == k[i]) value = v[i];
+
+      auto it = std::find(k.begin(), k.end(), cur_key);
+      KALDI_ASSERT(it != k.end());
+      size_t idx = std::distance(k.begin(), it);
+      double value = v[idx];
       if (Rand() % 2 == 0) {
         bool ans = sbr.HasKey(cur_key);
         KALDI_ASSERT(ans == true);
diff --git a/src/util/kaldi-table.cc b/src/util/kaldi-table.cc
index 99ddafb2017..1aeceb2bb7d 100644
--- a/src/util/kaldi-table.cc
+++ b/src/util/kaldi-table.cc
@@ -223,7 +223,7 @@ WspecifierType ClassifyWspecifier(const std::string &wspecifier,
 
 
 RspecifierType ClassifyRspecifier(const std::string &rspecifier,
-                                  std::string *wxfilename,
+                                  std::string *rxfilename,
                                   RspecifierOptions *opts) {
   // Examples
   // ark:rxfilename  ->  kArchiveRspecifier
@@ -247,7 +247,7 @@ RspecifierType ClassifyRspecifier(const std::string &rspecifier,
 
   // Improperly formed Rspecifiers will be classified as kNoRspecifier.
 
-  if (wxfilename) wxfilename->clear();
+  if (rxfilename) rxfilename->clear();
 
   if (opts != NULL)
     *opts = RspecifierOptions();  // Make sure all the defaults are as in the
@@ -308,8 +308,8 @@ RspecifierType ClassifyRspecifier(const std::string &rspecifier,
     }
   }
   if ((rs == kArchiveRspecifier || rs == kScriptRspecifier)
-     && wxfilename != NULL)
-    *wxfilename = after_colon;
+     && rxfilename != NULL)
+    *rxfilename = after_colon;
   return rs;
 }
 
diff --git a/src/util/kaldi-table.h b/src/util/kaldi-table.h
index e3a80b2743b..6865cea14ec 100644
--- a/src/util/kaldi-table.h
+++ b/src/util/kaldi-table.h
@@ -67,7 +67,7 @@ typedef std::vector<std::string> KeyList;
 //  t means text mode.
 //  b means binary mode.
 //  f means flush the stream after writing each entry.
-//   (nf means don't flush, and isn't very useful as the default is to flush).
+//   (nf means don't flush, and the default is not to flush).
 //  p means permissive mode, when writing to an "scp" file only: will ignore
 //     missing scp entries, i.e. won't write anything for those files but will
 //     return success status).
@@ -79,7 +79,7 @@ typedef std::vector<std::string> KeyList;
 //  ark,b:-
 //
 //  The meanings of rxfilename and wxfilename are as described in
-//  kaldi-stream.h (they are filenames but include pipes, stdin/stdout
+//  kaldi-io.h (they are filenames but include pipes, stdin/stdout
 //  and so on; filename is a regular filename.
 //
 
@@ -100,7 +100,7 @@ typedef std::vector<std::string> KeyList;
 //    key filename:12407
 //  where the number is the byte offset into the file.
 //  In this case we restrict the archive-filename to be an actual filename,
-//  as we can't see a situtation where an extended filename would make sense
+//  as we can't see a situation where an extended filename would make sense
 //  for this (we can't fseek() in pipes).
 
 enum WspecifierType  {
@@ -236,7 +236,7 @@ class RandomAccessTableReader {
 
   RandomAccessTableReader(): impl_(NULL) { }
 
-  // This constructor equivalent to default constructor + "open", but
+  // This constructor is equivalent to default constructor + "open", but
   // throws on error.
   explicit RandomAccessTableReader(const std::string &rspecifier);
 
@@ -315,7 +315,7 @@ class SequentialTableReader {
 
   // Return reference to the current value.  It's only valid to call this if
   // Done() returned false.  The reference is valid till next call to this
-  // object.  If will throw if you are reading an scp file, did not specify the
+  // object.  It will throw if you are reading an scp file, did not specify the
   // "permissive" (p) option and the file cannot be read.  [The permissive
   // option makes it behave as if that key does not even exist, if the
   // corresponding file cannot be read.]  You probably wouldn't want to catch
@@ -383,8 +383,7 @@ class TableWriter {
   // Returns true if open for writing.
   bool IsOpen() const;
 
-  // Write the object.  Throws  std::runtime_error on error (via the
-  // KALDI_ERR macro)
+  // Write the object. Throws KaldiFatalError on error via the KALDI_ERR macro.
   inline void Write(const std::string &key, const T &value) const;
 
 
diff --git a/src/util/kaldi-thread-test.cc b/src/util/kaldi-thread-test.cc
index e1776859222..eb6b72d1ed4 100644
--- a/src/util/kaldi-thread-test.cc
+++ b/src/util/kaldi-thread-test.cc
@@ -128,6 +128,6 @@ void TestTaskSequencer() {
 int main() {
   using namespace kaldi;
   TestThreads();
-  for (int32 i = 0; i < 1000; i++)
+  for (int32 i = 0; i < 10; i++)
     TestTaskSequencer();
 }
diff --git a/src/util/parse-options.cc b/src/util/parse-options.cc
index 2f75cb655f9..4b08ca390f7 100644
--- a/src/util/parse-options.cc
+++ b/src/util/parse-options.cc
@@ -323,14 +323,7 @@ int ParseOptions::Read(int argc, const char *const argv[]) {
 #else
     const char *c = strrchr(argv[0], '/');
 #endif
-    if (c == NULL)
-      c = argv[0];
-    else
-      c++;
-    char *program_name = new char[strlen(c)+1];
-    strcpy(program_name, c);
-    delete [] g_program_name;
-    g_program_name = program_name;
+    SetProgramName(c == NULL ? argv[0] : c + 1);
   }
   // first pass: look for config parameter, look for priority
   for (i = 1; i < argc; i++) {
@@ -504,7 +497,7 @@ void ParseOptions::ReadConfigFile(const std::string &filename) {
 
 
 
-void ParseOptions::SplitLongArg(std::string in,
+void ParseOptions::SplitLongArg(const std::string &in,
                                 std::string *key,
                                 std::string *value,
                                 bool *has_equal_sign) {
diff --git a/src/util/parse-options.h b/src/util/parse-options.h
index 3d76b692c7d..6884a40c10e 100644
--- a/src/util/parse-options.h
+++ b/src/util/parse-options.h
@@ -100,7 +100,7 @@ class ParseOptions : public OptionsItf {
 
     Initially the variables have implicit values,
     then the config file values are set-up,
-    finally the command line vaues given.
+    finally the command line values given.
     Returns the first position in argv that was not used.
     [typically not useful: use NumParams() and GetParam(). ]
    */
@@ -223,8 +223,8 @@ class ParseOptions : public OptionsItf {
   /// and sets "has_equal_sign" to true if an equals-sign was parsed..
   /// this is needed in order to correctly allow --x for a boolean option
   /// x, and --y= for a string option y, and to disallow --x= and --y.
-  void SplitLongArg(std::string in, std::string *key, std::string *value,
-                    bool *has_equal_sign);
+  void SplitLongArg(const std::string &in, std::string *key,
+                    std::string *value, bool *has_equal_sign);
 
   void NormalizeArgName(std::string *str);
 };
@@ -234,7 +234,7 @@ class ParseOptions : public OptionsItf {
 /// occasionally be needed.  This function assumes the config has a function
 /// "void Register(OptionsItf *opts)" which it can call to register the
 /// ParseOptions object.
-template<class C> void ReadConfigFromFile(const std::string config_filename,
+template<class C> void ReadConfigFromFile(const std::string &config_filename,
                                           C *c) {
   std::ostringstream usage_str;
   usage_str << "Parsing config from "
@@ -246,16 +246,15 @@ template<class C> void ReadConfigFromFile(const std::string config_filename,
 
 /// This variant of the template ReadConfigFromFile is for if you need to read
 /// two config classes from the same file.
-template<class C1, class C2> void ReadConfigsFromFile(const std::string
-                                                      config_filename,
+template<class C1, class C2> void ReadConfigsFromFile(const std::string &conf,
                                                       C1 *c1, C2 *c2) {
   std::ostringstream usage_str;
   usage_str << "Parsing config from "
-            << "from '" << config_filename << "'";
+            << "from '" << conf << "'";
   ParseOptions po(usage_str.str().c_str());
   c1->Register(&po);
   c2->Register(&po);
-  po.ReadConfigFile(config_filename);
+  po.ReadConfigFile(conf);
 }
 
 
diff --git a/src/util/simple-io-funcs.cc b/src/util/simple-io-funcs.cc
index 3d770dfff99..cb732a10a6d 100644
--- a/src/util/simple-io-funcs.cc
+++ b/src/util/simple-io-funcs.cc
@@ -21,7 +21,7 @@
 
 namespace kaldi {
 
-bool WriteIntegerVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorSimple(const std::string &wxfilename,
                               const std::vector<int32> &list) {
   kaldi::Output ko;
   // false, false is: text-mode, no Kaldi header.
@@ -30,7 +30,8 @@ bool WriteIntegerVectorSimple(std::string wxfilename,
   return ko.Close();
 }
 
-bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *list) {
+bool ReadIntegerVectorSimple(const std::string &rxfilename,
+                             std::vector<int32> *list) {
   kaldi::Input ki;
   if (!ki.OpenTextMode(rxfilename)) return false;
   std::istream &is = ki.Stream();
@@ -42,7 +43,7 @@ bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *list) {
   return is.eof();  // should be eof, or junk at end of file.
 }
 
-bool WriteIntegerVectorVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorVectorSimple(const std::string &wxfilename,
                                  const std::vector<std::vector<int32> > &list) {
   kaldi::Output ko;
   // false, false is: text-mode, no Kaldi header.
@@ -58,7 +59,7 @@ bool WriteIntegerVectorVectorSimple(std::string wxfilename,
   return ko.Close();
 }
 
-bool ReadIntegerVectorVectorSimple(std::string rxfilename,
+bool ReadIntegerVectorVectorSimple(const std::string &rxfilename,
                                    std::vector<std::vector<int32> > *list) {
   kaldi::Input ki;
   if (!ki.OpenTextMode(rxfilename)) return false;
diff --git a/src/util/simple-io-funcs.h b/src/util/simple-io-funcs.h
index 58445356e02..30b90acb399 100644
--- a/src/util/simple-io-funcs.h
+++ b/src/util/simple-io-funcs.h
@@ -35,13 +35,14 @@ namespace kaldi {
 /// WriteToList attempts to write this list of integers, one per line,
 /// to the given file, in text format.
 /// returns true if succeeded.
-bool WriteIntegerVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorSimple(const std::string &wxfilename,
                               const std::vector<int32> &v);
 
 /// ReadFromList attempts to read this list of integers, one per line,
 /// from the given file, in text format.
 /// returns true if succeeded.
-bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *v);
+bool ReadIntegerVectorSimple(const std::string &rxfilename,
+                             std::vector<int32> *v);
 
 // This is a file format like:
 // 1 2
@@ -49,10 +50,10 @@ bool ReadIntegerVectorSimple(std::string rxfilename, std::vector<int32> *v);
 //
 // 4 5 6
 // etc.
-bool WriteIntegerVectorVectorSimple(std::string wxfilename,
+bool WriteIntegerVectorVectorSimple(const std::string &wxfilename,
                                     const std::vector<std::vector<int32> > &v);
 
-bool ReadIntegerVectorVectorSimple(std::string rxfilename,
+bool ReadIntegerVectorVectorSimple(const std::string &rxfilename,
                                    std::vector<std::vector<int32> > *v);
 
 
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index a1506f557a7..647073a2215 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -97,8 +97,6 @@ void CopySetToVector(const std::set<T> &s, std::vector<T> *v) {
 
 template<class T>
 void CopySetToVector(const unordered_set<T> &s, std::vector<T> *v) {
-  // adds members of s to v, in sorted order from lowest to highest
-  // (because the set was in sorted order).
   KALDI_ASSERT(v != NULL);
   v->resize(s.size());
   typename unordered_set<T>::const_iterator siter = s.begin(), send = s.end();
diff --git a/src/util/text-utils-test.cc b/src/util/text-utils-test.cc
index 5bfe4cb24d0..3b58f4f1dd1 100644
--- a/src/util/text-utils-test.cc
+++ b/src/util/text-utils-test.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011     Microsoft Corporation
 //                2017     Johns Hopkins University (author: Daniel Povey)
+//                2015  Vimal Manohar (Johns Hopkins University)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -324,6 +325,193 @@ void TestStringsApproxEqual() {
   KALDI_ASSERT(!StringsApproxEqual("x 1.0 y", "x 1.0001 y", 4));
 }
 
+void UnitTestConfigLineParse() {
+  std::string str;
+  {
+    ConfigLine cfl;
+    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
+    bool status = cfl.ParseLine(str);
+    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
+
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
+    KALDI_ASSERT(str_value == "yyy");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
+    KALDI_ASSERT(str_value == "bar");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "123");
+
+    std::vector<int32> int_values;
+    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
+    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
+    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
+    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar";
+    KALDI_ASSERT(cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar a=b c d f=g";
+    std::string value;
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
+                 cfl.GetValue("a", &value)  && value == "b c d" &&
+                 cfl.GetValue("f", &value) && value == "g" &&
+                 !cfl.HasUnusedValues());
+  }
+  {
+    ConfigLine cfl;
+    str = "zzz a=b baz";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
+                 cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b baz ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b =c";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "x y z");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
+    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
+    KALDI_ASSERT(str_value == "cd");
+    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
+    KALDI_ASSERT(str_value == "bd");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "x baz= pp = qq flag=t ";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " x baz= pp=qq flag=t  ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
+    KALDI_ASSERT(str_value == "t");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+
+    bool bool_value = false;
+    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
+    KALDI_ASSERT(bool_value);
+  }
+
+  {
+    ConfigLine cfl;
+    str = "xx _baz=a -pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx 0baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx -baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz'=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " baz=g";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
+    bool flag;
+    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz1=a pp=qq";
+    KALDI_ASSERT(cfl.ParseLine(str));
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
+  }
+}
+
+void UnitTestReadConfig() {
+  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
+      "a-b beta2='b c' beta3=bd # \n"
+      "a-b gamma=1:2:3:4  # Int Vector test\n"
+      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
+      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
+      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
+      "a-b quoted='a b c' # quoted string\n"
+      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
+
+  std::istringstream is(str);
+  std::vector<std::string> lines;
+  ReadConfigLines(is, &lines);
+  KALDI_ASSERT(lines.size() == 8);
+
+  ConfigLine cfl;
+  for (size_t i = 0; i < lines.size(); i++) {
+    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
+    if (i == 1) {
+        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
+    }
+    if (i == 4) {
+      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
+    }
+    if (i == 5) {
+      BaseFloat float_val = 0;
+      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
+    }
+    if (i == 6) {
+      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
+    }
+    if (i == 7) {
+      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
+    }
+  }
+}
 
 }  // end namespace kaldi
 
@@ -344,5 +532,7 @@ int main() {
   TestNan<double>();
   TestInf<float>();
   TestInf<double>();
+  UnitTestConfigLineParse();
+  UnitTestReadConfig();
   std::cout << "Test OK\n";
 }
diff --git a/src/util/text-utils.cc b/src/util/text-utils.cc
index 200e3ad9327..bbf38ecc5cc 100644
--- a/src/util/text-utils.cc
+++ b/src/util/text-utils.cc
@@ -340,4 +340,252 @@ bool StringsApproxEqual(const std::string &a,
 }
 
 
+bool ConfigLine::ParseLine(const std::string &line) {
+  data_.clear();
+  whole_line_ = line;
+  if (line.size() == 0) return false;   // Empty line
+  size_t pos = 0, size = line.size();
+  while (isspace(line[pos]) && pos < size) pos++;
+  if (pos == size)
+    return false;  // whitespace-only line
+  size_t first_token_start_pos = pos;
+  // first get first_token_.
+  while (!isspace(line[pos]) && pos < size) {
+    if (line[pos] == '=') {
+      // If the first block of non-whitespace looks like "foo-bar=...",
+      // then we ignore it: there is no initial token, and FirstToken()
+      // is empty.
+      pos = first_token_start_pos;
+      break;
+    }
+    pos++;
+  }
+  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
+  // first_token_ is expected to be either empty or something like
+  // "component-node", which actually is a slightly more restrictive set of
+  // strings than IsValidName() checks for this is a convenient way to check it.
+  if (!first_token_.empty() && !IsValidName(first_token_))
+    return false;
+
+  while (pos < size) {
+    if (isspace(line[pos])) {
+      pos++;
+      continue;
+    }
+
+    // OK, at this point we know that we are pointing at nonspace.
+    size_t next_equals_sign = line.find_first_of("=", pos);
+    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
+      // we're looking for something like 'key=value'.  If there is no equals sign,
+      // or it's not preceded by something, it's a parsing failure.
+      return false;
+    }
+    std::string key(line, pos, next_equals_sign - pos);
+    if (!IsValidName(key)) return false;
+
+    // handle any quotes.  we support key='blah blah' or key="foo bar".
+    // no escaping is supported.
+    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
+      char my_quote = line[next_equals_sign+1];
+      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
+      if (next_quote == std::string::npos) {  // no matching quote was found.
+        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
+                   << line << "'";
+        return false;
+      } else {
+        std::string value(line, next_equals_sign + 2,
+                          next_quote - next_equals_sign - 2);
+        data_.insert(std::make_pair(key, std::make_pair(value, false)));
+        pos = next_quote + 1;
+        continue;
+      }
+    } else {
+      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
+      // in general, config values with spaces in them, even without quoting.
+
+      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
+          terminating_space = size;
+
+      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
+        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
+        if (preceding_space != std::string::npos &&
+            preceding_space > next_equals_sign)
+          terminating_space = preceding_space;
+      }
+      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
+        terminating_space--;
+
+      std::string value(line, next_equals_sign + 1,
+                        terminating_space - (next_equals_sign + 1));
+      data_.insert(std::make_pair(key, std::make_pair(value, false)));
+      pos = terminating_space;
+    }
+  }
+  return true;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::string *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      *value = (it->second).first;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToReal((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, int32 *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToInteger((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
+  KALDI_ASSERT(value != NULL);
+  value->clear();
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
+        // KALDI_WARN << "Bad option " << (it->second).first;
+        return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, bool *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if ((it->second).first.size() == 0) return false;
+      switch (((it->second).first)[0]) {
+        case 'F':
+        case 'f':
+          *value = false;
+          break;
+        case 'T':
+        case 't':
+          *value = true;
+          break;
+        default:
+          return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::HasUnusedValues() const {
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) return true;
+  }
+  return false;
+}
+
+std::string ConfigLine::UnusedValues() const {
+  std::string unused_str;
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) {
+      if (unused_str == "")
+        unused_str = it->first + "=" + (it->second).first;
+      else
+        unused_str += " " + it->first + "=" + (it->second).first;
+    }
+  }
+  return unused_str;
+}
+
+// This is like ExpectToken but for two tokens, and it
+// will either accept token1 and then token2, or just token2.
+// This is useful in Read functions where the first token
+// may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2) {
+  KALDI_ASSERT(token1 != token2);
+  std::string temp;
+  ReadToken(is, binary, &temp);
+  if (temp == token1) {
+    ExpectToken(is, binary, token2);
+  } else {
+    if (temp != token2) {
+      KALDI_ERR << "Expecting token " << token1 << " or " << token2
+                << " but got " << temp;
+    }
+  }
+}
+
+
+bool IsValidName(const std::string &name) {
+  if (name.size() == 0) return false;
+  for (size_t i = 0; i < name.size(); i++) {
+    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
+      return false;
+    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
+      return false;
+  }
+  return true;
+}
+
+void ReadConfigLines(std::istream &is,
+                    std::vector<std::string> *lines) {
+  KALDI_ASSERT(lines != NULL);
+  std::string line;
+  while (std::getline(is, line)) {
+    if (line.size() == 0) continue;
+    size_t start = line.find_first_not_of(" \t");
+    size_t end = line.find_first_of('#');
+    if (start == std::string::npos || start == end) continue;
+    end = line.find_last_not_of(" \t", end - 1);
+    KALDI_ASSERT(end >= start);
+    lines->push_back(line.substr(start, end - start + 1));
+  }
+}
+
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines) {
+  config_lines->resize(lines.size());
+  for (size_t i = 0; i < lines.size(); i++) {
+    bool ret = (*config_lines)[i].ParseLine(lines[i]);
+    if (!ret) {
+      KALDI_ERR << "Error parsing config line: " << lines[i];
+    }
+  }
+}
+
+
 }  // end namespace kaldi
diff --git a/src/util/text-utils.h b/src/util/text-utils.h
index 7bc20957672..02f4bf483fc 100644
--- a/src/util/text-utils.h
+++ b/src/util/text-utils.h
@@ -183,6 +183,98 @@ bool StringsApproxEqual(const std::string &a,
                         const std::string &b,
                         int32 decimal_places_check = 2);
 
+/**
+   This class is responsible for parsing input like
+    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
+   and giving you access to the fields, in this case
+
+   FirstToken() == "hi-there", and key->value pairs:
+
+   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
+   bing->"a b c", baz->"a b c d='a b' e"
+
+   The first token is optional, if the line started with a key-value pair then
+   FirstValue() will be empty.
+
+   Note: it can parse value fields with space inside them only if they are free of the '='
+   character.  If values are going to contain the '=' character, you need to quote them
+   with either single or double quotes.
+
+   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
+ */
+class ConfigLine {
+ public:
+  // Tries to parse the line as a config-file line.  Returns false
+  // if it could not for some reason, e.g. parsing failure.  In most cases
+  // prints no warnings; the user should do this.  Does not expect comments.
+  bool ParseLine(const std::string &line);
+
+  // the GetValue functions are overloaded for various types.  They return true
+  // if the key exists with value that can be converted to that type, and false
+  // otherwise.  They also mark the key-value pair as having been read.  It is
+  // not an error to read values twice.
+  bool GetValue(const std::string &key, std::string *value);
+  bool GetValue(const std::string &key, BaseFloat *value);
+  bool GetValue(const std::string &key, int32 *value);
+  // Values may be separated by ":" or by ",".
+  bool GetValue(const std::string &key, std::vector<int32> *value);
+  bool GetValue(const std::string &key, bool *value);
+
+  bool HasUnusedValues() const;
+  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
+  /// of the GetValue() functions.
+  std::string UnusedValues() const;
+
+  const std::string &FirstToken() const { return first_token_; }
+
+  const std::string WholeLine() { return whole_line_; }
+  // use default assignment operator and copy constructor.
+ private:
+  std::string whole_line_;
+  // the first token of the line, e.g. if line is
+  // foo-bar baz=bing
+  // then first_token_ would be "foo-bar".
+  std::string first_token_;
+
+  // data_ maps from key to (value, is-this-value-consumed?).
+  std::map<std::string, std::pair<std::string, bool> > data_;
+
+};
+
+/// This function is like ExpectToken but for two tokens, and it will either
+/// accept token1 and then token2, or just token2.  This is useful in Read
+/// functions where the first token may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2);
+
+
+/**
+   This function reads in a config file and *appends* its contents to a vector of
+   lines; it is responsible for removing comments (anything after '#') and
+   stripping out any lines that contain only whitespace after comment removal.
+ */
+void ReadConfigLines(std::istream &is,
+                     std::vector<std::string> *lines);
+
+
+/**
+   This function converts config-lines from a simple sequence of strings
+   as output by ReadConfigLines(), into a sequence of first-tokens and
+   name-value pairs.  The general format is:
+      "command-type bar=baz xx=yyy"
+   etc., although there are subtleties as to what exactly is allowed, see
+   documentation for class ConfigLine for details.
+   This function will die if there was a parsing failure.
+ */
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines);
+
+
+/// Returns true if 'name' would be a valid name for a component or node in a
+/// nnet3Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
+/// '-', '_', '.', A-Z, a-z, or 0-9.
+bool IsValidName(const std::string &name);
 
 }  // namespace kaldi
 
diff --git a/tools/Makefile b/tools/Makefile
index 1d62e1a3765..06aa2d6b855 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,13 +1,20 @@
 # SHELL += -x
 
-CXX = g++
-CC = gcc         # used for sph2pipe
-# CXX = clang++  # Uncomment these lines
-# CC = clang     # to build with Clang.
+CXX ?= g++
+CC ?= gcc        # used for sph2pipe
+# CXX = clang++  # Uncomment these lines...
+# CC = clang     # ...to build with Clang.
+
+WGET ?= wget
 
 # Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
 # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
 OPENFST_VERSION ?= 1.6.7
+CUB_VERSION ?= 1.8.0
+OPENBLAS_VERSION ?= 0.3.5
+SCTK_VERSION_PARTIAL ?= 2.4.10
+SCTK_VERSION ?= $(SCTK_VERSION_PARTIAL)-20151007-1312Z
+SPH2PIPE_VERSION = v2.5
 
 # Default features configured for OpenFST; can be overridden in the make command line.
 OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts
@@ -18,7 +25,7 @@ ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10600)","1")
             Supported versions: >= 1.6.0)
 endif
 
-all: check_required_programs sph2pipe sclite openfst
+all: check_required_programs sph2pipe sclite openfst cub
 	@echo -e "\n\n"
 	@echo "Warning: IRSTLM is not installed by default anymore. If you need IRSTLM"
 	@echo "Warning: use the script extras/install_irstlm.sh"
@@ -41,14 +48,17 @@ sclite_cleaned:
 
 distclean:
 	rm -rf openfst-$(OPENFST_VERSION)/
-	rm -rf sctk-2.4.10/
+	rm -rf sctk-$(SCTK_VERSION_PARTIAL)/
 	rm -rf sctk
-	rm -rf sph2pipe_v2.5/
-	rm -rf sph2pipe_v2.5.tar.gz
-	rm -rf sctk-2.4.10-20151007-1312Z.tar.bz2
+	rm -rf sph2pipe_$(SPH2PIPE_VERSION)/
+	rm -rf sph2pipe_$(SPH2PIPE_VERSION).tar.gz
+	rm -rf sctk-$(SCTK_VERSION).tar.bz2
 	rm -rf openfst-$(OPENFST_VERSION).tar.gz
 	rm -f openfst
 	rm -rf libsndfile-1.0.25{,.tar.gz} BeamformIt-3.51{,.tgz}
+	rm -f cub-$(CUB_VERSION).zip
+	rm -rf cub-$(CUB_VERSION)
+	rm -f cub
 
 
 .PHONY: openfst # so target will be made even though "openfst" exists.
@@ -84,8 +94,12 @@ openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
 	tar xozf openfst-$(OPENFST_VERSION).tar.gz
 
 openfst-$(OPENFST_VERSION).tar.gz:
-	wget -T 10 -t 1 http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
-	wget -T 10 -t 3 http://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz
+	if [ -d "$(DOWNLOAD_DIR)" ]; then \
+		cp -p "$(DOWNLOAD_DIR)/openfst-$(OPENFST_VERSION).tar.gz" .; \
+	else \
+		$(WGET) -T 10 -t 1 http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
+		$(WGET) -T 10 -t 3 https://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz; \
+	fi
 
 sclite: sclite_compiled
 
@@ -101,51 +115,51 @@ sctk/.configured: sctk
 	touch sctk/.configured
 
 .PHONY: sctk
-sctk: sctk-2.4.10-20151007-1312Z.tar.bz2
-	tar xojf sctk-2.4.10-20151007-1312Z.tar.bz2 || \
-      tar --exclude '*NONE*html' -xvojf sctk-2.4.10-20151007-1312Z.tar.bz2
-	rm -rf sctk && ln -s sctk-2.4.10 sctk
-
-sctk-2.4.10-20151007-1312Z.tar.bz2:
-	wget -T 10 -t 3 ftp://jaguar.ncsl.nist.gov/pub/sctk-2.4.10-20151007-1312Z.tar.bz2|| \
-	wget --no-check-certificate -T 10 http://www.openslr.org/resources/4/sctk-2.4.10-20151007-1312Z.tar.bz2
+sctk: sctk-$(SCTK_VERSION).tar.bz2
+	tar xojf sctk-$(SCTK_VERSION).tar.bz2 || \
+	tar --exclude '*NONE*html' -xvojf sctk-$(SCTK_VERSION).tar.bz2
+	rm -rf sctk && ln -s sctk-$(SCTK_VERSION_PARTIAL) sctk
+
+sctk-$(SCTK_VERSION).tar.bz2:
+	if [ -d "$(DOWNLOAD_DIR)" ]; then \
+		cp -p "$(DOWNLOAD_DIR)/sctk-$(SCTK_VERSION).tar.bz2" .; \
+	else \
+		$(WGET) -T 10 https://www.openslr.org/resources/4/sctk-$(SCTK_VERSION).tar.bz2; \
+	fi
 
 sph2pipe: sph2pipe_compiled
 
-sph2pipe_compiled: sph2pipe_v2.5/sph2pipe
+sph2pipe_compiled: sph2pipe_$(SPH2PIPE_VERSION)/sph2pipe
 
-sph2pipe_v2.5/sph2pipe: | sph2pipe_v2.5
-	cd sph2pipe_v2.5/; \
+sph2pipe_$(SPH2PIPE_VERSION)/sph2pipe: | sph2pipe_$(SPH2PIPE_VERSION)
+	cd sph2pipe_$(SPH2PIPE_VERSION)/ && \
 	$(CC) -o sph2pipe  *.c -lm
 
-sph2pipe_v2.5: sph2pipe_v2.5.tar.gz
-	tar xzf sph2pipe_v2.5.tar.gz
-
-sph2pipe_v2.5.tar.gz:
-	wget -T 10 -t 3 http://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz || \
-	wget --no-check-certificate -T 10  https://sourceforge.net/projects/kaldi/files/sph2pipe_v2.5.tar.gz
-
-openblas: openblas_compiled
-
-.PHONY: openblas_compiled
-
-fortran_opt = $(shell gcc -v 2>&1 | perl -e '$$x = join(" ", <STDIN>); if($$x =~ m/target=\S+64\S+/) { print "BINARY=64"; }')
-
-
-# note: you can uncomment the line that has USE_THREAD=1 and comment the line
-# that has USE_THREAD=0 if you want Open Blas to use multiple threads.  then
-# you could set, for example, OPENBLAS_NUM_THREADS=2 in your path.sh so that the
-# runtime knows how many threads to use.  Note: if you ever get the error
-# "Program is Terminated. Because you tried to allocate too many memory
-# regions.", this is because OpenBLAS has a fixed buffer size controlled by the
-# Makefile option NUM_THREADS; I believe this limits the product of number of
-# program threads that are calling BLAS by the shell variable
-# OPENBLAS_NUM_THREADS.  In that case it might help to increase the NUM_THREADS
-# option.
-openblas_compiled:
-	echo "Note: see tools/Makefile for options regarding OpenBLAS compilation"
-	-git clone https://github.com/xianyi/OpenBLAS.git
-	-cd OpenBLAS; git pull
-	cd OpenBLAS; sed 's:# FCOMMON_OPT = -frecursive:FCOMMON_OPT = -frecursive:' < Makefile.rule >tmp && mv tmp Makefile.rule
-	# $(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=1 NUM_THREADS=64 -C OpenBLAS all install
-	$(MAKE) PREFIX=`pwd`/OpenBLAS/install FC=gfortran $(fortran_opt) DEBUG=1 USE_THREAD=0 -C OpenBLAS all install
+sph2pipe_$(SPH2PIPE_VERSION): sph2pipe_$(SPH2PIPE_VERSION).tar.gz
+	tar --no-same-owner -xzf sph2pipe_$(SPH2PIPE_VERSION).tar.gz
+
+sph2pipe_$(SPH2PIPE_VERSION).tar.gz:
+	if [ -d "$(DOWNLOAD_DIR)" ]; then \
+		cp -p "$(DOWNLOAD_DIR)/sph2pipe_$(SPH2PIPE_VERSION).tar.gz" .; \
+	else \
+		$(WGET) -T 10 -t 3 https://www.openslr.org/resources/3/sph2pipe_$(SPH2PIPE_VERSION).tar.gz || \
+		$(WGET) -T 10 https://sourceforge.net/projects/kaldi/files/sph2pipe_$(SPH2PIPE_VERSION).tar.gz; \
+	fi
+
+.PHONY: cub
+cub:
+	if [ -d "$(DOWNLOAD_DIR)" ]; then \
+		cp -p "$(DOWNLOAD_DIR)/cub-$(CUB_VERSION).zip" .; \
+	else \
+		$(WGET) -T 10 -t 3 -O cub-$(CUB_VERSION).zip https://github.com/NVlabs/cub/archive/$(CUB_VERSION).zip; \
+	fi
+	unzip -oq cub-$(CUB_VERSION).zip
+	rm -f cub
+	ln -s cub-$(CUB_VERSION) cub
+
+# OpenBLAS is not compiled by default; you can run `make openblas` to build it,
+# but you should probably just call extras/install_openblas.sh; this is
+# here for legacy reasons.
+.PHONY: openblas
+openblas:
+	extras/install_openblas.sh
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 9a7ae2d9b29..fc941c52c17 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -22,4 +22,5 @@ ${KALDI_ROOT}/src/rnnlmbin:\
 ${KALDI_ROOT}/src/sgmm2bin:\
 ${KALDI_ROOT}/src/sgmmbin:\
 ${KALDI_ROOT}/src/tfrnnlmbin:\
+${KALDI_ROOT}/src/cudadecoderbin:\
 $PATH
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index dc3ad8fbe57..2da65a88a7f 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -10,47 +10,45 @@ debian_packages=
 opensuse_packages=
 
 function add_packages {
-  redhat_packages="$redhat_packages $1";
-  debian_packages="$debian_packages $2";
-  opensuse_packages="$opensuse_packages $3";
+  redhat_packages="$redhat_packages $1"
+  debian_packages="$debian_packages ${2:-$1}"
+  opensuse_packages="$opensuse_packages ${3:-$1}"
 }
 
-if ! which which >&/dev/null; then
-  echo "$0: which is not installed."
-  add_packages which debianutils which
-fi
+function have { type -t "$1" >/dev/null; }
 
-COMPILER_VER_INFO=$($CXX --version 2>/dev/null)
-case $COMPILER_VER_INFO in
+compiler_ver_info=$($CXX --version 2>/dev/null)
+case $compiler_ver_info in
   "")
-    echo "$0: $CXX is not installed."
+    echo "$0: Compiler '$CXX' is not installed."
     echo "$0: You need g++ >= 4.8.3, Apple Xcode >= 5.0 or clang >= 3.3."
+    add_packages gcc-c++ g++
     status=1
     ;;
   "g++ "* )
-    GCC_VER=$($CXX -dumpversion)
-    GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-    if [ $GCC_VER_NUM -lt 40803 ]; then
-        echo "$0: $CXX (g++-$GCC_VER) is not supported."
+    gcc_ver=$($CXX -dumpversion)
+    gcc_ver_num=$(echo $gcc_ver | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+    if [ $gcc_ver_num -lt 40803 ]; then
+        echo "$0: Compiler '$CXX' (g++-$gcc_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
   "Apple LLVM "* )
     # See https://gist.github.com/yamaya/2924292
-    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
-    CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
-    if [ $CLANG_VER_NUM -lt 500 ]; then
-        echo "$0: $CXX (Apple clang-$CLANG_VER) is not supported."
+    clang_ver=$(echo $compiler_ver_info | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    clang_ver_num=$(echo $compiler_ver_info | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
+    if [ $clang_ver_num -lt 500 ]; then
+        echo "$0: Compiler '$CXX' (Apple clang-$clang_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
     ;;
   "clang "* )
-    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
-    CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
-    if [ $CLANG_VER_NUM -lt 303 ]; then
-        echo "$0: $CXX (LLVM clang-$CLANG_VER) is not supported."
+    clang_ver=$(echo $compiler_ver_info | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    clang_ver_num=$(echo $clang_ver | sed 's/\./ /g' | xargs printf "%d%02d")
+    if [ $clang_ver_num -lt 303 ]; then
+        echo "$0: Compiler '$CXX' (LLVM clang-$clang_ver) is not supported."
         echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3."
         status=1
     fi
@@ -60,41 +58,42 @@ case $COMPILER_VER_INFO in
     ;;
 esac
 
-if ! echo "#include <zlib.h>" | $CXX -E - >&/dev/null; then
+# Cannot check this without a compiler.
+if have "$CXX" && ! echo "#include <zlib.h>" | $CXX -E - >&/dev/null; then
   echo "$0: zlib is not installed."
-  add_packages zlib-devel zlib1g-dev zlib-devel
+  add_packages zlib-devel zlib1g-dev
 fi
 
-for f in make automake autoconf patch grep bzip2 gzip wget git sox; do
-  if ! which $f >&/dev/null; then
+for f in make automake autoconf patch grep bzip2 gzip unzip wget git sox gfortran; do
+  if ! have $f; then
     echo "$0: $f is not installed."
-    add_packages $f $f $f
+    add_packages $f
   fi
 done
 
-if ! which libtoolize >&/dev/null && ! which glibtoolize >&/dev/null; then
+if ! have libtoolize && ! have glibtoolize; then
   echo "$0: neither libtoolize nor glibtoolize is installed"
-  add_packages libtool libtool libtool
+  add_packages libtool
 fi
 
-if ! which svn >&/dev/null; then
+if ! have svn; then
   echo "$0: subversion is not installed"
-  add_packages subversion subversion subversion
+  add_packages subversion
 fi
 
-if ! which awk >&/dev/null; then
+if ! have awk; then
   echo "$0: awk is not installed"
-  add_packages gawk gawk gawk
+  add_packages gawk
 fi
 
 pythonok=true
-if ! which python2.7 >&/dev/null; then
+if ! have python2.7; then
   echo "$0: python2.7 is not installed"
   add_packages python2.7
   pythonok=false
 fi
 
-if ! which python3 >&/dev/null; then
+if ! have python3; then
   echo "$0: python3 is not installed"
   add_packages python3
   pythonok=false
@@ -103,10 +102,11 @@ fi
 (
 #Use a subshell so that sourcing env.sh does not have an influence on the rest of the script
 [ -f ./env.sh ] && . ./env.sh
-if $pythonok && ! which python2 >&/dev/null; then
+if $pythonok && ! have python2; then
   mkdir -p $PWD/python
-  echo "$0: python2.7 is installed, but the python2 binary does not exist. Creating a symlink and adding this to tools/env.sh"
-  ln -s $(which python2.7) $PWD/python/python2
+  echo "$0: python2.7 is installed, but the python2 binary does not exist." \
+       "Creating a symlink and adding this to tools/env.sh"
+  ln -s $(command -v python2.7) $PWD/python/python2
   echo "export PATH=$PWD/python:\${PATH}" >> env.sh
 fi
 
@@ -114,83 +114,97 @@ if [[ -f $PWD/python/.use_default_python && -f $PWD/python/python ]]; then
   rm $PWD/python/python
 fi
 
-if $pythonok && which python >&/dev/null && [[ ! -f $PWD/python/.use_default_python ]]; then
-  version=`python 2>&1 --version | awk '{print $2}' `
+if $pythonok && have python && [[ ! -f $PWD/python/.use_default_python ]]; then
+  version=$(python 2>&1 --version | awk '{print $2}')
   if [[ $version != "2.7"* ]] ; then
-    echo "$0: WARNING python 2.7 is not the default python. We fixed this by adding a correct symlink more prominently on the path."
-    echo "$0: If you really want to use python $version as default, add an empty file $PWD/python/.use_default_python and run this script again."
+    echo "$0: WARNING python 2.7 is not the default python. We fixed this by" \
+         "adding a correct symlink more prominently on the path."
+    echo " ... If you really want to use python $version as default, add an" \
+         "empty file $PWD/python/.use_default_python and run this script again."
     mkdir -p $PWD/python
-    ln -s $(which python2.7) $PWD/python/python
+    ln -s $(command -v python2.7) $PWD/python/python
     echo "export PATH=$PWD/python:\${PATH}" >> env.sh
   fi
 fi
 )
 
-printed=false
-
-if which apt-get >&/dev/null && ! which zypper >/dev/null; then
-  # if we're using apt-get [but we're not OpenSuse, which uses zypper as the
-  # primary installer, but sometimes installs apt-get for some compatibility
-  # reason without it really working]...
-  if [ ! -z "$debian_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo apt-get install $debian_packages"
-    printed=true
-    status=1
-  fi
-  if ! dpkg -l | grep -E 'libatlas3gf|libatlas3-base' >/dev/null; then
-    echo "You should probably do: "
-    echo " sudo apt-get install libatlas3-base"
-    printed=true
-  fi
-elif which yum >&/dev/null; then
-  if [ ! -z "$redhat_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo yum install $redhat_packages"
-    printed=true
-    status=1
-  fi
-  if ! rpm -qa|  grep atlas >/dev/null; then
-    echo "You should probably do something like: "
-    echo "sudo yum install atlas.x86_64"
-    printed=true
-  fi
-elif which zypper >&/dev/null; then
-  if [ ! -z "$opensuse_packages" ]; then
-    echo "$0: we recommend that you run (our best guess):"
-    echo " sudo zypper install $opensuse_packages"
-    printed=true
-    status=1
-  fi
-  if ! zypper search -i | grep -E 'libatlas3|libatlas3-devel' >/dev/null; then
-    echo "You should probably do: "
-    echo "sudo zypper install libatlas3-devel"
-    printed=true
+mathlib_missing=false
+case $(uname -m) in
+  x86_64)  # Suggest MKL on an Intel64 system (configure does not like i?86 hosts).
+    # We do not know if compiler exists at this point, so double-check the
+    # well-known mkl.h file location. The compiler test would still find it if
+    # installed in an alternative location (this is unlikely).
+    MKL_ROOT="${MKL_ROOT:-/opt/intel/mkl}"
+    if [ ! -f "${MKL_ROOT}/include/mkl.h" ] &&
+         ! echo '#include <mkl.h>' | $CXX -I /opt/intel/mkl/include -E - >&/dev/null; then
+      if [[ $(uname) == Linux ]]; then
+        echo "$0: Intel MKL is not installed. Run extras/install_mkl.sh to install it."
+      else
+        echo "$0: Intel MKL is not installed. Download the installer package for your
+ ... system from: https://software.intel.com/mkl/choose-download."
+      fi
+      mathlib_missing=true
+    fi
+      ;;
+  *)  # Suggest OpenBLAS on other hardware.
+    if [ ! -f $(pwd)/OpenBLAS/install/include/openblas_config.h ] &&
+         ! echo '#include <openblas_config.h>' |
+            $CXX -I $(pwd)/OpenBLAS/install/include -E - >&/dev/null; then
+      echo "$0: OpenBLAS not detected. Run extras/install_openblas.sh
+ ... to compile it for your platform, or configure with --openblas-root= if you
+ ... have it installed in a location we could not guess. Note that packaged
+ ... library may be significantly slower and/or older than the one the above
+ ... would build."
+      mathlib_missing=true
+    fi
+      ;;
+esac
+$mathlib_missing &&
+  echo "\
+ ... You can also use other matrix algebra libraries. For information, see:
+ ...   http://kaldi-asr.org/doc/matrixwrap.html"
+
+# Report missing programs and libraries.
+if [ -n "$debian_packages" ]; then
+  install_pkg_command=$(
+    # Guess package manager from user's distribution type. Use a subshell
+    # because we are potentially importing a lot of dirt here.
+    eval $(grep 2>/dev/null ^ID /etc/os-release) 2>/dev/null
+    for rune in ${ID-} ${ID_LIKE-}; do
+      # The case '(pattern)' syntax is necessary in subshell for bash 3.x.
+      case $rune in
+        (rhel|centos|redhat) echo "yum install $redhat_packages"; break;;
+        (fedora) echo "dnf install $redhat_packages"; break;;
+        (suse) echo "zypper install $opensuse_packages"; break;;
+        (debian) echo "apt-get install $debian_packages"; break;;
+      esac
+    done
+  )
+
+  # Print the suggestion to install missing packages.
+  if [ -n "$install_pkg_command" ]; then
+    echo "$0: Some prerequisites are missing; install them using the command:"
+    echo "  sudo" $install_pkg_command
+  else
+    echo "$0: The following prerequisites are missing; install them first:"
+    echo "  " $debian_packages
   fi
-fi
-
-if [ ! -z "$debian_packages" ]; then
-  # If the list of packages to be installed is nonempty,
-  # we'll exit with error status.  Check this outside of
-  # checking for yum or apt-get, as we want it to exit with
-  # error even if we're not on Debian or red hat.
   status=1
 fi
 
-
 if [ $(pwd | wc -w) -gt 1 ]; then
   echo "*** $0: Warning: Kaldi scripts will fail if the directory name contains a space."
   echo "***  (it's OK if you just want to compile a few tools -> disable this check)."
-  status=1;
+  status=1
 fi
 
-if which grep >&/dev/null && pwd | grep -E 'JOB|LMWT' >/dev/null; then
+if pwd | grep -E 'JOB|LMWT' >/dev/null; then
   echo "*** $0: Kaldi scripts will fail if the directory name contains"
   echo "***  either of the strings 'JOB' or 'LMWT'."
-  status=1;
+  status=1
 fi
 
-if ! $printed && [ $status -eq 0 ]; then
+if ! $mathlib_missing && [ $status -eq 0 ]; then
   echo "$0: all OK."
 fi
 
diff --git a/tools/extras/install_beamformit.sh b/tools/extras/install_beamformit.sh
index e61b6645c36..e29311c13e1 100755
--- a/tools/extras/install_beamformit.sh
+++ b/tools/extras/install_beamformit.sh
@@ -1,14 +1,24 @@
 #!/bin/bash
 
+LIBSNDFILE_VERSION=1.0.25
+
+GIT=${GIT:-git}
+WGET=${WGET:-wget}
+
 # Installs beamformit from the location https://github.com/xanguera/BeamformIt
 
 # libsndfile needed by beamformit
-[ ! -f libsndfile-1.0.25.tar.gz ] && \
-  wget http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.25.tar.gz
-[ ! -d libsndfile-1.0.25 ] && \
-  tar xzf libsndfile-1.0.25.tar.gz
+if [ ! -f libsndfile-$LIBSNDFILE_VERSION.tar.gz ]; then
+  if [ -d "$DOWNLOAD_DIR" ]; then
+    cp -p "$DOWNLOAD_DIR/libsndfile-$LIBSNDFILE_VERSION.tar.gz" . || exit 1
+  else
+    $WGET http://www.mega-nerd.com/libsndfile/files/libsndfile-$LIBSNDFILE_VERSION.tar.gz || exit 1
+  fi
+fi
+[ ! -d libsndfile-$LIBSNDFILE_VERSION ] && \
+  tar xzf libsndfile-$LIBSNDFILE_VERSION.tar.gz
 (
-  cd libsndfile-1.0.25
+  cd libsndfile-$LIBSNDFILE_VERSION
   ./configure --prefix=$PWD
   make
   make install
@@ -16,11 +26,11 @@
 
 # building beamformit
 [ ! -d ./BeamformIt ] &&
-  git clone https://github.com/xanguera/BeamformIt
+  $GIT clone https://github.com/xanguera/BeamformIt
 (
   cd BeamformIt
-  git pull
-  cmake -DLIBSND_INSTALL_DIR=$PWD/../libsndfile-1.0.25 .
+  $GIT pull
+  cmake -DLIBSND_INSTALL_DIR=$PWD/../libsndfile-$LIBSNDFILE_VERSION .
   make
 )
 
diff --git a/tools/extras/install_cffi.sh b/tools/extras/install_cffi.sh
index 7f94497967f..7319faa4cd3 100755
--- a/tools/extras/install_cffi.sh
+++ b/tools/extras/install_cffi.sh
@@ -5,7 +5,7 @@
 #you may not use this file except in compliance with the License.
 #You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+#https://www.apache.org/licenses/LICENSE-2.0
 #
 #THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
@@ -16,12 +16,12 @@
 #
 # This script is attempting to install cffi a Python/C interface
 # Cffi is used to call Kaldi from Python
-# See the documentation at http://cffi.readthedocs.org/en/latest/
+# See the documentation at https://cffi.readthedocs.io/en/latest/
 # This script is trying to install cffi 0.6(now the latest version
 # Tell us if you need higher version(if it exists).
 #
 # Also dependencies are installed. Namely:
-# * pycparser >= 2.06: http://code.google.com/p/pycparser/
+# * pycparser >= 2.06: https://github.com/eliben/pycparser/releases
 # * py.test
 # 
 # NOT INSTALLED DEPENDENCIES (We are letting you to install it!):
@@ -29,10 +29,12 @@
 # * python-dev (Python headers) and libffi-dev (ffi C library)
 # * a C compiler is required to use CFFI during development, but not to run correctly-installed programs that use CFF
 
+WGET=${WGET:-wget}
+
 echo "**** Installing Cffi and dependencies"
 
 echo "Checking for Python-Dev"
-# copied from http://stackoverflow.com/questions/4848566/check-for-existence-of-python-dev-files-from-bash-script
+# copied from https://stackoverflow.com/questions/4848566/check-for-existence-of-python-dev-files-from-bash-script
 if [ ! -e $(python -c 'from distutils.sysconfig import get_makefile_filename as m; print m()') ]; then 
     echo "On Debian/Ubuntu like system install by 'sudo apt-get python-dev' package."
     echo "On Fedora by 'yum install python-devel'"
@@ -91,26 +93,32 @@ function downloader {
     file=$1; url=$2;
     if [ ! -e "$file" ]; then
         echo "Could not find $file" 
-        echo "Trying to download it via wget!"
+
+        if [ -d "$DOWNLOAD_DIR" ]; then
+          echo "Copying it from $DOWNLOAD_DIR !"
+          cp -p "$DOWNLOAD_DIR/$file" .
+        else
+          echo "Trying to download it via wget!"
         
-        wget --version  >/dev/null 2>&1 || \
-            { echo "This script requires you to first install wget"
-            echo "You can also just download $file from $url"
-            exit 1; }
+          $WGET --version  >/dev/null 2>&1 || \
+              { echo "This script requires you to first install wget"
+              echo "You can also just download $file from $url"
+              exit 1; }
 
-       wget --no-check-certificate -T 10 -t 3 $url
+          $WGET -T 10 -t 3 $url
+        fi
 
-       if [ ! -e $file ]; then
+        if [ ! -e $file ]; then
             echo "Download of $file - failed!"
             echo "Aborting script. Please download and install $file manually!"
-        exit 1;
-       fi
+            exit 1;
+        fi
     fi
 }
 
 echo Downloading and extracting cffi
 cffitar=$cffiname.tar.gz
-cffiurl=http://pypi.python.org/packages/source/c/cffi/cffi-0.6.tar.gz
+cffiurl=https://pypi.python.org/packages/source/c/cffi/cffi-0.6.tar.gz
 downloader $cffitar $cffiurl
 tar -xovzf $cffitar || exit 1
 
@@ -142,8 +150,8 @@ pushd $pytestname
 
 new_path="$prefix/bin"
 export PATH="$PATH:$new_path"
-echo "\nAdding the $new_path to PATH so I can launch the pytest"
-echo "DO THE SAME IN YOUR PERMANENT SETTINGS TO USE THE pytest REGULARLY!\n"
+echo; echo "Adding the $new_path to PATH so I can launch the pytest"
+echo "DO THE SAME IN YOUR PERMANENT SETTINGS TO USE THE pytest REGULARLY!"; echo
 
 python setup.py install --prefix="$prefix" || exit 1
 popd
diff --git a/tools/extras/install_chainer.sh b/tools/extras/install_chainer.sh
new file mode 100755
index 00000000000..1dfb8cb9881
--- /dev/null
+++ b/tools/extras/install_chainer.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Installs chainer with nn-gev dependencies
+# miniconda should be installed in $HOME/miniconda3/ 
+# Download cuDNN from "https://developer.nvidia.com/rdp/cudnn-download" and extract in "$HOME/cuda/"
+# and add their paths "export CFLAGS=-I$HOME/cuda/include" "export LDFLAGS=-L$HOME/cuda/lib64"
+
+cudnn_dir=$HOME/cuda
+cudnn_include_file=$cudnn_dir/include/cudnn.h
+cudnn_lib_dir=$cudnn_dir/lib64
+miniconda_dir=$HOME/miniconda3/
+
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run 'tools/extras/install_miniconda.sh" && exit 1;
+fi
+
+if [ ! -d $cudnn_lib_dir ] || [ ! -f $cudnn_include_file ]; then
+    echo "cuDNN is not available. $cudnn_include_file and/or $cudnn_lib_dir are missing.
+          Download cuDNN v5.1 for appropriate CUDA version (7.5 or 8.0) from 'https://developer.nvidia.com/rdp/cudnn-download'.
+	  Check CUDA version using the command 'nvcc --version'
+	  Place the include and lib directories in $cudnn_dir after download" && exit 1;
+fi
+
+cudnn_major=`cat $HOME/cuda/include/cudnn.h | grep CUDNN_MAJOR | head -1 | rev | cut -d " " -f1`
+cudnn_minor=`cat $HOME/cuda/include/cudnn.h | grep CUDNN_MINOR | head -1 | rev | cut -d " " -f1`
+
+if [ $cudnn_major -ne 5 ] || [ $cudnn_minor -ne 1 ]; then
+    echo "cuDNN version in $cudnn_dir is not '5.1'. Please download v5.1"  && exit 1;
+fi
+
+$HOME/miniconda3/bin/python -m pip install --user chainer==1.16.0
diff --git a/tools/extras/install_faster_rnnlm.sh b/tools/extras/install_faster_rnnlm.sh
index 48fca768a50..b34567eb892 100755
--- a/tools/extras/install_faster_rnnlm.sh
+++ b/tools/extras/install_faster_rnnlm.sh
@@ -3,6 +3,8 @@
 # The script downloads and installs faster-rnnlm
 # https://github.com/yandex/faster-rnnlm
 
+GIT=${GIT:-git}
+
 set -e
 
 # Make sure we are in the tools/ directory.
@@ -16,10 +18,10 @@ fi
 echo "Installing Faster RNNLM"
 
 if [ ! -d "faster-rnnlm" ]; then
-    git clone https://github.com/yandex/faster-rnnlm.git
+    $GIT clone https://github.com/yandex/faster-rnnlm.git
 fi
 
 cd faster-rnnlm
-git pull
+$GIT pull
 ./build.sh
 ln -sf faster-rnnlm/rnnlm
diff --git a/tools/extras/install_ffv.sh b/tools/extras/install_ffv.sh
index 1cec15e9459..0f2eece85c8 100755
--- a/tools/extras/install_ffv.sh
+++ b/tools/extras/install_ffv.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+VERSION=1.0.1
+
+WGET=${WGET:-wget}
+
 # Make sure we are in the tools/ directory.
 if [ `basename $PWD` == extras ]; then
   cd ..
@@ -13,20 +17,22 @@ cd pitch_trackers
 
 echo "Installing a package for FFV feature extraction."
 
-if [ -s ffv-1.0.1.tar.gz ]; then
-  echo "*ffv-1.0.1.tar.gz already exists, not getting it."
+if [ -s ffv-$VERSION.tar.gz ]; then
+  echo "*ffv-$VERSION.tar.gz already exists, not getting it."
+elif [ -d "$DOWNLOAD_DIR" ]; then
+  cp -p "$DOWNLOAD_DIR/ffv-$VERSION.tar.gz" . || exit 1
 else
-  ! wget -t 2 http://www.cs.cmu.edu/~kornel/software/ffv-1.0.1.tar.gz && \
-    echo "Error wgetting ffv-1.0.1.tar.gz" && exit 1;
+  ! $WGET -t 2 https://www.cs.cmu.edu/~kornel/software/ffv-$VERSION.tar.gz && \
+    echo "Error wgetting ffv-$VERSION.tar.gz" && exit 1;
 fi
 
-if [ -d ffv-1.0.1 ]; then
-  echo "*It looks like ffv-1.0.1.tar.gz has already been unpacked, not unpacking it."
+if [ -d ffv-$VERSION ]; then
+  echo "*It looks like ffv-$VERSION.tar.gz has already been unpacked, not unpacking it."
 else 
-  ! tar -zxvf ffv-1.0.1.tar.gz && \
-  echo "Error unpacking  ffv-1.0.1.tar.gz [e.g. unpack not installed?]" && exit 1;
+  ! tar -zxvf ffv-$VERSION.tar.gz && \
+  echo "Error unpacking  ffv-$VERSION.tar.gz [e.g. unpack not installed?]" && exit 1;
 fi
-cd ffv-1.0.1
+cd ffv-$VERSION
 
 if [ -f Makefile ]; then
   echo "Makefile already exists, no creating it."
diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index b27f0f89897..1c6424504ac 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -4,6 +4,9 @@
 
 # Begin configuration section.
 # End configuration section
+
+GIT=${GIT:-git}
+
 set -e -o pipefail
 
 
@@ -11,14 +14,20 @@ errcho() { echo "$@" 1>&2; }
 
 errcho "****() Installing IRSTLM"
 
+if [ ! -d ./extras ]; then
+  errcho "****** You are trying to install IRSTLM from the wrong directory.  You should"
+  errcho "****** go to tools/ and type extras/install_irstlm.sh."
+  exit 1
+fi
+
+
 if [ ! -d ./irstlm ] ; then
-  svn=`which git`
-  if [ $? != 0 ]  ; then
+  if ! $GIT --version >&/dev/null ; then
     errcho "****() You need to have git installed"
     exit 1
   fi
   (
-    git clone https://github.com/irstlm-team/irstlm.git irstlm
+    $GIT clone https://github.com/irstlm-team/irstlm.git irstlm
   ) || {
     errcho "****() Error getting the IRSTLM sources. The server hosting it"
     errcho "****() might be down."
diff --git a/tools/extras/install_jieba.sh b/tools/extras/install_jieba.sh
index 49fe1b79804..3200a16db8e 100755
--- a/tools/extras/install_jieba.sh
+++ b/tools/extras/install_jieba.sh
@@ -2,6 +2,8 @@
 
 # The script downloads and installs jieba
 
+GIT=${GIT:-git}
+
 set -e
 
 # Make sure we are in the tools/ directory.
@@ -22,7 +24,7 @@ if [ -d ./jieba ] ; then
 fi
 
 if [ ! -d ./jieba ]; then
-  git clone https://github.com/fxsjy/jieba.git || exit 1;
+  $GIT clone https://github.com/fxsjy/jieba.git || exit 1;
 fi
 
 (
diff --git a/tools/extras/install_kaldi_io.sh b/tools/extras/install_kaldi_io.sh
new file mode 100755
index 00000000000..e3192be78a8
--- /dev/null
+++ b/tools/extras/install_kaldi_io.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Install kaldi_io. Please refer to https://github.com/vesis84/kaldi-io-for-python 
+# for details.
+
+python3 -m pip install --user kaldi_io
diff --git a/tools/extras/install_kaldi_lm.sh b/tools/extras/install_kaldi_lm.sh
index 9b5fd23a9c3..a70a6ac14c3 100755
--- a/tools/extras/install_kaldi_lm.sh
+++ b/tools/extras/install_kaldi_lm.sh
@@ -2,6 +2,8 @@
 
 # The script downloads and installs kaldi_lm
 
+GIT=${GIT:-git}
+
 set -e
 
 # Make sure we are in the tools/ directory.
@@ -15,8 +17,7 @@ fi
 echo "Installing kaldi_lm"
 
 if [ ! -d "kaldi_lm" ]; then
-  wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
-  tar -xvzf kaldi_lm.tar.gz || exit 1;
+  $GIT clone https://github.com/danpovey/kaldi_lm.git || exit 1
 fi
 
 cd kaldi_lm
diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh
index 10f72cad84f..df14cdd70e7 100644
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
@@ -1,7 +1,15 @@
 #!/bin/bash
+
 VER=1.10
+
+WGET=${WGET:-wget}
+
 if [ ! -f liblbfgs-$VER.tar.gz ]; then
-  wget https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz
+  if [ -d "$DOWNLOAD_DIR" ]; then
+    cp -p "$DOWNLOAD_DIR/liblbfgs-$VER.tar.gz" . || exit 1
+  else
+    $WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
+  fi
 fi
 
 tar -xzf liblbfgs-$VER.tar.gz
diff --git a/tools/extras/install_mikolov_rnnlm.sh b/tools/extras/install_mikolov_rnnlm.sh
index 175cd83bcff..66bbc9fc9c5 100755
--- a/tools/extras/install_mikolov_rnnlm.sh
+++ b/tools/extras/install_mikolov_rnnlm.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+WGET=${WGET:-wget}
+
 set -e
 
 if [ $# -ne 1 ]; then
@@ -19,10 +21,14 @@ fi
 
 cd $tools_dir
 echo Downloading and installing the rnnlm tools
-# http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
+# https://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz
 arc_file="$rnnlm_ver.tgz"
 if [ ! -f "$arc_file" ]; then
-    wget "http://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz" -O "$arc_file" || exit 1;
+    if [ -d "$DOWNLOAD_DIR" ]; then
+        cp -p "$DOWNLOAD_DIR/$arc_file" . || exit 1
+    else
+        $WGET "https://www.fit.vutbr.cz/~imikolov/rnnlm/$rnnlm_ver.tgz" -O "$arc_file" || exit 1;
+    fi
 fi
 mkdir $rnnlm_ver
 cd $rnnlm_ver
diff --git a/tools/extras/install_miniconda.sh b/tools/extras/install_miniconda.sh
new file mode 100755
index 00000000000..1ddc7c68bf6
--- /dev/null
+++ b/tools/extras/install_miniconda.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+WGET=${WGET:-wget}
+
+# The script automatically choose default settings of miniconda for installation
+# Miniconda will be installed in the HOME directory. ($HOME/miniconda3).
+# Also don't make miniconda's python as default.
+
+if [ -d "$DOWNLOAD_DIR" ]; then
+  cp -p "$DOWNLOAD_DIR/Miniconda3-latest-Linux-x86_64.sh" . || exit 1
+else
+  $WGET https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh || exit 1
+fi
+bash Miniconda3-latest-Linux-x86_64.sh -b
+
+$HOME/miniconda3/bin/python -m pip install --user tqdm
+$HOME/miniconda3/bin/python -m pip install --user scikit-learn
+$HOME/miniconda3/bin/python -m pip install --user librosa
+$HOME/miniconda3/bin/python -m pip install --user h5py
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
new file mode 100755
index 00000000000..fe2ea7bdb65
--- /dev/null
+++ b/tools/extras/install_mkl.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+
+# Intel MKL is now freely available even for commercial use. This script
+# attempts to install the MKL package automatically from Intel's repository.
+#
+# For manual repository setup instructions, see:
+#   https://software.intel.com/articles/installing-intel-free-libs-and-python-yum-repo
+#   https://software.intel.com/articles/installing-intel-free-libs-and-python-apt-repo
+#
+# For other package managers, or non-Linux platforms, see:
+#   https://software.intel.com/mkl/choose-download
+
+set -o pipefail
+
+default_package=intel-mkl-64bit-2019.2-057
+
+yum_repo='https://yum.repos.intel.com/mkl/setup/intel-mkl.repo'
+apt_repo='https://apt.repos.intel.com/mkl'
+intel_key_url='https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB'
+
+Usage () {
+  cat >&2 <<EOF
+Usage: $0 [-s] [<MKL-package>]
+
+Checks if MKL is present on the system, and/or attempts to install it.
+
+If <MKL-package> is not provided, ${default_package} will be installed.
+
+Intel packages are installed under the /opt/intel directory. You should be root
+to install MKL into this directory; run this script using the sudo command.
+
+Options:
+  -s  - Skip check for MKL being already present.
+  -p <suse|redhat|debian|fedora> -- Force type of package management. Use only
+                                    if automatic detection fails, as instructed.
+  -h  - Show this message.
+
+Environment:
+  CC   The C compiler to use for MKL check. If not set, uses 'cc'.
+EOF
+  exit 2
+}
+
+Fatal () { echo "$0: $@"; exit 1; }
+
+Have () { type -t "$1" >/dev/null; }
+
+# Option values.
+skip_cc=
+distro=
+
+while getopts ":hksp:" opt; do
+  case ${opt} in
+    h) Usage ;;
+    s) skip_cc=yes ;;
+    p) case $OPTARG in
+         suse|redhat|debian|fedora) distro=$OPTARG ;;
+         *) Fatal "invalid value -p '${OPTARG}'. " \
+                  "Allowed: 'suse', 'redhat', 'debian' or 'fedora'."
+       esac ;;
+    \?) echo >&2 "$0: invalid option -${OPTARG}."; Usage ;;
+  esac
+done
+shift $((OPTIND-1))
+
+orig_arg_package=${1-''}
+package=${1:-$default_package}
+
+# Check that we are actually on Linux, otherwise give a helpful reference.
+[[ $(uname) == Linux ]] || Fatal "\
+This script can be used on Linux only, and your system is $(uname).
+
+Installer packages for Mac and Windows are available for download from Intel:
+https://software.intel.com/mkl/choose-download"
+
+# Test if MKL is already installed on the system.
+if [[ ! $skip_cc ]]; then
+  : ${CC:=cc}
+  Have "$CC" || Fatal "\
+C compiler $CC not found.
+
+You can skip the check for MKL presence by invoking this script with the '-s'
+option to this script, but you will need a functional compiler anyway, so we
+recommend that you install it first."
+
+  mkl_version=$($CC -E -I /opt/intel/mkl/include - <<< \
+                      '#include <mkl_version.h>
+           __INTEL_MKL__.__INTEL_MKL_MINOR__.__INTEL_MKL_UPDATE__' 2>/dev/null |
+                  tail -n 1 ) || mkl_version=
+  mkl_version=${mkl_version// /}
+
+  [[ $mkl_version ]] && Fatal "\
+MKL version $mkl_version is already installed.
+
+You can skip the check for MKL presence by invoking this script with the '-s'
+option and proceed with automated installation, but we highly discourage
+this. This script will register Intel repositories with your system, and it
+seems that they have been already registered, or MKL has been installed some
+other way.
+
+You should use your package manager to check which MKL package is already
+installed. Note that Intel packages register the latest installed version of
+the library as the default. If your installed version is older than
+$package, it makes sense to upgrade."
+fi
+
+# Try to determine which package manager the distro uses, unless overridden.
+if [[ ! $distro ]]; then
+  dist_vars=$(cat /etc/os-release 2>/dev/null)
+  eval "$dist_vars"
+  for rune in $CPE_NAME $ID $ID_LIKE; do
+    case "$rune" in
+      cpe:/o:fedoraproject:fedora:2[01]) distro=redhat; break;;  # Use yum.
+      rhel|centos) distro=redhat; break;;
+      redhat|suse|fedora|debian) distro=$rune; break;;
+    esac
+  done
+
+  # Certain old distributions do not have /etc/os-release. We are unlikely to
+  # encounter these in the wild, but just in case.
+  # NOTE: Do not try to guess Fedora specifically here! Fedora 20 and below
+  #       detect as redhat, and this is good, because they use yum by default.
+  [[ ! $distro && -f /etc/redhat-release ]] && distro=redhat
+  [[ ! $distro && -f /etc/SuSE-release ]]   && distro=suse
+  [[ ! $distro && -f /etc/debian_release ]] && distro=debian
+
+  [[ ! $distro ]] && Fatal "\
+Unable to determine package management style.
+
+Invoke this script with the option '-p <style>', where <style> can be:
+  redhat -- RedHat-like, uses yum and rpm for package management.
+  fedora -- Fedora 22+, also RedHat-like, but uses dnf instead of yum.
+  suse   -- SUSE-like, uses zypper and rpm.
+  debian -- Debian-like, uses apt and dpkg.
+
+We do not currently support other package management systems. Check the Intel's
+documentation at https://software.intel.com/mkl/choose-download for other
+install options."
+
+  echo >&2 "$0: Your system is using ${distro}-style package management."
+fi
+
+# Check for root.
+if [[ "$(id -u)" -ne 0 ]]; then
+  echo >&2 "$0: You must be root to install MKL.
+
+Restart this script using the 'sudo' command, as:
+
+  sudo $0 -sp $distro $package
+
+We recommend adding the '-sp $distro' options to skip the MKL and distro
+detection, since this has already been done. This minimizes the number of
+programs invoked with the root privileges to keep your system safe from
+unexpected or erroneous changes. Also, if you are setting the CC environment
+variable, sudo might not allow it to propagate to the command that it invokes."
+
+  if [ -t 0 ]; then
+    echo; read -ep "Run the above sudo command now? [Y/n]:"
+    case $REPLY in
+      ''|[Yy]*) set -x; exec sudo "$0" -sp "$distro" "$package"
+    esac
+  fi
+  exit 0
+fi
+
+# The install variants, each in a finction to simplify error reporting.
+# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# commands it runs. The subshells simply limit the scope of this diagnostics
+# and avoid creating noise (if we were using 'set +x', it would be printed).
+Install_redhat () {
+  # yum-utils contains yum-config-manager, in case the user does not have it.
+  ( set -x
+    yum -y install yum-utils &&
+    yum-config-manager --add-repo "$yum_repo" &&
+    yum -y install "$package" )
+}
+
+Install_fedora () {
+  ( set -x
+    dnf -y install 'dnf-command(config-manager)' &&
+    dnf config-manager --add-repo "$yum_repo" &&
+    dnf -y install "$package" )
+}
+
+Install_suse () {
+  # zypper bug until libzypp-17.6.4: '--gpg-auto-import-keys' is ignored.
+  # See https://github.com/openSUSE/zypper/issues/144#issuecomment-418685933
+  # We must disable gpg checks with '--no-gpg-checks'. I won't bend backwards
+  # as far as check the installed .so version...
+  ( set -x
+    zypper addrepo "$yum_repo" &&
+    zypper --gpg-auto-import-keys --no-gpg-checks \
+           --non-interactive install "$package" )
+}
+
+Install_debian () {
+  local keyring='/usr/share/keyrings/intel-sw-products.gpg' \
+        sources_d='/etc/apt/sources.list.d' \
+        trusted_d='/etc/apt/trusted.gpg.d' \
+        apt_maj= apt_min= apt_ver=
+
+  # apt before 1.2 does not understand the signed-by option, and always
+  # look for the keyring in their trusted.gpg.d directory. This is not
+  # considered a good security practice any more. If apt is old, add a link
+  # to the keyring file and remind the user to delete it when apt is upgraded.
+  IFS=' .' builtin read _ apt_maj apt_min _ < <(apt-get --version)
+  apt_ver=$(builtin printf '%03d%03d' $apt_maj $apt_min)
+
+  # Get alternative location of /etc/apt/sources.list.d, if so configured.
+  eval $(apt-config shell sources_d Dir::Etc::sourceparts/f \
+                          trusted_d Dir::Etc::trustedparts/f)
+
+  # apt is much more involved to configure than other package managers, as fas
+  # as third-party security keys go.
+  ( set -x;
+    apt-get update &&
+    apt-get install -y wget apt-transport-https ca-certificates gnupg &&
+    wget -qO- $intel_key_url | apt-key --keyring $keyring add - &&
+    echo "deb [signed-by=${keyring}] $apt_repo all main" \
+         > "$sources_d/intel-mkl.list" ) || return 1
+
+  if [[ $apt_ver < '001002' ]]; then
+    ( set -x; ln -s "$keyring" "${trusted_d}/" ) || return 1
+  fi
+
+  ( set +x
+    apt-get update &&
+    apt-get install -y "$package" ) || return 1
+
+  # Print the message after the large install, so the user may notice. I hope...
+  if [[ $apt_ver < '001002' ]]; then
+    echo >&2 "$0: Your apt-get version is earlier than 1.2.
+
+This version does not understand individual repositories signing keys, and
+trusts all keys in $trusted_d. We have created a link
+$trusted_d/$(basename $keyring) pointing to the file
+$keyring. If/when you upgrade your system to
+a higher version of apt, removing this link will help make it more secure.
+
+This is not considered a severe security issue, but separating keyrings is the
+current recommended security practice."
+  fi
+}
+
+# Register MKL .so libraries with the ld.so.
+ConfigLdSo() {
+  [ -d /etc/ld.so.conf.d ] || return 0
+  type -t ldconfig >/dev/null || return 0
+  echo >&2 "$0: Configuring ld runtime bindings"
+  ( set -x;
+    echo >/etc/ld.so.conf.d/intel-mkl.conf "\
+/opt/intel/lib/intel64
+/opt/intel/mkl/lib/intel64"
+    ldconfig )
+}
+
+# Invoke installation.
+if Install_${distro} && ConfigLdSo; then
+  echo >&2 "$0: MKL package $package was successfully installed"
+else
+  Fatal "MKL package $package installation FAILED.
+
+Please open an issue with us at https://github.com/kaldi-asr/kaldi/ if you
+believe this is a bug."
+fi
diff --git a/tools/extras/install_mmseg.sh b/tools/extras/install_mmseg.sh
index 586740b5cbc..232c1ac7d38 100755
--- a/tools/extras/install_mmseg.sh
+++ b/tools/extras/install_mmseg.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
-set -e
 
+VERSION=1.3.0
+
+WGET=${WGET:-wget}
+
+set -e
 
 # Make sure we are in the tools/ directory.
 if [ `basename $PWD` == extras ]; then
@@ -18,7 +22,7 @@ if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print s.get_config_vars()['INCLUDEPY']")
+  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
@@ -35,20 +39,24 @@ else
   fi
 fi
 
-if [ -d ./mmseg-1.3.0 ] ; then
+if [ -d ./mmseg-$VERSION ] ; then
   echo  >&2 "$0: Warning: old installation of mmseg found. You should manually"
   echo  >&2 "  delete the directory tools/mmseg and "
   echo  >&2 "  edit the file tools/env.sh and remove manually all references to it"
   exit 1
 fi
 
-if [ ! -d ./mmseg-1.3.0 ] ; then
-  wget http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
-  tar xf mmseg-1.3.0.tar.gz
+if [ ! -d ./mmseg-$VERSION ] ; then
+  if [ -d "$DOWNLOAD_DIR" ] ; then
+    cp -p "$DOWNLOAD_DIR/mmseg-$VERSION.tar.gz" .
+  else
+    $WGET https://pypi.python.org/packages/source/m/mmseg/mmseg-$VERSION.tar.gz
+  fi
+  tar xf mmseg-$VERSION.tar.gz
 fi
 
 (
-cd mmseg-1.3.0
+cd mmseg-$VERSION
 pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
 export PYTHONPATH=$PYTHONPATH:$PWD/lib/python${pyver}/site-packages/:$PWD/lib64/python${pyver}/site-packages/
 # we have to create those dir, as the install target does not create it
@@ -62,23 +70,23 @@ python setup.py install --prefix `pwd`
 ## so that should be pretty reliable) and then we work out the location of
 ## the site-packages directory (typically it would be one level up from
 ## the location of the mmseg.py file but using find seems more reliable
-mmseg_file_lib=$(find ./mmseg-1.3.0/lib/ -type f -name mmseg.py | head -n1)
-mmseg_file_lib64=$(find ./mmseg-1.3.0/lib64/ -type f -name mmseg.py | head -n1)
+mmseg_file_lib=$(find ./mmseg-$VERSION/lib/ -type f -name mmseg.py | head -n1)
+mmseg_file_lib64=$(find ./mmseg-$VERSION/lib64/ -type f -name mmseg.py | head -n1)
 if [ ! -z ${mmseg_file_lib+x} ]; then
   lib_dir=./lib/
 elif [ ! -z ${mmseg_file_lib64+x} ]; then
   lib_dir=./lib64/
 else
-  echo >&2 "$0: ERROR: Didn't find ./mmseg-1.3.0/lib/ or ./mmseg-1.3.0/lib64/"
+  echo >&2 "$0: ERROR: Didn't find ./mmseg-$VERSION/lib/ or ./mmseg-$VERSION/lib64/"
   echo >&2 "  Perhaps your python or system installs python modules into"
   echo >&2 "  a different dir or some other unknown issues arised. Review the output"
   echo >&2 "  of the script and try to figure out what went wrong."
   exit 1
 fi
 
-site_packages_dir=$(cd ./mmseg-1.3.0; find $lib_dir -name "site-packages" -type d | head -n1)
+site_packages_dir=$(cd ./mmseg-$VERSION; find $lib_dir -name "site-packages" -type d | head -n1)
 (
-  echo "export MMSEG=\"$PWD/mmseg-1.3.0\""
+  echo "export MMSEG=\"$PWD/mmseg-$VERSION\""
   echo "export PYTHONPATH=\"\${PYTHONPATH:-}:\$MMSEG/${site_packages_dir}\""
 ) >> env.sh
 
diff --git a/tools/extras/install_morfessor.sh b/tools/extras/install_morfessor.sh
index 0722f0fa16a..a247e8c68b6 100755
--- a/tools/extras/install_morfessor.sh
+++ b/tools/extras/install_morfessor.sh
@@ -4,11 +4,13 @@
 # Apache 2.0
 #
 
+GIT=${GIT:-git}
+
 echo "#### installing morfessor"
 dirname=morfessor
 if [ ! -d ./$dirname ]; then
   mkdir -p ./$dirname
-  git clone https://github.com/aalto-speech/morfessor.git morfessor ||
+  $GIT clone https://github.com/aalto-speech/morfessor.git morfessor ||
     {
       echo  >&2 "$0: Error git clone operation "
       echo  >&2 "  Failed in cloning the github repository (https://github.com/aalto-speech/morfessor.git)"
diff --git a/tools/extras/install_mpg123.sh b/tools/extras/install_mpg123.sh
index 5702ff476b4..ea362f7a3b4 100755
--- a/tools/extras/install_mpg123.sh
+++ b/tools/extras/install_mpg123.sh
@@ -5,7 +5,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+# https://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
@@ -17,35 +17,43 @@
 # This script attempts to install mpg123, which can be used for decoding 
 # mp2 and mp3 file formats.
 
+VERSION=1.21.0
+
+WGET=${WGET:-wget}
+
 errcho() { echo "$@" 1>&2; }
 
 errcho "****() Installing MPG123"
 
-if [ ! -e mpg123-1.21.0.tar.bz2 ]; then
-    errcho "Could not find the tarball mpg123-1.21.0.tar.bz2"  
-    
-    if ! which wget >&/dev/null; then
-        errcho "This script requires you to first install wget"
-        errcho "You can also just download mpg123-1.21.0.tar.bz2 from"
-        errcho "http://www.mpg123.org/download.shtml)"
-        errcho "and run this installation script again"
-        exit 1;
-    fi
+if [ ! -e mpg123-$VERSION.tar.bz2 ]; then
+    errcho "Could not find the tarball mpg123-$VERSION.tar.bz2"
 
-   wget -T 10 -t 3 -c 'http://downloads.sourceforge.net/project/mpg123/mpg123/1.21.0/mpg123-1.21.0.tar.bz2'
+    if [ -d "$DOWNLOAD_DIR" ]; then
+        cp -p "$DOWNLOAD_DIR/mpg123-$VERSION.tar.bz2" .
+    else
+        if ! $WGET --version >&/dev/null; then
+            errcho "This script requires you to first install wget"
+            errcho "You can also just download mpg123-$VERSION.tar.bz2 from"
+            errcho "https://www.mpg123.org/download.shtml"
+            errcho "and run this installation script again"
+            exit 1
+        fi
 
-   if [ ! -e mpg123-1.21.0.tar.bz2 ]; then
-        errcho "Download of mpg123-1.21.0.tar.bz2 failed!"
-        errcho "You can also just download mpg123-1.21.0.tar.bz2 from"
-        errcho "http://www.mpg123.org/download.shtml)"
+        $WGET -T 10 -t 3 -c "https://downloads.sourceforge.net/project/mpg123/mpg123/$VERSION/mpg123-$VERSION.tar.bz2"
+    fi
+
+    if [ ! -e mpg123-$VERSION.tar.bz2 ]; then
+        errcho "Download of mpg123-$VERSION.tar.bz2 failed!"
+        errcho "You can also just download mpg123-$VERSION.tar.bz2 from"
+        errcho "https://www.mpg123.org/download.shtml"
         errcho "and run this installation script again"
-    exit 1;
-   fi
+        exit 1
+    fi
 fi
 
-tar xjf mpg123-1.21.0.tar.bz2|| exit 1
+tar xjf mpg123-$VERSION.tar.bz2 || exit 1
 rm -fr mpg123
-ln -s mpg123-1.21.0  mpg123
+ln -s mpg123-$VERSION  mpg123
 
 (
   cd mpg123
diff --git a/tools/extras/install_nkf.sh b/tools/extras/install_nkf.sh
index 6df6785d9c3..ca27a1777c1 100644
--- a/tools/extras/install_nkf.sh
+++ b/tools/extras/install_nkf.sh
@@ -8,6 +8,10 @@
 # code to designated kanji code such as ISO-2022-JP, UTF-8, and so on.
 # In kaldi, it will be used in egs/csj. (Corpus of Spontaneous Japanese data)
 
+VERSION=2.1.4
+
+WGET=${WGET:-wget}
+
 set -u
 set -e
 
@@ -21,20 +25,24 @@ fi
 
 echo Downloading and installing the nkf tools
 #Download
-if [ ! -f nkf-2.1.4.tar.gz ]; then
-  wget https://osdn.net/dl/nkf/nkf-2.1.4.tar.gz
-  tar -vxzf nkf-2.1.4.tar.gz
+if [ ! -f nkf-$VERSION.tar.gz ]; then
+  if [ -d "${DOWNLOAD_DIR:-}" ]; then
+    cp -p "$DOWNLOAD_DIR/nkf-$VERSION.tar.gz" .
+  else
+    $WGET https://osdn.net/dl/nkf/nkf-$VERSION.tar.gz
+  fi
+  tar -vxzf nkf-$VERSION.tar.gz
 fi
 
 #install
-cd nkf-2.1.4
+cd nkf-$VERSION
 make
 cd ..
 
 #add to env.sh
 if [ -f env.sh ]; then
   wd=`pwd`
-  echo "export NKF=$wd/nkf-2.1.4" >> env.sh
+  echo "export NKF=$wd/nkf-$VERSION" >> env.sh
   echo "export PATH=\${PATH}:\${NKF}" >> env.sh
 fi
 echo Done making the nkf tools
diff --git a/tools/extras/install_openblas.sh b/tools/extras/install_openblas.sh
index 44ff1793018..e8a67ceb388 100755
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
@@ -1,7 +1,35 @@
 #!/bin/bash
 
-# to be run from ..
-# this script just exists to tell you how you'd make openblas- we actually did it via Makefile rules,
-# but it's not a default target.
+OPENBLAS_VERSION=0.3.7
 
-make openblas
+WGET=${WGET:-wget}
+
+set -e
+
+if ! command -v gfortran 2>/dev/null; then
+  echo "$0: gfortran is not installed.  Please install it, e.g. by:"
+  echo " apt-get install gfortran"
+  echo "(if on Debian or Ubuntu), or:"
+  echo " yum install gcc-gfortran"
+  echo "(if on RedHat/CentOS).  On a Mac, if brew is installed, it's:"
+  echo " brew install gfortran"
+  exit 1
+fi
+
+
+tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz
+
+rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
+
+if [ -d "$DOWNLOAD_DIR" ]; then
+  cp -p "$DOWNLOAD_DIR/$tarball" .
+else
+  url=$($WGET -qO- "https://github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
+  test -n "$url"
+  $WGET -t3 -nv -O $tarball "$url"
+fi
+
+tar xzf $tarball
+mv xianyi-OpenBLAS-* OpenBLAS
+
+make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
diff --git a/tools/extras/install_pfile_utils.sh b/tools/extras/install_pfile_utils.sh
index 57445936f28..3f76cfa7640 100644
--- a/tools/extras/install_pfile_utils.sh
+++ b/tools/extras/install_pfile_utils.sh
@@ -3,14 +3,22 @@
 # BABEL setup that was done by Yajie Miao.  We don't expect these tools will
 # be used very heavily.
 
+VERSION=v3_33
+
+WGET=${WGET:-wget}
+
 ! which pkg-config >/dev/null  && \
    echo "pkg-config is not installed, this will not work.  Ask your sysadmin to install it" && exit 1;
 
-if [ ! -s quicknet-v3_33.tar.gz ]; then
-  wget ftp://ftp.icsi.berkeley.edu/pub/real/davidj/quicknet-v3_33.tar.gz || exit 1
+if [ ! -s quicknet-$VERSION.tar.gz ]; then
+  if [ -d "$DOWNLOAD_DIR" ]; then
+    cp -p "$DOWNLOAD_DIR/quicknet-$VERSION.tar.gz" . || exit 1
+  else
+    $WGET ftp://ftp.icsi.berkeley.edu/pub/real/davidj/quicknet-$VERSION.tar.gz || exit 1
+  fi
 fi
-tar -xvzf quicknet-v3_33.tar.gz
-cd quicknet-v3_33/
+tar -xvzf quicknet-$VERSION.tar.gz
+cd quicknet-$VERSION/
 ./configure --prefix=`pwd`  || exit 1
 make install  || exit 1
 cd ..
diff --git a/tools/extras/install_phonetisaurus.sh b/tools/extras/install_phonetisaurus.sh
index 617aa341b32..1f92d13be25 100755
--- a/tools/extras/install_phonetisaurus.sh
+++ b/tools/extras/install_phonetisaurus.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+
+GIT=${GIT:-git}
+
 set -u
 set -e
 
@@ -21,7 +24,7 @@ if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print s.get_config_vars()['INCLUDEPY']")
+  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
@@ -40,12 +43,12 @@ fi
 
 
 if [ ! -d ./phonetisaurus-g2p ] ; then
-  git clone https://github.com/AdolfVonKleist/Phonetisaurus.git phonetisaurus-g2p ||
+  $GIT clone https://github.com/AdolfVonKleist/Phonetisaurus.git phonetisaurus-g2p ||
   {
     echo  >&2 "$0: Warning: git clone operation ended unsuccessfully"
     echo  >&2 "  I will assume this is because you don't have https support"
     echo  >&2 "  compiled into your git "
-    git clone http://github.com/AdolfVonKleist/Phonetisaurus.git phonetisaurus-g2p
+    $GIT clone https://github.com/AdolfVonKleist/Phonetisaurus.git phonetisaurus-g2p
 
     if [ $? -ne 0 ]; then
       echo  >&2 "$0: Error git clone operation ended unsuccessfully"
@@ -59,7 +62,7 @@ fi
     export TOOLS=${PWD}
     cd phonetisaurus-g2p
     #checkout the current kaldi tag
-    git checkout -b kaldi kaldi
+    $GIT checkout -b kaldi kaldi
     ./configure --with-openfst-includes=${TOOLS}/openfst/include --with-openfst-libs=${TOOLS}/openfst/lib
     make
 )
diff --git a/tools/extras/install_pocolm.sh b/tools/extras/install_pocolm.sh
index bc0203b4f2e..df0d08b4e33 100755
--- a/tools/extras/install_pocolm.sh
+++ b/tools/extras/install_pocolm.sh
@@ -4,6 +4,8 @@
 #           2016  Johns Hopkins University (author: Daniel Povey)
 # Apache 2.0
 
+GIT=${GIT:-git}
+
 set -u
 set -e
 
@@ -23,7 +25,7 @@ if [ -d pocolm ]; then
 fi
 
 echo Downloading and installing the pocolm tools
-git clone https://github.com/danpovey/pocolm.git || exit 1;
+$GIT clone https://github.com/danpovey/pocolm.git || exit 1;
 cd pocolm/src
 make || exit 1;
 echo Done making the pocolm tools
diff --git a/tools/extras/install_portaudio.sh b/tools/extras/install_portaudio.sh
index ed9529477a6..1a339c40907 100755
--- a/tools/extras/install_portaudio.sh
+++ b/tools/extras/install_portaudio.sh
@@ -5,7 +5,7 @@
 #you may not use this file except in compliance with the License.
 #You may obtain a copy of the License at
 #
-#http://www.apache.org/licenses/LICENSE-2.0
+#https://www.apache.org/licenses/LICENSE-2.0
 #
 #THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 #KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
@@ -14,10 +14,10 @@
 #See the Apache 2 License for the specific language governing permissions and
 #limitations under the License.
 #
-#This script attempts to install port audio, which is needed for the run-on 
-#decoding stuff. Portaudio enables the decoder to grab a live audio stream 
-#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and 
-#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7, 
+#This script attempts to install port audio, which is needed for the run-on
+#decoding stuff. Portaudio enables the decoder to grab a live audio stream
+#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and
+#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7,
 #it is necessary to edit the Makefile (this script tries to do that).
 #The script will remove all occurances of
 #
@@ -29,33 +29,40 @@
 #also, it seems that one has to uncomment the inclusion of AudioToolbox in
 #include/pa_mac_core.h
 #
-#All this should make it compile fine for x86_64 under MacOS 10.7 
-#(always assuming that you installed XCode, wget and 
+#All this should make it compile fine for x86_64 under MacOS 10.7
+#(always assuming that you installed XCode, wget and
 #the Linux environment stuff on MacOS)
 
+VERSION=v19_20111121
+
+WGET=${WGET:-wget}
+
 echo "****() Installing portaudio"
 
-if [ ! -e pa_stable_v19_20111121.tgz ]; then
-    echo "Could not find portaudio tarball pa_stable_v19_20111121.tgz"
+if [ ! -e pa_stable_$VERSION.tgz ]; then
+    echo "Could not find portaudio tarball pa_stable_$VERSION.tgz"
     echo "Trying to download it via wget!"
-    
-    if ! which wget >&/dev/null; then
-        echo "This script requires you to first install wget"
-        echo "You can also just download pa_stable_v19_20111121.tgz from"
-        echo "http://www.portaudio.com/download.html)"
-        exit 1;
-    fi
 
-   wget -T 10 -t 3 http://www.portaudio.com/archives/pa_stable_v19_20111121.tgz
+    if [ -d "$DOWNLOAD_DIR" ]; then
+        cp -p "$DOWNLOAD_DIR/pa_stable_$VERSION.tgz" .
+    else
+        if ! $WGET --version >&/dev/null; then
+            echo "This script requires you to first install wget"
+            echo "You can also just download pa_stable_$VERSION.tgz from"
+            echo "http://www.portaudio.com/download.html)"
+            exit 1;
+        fi
+        $WGET -T 10 -t 3 http://www.portaudio.com/archives/pa_stable_$VERSION.tgz
+    fi
 
-   if [ ! -e pa_stable_v19_20111121.tgz ]; then
-        echo "Download of pa_stable_v19_20111121.tgz - failed!"
+    if [ ! -e pa_stable_$VERSION.tgz ]; then
+        echo "Download of pa_stable_$VERSION.tgz - failed!"
         echo "Aborting script. Please download and install port audio manually!"
-    exit 1;
-   fi
+        exit 1;
+    fi
 fi
 
-tar -xovzf pa_stable_v19_20111121.tgz || exit 1
+tar -xovzf pa_stable_$VERSION.tgz || exit 1
 
 read -d '' pa_patch << "EOF"
 --- portaudio/Makefile.in	2012-08-05 10:42:05.000000000 +0300
@@ -81,8 +88,10 @@ if [ -z "$MACOS" ]; then
     echo "${pa_patch}" | patch -p0 Makefile.in
 fi
 
+patch -p0  Makefile.in < ../extras/portaudio.patch
+autoconf
 ./configure --prefix=`pwd`/install --with-pic
-sed -i.bk '40s:src/common/pa_ringbuffer.o::g; 40s:$: src/common/pa_ringbuffer.o:' Makefile
+perl -i -pe 's:src/common/pa_ringbuffer.o:: if /^OTHER_OBJS\s*=/' Makefile
 
 if [ "$MACOS" != "" ]; then
     echo "detected MacOS operating system ... trying to fix Makefile"
@@ -93,7 +102,7 @@ if [ "$MACOS" != "" ]; then
     mv include/pa_mac_core.h include/pa_mac_core.h.bck
     cat include/pa_mac_core.h.bck \
       | sed 's/\/\/\#include \<AudioToolbox\/AudioToolbox.h\>/#include \<AudioToolbox\/AudioToolbox.h\>/g' \
-      > include/pa_mac_core.h 
+      > include/pa_mac_core.h
 fi
 
 make
diff --git a/tools/extras/install_sacc.sh b/tools/extras/install_sacc.sh
index 4c55a76a6a0..fc47f43f75f 100755
--- a/tools/extras/install_sacc.sh
+++ b/tools/extras/install_sacc.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+WGET=${WGET:-wget}
+
 # Make sure we are in the tools/ directory.
 if [ `basename $PWD` == extras ]; then
   cd ..
@@ -13,8 +15,10 @@ mkdir -p pitch_trackers/sacc
 cd pitch_trackers/sacc
 if [ -s SAcC_GLNXA64.zip ]; then
   echo "*SAcC_GLNXA64.zip already exists, not getting it."
+elif [ -d "$DOWNLOAD_DIR" ]; then
+  cp -p "$DOWNLOAD_DIR/SAcC_GLNXA64.zip" . || exit 1
 else
-  ! wget -t 2 http://labrosa.ee.columbia.edu/projects/SAcC/SAcC_GLNXA64.zip && \
+  ! $WGET -t 2 https://labrosa.ee.columbia.edu/projects/SAcC/SAcC_GLNXA64.zip && \
     echo "Error wgetting SAcC_GLNXA64.zip" && exit 1;
 fi
 
@@ -27,8 +31,10 @@ fi
 
 if [ -f MCRInstaller_glnxa64.bin ]; then
   echo "*It looks like you already downloaded MCRInstaller_glnxa64.bin, not getting it."
+elif [ -d "$DOWNLOAD_DIR" ]; then
+  cp -p "$DOWNLOAD_DIR/MCRInstaller_glnxa64.bin" . || exit 1
 else
-  ! wget -t 2 http://www.ee.columbia.edu/~dpwe/tmp/MCRInstaller_glnxa64.bin && \
+  ! $WGET -t 2 https://www.ee.columbia.edu/~dpwe/tmp/MCRInstaller_glnxa64.bin && \
    echo "Error getting MCRInstaller_glnxa64.bin" && exit 1;
 fi
 
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index 6ee4d9f4336..b58c482702d 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+
+GIT=${GIT:-git}
+
 set -u
 set -e
 
@@ -18,7 +21,7 @@ if ! $(python -c "import distutils.sysconfig" &> /dev/null); then
     echo "Proceeding with installation." >&2
 else
   # get include path for this python version
-  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print s.get_config_vars()['INCLUDEPY']")
+  INCLUDE_PY=$(python -c "from distutils import sysconfig as s; print(s.get_python_inc())")
   if [ ! -f "${INCLUDE_PY}/Python.h" ]; then
       echo "$0 : ERROR: python-devel/python-dev not installed" >&2
       if which yum >&/dev/null; then
@@ -49,12 +52,12 @@ if [ -d ./g2p ] || [ -d sequitur ] ; then
 fi
 
 if [ ! -d ./sequitur-g2p ] ; then
-  git clone https://github.com/sequitur-g2p/sequitur-g2p.git sequitur-g2p ||
+  $GIT clone https://github.com/sequitur-g2p/sequitur-g2p.git sequitur-g2p ||
   {
     echo  >&2 "$0: Warning: git clone operation ended unsuccessfully"
     echo  >&2 "  I will assume this is because you don't have https support"
     echo  >&2 "  compiled into your git "
-    git clone git@github.com:sequitur-g2p/sequitur-g2p.git sequitur-g2p
+    $GIT clone git@github.com:sequitur-g2p/sequitur-g2p.git sequitur-g2p
 
     if [ $? -ne 0 ]; then
       echo  >&2 "$0: Error git clone operation ended unsuccessfully"
@@ -66,10 +69,10 @@ else
   echo >&2 "$0: Updating the repository -- we will try to merge with local changes (if you have any)"
   (
     cd sequitur-g2p/
-    git pull
+    $GIT pull
     # this would work also, but would drop all local modifications
-    #git fetch
-    #git reset --hard origin/master
+    #$GIT fetch
+    #$GIT reset --hard origin/master
   ) || {
     echo >&2 "Failed to do git pull, delete the sequitur dir and run again";
     exit 1
diff --git a/tools/extras/install_speex.sh b/tools/extras/install_speex.sh
index 5ccd8750208..70dd06d9bc3 100755
--- a/tools/extras/install_speex.sh
+++ b/tools/extras/install_speex.sh
@@ -5,7 +5,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+# https://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
@@ -23,34 +23,42 @@
 # I just let it be like this at this moment, and may add a patch to resolve this
 # later.
 
+VERSION=1.2rc1
+
+WGET=${WGET:-wget}
+
 echo "****() Installing Speex"
 
-if [ ! -e speex-1.2rc1.tar.gz ]; then
-    echo "Could not find Speex tarball speex-1.2rc1.tar.gz"
+if [ ! -e speex-$VERSION.tar.gz ]; then
+    echo "Could not find Speex tarball speex-$VERSION.tar.gz"
     echo "Trying to download it via wget!"
     
     if ! which wget >&/dev/null; then
         echo "This script requires you to first install wget"
-        echo "You can also just download speex-1.2rc1.tar.gz from"
-        echo "http://www.speex.org/downloads/)"
+        echo "You can also just download speex-$VERSION.tar.gz from"
+        echo "https://www.speex.org/downloads/)"
         exit 1;
     fi
 
-   wget -T 10 -t 3 -c http://downloads.xiph.org/releases/speex/speex-1.2rc1.tar.gz
+    if [ -d "$DOWNLOAD_DIR" ]; then
+        cp -p "$DOWNLOAD_DIR/speex-$VERSION.tar.gz" .
+    else
+        $WGET -T 10 -t 3 -c https://downloads.xiph.org/releases/speex/speex-$VERSION.tar.gz
+    fi
 
-   if [ ! -e speex-1.2rc1.tar.gz ]; then
-        echo "Download of speex_v1.2rc1.tar.gz - failed!"
+    if [ ! -e speex-$VERSION.tar.gz ]; then
+        echo "Download of speex-$VERSION.tar.gz - failed!"
         echo "Aborting script. Please download and install Speex manually!"
-    exit 1;
-   fi
+        exit 1;
+    fi
 fi
 
-tar -xovzf speex-1.2rc1.tar.gz|| exit 1
+tar -xovzf speex-$VERSION.tar.gz || exit 1
 rm -fr speex
 
-cd speex-1.2rc1
+cd speex-$VERSION
 ./configure --prefix `pwd`/build
 make; make install
 
 cd ..
-ln -s speex-1.2rc1/build speex 
+ln -s speex-$VERSION/build speex
diff --git a/tools/extras/install_tensorflow_cc.sh b/tools/extras/install_tensorflow_cc.sh
index 95e81053e74..76f3f3f0b1d 100755
--- a/tools/extras/install_tensorflow_cc.sh
+++ b/tools/extras/install_tensorflow_cc.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
 
+BAZEL_VERSION=0.15.0
+
+GIT=${GIT:-git}
+WGET=${WGET:-wget}
+
 set -e
 
 #export JAVA_HOME=/LOCATION_ON_YOUR_MACHINE/java/jdk1.8.0_121
@@ -17,7 +22,7 @@ good_version=`echo 1.8 $java_version | awk '{if($1<$2)print 1; else print 0}'`
 if [ $good_version -eq 0 ]; then
   echo You have jdk version = $java_version, which is older than 1.8
   echo You need to download a later than 1.8 JDK version at
-  echo http://www.oracle.com/technetwork/pt/java/javase/downloads/jdk8-downloads-2133151.html
+  echo https://www.oracle.com/technetwork/pt/java/javase/downloads/jdk8-downloads-2133151.html
   echo and set your JAVA_HOME to point to where it is installed
   exit 1
 else
@@ -25,20 +30,35 @@ else
 fi
 
 
-[ ! -f bazel.zip ] && wget https://github.com/bazelbuild/bazel/releases/download/0.5.4/bazel-0.5.4-dist.zip -O bazel.zip
+if [ ! -f bazel-$BAZEL_VERSION-dist.zip ]; then
+  if [ -d "$DOWNLOAD_DIR" ]; then
+    cp -p "$DOWNLOAD_DIR/bazel-$BAZEL_VERSION-dist.zip" .
+  else
+    $WGET https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip
+  fi
+fi
 mkdir -p bazel
 cd bazel
-unzip ../bazel.zip
+unzip ../bazel-$BAZEL_VERSION-dist.zip
 ./compile.sh
 cd ../
 
 # now bazel is built
-git clone https://github.com/tensorflow/tensorflow
+[ ! -d tensorflow ] && $GIT clone https://github.com/tensorflow/tensorflow
 cd tensorflow
-git checkout r1.4
+$GIT fetch --tags
+$GIT checkout r1.12
 ./configure
 
-tensorflow/contrib/makefile/download_dependencies.sh 
+if $GIT --version >&/dev/null && $WGET --version >&/dev/null
+then
+  tensorflow/contrib/makefile/download_dependencies.sh
+else
+  echo "Please run tensorflow/tensorflow/contrib/makefile/download_dependencies.sh"
+  echo "to download needed dependencies."
+  exit 1
+fi
+
 bazel build -c opt //tensorflow:libtensorflow.so
 bazel build -c opt //tensorflow:libtensorflow_cc.so
 
diff --git a/tools/extras/install_wpe.sh b/tools/extras/install_wpe.sh
new file mode 100755
index 00000000000..39393a55a46
--- /dev/null
+++ b/tools/extras/install_wpe.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+GIT=${GIT:-git}
+
+# Installs nara-wpe with dependencies
+# miniconda should be installed in $HOME/miniconda3/ 
+
+miniconda_dir=$HOME/miniconda3/
+
+if [ ! -d $miniconda_dir ]; then
+    echo "$miniconda_dir does not exist. Please run 'tools/extras/install_miniconda.sh" && exit 1;
+fi
+
+$HOME/miniconda3/bin/python -m pip install soundfile
+$GIT clone https://github.com/fgnt/nara_wpe.git
+cd nara_wpe
+$HOME/miniconda3/bin/python -m pip install --editable .
diff --git a/tools/extras/portaudio.patch b/tools/extras/portaudio.patch
new file mode 100644
index 00000000000..9fc201f9278
--- /dev/null
+++ b/tools/extras/portaudio.patch
@@ -0,0 +1,21 @@
+diff --git a/Makefile.in b/Makefile.in
+index 24129a3..61a3952 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -44,7 +44,7 @@ PALIB = libportaudio.la
+ PAINC = include/portaudio.h
+ 
+ PA_LDFLAGS = $(LDFLAGS) $(SHARED_FLAGS) -rpath $(libdir) -no-undefined \
+-	     -export-symbols-regex "(Pa|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
++	     -export-symbols-regex "(Pa|PaUtil|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
+ 	     -version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE)
+ 
+ COMMON_OBJS = \
+@@ -57,6 +57,7 @@ COMMON_OBJS = \
+ 	src/common/pa_process.o \
+ 	src/common/pa_stream.o \
+ 	src/common/pa_trace.o \
++	src/common/pa_ringbuffer.o \
+ 	src/hostapi/skeleton/pa_hostapi_skeleton.o
+ 
+ LOOPBACK_OBJS = \
diff --git a/tools/extras/travis_install_bindeps.sh b/tools/extras/travis_install_bindeps.sh
index b8465823426..80067e1ddef 100755
--- a/tools/extras/travis_install_bindeps.sh
+++ b/tools/extras/travis_install_bindeps.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+WGET=${WGET:-wget}
+
 set -e
 
 xroot=${1:-~/xroot}
@@ -9,20 +11,20 @@ cd $xroot
 
 add_deb () {
   echo "Adding deb package $1 to $xroot"
-  wget -nv $1
+  $WGET -nv $1
   dpkg-deb -x ${1##*/} $xroot
 }
 
 # OpenBLAS and Netlib LAPACK binaries from Trusty.
-add_deb http://mirrors.kernel.org/ubuntu/pool/main/l/lapack/liblapacke-dev_3.5.0-2ubuntu1_amd64.deb
-add_deb http://mirrors.kernel.org/ubuntu/pool/main/l/lapack/liblapacke_3.5.0-2ubuntu1_amd64.deb
-add_deb http://mirrors.kernel.org/ubuntu/pool/universe/o/openblas/libopenblas-dev_0.2.8-6ubuntu1_amd64.deb
-add_deb http://mirrors.kernel.org/ubuntu/pool/universe/o/openblas/libopenblas-base_0.2.8-6ubuntu1_amd64.deb
-add_deb http://mirrors.kernel.org/ubuntu/pool/main/b/blas/libblas-dev_1.2.20110419-7_amd64.deb
-add_deb http://mirrors.kernel.org/ubuntu/pool/main/b/blas/libblas3_1.2.20110419-7_amd64.deb
+add_deb https://mirrors.kernel.org/ubuntu/pool/main/l/lapack/liblapacke-dev_3.5.0-2ubuntu1_amd64.deb
+add_deb https://mirrors.kernel.org/ubuntu/pool/main/l/lapack/liblapacke_3.5.0-2ubuntu1_amd64.deb
+add_deb https://mirrors.kernel.org/ubuntu/pool/universe/o/openblas/libopenblas-dev_0.2.8-6ubuntu1_amd64.deb
+add_deb https://mirrors.kernel.org/ubuntu/pool/universe/o/openblas/libopenblas-base_0.2.8-6ubuntu1_amd64.deb
+add_deb https://mirrors.kernel.org/ubuntu/pool/main/b/blas/libblas-dev_1.2.20110419-7_amd64.deb
+add_deb https://mirrors.kernel.org/ubuntu/pool/main/b/blas/libblas3_1.2.20110419-7_amd64.deb
 
 if [[ "$(ccache --version 2>/dev/null | sed -n '1{s/^[a-z ]*//;s/\./0/g;p}')" -lt 30304 ]]; then
-    add_deb http://mirrors.kernel.org/debian/pool/main/c/ccache/ccache_3.3.4-1_amd64.deb
+    add_deb https://mirrors.kernel.org/debian/pool/main/c/ccache/ccache_3.3.4-1_amd64.deb
 fi
 
 # Show extracted package files.
diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index 6acc805abda..c857402bed5 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -69,6 +69,7 @@ echo "Building tools..." [Time: $(date)]
 runvx cd tools
 runvx make -j$MAXPAR openfst "$CCC" CXXFLAGS="$CF" \
       OPENFST_CONFIGURE="--disable-static --enable-shared --disable-bin --disable-dependency-tracking"
+runvx make -j$MAXPAR cub "$CCC" CXXFLAGS="$CF"
 cd ..
 
 runvx cd src
diff --git a/windows/INSTALL.md b/windows/INSTALL.md
index cd9c77b1776..c48f2908e98 100644
--- a/windows/INSTALL.md
+++ b/windows/INSTALL.md
@@ -6,10 +6,10 @@ For cygwin installation, see the instructions in `../INSTALL`.
 ## Notes
 
 * The recipes (in egs/) will not work. There is no commitment to support Windows.
-  The Windows port of Kaldi is targeted at experienced developers who want 
-  to program their own apps using the kaldi libraries and are able to do 
-  the troubleshooting on their own. 
-* These instructions are valid November 2017, 
+  The Windows port of Kaldi is targeted at experienced developers who want
+  to program their own apps using the kaldi libraries and are able to do
+  the troubleshooting on their own.
+* These instructions are valid November 2017,
   [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS are supported
 * ATLAS is not supported and I personally have no intention to work on supporting
   it, as it requires whole cygwin environment
@@ -19,7 +19,7 @@ For cygwin installation, see the instructions in `../INSTALL`.
   and we didn't test if the solutions work or not.
 * While the 32bit project files will still be generated, we don't really
   care if they work or not. They will be removed in the near future.
-* The build process was validated using MSVC2017. We do not support earlier 
+* The build process was validated using MSVC2017. We do not support earlier
   releases (i.e. MSVC2015 and older). The reason is the C++11 support is still
   very buggy in the MS compiler.
 * We support only openfst-1.6.5 for now.
@@ -36,7 +36,7 @@ For cygwin installation, see the instructions in `../INSTALL`.
 ## Compiling OpenFST
 
 Skip this section, if you have downloaded OpenFST project from https://github.com/kkm000/openfst.git and it already contains openfst.sln file in the root folder. If it is present you can directly open it with Visual Studio 17 and you do not need CMake.
-------------------------- 
+-------------------------
 For compilation of OpenFST, you will need CMake installed. Simply go to https://cmake.org/download/ and download and install.
 Then, in the command line, run the following commands. Be very careful about writing the commands verbatim!
 
@@ -45,7 +45,7 @@ Then, in the command line, run the following commands. Be very careful about wri
         $ mkdir build64
         $ cd build64
         $ cmake -G "Visual Studio 15 2017 Win64" ../
-        
+
 The last command will generate output looking similarly to this. Do not try to read too much into specific versions of the programs.
 
         -- The C compiler identification is MSVC 19.11.25547.0
@@ -73,20 +73,20 @@ The last command will generate output looking similarly to this. Do not try to r
         -- Generating done
         -- Build files have been written to: C:/Users/jtrmal/Documents/openfst/build64
 
-In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17. 
-------------------------- 
+In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17.
+-------------------------
 
-   **Switch the configuration to `debug|Win64` and build the solution.**
-   **Do the same for configuration `release|Win64`.**
+   **Switch the configuration to `Debug|x64` and build the solution.**
+   **Do the same for configuration `Release|x64`.**
 
  If either of the two won't build, you should stop here and start figuring what's different!
 
-## Compiling Kaldi   
-   
+## Compiling Kaldi
+
 1. Checkout Kaldi trunk, using [git](https://git-for-windows.github.io/) from https://github.com/kaldi-asr/kaldi.git
 
    Example:
-   
+
         $ git clone https://github.com/kaldi-asr/kaldi.git kaldi
 
 There are two options to use for BLAS (linear algebra): [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS. [Intel® MKL](https://software.intel.com/en-us/intel-mkl) is made by Intel and is optimised
@@ -124,7 +124,7 @@ for their processors. It isn't free, but you can get [Community Licensing for In
 4. Enter the `(kaldi)/windows` directory
 
     Example:
-    
+
          (kaldi)/$ cd windows
          (kaldi)/windows $ pwd
 
@@ -148,7 +148,7 @@ for their processors. It isn't free, but you can get [Community Licensing for In
          generate_solution.pl --vsver <default|vs2017|vs2015> [--enable-cuda] [--enable-openblas] [--enable-mkl]
 
     `--enable-mkl` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MKL support.
-    CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017). 
+    CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017).
     Please note that while we support generating the project for Visual Studio 2015, the C++11 support for that compiler
     is rather sub-par, i.e. it won't probably compile. When choosing Visual Studio 2015, you are on your own!
 
@@ -161,10 +161,10 @@ for their processors. It isn't free, but you can get [Community Licensing for In
          (kaldi)/windows$ generate_solution.pl --vsver vs2017 --enable-cuda --enable-openblas
 
 9. Run the script (kaldi)/windows/get_version.pl:
-        
+
         (kaldi)/windows$ get_version.pl
-  
-10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs<version>_<blas-library> 
+
+10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs<version>_<blas-library>
 	in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build.
    Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`. The tests will
    fail to compile too -- this is because of deficiency of the script generate_solution.pl. We might fix it
diff --git a/windows/variables.props.dev b/windows/variables.props.dev
index d797f2f2abf..0810edcd262 100644
--- a/windows/variables.props.dev
+++ b/windows/variables.props.dev
@@ -7,7 +7,9 @@
     <MKLDIR>C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\</MKLDIR>
     <OPENBLASDIR>C:\Users\Yenda\Downloads\kaldi-svn\tools\OpenBLAS-v0.2.14-Win64-int32</OPENBLASDIR>
     <OPENFST>C:\Users\jtrmal\Documents\openfst\</OPENFST>
-    <OPENFSTLIB>C:\Users\jtrmal\Documents\openfst\build64\lib</OPENFSTLIB>
+    <OPENFSTLIB>C:\Users\jtrmal\Documents\openfst\build64</OPENFSTLIB>
+    <CUBDIR>c:\Users\jtrmal\Documents\cub\</CUBDIR>
+    <NVTOOLSDIR>C:\Program FIles\NVIDIA Corporation\NvToolsExt\</NVTOOLSDIR>
     <!-- Do not modify anything after this line -->
   </PropertyGroup>
   <PropertyGroup />
@@ -29,5 +31,13 @@
       <Value>$(MKLDIR)</Value>
       <EnvironmentVariable>true</EnvironmentVariable>
     </BuildMacro>
+    <BuildMacro Include="CUBDIR">
+      <Value>$(CUBDIR)</Value>
+      <EnvironmentVariable>true</EnvironmentVariable>
+    </BuildMacro>
+    <BuildMacro Include="NVTOOLSDIR">
+      <Value>$(NVTOOLSDIR)</Value>
+      <EnvironmentVariable>true</EnvironmentVariable>
+    </BuildMacro>
   </ItemGroup>
 </Project>